diff mbox series

[4/4] event/utils: Avoid deadlock from lock_timeout() and recursive events

Message ID 20250303130628.1656131-4-richard.purdie@linuxfoundation.org
State Accepted, archived
Commit 2c590ff1aff89d23b25ce808650f200013a1e6af
Headers show
Series [1/4] utils: Print information about lock issue before exiting | expand

Commit Message

Richard Purdie March 3, 2025, 1:06 p.m. UTC
We've been seeing intermittent failures on Ubuntu 22.04 in oe-selftest which
were problematic to debug. The failure was inside lock_timeout and once that was
identified and the backtrace obtained, the problem becomes clearer:

  File "X/bitbake/lib/bb/server/process.py", line 466, in idle_thread_internal
    retval = function(self, data, False)
  File "X/bitbake/lib/bb/command.py", line 123, in runAsyncCommand
    self.cooker.updateCache()
  File "X/bitbake/lib/bb/cooker.py", line 1629, in updateCache
    self.parser = CookerParser(self, mcfilelist, total_masked)
  File "X/bitbake/lib/bb/cooker.py", line 2141, in __init__
    self.bb_caches = bb.cache.MulticonfigCache(self.cfgbuilder, self.cfghash, cooker.caches_array)
  File "X/bitbake/lib/bb/cache.py", line 772, in __init__
    loaded += c.prepare_cache(progress)
  File "X/bitbake/lib/bb/cache.py", line 435, in prepare_cache
    loaded = self.load_cachefile(progress)
  File "X/bitbake/lib/bb/cache.py", line 516, in load_cachefile
    progress(cachefile.tell() + previous_progress)
  File "X/bitbake/lib/bb/cache.py", line 751, in progress
    bb.event.fire(bb.event.CacheLoadProgress(current_progress, cachesize),
  File "X/bitbake/lib/bb/event.py", line 234, in fire
    fire_ui_handlers(event, d)
  File "X/bitbake/lib/bb/event.py", line 210, in fire_ui_handlers
    _ui_handlers[h].event.send(event)
  File "X/bitbake/lib/bb/cooker.py", line 117, in send
    str_event = codecs.encode(pickle.dumps(event), \'base64\').decode(\'utf-8\')
  File "/usr/lib/python3.10/asyncio/sslproto.py", line 320, in __del__
    _warn(f"unclosed transport {self!r}", ResourceWarning, source=self)
  File "/usr/lib/python3.10/warnings.py", line 109, in _showwarnmsg
    sw(msg.message, msg.category, msg.filename, msg.lineno,
  File "X/bitbake/lib/bb/main.py", line 113, in _showwarning
    warnlog.warning(s)
  File "/usr/lib/python3.10/logging/__init__.py", line 1489, in warning
    self._log(WARNING, msg, args, **kwargs)
  File "/usr/lib/python3.10/logging/__init__.py", line 1624, in _log
    self.handle(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 1634, in handle
    self.callHandlers(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 1696, in callHandlers
    hdlr.handle(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 968, in handle
    self.emit(record)
  File "X/bitbake/lib/bb/event.py", line 778, in emit
    fire(record, None)
  File "X/bitbake/lib/bb/event.py", line 234, in fire
    fire_ui_handlers(event, d)
  File "X/bitbake/lib/bb/event.py", line 197, in fire_ui_handlers
    with bb.utils.lock_timeout(_thread_lock):
  File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__
    return next(self.gen)
  File "X/bitbake/lib/bb/utils.py", line 1888, in lock_timeout
    bb.server.process.serverlog("Couldn\'t get the lock for 5 mins, timed out, exiting. %s" % traceback.format_stack())

or put in simpler terms, whilst sending an event(), an unrelated warning
message happens to be triggered from asyncio:

/usr/lib/python3.10/asyncio/sslproto.py:320: ResourceWarning: unclosed transport <asyncio.sslproto._SSLProtocolTransport object at 0x7f0e797d3100>

which triggers a second event() which can't be sent as we're already
in the critcal section and already hold the lock.

That warning is due to the version of asyncio used on Ubuntu 22.04 with
python 3.10 and that comined with timing issues explains why we don't
see it on other python versions or distros.

We can't handle the second event as the lock is there to serialise the
events. Instead, we queue the event and then process the queue later.

Add a new version of lock_timeout which allows us to handle the situation
more gracefully.

Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
---
 lib/bb/event.py | 10 +++++++++-
 lib/bb/utils.py | 15 +++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

Comments

Joshua Watt March 4, 2025, 4:21 p.m. UTC | #1
On Mon, Mar 3, 2025 at 6:06 AM Richard Purdie via
lists.openembedded.org
<richard.purdie=linuxfoundation.org@lists.openembedded.org> wrote:
>
> We've been seeing intermittent failures on Ubuntu 22.04 in oe-selftest which
> were problematic to debug. The failure was inside lock_timeout and once that was
> identified and the backtrace obtained, the problem becomes clearer:
>
>   File "X/bitbake/lib/bb/server/process.py", line 466, in idle_thread_internal
>     retval = function(self, data, False)
>   File "X/bitbake/lib/bb/command.py", line 123, in runAsyncCommand
>     self.cooker.updateCache()
>   File "X/bitbake/lib/bb/cooker.py", line 1629, in updateCache
>     self.parser = CookerParser(self, mcfilelist, total_masked)
>   File "X/bitbake/lib/bb/cooker.py", line 2141, in __init__
>     self.bb_caches = bb.cache.MulticonfigCache(self.cfgbuilder, self.cfghash, cooker.caches_array)
>   File "X/bitbake/lib/bb/cache.py", line 772, in __init__
>     loaded += c.prepare_cache(progress)
>   File "X/bitbake/lib/bb/cache.py", line 435, in prepare_cache
>     loaded = self.load_cachefile(progress)
>   File "X/bitbake/lib/bb/cache.py", line 516, in load_cachefile
>     progress(cachefile.tell() + previous_progress)
>   File "X/bitbake/lib/bb/cache.py", line 751, in progress
>     bb.event.fire(bb.event.CacheLoadProgress(current_progress, cachesize),
>   File "X/bitbake/lib/bb/event.py", line 234, in fire
>     fire_ui_handlers(event, d)
>   File "X/bitbake/lib/bb/event.py", line 210, in fire_ui_handlers
>     _ui_handlers[h].event.send(event)
>   File "X/bitbake/lib/bb/cooker.py", line 117, in send
>     str_event = codecs.encode(pickle.dumps(event), \'base64\').decode(\'utf-8\')
>   File "/usr/lib/python3.10/asyncio/sslproto.py", line 320, in __del__
>     _warn(f"unclosed transport {self!r}", ResourceWarning, source=self)
>   File "/usr/lib/python3.10/warnings.py", line 109, in _showwarnmsg
>     sw(msg.message, msg.category, msg.filename, msg.lineno,
>   File "X/bitbake/lib/bb/main.py", line 113, in _showwarning
>     warnlog.warning(s)
>   File "/usr/lib/python3.10/logging/__init__.py", line 1489, in warning
>     self._log(WARNING, msg, args, **kwargs)
>   File "/usr/lib/python3.10/logging/__init__.py", line 1624, in _log
>     self.handle(record)
>   File "/usr/lib/python3.10/logging/__init__.py", line 1634, in handle
>     self.callHandlers(record)
>   File "/usr/lib/python3.10/logging/__init__.py", line 1696, in callHandlers
>     hdlr.handle(record)
>   File "/usr/lib/python3.10/logging/__init__.py", line 968, in handle
>     self.emit(record)
>   File "X/bitbake/lib/bb/event.py", line 778, in emit
>     fire(record, None)
>   File "X/bitbake/lib/bb/event.py", line 234, in fire
>     fire_ui_handlers(event, d)
>   File "X/bitbake/lib/bb/event.py", line 197, in fire_ui_handlers
>     with bb.utils.lock_timeout(_thread_lock):
>   File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__
>     return next(self.gen)
>   File "X/bitbake/lib/bb/utils.py", line 1888, in lock_timeout
>     bb.server.process.serverlog("Couldn\'t get the lock for 5 mins, timed out, exiting. %s" % traceback.format_stack())
>
> or put in simpler terms, whilst sending an event(), an unrelated warning
> message happens to be triggered from asyncio:
>
> /usr/lib/python3.10/asyncio/sslproto.py:320: ResourceWarning: unclosed transport <asyncio.sslproto._SSLProtocolTransport object at 0x7f0e797d3100>
>
> which triggers a second event() which can't be sent as we're already
> in the critcal section and already hold the lock.
>
> That warning is due to the version of asyncio used on Ubuntu 22.04 with
> python 3.10 and that comined with timing issues explains why we don't
> see it on other python versions or distros.
>
> We can't handle the second event as the lock is there to serialise the
> events. Instead, we queue the event and then process the queue later.
>
> Add a new version of lock_timeout which allows us to handle the situation
> more gracefully.
>
> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
> ---
>  lib/bb/event.py | 10 +++++++++-
>  lib/bb/utils.py | 15 +++++++++++++++
>  2 files changed, 24 insertions(+), 1 deletion(-)
>
> diff --git a/lib/bb/event.py b/lib/bb/event.py
> index e9834d01c7..da07a1dd64 100644
> --- a/lib/bb/event.py
> +++ b/lib/bb/event.py
> @@ -194,7 +194,12 @@ def fire_ui_handlers(event, d):
>          ui_queue.append(event)
>          return
>
> -    with bb.utils.lock_timeout(_thread_lock):
> +    with bb.utils.lock_timeout_nocheck(_thread_lock) as lock:
> +        if not lock:
> +            # If we can't get the lock, we may be recursively called, queue and return
> +            ui_queue.append(event)
> +            return
> +
>          errors = []
>          for h in _ui_handlers:
>              #print "Sending event %s" % event
> @@ -213,6 +218,9 @@ def fire_ui_handlers(event, d):
>          for h in errors:
>              del _ui_handlers[h]
>
> +    while ui_queue:
> +        fire_ui_handlers(ui_queue.pop(), d)
> +
>  def fire(event, d):
>      """Fire off an Event"""
>
> diff --git a/lib/bb/utils.py b/lib/bb/utils.py
> index 3a4b29181e..5486f9599d 100644
> --- a/lib/bb/utils.py
> +++ b/lib/bb/utils.py
> @@ -1879,6 +1879,9 @@ def path_is_descendant(descendant, ancestor):
>  # If we don't have a timeout of some kind and a process/thread exits badly (for example
>  # OOM killed) and held a lock, we'd just hang in the lock futex forever. It is better
>  # we exit at some point than hang. 5 minutes with no progress means we're probably deadlocked.
> +# This function can still deadlock python since it can't signal the other threads to exit
> +# (signals are handled in the main thread) and even os._exit() will wait on non-daemon threads
> +# to exit.
>  @contextmanager
>  def lock_timeout(lock):
>      try:
> @@ -1891,3 +1894,15 @@ def lock_timeout(lock):
>      finally:
>          lock.release()
>          signal.pthread_sigmask(signal.SIG_SETMASK, s)
> +
> +# A version of lock_timeout without the check that the lock was locked and a shorter timeout
> +@contextmanager
> +def lock_timeout_nocheck(lock):
> +    try:
> +        s = signal.pthread_sigmask(signal.SIG_BLOCK, signal.valid_signals())
> +        l = lock.acquire(timeout=10)

If either of these lines raises an exception, the finally will fail.
You probably want something like:

s = signal.pthread_sigmask(...)
try:
   l = lock.acquire(...)
   try:
       ...

> +        yield l
> +    finally:
> +        if l:
> +            lock.release()
> +        signal.pthread_sigmask(signal.SIG_SETMASK, s)
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#17372): https://lists.openembedded.org/g/bitbake-devel/message/17372
> Mute This Topic: https://lists.openembedded.org/mt/111487229/3616693
> Group Owner: bitbake-devel+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/bitbake-devel/unsub [JPEWhacker@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
diff mbox series

Patch

diff --git a/lib/bb/event.py b/lib/bb/event.py
index e9834d01c7..da07a1dd64 100644
--- a/lib/bb/event.py
+++ b/lib/bb/event.py
@@ -194,7 +194,12 @@  def fire_ui_handlers(event, d):
         ui_queue.append(event)
         return
 
-    with bb.utils.lock_timeout(_thread_lock):
+    with bb.utils.lock_timeout_nocheck(_thread_lock) as lock:
+        if not lock:
+            # If we can't get the lock, we may be recursively called, queue and return
+            ui_queue.append(event)
+            return
+
         errors = []
         for h in _ui_handlers:
             #print "Sending event %s" % event
@@ -213,6 +218,9 @@  def fire_ui_handlers(event, d):
         for h in errors:
             del _ui_handlers[h]
 
+    while ui_queue:
+        fire_ui_handlers(ui_queue.pop(), d)
+
 def fire(event, d):
     """Fire off an Event"""
 
diff --git a/lib/bb/utils.py b/lib/bb/utils.py
index 3a4b29181e..5486f9599d 100644
--- a/lib/bb/utils.py
+++ b/lib/bb/utils.py
@@ -1879,6 +1879,9 @@  def path_is_descendant(descendant, ancestor):
 # If we don't have a timeout of some kind and a process/thread exits badly (for example
 # OOM killed) and held a lock, we'd just hang in the lock futex forever. It is better
 # we exit at some point than hang. 5 minutes with no progress means we're probably deadlocked.
+# This function can still deadlock python since it can't signal the other threads to exit
+# (signals are handled in the main thread) and even os._exit() will wait on non-daemon threads
+# to exit.
 @contextmanager
 def lock_timeout(lock):
     try:
@@ -1891,3 +1894,15 @@  def lock_timeout(lock):
     finally:
         lock.release()
         signal.pthread_sigmask(signal.SIG_SETMASK, s)
+
+# A version of lock_timeout without the check that the lock was locked and a shorter timeout
+@contextmanager
+def lock_timeout_nocheck(lock):
+    try:
+        s = signal.pthread_sigmask(signal.SIG_BLOCK, signal.valid_signals())
+        l = lock.acquire(timeout=10)
+        yield l
+    finally:
+        if l:
+            lock.release()
+        signal.pthread_sigmask(signal.SIG_SETMASK, s)