Message ID | 20250702222437.3733042-1-richard.purdie@linuxfoundation.org |
---|---|
State | New |
Headers | show |
Series | [1/2] cooker: Try and avoid parseing hangs | expand |
LGTM Reviewed-by: Joshua Watt <JPEWhacker@gmail.com> On Wed, Jul 2, 2025 at 4:24 PM Richard Purdie via lists.openembedded.org <richard.purdie=linuxfoundation.org@lists.openembedded.org> wrote: > > We sometimes see hangs in parsing during automated testing. It appears that > SIGINT was sent to the underlying processes which see KeyboardInterrupt but > they're stuck trying to write into the results pipe. The SIGINT was probably > from some kind of parsing failure which doens't happen often, hence the hang > being rare (in the incompatible license selftests from OE). > > This patch: > * sets a flag to indicate exit upon SIGINT so the exit is more graceful > and a defined exit path > * empties the results queue after we send the quit event > * empties the results queue after the SIGINT for good measure > * increases the 0.5s timeout to 2s since we now have some very slow to > parse recipes due to class extensions (ptests) > > This should hopefully make the parsing failure codepaths more robust. > > Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org> > --- > lib/bb/cooker.py | 24 +++++++++++++++--------- > 1 file changed, 15 insertions(+), 9 deletions(-) > > diff --git a/lib/bb/cooker.py b/lib/bb/cooker.py > index 1810bcc6049..91e3ee025ea 100644 > --- a/lib/bb/cooker.py > +++ b/lib/bb/cooker.py > @@ -2009,6 +2009,7 @@ class Parser(multiprocessing.Process): > self.queue_signals = False > self.signal_received = [] > self.signal_threadlock = threading.Lock() > + self.exit = False > > def catch_sig(self, signum, frame): > if self.queue_signals: > @@ -2021,7 +2022,7 @@ class Parser(multiprocessing.Process): > signal.signal(signal.SIGTERM, signal.SIG_DFL) > os.kill(os.getpid(), signal.SIGTERM) > elif signum == signal.SIGINT: > - signal.default_int_handler(signum, frame) > + self.exit = True > > def run(self): > > @@ -2059,7 +2060,7 @@ class Parser(multiprocessing.Process): > pending = [] > havejobs = True > try: > - while havejobs or pending: > + while (havejobs or pending) and not self.exit: > if self.quit.is_set(): > break > > @@ -2196,11 +2197,12 @@ class CookerParser(object): > > # Cleanup the queue before call process.join(), otherwise there might be > # deadlocks. > - while True: > - try: > - self.result_queue.get(timeout=0.25) > - except queue.Empty: > - break > + def read_results(): > + while True: > + try: > + self.result_queue.get(timeout=0.25) > + except queue.Empty: > + break > > def sync_caches(): > for c in self.bb_caches.values(): > @@ -2212,15 +2214,19 @@ class CookerParser(object): > > self.parser_quit.set() > > + read_results() > + > for process in self.processes: > - process.join(0.5) > + process.join(2) > > for process in self.processes: > if process.exitcode is None: > os.kill(process.pid, signal.SIGINT) > > + read_results() > + > for process in self.processes: > - process.join(0.5) > + process.join(2) > > for process in self.processes: > if process.exitcode is None: > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#17737): https://lists.openembedded.org/g/bitbake-devel/message/17737 > Mute This Topic: https://lists.openembedded.org/mt/113957082/3616693 > Group Owner: bitbake-devel+owner@lists.openembedded.org > Unsubscribe: https://lists.openembedded.org/g/bitbake-devel/unsub [JPEWhacker@gmail.com] > -=-=-=-=-=-=-=-=-=-=-=- >
diff --git a/lib/bb/cooker.py b/lib/bb/cooker.py index 1810bcc6049..91e3ee025ea 100644 --- a/lib/bb/cooker.py +++ b/lib/bb/cooker.py @@ -2009,6 +2009,7 @@ class Parser(multiprocessing.Process): self.queue_signals = False self.signal_received = [] self.signal_threadlock = threading.Lock() + self.exit = False def catch_sig(self, signum, frame): if self.queue_signals: @@ -2021,7 +2022,7 @@ class Parser(multiprocessing.Process): signal.signal(signal.SIGTERM, signal.SIG_DFL) os.kill(os.getpid(), signal.SIGTERM) elif signum == signal.SIGINT: - signal.default_int_handler(signum, frame) + self.exit = True def run(self): @@ -2059,7 +2060,7 @@ class Parser(multiprocessing.Process): pending = [] havejobs = True try: - while havejobs or pending: + while (havejobs or pending) and not self.exit: if self.quit.is_set(): break @@ -2196,11 +2197,12 @@ class CookerParser(object): # Cleanup the queue before call process.join(), otherwise there might be # deadlocks. - while True: - try: - self.result_queue.get(timeout=0.25) - except queue.Empty: - break + def read_results(): + while True: + try: + self.result_queue.get(timeout=0.25) + except queue.Empty: + break def sync_caches(): for c in self.bb_caches.values(): @@ -2212,15 +2214,19 @@ class CookerParser(object): self.parser_quit.set() + read_results() + for process in self.processes: - process.join(0.5) + process.join(2) for process in self.processes: if process.exitcode is None: os.kill(process.pid, signal.SIGINT) + read_results() + for process in self.processes: - process.join(0.5) + process.join(2) for process in self.processes: if process.exitcode is None:
We sometimes see hangs in parsing during automated testing. It appears that SIGINT was sent to the underlying processes which see KeyboardInterrupt but they're stuck trying to write into the results pipe. The SIGINT was probably from some kind of parsing failure which doens't happen often, hence the hang being rare (in the incompatible license selftests from OE). This patch: * sets a flag to indicate exit upon SIGINT so the exit is more graceful and a defined exit path * empties the results queue after we send the quit event * empties the results queue after the SIGINT for good measure * increases the 0.5s timeout to 2s since we now have some very slow to parse recipes due to class extensions (ptests) This should hopefully make the parsing failure codepaths more robust. Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org> --- lib/bb/cooker.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-)