Message ID | 20220706205048.2492443-1-aryaman.gupta@windriver.com |
---|---|
State | Accepted, archived |
Commit | 502e05cbe67fb7a0e804dcc2cc0764a2e05c014f |
Headers | show |
Series | [v3] runqueue: add cpu/io pressure regulation | expand |
On Wed, Jul 6, 2022 at 01:51 PM, Aryaman Gupta wrote: > > Stop the scheduler from starting new tasks if the current cpu or io > pressure is above a certain threshold and there is at least one active > task. This threshold can be specified through the > "BB_PRESSURE_MAX_SOME_{CPU|IO}" variables in conf/local.conf. > > If the thresholds aren't specified, pressure is not monitored and hence > there is no impact on build times. > Arbitary lower limit of 1.0 results in a fatal error to avoid extremely > long builds. If the percentage limits are higher than 100, then the > vales are rounded down to 100 and warnings are issued to inform users > that the specified limit is out of bounds. > > The current bitbake scheduling algorithm requires that at least one > task be active. This means that if high pressure is seen, then new tasks > will not be started and pressure will be checked only for as long as at > least one task is active. When there are no active tasks, an additional > task > will be started and pressure checking resumed. This behaviour means that > if an external source is causing the pressure to exceed the threshold, > bitbake will continue to make some progress towards the requested target. > This violates the intent of limiting pressure but, given the current > scheduling algorithm as described above, there seems to be no other > option. > In the case where only one bitbake build is running, the implications of > the scheduler requirement will likely result in pressure being higher > than the threshold. More work would be required to ensure that > the pressure threshold is never exceeded, for example adding pressure > monitoring to make/ninja. > > Signed-off-by: Aryaman Gupta <aryaman.gupta@windriver.com> > Signed-off-by: Randy Macleod <randy.macleod@windriver.com> > --- > * Changes in V3: > - Ensure that there is always at least one active task before monitoring > pressure. > - Fix formatting issues and make code more Pythonic. > > * Changes in V2: > - Replace subprocess() calls with open() > - Rename BB variables to BB_PRESSURE_MAX_SOME_{CPU|IO} > - Skip the checking of pressure when no value is provided. > > bitbake/lib/bb/runqueue.py | 48 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 48 insertions(+) > > diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py > index 1e47fe70ef..eb3bea07a9 100644 > --- a/bitbake/lib/bb/runqueue.py > +++ b/bitbake/lib/bb/runqueue.py > @@ -159,6 +159,28 @@ class RunQueueScheduler(object): > self.buildable.append(tid) > > self.rev_prio_map = None > + # Some hosts like openSUSE have readable /proc/pressure files but throw > errors when these files are opened > + # and can't actually be read so don't check pressure in that case. > + if self.rq.max_cpu_pressure or self.rq.max_io_pressure: > + try: > + with open("/proc/pressure/cpu") as cpu_pressure_fds, > open("/proc/pressure/io") as io_pressure_fds: > + cpu_pressure_fds.read() > + io_pressure_fds.read() > + self.check_pressure = True > + except: > + bb.warn("The /proc/pressure files can't be read. Continuing build > without monitoring pressure") > + self.check_pressure = False > + else: > + self.check_pressure = False > + Can you extract the check into a standalone function returning a boolean? It would be more readable and maintainable. > > + def exceeds_max_pressure(self): > + if self.check_pressure: > + # Extract 'some avg10 percent values' from /proc/pressure/{cpu|io} > + with open("/proc/pressure/cpu") as cpu_pressure_fds, > open("/proc/pressure/io") as io_pressure_fds: > + curr_cpu_pressure = cpu_pressure_fds.readline().split()[1].split("=")[1] > > + curr_io_pressure = io_pressure_fds.readline().split()[1].split("=")[1] > + return float(curr_cpu_pressure) > self.rq.max_cpu_pressure or > float(curr_io_pressure) > self.rq.max_io_pressure > + return False > > def next_buildable_task(self): > """ > @@ -172,6 +194,12 @@ class RunQueueScheduler(object): > if not buildable: > return None > > + # Bitbake requires that at least one task be active. Only check for > pressure if > + # this is the case, otherwise the pressure limitation could result in no > tasks > + # being active and no new tasks started. > + if self.rq.stats.active and self.exceeds_max_pressure(): > + return None > + > # Filter out tasks that have a max number of threads that have been > exceeded > skip_buildable = {} > for running in self.rq.runq_running.difference(self.rq.runq_complete): > @@ -1699,6 +1727,8 @@ class RunQueueExecute: > > self.number_tasks = int(self.cfgData.getVar("BB_NUMBER_THREADS") or 1) > self.scheduler = self.cfgData.getVar("BB_SCHEDULER") or "speed" > + self.max_cpu_pressure = self.cfgData.getVar("BB_PRESSURE_MAX_SOME_CPU") > + self.max_io_pressure = self.cfgData.getVar("BB_PRESSURE_MAX_SOME_IO") > > self.sq_buildable = set() > self.sq_running = set() > @@ -1733,6 +1763,24 @@ class RunQueueExecute: > if self.number_tasks <= 0: > bb.fatal("Invalid BB_NUMBER_THREADS %s" % self.number_tasks) > > + lower_limit = 1.0 > + upper_limit = 100.0 > + if self.max_cpu_pressure: > + self.max_cpu_pressure = float(self.max_cpu_pressure) > + if self.max_cpu_pressure < lower_limit: > + bb.fatal("Invalid BB_PRESSURE_MAX_SOME_CPU %s, minimum value is %s" % > (self.max_cpu_pressure, lower_limit)) > + if self.max_cpu_pressure > upper_limit: > + bb.warn("Percentage value of BB_PRESSURE_MAX_SOME_CPU %s rounded down to > %s" % (self.max_cpu_pressure, upper_limit)) > + self.max_cpu_pressure = upper_limit > + > + if self.max_io_pressure: > + self.max_io_pressure = float(self.max_io_pressure) > + if self.max_io_pressure < lower_limit: > + bb.fatal("Invalid BB_PRESSURE_MAX_SOME_IO %s, minimum value is %s" % > (self.max_io_pressure, lower_limit)) > + if self.max_io_pressure > upper_limit: > + bb.warn("Percentage value of BB_PRESSURE_MAX_SOME_IO %s rounded down to > %s" % (self.max_io_pressure, upper_limit)) > + self.max_io_pressure = upper_limit > + > # List of setscene tasks which we've covered > self.scenequeue_covered = set() > # List of tasks which are covered (including setscene ones) > -- > 2.35.1
On 2022-07-09 07:22, Paulo Neves wrote: > On Wed, Jul 6, 2022 at 01:51 PM, Aryaman Gupta wrote: > > Stop the scheduler from starting new tasks if the current cpu or io > pressure is above a certain threshold and there is at least one active > task. This threshold can be specified through the > "BB_PRESSURE_MAX_SOME_{CPU|IO}" variables in conf/local.conf. > > If the thresholds aren't specified, pressure is not monitored and > hence > there is no impact on build times. > Arbitary lower limit of 1.0 results in a fatal error to avoid > extremely > long builds. If the percentage limits are higher than 100, then the > vales are rounded down to 100 and warnings are issued to inform users > that the specified limit is out of bounds. > > The current bitbake scheduling algorithm requires that at least one > task be active. This means that if high pressure is seen, then new > tasks > will not be started and pressure will be checked only for as long > as at > least one task is active. When there are no active tasks, an > additional task > will be started and pressure checking resumed. This behaviour > means that > if an external source is causing the pressure to exceed the threshold, > bitbake will continue to make some progress towards the requested > target. > This violates the intent of limiting pressure but, given the current > scheduling algorithm as described above, there seems to be no > other option. > In the case where only one bitbake build is running, the > implications of > the scheduler requirement will likely result in pressure being higher > than the threshold. More work would be required to ensure that > the pressure threshold is never exceeded, for example adding pressure > monitoring to make/ninja. > > Signed-off-by: Aryaman Gupta <aryaman.gupta@windriver.com> > Signed-off-by: Randy Macleod <randy.macleod@windriver.com> > --- > * Changes in V3: > - Ensure that there is always at least one active task before > monitoring > pressure. > - Fix formatting issues and make code more Pythonic. > > * Changes in V2: > - Replace subprocess() calls with open() > - Rename BB variables to BB_PRESSURE_MAX_SOME_{CPU|IO} > - Skip the checking of pressure when no value is provided. > > bitbake/lib/bb/runqueue.py > <https://urldefense.com/v3/__http://runqueue.py__;!!AjveYdw8EvQ!f65av5f_qIYMyIUWYyaF0PNd1UCdScGSGjGxc9BqGXoUFyz4ri56lFcyZy2KW4fxnf385-4EHu_lmvDNAxhxSw$> > | 48 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 48 insertions(+) > > diff --git a/bitbake/lib/bb/runqueue.py > <https://urldefense.com/v3/__http://runqueue.py__;!!AjveYdw8EvQ!f65av5f_qIYMyIUWYyaF0PNd1UCdScGSGjGxc9BqGXoUFyz4ri56lFcyZy2KW4fxnf385-4EHu_lmvDNAxhxSw$> > b/bitbake/lib/bb/runqueue.py > <https://urldefense.com/v3/__http://runqueue.py__;!!AjveYdw8EvQ!f65av5f_qIYMyIUWYyaF0PNd1UCdScGSGjGxc9BqGXoUFyz4ri56lFcyZy2KW4fxnf385-4EHu_lmvDNAxhxSw$> > index 1e47fe70ef..eb3bea07a9 100644 > --- a/bitbake/lib/bb/runqueue.py > <https://urldefense.com/v3/__http://runqueue.py__;!!AjveYdw8EvQ!f65av5f_qIYMyIUWYyaF0PNd1UCdScGSGjGxc9BqGXoUFyz4ri56lFcyZy2KW4fxnf385-4EHu_lmvDNAxhxSw$> > +++ b/bitbake/lib/bb/runqueue.py > <https://urldefense.com/v3/__http://runqueue.py__;!!AjveYdw8EvQ!f65av5f_qIYMyIUWYyaF0PNd1UCdScGSGjGxc9BqGXoUFyz4ri56lFcyZy2KW4fxnf385-4EHu_lmvDNAxhxSw$> > @@ -159,6 +159,28 @@ class RunQueueScheduler(object): > self.buildable.append(tid) > > self.rev_prio_map = None > + # Some hosts like openSUSE have readable /proc/pressure files > but throw errors when these files are opened > + # and can't actually be read so don't check pressure in that case. > + if self.rq.max_cpu_pressure or self.rq.max_io_pressure: > + try: > + with open("/proc/pressure/cpu") as cpu_pressure_fds, > open("/proc/pressure/io") as io_pressure_fds: > + cpu_pressure_fds.read() > + io_pressure_fds.read() > + self.check_pressure = True > + except: > + bb.warn("The /proc/pressure files can't be read. Continuing > build without monitoring pressure") > + self.check_pressure = False > + else: > + self.check_pressure = False > + > > Can you extract the check into a standalone function returning a > boolean? It would be more readable and maintainable. Good idea, Aryaman or I will implement it that way. Btw, we've seen, as expected, that using the avg10 is effective but tends to allow more work to start while pressure is ramping up from below threshold and delays starting work when the average is above threshold but declining. We've done some experiments with the 'some cpu total' values that responds to changes almost instantaneously. We take the diff every second and if the delta exceeds a threshold, new work is not started ( unless there is no active task ). More to come in v4. Hopefully that'll be later this week pending some tests. It would be nice to teach make and ninja about /proc/pressure and it shouldn't be that difficult since they already monitor load when fed the -l flag. In the shorter term, we want to focus on understanding what's causing the high (> 15 second!) IO latency spikes on the Yocto autobuilder machines. eg: ../Randy > > + def exceeds_max_pressure(self): > + if self.check_pressure: > + # Extract 'some avg10 percent values' from /proc/pressure/{cpu|io} > + with open("/proc/pressure/cpu") as cpu_pressure_fds, > open("/proc/pressure/io") as io_pressure_fds: > + curr_cpu_pressure = > cpu_pressure_fds.readline().split()[1].split("=")[1] > + curr_io_pressure = > io_pressure_fds.readline().split()[1].split("=")[1] > + return float(curr_cpu_pressure) > self.rq.max_cpu_pressure or > float(curr_io_pressure) > self.rq.max_io_pressure > + return False > > def next_buildable_task(self): > """ > @@ -172,6 +194,12 @@ class RunQueueScheduler(object): > if not buildable: > return None > > + # Bitbake requires that at least one task be active. Only check > for pressure if > + # this is the case, otherwise the pressure limitation could > result in no tasks > + # being active and no new tasks started. > + if self.rq.stats.active > <https://urldefense.com/v3/__http://self.rq.stats.active__;!!AjveYdw8EvQ!f65av5f_qIYMyIUWYyaF0PNd1UCdScGSGjGxc9BqGXoUFyz4ri56lFcyZy2KW4fxnf385-4EHu_lmvA4pIS4TQ$> > and self.exceeds_max_pressure(): > + return None > + > # Filter out tasks that have a max number of threads that have > been exceeded > skip_buildable = {} > for running in self.rq.runq_running.difference(self.rq.runq_complete): > @@ -1699,6 +1727,8 @@ class RunQueueExecute: > > self.number_tasks = int(self.cfgData.getVar("BB_NUMBER_THREADS") or 1) > self.scheduler = self.cfgData.getVar("BB_SCHEDULER") or "speed" > + self.max_cpu_pressure = > self.cfgData.getVar("BB_PRESSURE_MAX_SOME_CPU") > + self.max_io_pressure = > self.cfgData.getVar("BB_PRESSURE_MAX_SOME_IO") > > self.sq_buildable = set() > self.sq_running = set() > @@ -1733,6 +1763,24 @@ class RunQueueExecute: > if self.number_tasks <= 0: > bb.fatal("Invalid BB_NUMBER_THREADS %s" % self.number_tasks) > > + lower_limit = 1.0 > + upper_limit = 100.0 > + if self.max_cpu_pressure: > + self.max_cpu_pressure = float(self.max_cpu_pressure) > + if self.max_cpu_pressure < lower_limit: > + bb.fatal("Invalid BB_PRESSURE_MAX_SOME_CPU %s, minimum value is > %s" % (self.max_cpu_pressure, lower_limit)) > + if self.max_cpu_pressure > upper_limit: > + bb.warn("Percentage value of BB_PRESSURE_MAX_SOME_CPU %s rounded > down to %s" % (self.max_cpu_pressure, upper_limit)) > + self.max_cpu_pressure = upper_limit > + > + if self.max_io_pressure: > + self.max_io_pressure = float(self.max_io_pressure) > + if self.max_io_pressure < lower_limit: > + bb.fatal("Invalid BB_PRESSURE_MAX_SOME_IO %s, minimum value is > %s" % (self.max_io_pressure, lower_limit)) > + if self.max_io_pressure > upper_limit: > + bb.warn("Percentage value of BB_PRESSURE_MAX_SOME_IO %s rounded > down to %s" % (self.max_io_pressure, upper_limit)) > + self.max_io_pressure = upper_limit > + > # List of setscene tasks which we've covered > self.scenequeue_covered = set() > # List of tasks which are covered (including setscene ones) > -- > 2.35.1 > > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#13817):https://lists.openembedded.org/g/bitbake-devel/message/13817 > Mute This Topic:https://lists.openembedded.org/mt/92215602/3616765 > Group Owner:bitbake-devel+owner@lists.openembedded.org > Unsubscribe:https://lists.openembedded.org/g/bitbake-devel/unsub [randy.macleod@windriver.com] > -=-=-=-=-=-=-=-=-=-=-=- >
diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py index 1e47fe70ef..eb3bea07a9 100644 --- a/bitbake/lib/bb/runqueue.py +++ b/bitbake/lib/bb/runqueue.py @@ -159,6 +159,28 @@ class RunQueueScheduler(object): self.buildable.append(tid) self.rev_prio_map = None + # Some hosts like openSUSE have readable /proc/pressure files but throw errors when these files are opened + # and can't actually be read so don't check pressure in that case. + if self.rq.max_cpu_pressure or self.rq.max_io_pressure: + try: + with open("/proc/pressure/cpu") as cpu_pressure_fds, open("/proc/pressure/io") as io_pressure_fds: + cpu_pressure_fds.read() + io_pressure_fds.read() + self.check_pressure = True + except: + bb.warn("The /proc/pressure files can't be read. Continuing build without monitoring pressure") + self.check_pressure = False + else: + self.check_pressure = False + + def exceeds_max_pressure(self): + if self.check_pressure: + # Extract 'some avg10 percent values' from /proc/pressure/{cpu|io} + with open("/proc/pressure/cpu") as cpu_pressure_fds, open("/proc/pressure/io") as io_pressure_fds: + curr_cpu_pressure = cpu_pressure_fds.readline().split()[1].split("=")[1] + curr_io_pressure = io_pressure_fds.readline().split()[1].split("=")[1] + return float(curr_cpu_pressure) > self.rq.max_cpu_pressure or float(curr_io_pressure) > self.rq.max_io_pressure + return False def next_buildable_task(self): """ @@ -172,6 +194,12 @@ class RunQueueScheduler(object): if not buildable: return None + # Bitbake requires that at least one task be active. Only check for pressure if + # this is the case, otherwise the pressure limitation could result in no tasks + # being active and no new tasks started. + if self.rq.stats.active and self.exceeds_max_pressure(): + return None + # Filter out tasks that have a max number of threads that have been exceeded skip_buildable = {} for running in self.rq.runq_running.difference(self.rq.runq_complete): @@ -1699,6 +1727,8 @@ class RunQueueExecute: self.number_tasks = int(self.cfgData.getVar("BB_NUMBER_THREADS") or 1) self.scheduler = self.cfgData.getVar("BB_SCHEDULER") or "speed" + self.max_cpu_pressure = self.cfgData.getVar("BB_PRESSURE_MAX_SOME_CPU") + self.max_io_pressure = self.cfgData.getVar("BB_PRESSURE_MAX_SOME_IO") self.sq_buildable = set() self.sq_running = set() @@ -1733,6 +1763,24 @@ class RunQueueExecute: if self.number_tasks <= 0: bb.fatal("Invalid BB_NUMBER_THREADS %s" % self.number_tasks) + lower_limit = 1.0 + upper_limit = 100.0 + if self.max_cpu_pressure: + self.max_cpu_pressure = float(self.max_cpu_pressure) + if self.max_cpu_pressure < lower_limit: + bb.fatal("Invalid BB_PRESSURE_MAX_SOME_CPU %s, minimum value is %s" % (self.max_cpu_pressure, lower_limit)) + if self.max_cpu_pressure > upper_limit: + bb.warn("Percentage value of BB_PRESSURE_MAX_SOME_CPU %s rounded down to %s" % (self.max_cpu_pressure, upper_limit)) + self.max_cpu_pressure = upper_limit + + if self.max_io_pressure: + self.max_io_pressure = float(self.max_io_pressure) + if self.max_io_pressure < lower_limit: + bb.fatal("Invalid BB_PRESSURE_MAX_SOME_IO %s, minimum value is %s" % (self.max_io_pressure, lower_limit)) + if self.max_io_pressure > upper_limit: + bb.warn("Percentage value of BB_PRESSURE_MAX_SOME_IO %s rounded down to %s" % (self.max_io_pressure, upper_limit)) + self.max_io_pressure = upper_limit + # List of setscene tasks which we've covered self.scenequeue_covered = set() # List of tasks which are covered (including setscene ones)