Message ID | 20230523020905.2911543-1-Qi.Chen@windriver.com |
---|---|
State | New |
Headers | show |
Series | [bitbake-devel,V2] runqueue.py: fix PSI check logic | expand |
On 2023-05-22 22:09, Qi.Chen@windriver.com wrote: > From: Chen Qi<Qi.Chen@windriver.com> > > The current logic is not correct because if the time interval > between the current check and the last check is very small, the PSI > checker is not likely to block things even if the system is heavy > loaded. > > It's not good to calculate the value too often. So we change to a 1s > check. As a build will usually take at least minutes, using the 1s value > seems reasonable. I don't have time to check today but I'm not convinced that this is true. I suspect that in the typical case, it would be okay but there will be workloads (perhaps rare workloads) where you want to check more often. The 1 second intervals where we no longer launch new work may result in a significantly longer build. IIRC, our benchmark was building core-image-minimal on a 24 core system without PSI regulation and then with it set to 300. I'd look at the elapsed time and the buildchart/bootchart: scripts/pybootchartgui/README.pybootchart Ola, have you been able to test this and if not, can you? All for today. ../Randy > > Signed-off-by: Chen Qi<Qi.Chen@windriver.com> > --- > bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- > 1 file changed, 14 insertions(+), 15 deletions(-) > > diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py > index 02f1474540..4d49d25153 100644 > --- a/bitbake/lib/bb/runqueue.py > +++ b/bitbake/lib/bb/runqueue.py > @@ -179,6 +179,7 @@ class RunQueueScheduler(object): > self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] > self.prev_pressure_time = time.time() > self.check_pressure = True > + self.psi_exceeded = False > except: > bb.note("The /proc/pressure files can't be read. Continuing build without monitoring pressure") > self.check_pressure = False > @@ -191,6 +192,10 @@ class RunQueueScheduler(object): > BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. > """ > if self.check_pressure: > + now = time.time() > + tdiff = now - self.prev_pressure_time > + if tdiff < 1.0: > + return self.psi_exceeded > with open("/proc/pressure/cpu") as cpu_pressure_fds, \ > open("/proc/pressure/io") as io_pressure_fds, \ > open("/proc/pressure/memory") as memory_pressure_fds: > @@ -198,21 +203,15 @@ class RunQueueScheduler(object): > curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] > curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] > curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] > - now = time.time() > - tdiff = now - self.prev_pressure_time > - if tdiff > 1.0: > - exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > self.rq.max_cpu_pressure > - exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > self.rq.max_io_pressure > - exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure > - self.prev_cpu_pressure = curr_cpu_pressure > - self.prev_io_pressure = curr_io_pressure > - self.prev_memory_pressure = curr_memory_pressure > - self.prev_pressure_time = now > - else: > - exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > self.rq.max_cpu_pressure > - exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) > self.rq.max_io_pressure > - exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > self.rq.max_memory_pressure > - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) > + exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > self.rq.max_cpu_pressure > + exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > self.rq.max_io_pressure > + exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure > + self.prev_cpu_pressure = curr_cpu_pressure > + self.prev_io_pressure = curr_io_pressure > + self.prev_memory_pressure = curr_memory_pressure > + self.prev_pressure_time = now > + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure > + return self.psi_exceeded > return False > > def next_buildable_task(self):
On 5/24/23 07:24, Randy MacLeod wrote: > On 2023-05-22 22:09, Qi.Chen@windriver.com wrote: >> From: Chen Qi<Qi.Chen@windriver.com> >> >> The current logic is not correct because if the time interval >> between the current check and the last check is very small, the PSI >> checker is not likely to block things even if the system is heavy >> loaded. >> >> It's not good to calculate the value too often. So we change to a 1s >> check. As a build will usually take at least minutes, using the 1s value >> seems reasonable. > > I don't have time to check today but I'm not convinced that this is true. > I'd like to give out two reasons to try to convince you :) 1. Think of this as some kind of sampling rate on the PSI data. Picture in our mind the wave of the PSI data, and we sample it. Only in a situation where PSI data changes very rapidly (high -> low & low -> high) on a time basis less than 1s (e.g., 0.1s), will we want to check more. I don't think there would be any situation like this in our real life. 2. The build time is typically related to task numbers. From example, a core-image-minimal involves about 2000+ tasks and takes about 20 minutes (1200s) to finish. In such case, we have about 1000+ checks during the whole build for 2000+ tasks. This seems pretty enough. > I suspect that in the typical case, it would be okay but there will be > workloads (perhaps rare workloads) where you want to check more often. > > The 1 second intervals where we no longer launch new work may > result in a significantly longer build. IIRC, our benchmark was building > core-image-minimal on a 24 core system without PSI regulation and then > with it set to 300. > The reason we currently have to use very low number (e.g., 300, 300/1000000=0.03%) to achieve the effect of restricting task spawning is due to the problem Ola pointed out, that is, even if a task is blocked in this check, if the next check happens shortly enough, the task does not get blocked. We need to apply this patch or similar to start compare the data. Maybe 5% (50,000) or 10% (100,000) would serve as a start? Regards, Qi > I'd look at the elapsed time and the buildchart/bootchart: > scripts/pybootchartgui/README.pybootchart > > Ola, have you been able to test this and if not, can you? > > All for today. > > ../Randy > > >> Signed-off-by: Chen Qi<Qi.Chen@windriver.com> >> --- >> bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- >> 1 file changed, 14 insertions(+), 15 deletions(-) >> >> diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py >> index 02f1474540..4d49d25153 100644 >> --- a/bitbake/lib/bb/runqueue.py >> +++ b/bitbake/lib/bb/runqueue.py >> @@ -179,6 +179,7 @@ class RunQueueScheduler(object): >> self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >> self.prev_pressure_time = time.time() >> self.check_pressure = True >> + self.psi_exceeded = False >> except: >> bb.note("The /proc/pressure files can't be read. Continuing build without monitoring pressure") >> self.check_pressure = False >> @@ -191,6 +192,10 @@ class RunQueueScheduler(object): >> BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. >> """ >> if self.check_pressure: >> + now = time.time() >> + tdiff = now - self.prev_pressure_time >> + if tdiff < 1.0: >> + return self.psi_exceeded >> with open("/proc/pressure/cpu") as cpu_pressure_fds, \ >> open("/proc/pressure/io") as io_pressure_fds, \ >> open("/proc/pressure/memory") as memory_pressure_fds: >> @@ -198,21 +203,15 @@ class RunQueueScheduler(object): >> curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] >> curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] >> curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >> - now = time.time() >> - tdiff = now - self.prev_pressure_time >> - if tdiff > 1.0: >> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > self.rq.max_cpu_pressure >> - exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > self.rq.max_io_pressure >> - exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure >> - self.prev_cpu_pressure = curr_cpu_pressure >> - self.prev_io_pressure = curr_io_pressure >> - self.prev_memory_pressure = curr_memory_pressure >> - self.prev_pressure_time = now >> - else: >> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > self.rq.max_cpu_pressure >> - exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) > self.rq.max_io_pressure >> - exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > self.rq.max_memory_pressure >> - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) >> + exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > self.rq.max_cpu_pressure >> + exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > self.rq.max_io_pressure >> + exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure >> + self.prev_cpu_pressure = curr_cpu_pressure >> + self.prev_io_pressure = curr_io_pressure >> + self.prev_memory_pressure = curr_memory_pressure >> + self.prev_pressure_time = now >> + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure >> + return self.psi_exceeded >> return False >> >> def next_buildable_task(self): > > > -- > # Randy MacLeod > # Wind River Linux
On 2023-05-23 22:18, ChenQi wrote: > On 5/24/23 07:24, Randy MacLeod wrote: >> On 2023-05-22 22:09, Qi.Chen@windriver.com wrote: >>> From: Chen Qi<Qi.Chen@windriver.com> >>> >>> The current logic is not correct because if the time interval >>> between the current check and the last check is very small, the PSI >>> checker is not likely to block things even if the system is heavy >>> loaded. >>> >>> It's not good to calculate the value too often. So we change to a 1s >>> check. As a build will usually take at least minutes, using the 1s >>> value >>> seems reasonable. >> >> I don't have time to check today but I'm not convinced that this is >> true. >> > I'd like to give out two reasons to try to convince you :) > > 1. Think of this as some kind of sampling rate on the PSI data. > Picture in our mind the wave of the PSI data, and we sample it. Only > in a situation where PSI data changes very rapidly (high -> low & low > -> high) on a time basis less than 1s (e.g., 0.1s), will we want to > check more. I don't think there would be any situation like this in > our real life. Hmm, really: https://photos.app.goo.gl/PDTRkGQ4LMnz8Qw26 You can see that the pressure does change quickly. Remember, some people describe bitbake as a somewhat controlled fork-bomb! > > 2. The build time is typically related to task numbers. From example, > a core-image-minimal involves about 2000+ tasks and takes about 20 > minutes (1200s) to finish. In such case, we have about 1000+ checks > during the whole build for 2000+ tasks. This seems pretty enough. Sure but task creation is not at all uniformly distributed over that time. ❯ pwd /home/rmacleod/wrs/yocto/summit/22.11/build-time/buildstats 22.11/build-time/buildstats ❯ find 20221125231705 -type f | wc -l 4379 and see the graph of attached data below. > > >> I suspect that in the typical case, it would be okay but there will be >> workloads (perhaps rare workloads) where you want to check more often. >> >> The 1 second intervals where we no longer launch new work may >> result in a significantly longer build. IIRC, our benchmark was building >> core-image-minimal on a 24 core system without PSI regulation and then >> with it set to 300. >> > The reason we currently have to use very low number (e.g., 300, > 300/1000000=0.03%) to achieve the effect of restricting task spawning > is due to the problem Ola pointed out, that is, even if a task is > blocked in this check, if the next check happens shortly enough, the > task does not get blocked. We need to apply this patch or similar to > start compare the data. Maybe. Remember that the pressure is a lagging indicator or resource contention. The goal is to have the system come up 100% utilization but not enter the contention state. The main reason we need low limits is that the tasks can introduce lots of load to an already busy system and the recipe build programs (make, ninja, ...) do not check /proc/pressure. Zheng and I started on that but his term finished. Paul Smith, who maintains gmake, suggested that I take a closer look at the "make -l" algorithm, which has been improved recently but I haven't been able to take time to work on that. > > Maybe 5% (50,000) or 10% (100,000) would serve as a start? These are good ideas and may even be what we go with but what they lack is data to back them up. I have a core-image-minimal build starting now on a 24 core machine similar to what I used for the data above. I'll do some builds with and without your patch and get back to you hopefully tomorrow but if not next week since I'm taking Friday as a holiday. ../Randy > > Regards, > > Qi > >> I'd look at the elapsed time and the buildchart/bootchart: >> scripts/pybootchartgui/README.pybootchart >> >> Ola, have you been able to test this and if not, can you? >> >> All for today. >> >> ../Randy >> >> >>> Signed-off-by: Chen Qi<Qi.Chen@windriver.com> >>> --- >>> bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- >>> 1 file changed, 14 insertions(+), 15 deletions(-) >>> >>> diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py >>> index 02f1474540..4d49d25153 100644 >>> --- a/bitbake/lib/bb/runqueue.py >>> +++ b/bitbake/lib/bb/runqueue.py >>> @@ -179,6 +179,7 @@ class RunQueueScheduler(object): >>> self.prev_memory_pressure = >>> memory_pressure_fds.readline().split()[4].split("=")[1] >>> self.prev_pressure_time = time.time() >>> self.check_pressure = True >>> + self.psi_exceeded = False >>> except: >>> bb.note("The /proc/pressure files can't be read. >>> Continuing build without monitoring pressure") >>> self.check_pressure = False >>> @@ -191,6 +192,10 @@ class RunQueueScheduler(object): >>> BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if >>> above threshold. >>> """ >>> if self.check_pressure: >>> + now = time.time() >>> + tdiff = now - self.prev_pressure_time >>> + if tdiff < 1.0: >>> + return self.psi_exceeded >>> with open("/proc/pressure/cpu") as cpu_pressure_fds, \ >>> open("/proc/pressure/io") as io_pressure_fds, \ >>> open("/proc/pressure/memory") as memory_pressure_fds: >>> @@ -198,21 +203,15 @@ class RunQueueScheduler(object): >>> curr_cpu_pressure = >>> cpu_pressure_fds.readline().split()[4].split("=")[1] >>> curr_io_pressure = >>> io_pressure_fds.readline().split()[4].split("=")[1] >>> curr_memory_pressure = >>> memory_pressure_fds.readline().split()[4].split("=")[1] >>> - now = time.time() >>> - tdiff = now - self.prev_pressure_time >>> - if tdiff > 1.0: >>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure >>> and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / >>> tdiff > self.rq.max_cpu_pressure >>> - exceeds_io_pressure = self.rq.max_io_pressure >>> and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff >>> > self.rq.max_io_pressure >>> - exceeds_memory_pressure = >>> self.rq.max_memory_pressure and (float(curr_memory_pressure) - >>> float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure >>> - self.prev_cpu_pressure = curr_cpu_pressure >>> - self.prev_io_pressure = curr_io_pressure >>> - self.prev_memory_pressure = curr_memory_pressure >>> - self.prev_pressure_time = now >>> - else: >>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure >>> and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > >>> self.rq.max_cpu_pressure >>> - exceeds_io_pressure = self.rq.max_io_pressure >>> and (float(curr_io_pressure) - float(self.prev_io_pressure)) > >>> self.rq.max_io_pressure >>> - exceeds_memory_pressure = >>> self.rq.max_memory_pressure and (float(curr_memory_pressure) - >>> float(self.prev_memory_pressure)) > self.rq.max_memory_pressure >>> - return (exceeds_cpu_pressure or exceeds_io_pressure or >>> exceeds_memory_pressure) >>> + exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >>> self.rq.max_cpu_pressure >>> + exceeds_io_pressure = self.rq.max_io_pressure and >>> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >>> self.rq.max_io_pressure >>> + exceeds_memory_pressure = >>> self.rq.max_memory_pressure and (float(curr_memory_pressure) - >>> float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure >>> + self.prev_cpu_pressure = curr_cpu_pressure >>> + self.prev_io_pressure = curr_io_pressure >>> + self.prev_memory_pressure = curr_memory_pressure >>> + self.prev_pressure_time = now >>> + self.psi_exceeded = exceeds_cpu_pressure or >>> exceeds_io_pressure or exceeds_memory_pressure >>> + return self.psi_exceeded >>> return False >>> def next_buildable_task(self): >> >> >> -- >> # Randy MacLeod >> # Wind River Linux > > -- # Randy MacLeod # Wind River Linux 0, 1 18, 2 19, 120 20, 106 21, 78 22, 74 23, 40 24, 57 25, 57 26, 71 27, 53 28, 68 29, 59 30, 75 31, 65 32, 80 33, 22 34, 25 35, 11 36, 10 37, 5 38, 5 39, 4 41, 3 42, 2 45, 3 54, 1 58, 2 59, 2 61, 1 62, 1 63, 3 66, 1 67, 1 68, 3 69, 1 80, 1 81, 1 82, 1 90, 1 91, 1 92, 20 93, 18 98, 2 99, 7 100, 1 101, 1 102, 3 103, 2 104, 2 105, 5 106, 6 107, 10 108, 8 109, 7 110, 1 111, 1 112, 2 113, 2 114, 2 115, 12 116, 54 117, 60 118, 67 119, 15 120, 5 121, 3 122, 9 123, 2 127, 1 128, 6 129, 3 131, 4 132, 4 133, 2 136, 4 137, 17 138, 18 141, 1 142, 3 143, 4 144, 7 148, 3 149, 2 150, 3 151, 2 152, 64 153, 61 155, 34 156, 81 157, 78 158, 41 159, 13 160, 84 161, 101 162, 15 163, 1 165, 5 168, 1 170, 4 171, 3 172, 3 173, 1 174, 2 175, 3 176, 1 177, 2 178, 2 179, 3 180, 2 181, 1 182, 3 184, 1 186, 2 187, 12 188, 15 189, 13 190, 2 191, 3 193, 1 194, 1 195, 3 196, 1 197, 4 202, 1 205, 1 206, 4 207, 1 208, 4 210, 1 213, 1 215, 1 216, 1 217, 1 218, 1 219, 1 220, 1 221, 5 222, 2 223, 2 226, 5 227, 6 231, 1 232, 1 233, 2 234, 5 243, 1 245, 1 247, 1 249, 2 252, 1 253, 1 256, 5 268, 1 274, 1 277, 6 278, 3 279, 1 280, 2 289, 1 297, 1 298, 1 310, 1 313, 2 334, 1 335, 9 336, 1 338, 1 350, 1 352, 1 353, 1 373, 1 378, 2 379, 1 380, 2 385, 1 387, 1 394, 1 397, 1 402, 1 405, 3 409, 1 410, 1 411, 4 412, 2 417, 1 424, 1 428, 13 429, 1 430, 3 431, 2 432, 2 433, 4 434, 2 437, 1 438, 2 439, 1 440, 1 441, 1 442, 1 443, 1 444, 2 445, 3 447, 1 448, 4 449, 7 450, 13 452, 4 453, 4 454, 5 455, 1 456, 18 457, 9 458, 4 459, 5 460, 7 461, 6 462, 4 463, 2 465, 1 466, 1 467, 1 468, 2 469, 4 470, 1 471, 2 472, 2 473, 13 474, 3 475, 3 476, 1 477, 2 478, 1 479, 1 480, 6 481, 6 482, 7 483, 3 484, 2 485, 3 486, 2 487, 2 488, 1 489, 2 491, 2 492, 2 496, 1 498, 1 500, 1 501, 1 502, 2 503, 2 504, 2 505, 1 506, 3 512, 1 514, 1 516, 1 518, 3 519, 2 520, 1 522, 1 523, 2 525, 2 527, 2 529, 1 533, 2 534, 1 536, 1 539, 2 540, 4 541, 3 542, 1 543, 2 544, 1 545, 1 547, 1 548, 4 549, 4 555, 1 557, 2 559, 2 560, 1 561, 2 562, 3 563, 4 564, 2 565, 2 566, 2 567, 2 568, 2 571, 1 572, 2 573, 1 574, 1 575, 1 577, 1 578, 1 579, 1 580, 2 581, 2 582, 3 583, 1 584, 1 585, 1 587, 2 592, 1 594, 2 595, 1 597, 1 612, 1 614, 1 615, 1 617, 2 619, 1 630, 1 634, 1 641, 1 642, 1 644, 2 645, 2 648, 1 651, 1 652, 1 654, 4 655, 4 656, 1 660, 1 661, 1 662, 2 663, 1 667, 1 670, 1 671, 1 672, 1 676, 2 677, 1 678, 1 679, 1 680, 1 684, 1 688, 1 690, 19 691, 2 692, 8 693, 12 694, 11 695, 3 696, 2 697, 1 698, 5 699, 6 700, 3 702, 2 704, 1 706, 1 708, 14 709, 26 710, 15 711, 4 712, 8 713, 3 714, 7 715, 1 716, 2 717, 4 718, 1 719, 3 720, 1 721, 1 722, 4 723, 3 725, 1 728, 3 734, 2 735, 18 736, 21 737, 18 738, 20 739, 28 740, 14 741, 13 742, 6 743, 6 744, 6 745, 15 746, 14 747, 12 748, 4 749, 5 750, 5 751, 1 752, 1 753, 3 754, 5 755, 2 756, 3 758, 1 759, 5 760, 2 761, 3 762, 7 763, 4 764, 8 765, 6 766, 4 767, 4 768, 2 769, 7 770, 5 771, 5 772, 10 773, 12 774, 8 775, 11 776, 15 777, 8 778, 11 779, 14 780, 15 781, 4 782, 20 783, 14 784, 10 785, 3 786, 2 787, 3 788, 4 789, 3 790, 8 791, 5 792, 11 793, 10 794, 8 795, 11 796, 8 797, 16 798, 6 799, 8 800, 8 801, 10 802, 4 803, 2 804, 5 805, 8 806, 5 807, 2 808, 4 809, 3 810, 8 811, 4 812, 3 813, 4 814, 1 815, 3 816, 2 817, 8 818, 3 819, 6 820, 3 821, 8 822, 5 823, 8 824, 6 826, 3 827, 5 828, 4 829, 6 831, 3 832, 2 833, 1 834, 3 835, 5 836, 3 839, 7 840, 8 841, 4 842, 4 843, 1 844, 4 845, 5 846, 2 847, 7 848, 7 849, 6 850, 3 852, 5 853, 4 854, 2 856, 4 857, 3 859, 4 860, 2 861, 2 862, 3 863, 7 864, 1 865, 1 866, 4 867, 1 868, 4 869, 6 870, 4 871, 4 872, 5 873, 2 874, 2 875, 3 876, 6 877, 5 878, 3 879, 5 880, 2 881, 4 882, 3 883, 1 884, 2 885, 2 886, 3 887, 2 888, 4 889, 5 890, 1 891, 2 892, 1 893, 1 895, 4 896, 2 897, 7 898, 5 900, 3 901, 9 902, 8 903, 4 904, 3 905, 5 906, 8 907, 1 908, 2 909, 4 910, 2 911, 5 912, 7 913, 6 914, 2 915, 1 916, 7 917, 3 918, 8 919, 2 923, 2 924, 5 925, 3 927, 1 928, 3 929, 1 930, 4 931, 3 932, 2 933, 2 934, 1 935, 6 936, 3 937, 1 938, 5 939, 5 940, 2 941, 13 942, 3 947, 2 948, 2 949, 5 950, 4 951, 3 952, 1 953, 4 954, 2 955, 3 956, 1 958, 1 959, 7 961, 4 962, 1 963, 3 964, 4 965, 4 967, 1 969, 3 970, 2 971, 2 972, 2 974, 2 975, 3 976, 5 977, 1 979, 1 980, 2 981, 3 984, 2 985, 1 986, 5 987, 3 988, 2 991, 2 992, 2 994, 1 995, 2 996, 1 1000, 1 1001, 1 1003, 1 1006, 1 1008, 1 1010, 2 1011, 1 1012, 3 1013, 4 1019, 2 1022, 1 1026, 1 1027, 2 1028, 1 1029, 1 1030, 1 1031, 5 1032, 8 1036, 3 1039, 2 1040, 1 1041, 2 1046, 1 1047, 4 1048, 2 1049, 1 1050, 1 1051, 3 1053, 1 1057, 1 1058, 7 1059, 1 1060, 1 1061, 2 1063, 1 1065, 1 1066, 4 1067, 9 1068, 1 1069, 1 1070, 5 1071, 4 1073, 2 1075, 4 1077, 1 1078, 1 1079, 8 1080, 1 1092, 2 1096, 1 1097, 1 1098, 3 1099, 1 1100, 2 1101, 1 1102, 1 1105, 2 1107, 3 1108, 2 1109, 1 1111, 1 1112, 2 1113, 1 1114, 2 1115, 2 1116, 1 1118, 2 1119, 1 1121, 1 1123, 2 1126, 1 1127, 1 1128, 2 1129, 2 1130, 3 1131, 5 1132, 1 1133, 1 1134, 3 1135, 6 1136, 2 1137, 2 1138, 1 1139, 1 1140, 5 1141, 1 1142, 4 1145, 1 1146, 2 1147, 1 1148, 2 1161, 1 1169, 1 1174, 1 1175, 2 1177, 2 1178, 3 1182, 1 1183, 1 1184, 11 1185, 19 1186, 1 1187, 3 1188, 3 1190, 2 1193, 1 1197, 1 1199, 2 1201, 1 1203, 1 1213, 5 1223, 1 1229, 1 1240, 1 1242, 1 1246, 1 1248, 3 1249, 2 1250, 3 1251, 4 1252, 1 1256, 2 1258, 1 1259, 1 1264, 1 1265, 1 1267, 3 1268, 1 1269, 1 1271, 2 1272, 4 1273, 3 1274, 6 1276, 1 1277, 1 1278, 2 1279, 1 1281, 2 1282, 2 1287, 1 1289, 1 1290, 1 1294, 5 1314, 1 1315, 2 1332, 1 1344, 1 1347, 1 1350, 1 1353, 1 1354, 6 1358, 1 1359, 4 1457, 1 1566, 1 1570, 1 1571, 3 1657, 2 1658, 1 1659, 1 1660, 2 1693, 1 1694, 2 1696, 2 1697, 6 1698, 3
On Tue, May 23 2023, Randy MacLeod via lists.openembedded.org wrote: > On 2023-05-22 22:09, Qi.Chen@windriver.com wrote: >> From: Chen Qi<Qi.Chen@windriver.com> >> >> The current logic is not correct because if the time interval >> between the current check and the last check is very small, the PSI >> checker is not likely to block things even if the system is heavy >> loaded. >> >> It's not good to calculate the value too often. So we change to a 1s >> check. As a build will usually take at least minutes, using the 1s value >> seems reasonable. > > I don't have time to check today but I'm not convinced that this is true. > > I suspect that in the typical case, it would be okay but there will be > workloads (perhaps rare workloads) where you want to check more often. > > The 1 second intervals where we no longer launch new work may > result in a significantly longer build. IIRC, our benchmark was building > core-image-minimal on a 24 core system without PSI regulation and then > with it set to 300. I'd look at the elapsed time and the > buildchart/bootchart: > scripts/pybootchartgui/README.pybootchart > > Ola, have you been able to test this and if not, can you? > This patch is identical to the V1 except for the commit message, so I consider it Tested-By: Ola Nilsson <ola.x.nilsson@axis.com> /Ola > All for today. > > ../Randy > > >> >> Signed-off-by: Chen Qi<Qi.Chen@windriver.com> >> --- >> bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- >> 1 file changed, 14 insertions(+), 15 deletions(-) >> >> diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py >> index 02f1474540..4d49d25153 100644 >> --- a/bitbake/lib/bb/runqueue.py >> +++ b/bitbake/lib/bb/runqueue.py >> @@ -179,6 +179,7 @@ class RunQueueScheduler(object): >> self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >> self.prev_pressure_time = time.time() >> self.check_pressure = True >> + self.psi_exceeded = False >> except: >> bb.note("The /proc/pressure files can't be read. Continuing build > without monitoring pressure") >> self.check_pressure = False >> @@ -191,6 +192,10 @@ class RunQueueScheduler(object): >> BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. >> """ >> if self.check_pressure: >> + now = time.time() >> + tdiff = now - self.prev_pressure_time >> + if tdiff < 1.0: >> + return self.psi_exceeded >> with open("/proc/pressure/cpu") as cpu_pressure_fds, \ >> open("/proc/pressure/io") as io_pressure_fds, \ >> open("/proc/pressure/memory") as memory_pressure_fds: >> @@ -198,21 +203,15 @@ class RunQueueScheduler(object): >> curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] >> curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] >> curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >> - now = time.time() >> - tdiff = now - self.prev_pressure_time >> - if tdiff > 1.0: >> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and > (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > > self.rq.max_cpu_pressure >> - exceeds_io_pressure = self.rq.max_io_pressure and > (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > > self.rq.max_io_pressure >> - exceeds_memory_pressure = self.rq.max_memory_pressure and > (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / > tdiff > self.rq.max_memory_pressure >> - self.prev_cpu_pressure = curr_cpu_pressure >> - self.prev_io_pressure = curr_io_pressure >> - self.prev_memory_pressure = curr_memory_pressure >> - self.prev_pressure_time = now >> - else: >> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and > (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > > self.rq.max_cpu_pressure >> - exceeds_io_pressure = self.rq.max_io_pressure and > (float(curr_io_pressure) - float(self.prev_io_pressure)) > > self.rq.max_io_pressure >> - exceeds_memory_pressure = self.rq.max_memory_pressure and > (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > > self.rq.max_memory_pressure >> - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) >> + exceeds_cpu_pressure = self.rq.max_cpu_pressure and > (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > > self.rq.max_cpu_pressure >> + exceeds_io_pressure = self.rq.max_io_pressure and > (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > > self.rq.max_io_pressure >> + exceeds_memory_pressure = self.rq.max_memory_pressure and > (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / > tdiff > self.rq.max_memory_pressure >> + self.prev_cpu_pressure = curr_cpu_pressure >> + self.prev_io_pressure = curr_io_pressure >> + self.prev_memory_pressure = curr_memory_pressure >> + self.prev_pressure_time = now >> + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure >> + return self.psi_exceeded >> return False >> def next_buildable_task(self):
On 2023-05-25 04:17, Ola x Nilsson wrote: > On Tue, May 23 2023, Randy MacLeod via lists.openembedded.org wrote: > >> On 2023-05-22 22:09,Qi.Chen@windriver.com wrote: >>> From: Chen Qi<Qi.Chen@windriver.com> >>> >>> The current logic is not correct because if the time interval >>> between the current check and the last check is very small, the PSI >>> checker is not likely to block things even if the system is heavy >>> loaded. >>> >>> It's not good to calculate the value too often. So we change to a 1s >>> check. As a build will usually take at least minutes, using the 1s value >>> seems reasonable. >> I don't have time to check today but I'm not convinced that this is true. >> >> I suspect that in the typical case, it would be okay but there will be >> workloads (perhaps rare workloads) where you want to check more often. >> >> The 1 second intervals where we no longer launch new work may >> result in a significantly longer build. IIRC, our benchmark was building >> core-image-minimal on a 24 core system without PSI regulation and then >> with it set to 300. I'd look at the elapsed time and the >> buildchart/bootchart: >> scripts/pybootchartgui/README.pybootchart >> >> Ola, have you been able to test this and if not, can you? >> > This patch is identical to the V1 except for the commit message, so I > consider it > > Tested-By: Ola Nilsson<ola.x.nilsson@axis.com> > > /Ola Richard and Alex were waiting to merge this but I think we need some indicating that this produces a better outcome than before the patch. I've pencilled in some time on next Tuesday to do that but we're working on a release so higher priority work may interfere. Qi and I will get to this eventually and reply here or with a new patch. ../Randy > >> All for today. >> >> ../Randy >> >> >>> Signed-off-by: Chen Qi<Qi.Chen@windriver.com> >>> --- >>> bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- >>> 1 file changed, 14 insertions(+), 15 deletions(-) >>> >>> diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py >>> index 02f1474540..4d49d25153 100644 >>> --- a/bitbake/lib/bb/runqueue.py >>> +++ b/bitbake/lib/bb/runqueue.py >>> @@ -179,6 +179,7 @@ class RunQueueScheduler(object): >>> self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >>> self.prev_pressure_time = time.time() >>> self.check_pressure = True >>> + self.psi_exceeded = False >>> except: >>> bb.note("The /proc/pressure files can't be read. Continuing build >> without monitoring pressure") >>> self.check_pressure = False >>> @@ -191,6 +192,10 @@ class RunQueueScheduler(object): >>> BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. >>> """ >>> if self.check_pressure: >>> + now = time.time() >>> + tdiff = now - self.prev_pressure_time >>> + if tdiff < 1.0: >>> + return self.psi_exceeded >>> with open("/proc/pressure/cpu") as cpu_pressure_fds, \ >>> open("/proc/pressure/io") as io_pressure_fds, \ >>> open("/proc/pressure/memory") as memory_pressure_fds: >>> @@ -198,21 +203,15 @@ class RunQueueScheduler(object): >>> curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] >>> curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] >>> curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >>> - now = time.time() >>> - tdiff = now - self.prev_pressure_time >>> - if tdiff > 1.0: >>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and >> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >> self.rq.max_cpu_pressure >>> - exceeds_io_pressure = self.rq.max_io_pressure and >> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >> self.rq.max_io_pressure >>> - exceeds_memory_pressure = self.rq.max_memory_pressure and >> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / >> tdiff > self.rq.max_memory_pressure >>> - self.prev_cpu_pressure = curr_cpu_pressure >>> - self.prev_io_pressure = curr_io_pressure >>> - self.prev_memory_pressure = curr_memory_pressure >>> - self.prev_pressure_time = now >>> - else: >>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and >> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > >> self.rq.max_cpu_pressure >>> - exceeds_io_pressure = self.rq.max_io_pressure and >> (float(curr_io_pressure) - float(self.prev_io_pressure)) > >> self.rq.max_io_pressure >>> - exceeds_memory_pressure = self.rq.max_memory_pressure and >> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > >> self.rq.max_memory_pressure >>> - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) >>> + exceeds_cpu_pressure = self.rq.max_cpu_pressure and >> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >> self.rq.max_cpu_pressure >>> + exceeds_io_pressure = self.rq.max_io_pressure and >> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >> self.rq.max_io_pressure >>> + exceeds_memory_pressure = self.rq.max_memory_pressure and >> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / >> tdiff > self.rq.max_memory_pressure >>> + self.prev_cpu_pressure = curr_cpu_pressure >>> + self.prev_io_pressure = curr_io_pressure >>> + self.prev_memory_pressure = curr_memory_pressure >>> + self.prev_pressure_time = now >>> + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure >>> + return self.psi_exceeded >>> return False >>> def next_buildable_task(self):
On 2023-06-08 09:25, Randy MacLeod via lists.openembedded.org wrote: > On 2023-05-25 04:17, Ola x Nilsson wrote: >> On Tue, May 23 2023, Randy MacLeod via lists.openembedded.org wrote: >> >>> On 2023-05-22 22:09,Qi.Chen@windriver.com wrote: >>>> From: Chen Qi<Qi.Chen@windriver.com> >>>> >>>> The current logic is not correct because if the time interval >>>> between the current check and the last check is very small, the PSI >>>> checker is not likely to block things even if the system is heavy >>>> loaded. >>>> >>>> It's not good to calculate the value too often. So we change to a 1s >>>> check. As a build will usually take at least minutes, using the 1s value >>>> seems reasonable. >>> I don't have time to check today but I'm not convinced that this is true. >>> >>> I suspect that in the typical case, it would be okay but there will be >>> workloads (perhaps rare workloads) where you want to check more often. >>> >>> The 1 second intervals where we no longer launch new work may >>> result in a significantly longer build. IIRC, our benchmark was building >>> core-image-minimal on a 24 core system without PSI regulation and then >>> with it set to 300. I'd look at the elapsed time and the >>> buildchart/bootchart: >>> scripts/pybootchartgui/README.pybootchart >>> >>> Ola, have you been able to test this and if not, can you? >>> >> This patch is identical to the V1 except for the commit message, so I >> consider it >> >> Tested-By: Ola Nilsson<ola.x.nilsson@axis.com> >> >> /Ola > > Richard and Alex were waiting to merge this but I think we need > some indicating that this produces a better outcome than before the patch. > > I've pencilled in some time on next Tuesday to do that but > we're working on a release so higher priority work may interfere. > > Qi and I will get to this eventually and reply here or with a new patch. > > ../Randy > This change was merged back in April, despite my comments above. We're still working towards a product milestone later this month so my time to look into this type of problem is still limited - sigh. Recently some people have noticed slow builds when pressure regulation is enabled but I'm not sure if this change is the root cause. I'll be doing some builds to test that idea out. Has anyone else noticed very long build times that are say > 2x the expected time? I saw some unexpectedly slow builds on a new laptop 2 weeks ago but then went on vacation and I haven't gotten back to review the situation. If you look at the build stats from a recent low YP autobuilder: https://autobuilder.yoctoproject.org/typhoon/#/builders/100/builds/4707 https://autobuilder.yocto.io/pub/failed-builds-data/slow-buildstats.tgz $.../poky.git/scripts/pybootchartgui/pybootchartgui.py 20230710094248/ you can see a staircase effect of a group of tasks being started and then the system waiting for a while (1). While that's not unexpected, I don't recall the effect being so significant. Furthermore, in the middle ~1/3 the build (2), there are very few tasks are started and yet the CPU usage and contention seem to remain high. This effect could be due to a kirkstone build running and the current algorithm being nicer than what is in the older branches. Also, we're going to add some logging around pressure regulation but it'll have to be written to ensure that the logs aren't flooded. ../Randy 1) build staircase effect: 2) 50000 m view, with not much happening in the middle of the build: > > >>> All for today. >>> >>> ../Randy >>> >>> >>>> Signed-off-by: Chen Qi<Qi.Chen@windriver.com> >>>> --- >>>> bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- >>>> 1 file changed, 14 insertions(+), 15 deletions(-) >>>> >>>> diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py >>>> index 02f1474540..4d49d25153 100644 >>>> --- a/bitbake/lib/bb/runqueue.py >>>> +++ b/bitbake/lib/bb/runqueue.py >>>> @@ -179,6 +179,7 @@ class RunQueueScheduler(object): >>>> self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >>>> self.prev_pressure_time = time.time() >>>> self.check_pressure = True >>>> + self.psi_exceeded = False >>>> except: >>>> bb.note("The /proc/pressure files can't be read. Continuing build >>> without monitoring pressure") >>>> self.check_pressure = False >>>> @@ -191,6 +192,10 @@ class RunQueueScheduler(object): >>>> BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. >>>> """ >>>> if self.check_pressure: >>>> + now = time.time() >>>> + tdiff = now - self.prev_pressure_time >>>> + if tdiff < 1.0: >>>> + return self.psi_exceeded >>>> with open("/proc/pressure/cpu") as cpu_pressure_fds, \ >>>> open("/proc/pressure/io") as io_pressure_fds, \ >>>> open("/proc/pressure/memory") as memory_pressure_fds: >>>> @@ -198,21 +203,15 @@ class RunQueueScheduler(object): >>>> curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] >>>> curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] >>>> curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >>>> - now = time.time() >>>> - tdiff = now - self.prev_pressure_time >>>> - if tdiff > 1.0: >>>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >>> self.rq.max_cpu_pressure >>>> - exceeds_io_pressure = self.rq.max_io_pressure and >>> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >>> self.rq.max_io_pressure >>>> - exceeds_memory_pressure = self.rq.max_memory_pressure and >>> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / >>> tdiff > self.rq.max_memory_pressure >>>> - self.prev_cpu_pressure = curr_cpu_pressure >>>> - self.prev_io_pressure = curr_io_pressure >>>> - self.prev_memory_pressure = curr_memory_pressure >>>> - self.prev_pressure_time = now >>>> - else: >>>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > >>> self.rq.max_cpu_pressure >>>> - exceeds_io_pressure = self.rq.max_io_pressure and >>> (float(curr_io_pressure) - float(self.prev_io_pressure)) > >>> self.rq.max_io_pressure >>>> - exceeds_memory_pressure = self.rq.max_memory_pressure and >>> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > >>> self.rq.max_memory_pressure >>>> - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) >>>> + exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >>> self.rq.max_cpu_pressure >>>> + exceeds_io_pressure = self.rq.max_io_pressure and >>> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >>> self.rq.max_io_pressure >>>> + exceeds_memory_pressure = self.rq.max_memory_pressure and >>> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / >>> tdiff > self.rq.max_memory_pressure >>>> + self.prev_cpu_pressure = curr_cpu_pressure >>>> + self.prev_io_pressure = curr_io_pressure >>>> + self.prev_memory_pressure = curr_memory_pressure >>>> + self.prev_pressure_time = now >>>> + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure >>>> + return self.psi_exceeded >>>> return False >>>> def next_buildable_task(self): > > > -- > # Randy MacLeod > # Wind River Linux > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#14835):https://lists.openembedded.org/g/bitbake-devel/message/14835 > Mute This Topic:https://lists.openembedded.org/mt/99079426/3616765 > Group Owner:bitbake-devel+owner@lists.openembedded.org > Unsubscribe:https://lists.openembedded.org/g/bitbake-devel/unsub [randy.macleod@windriver.com] > -=-=-=-=-=-=-=-=-=-=-=- >
On 2023-07-10 19:22, Randy MacLeod wrote: > On 2023-06-08 09:25, Randy MacLeod via lists.openembedded.org wrote: >> On 2023-05-25 04:17, Ola x Nilsson wrote: >>> On Tue, May 23 2023, Randy MacLeod via lists.openembedded.org wrote: >>> >>>> On 2023-05-22 22:09,Qi.Chen@windriver.com wrote: >>>>> From: Chen Qi<Qi.Chen@windriver.com> >>>>> >>>>> The current logic is not correct because if the time interval >>>>> between the current check and the last check is very small, the PSI >>>>> checker is not likely to block things even if the system is heavy >>>>> loaded. >>>>> >>>>> It's not good to calculate the value too often. So we change to a 1s >>>>> check. As a build will usually take at least minutes, using the 1s value >>>>> seems reasonable. >>>> I don't have time to check today but I'm not convinced that this is true. >>>> >>>> I suspect that in the typical case, it would be okay but there will be >>>> workloads (perhaps rare workloads) where you want to check more often. >>>> >>>> The 1 second intervals where we no longer launch new work may >>>> result in a significantly longer build. IIRC, our benchmark was building >>>> core-image-minimal on a 24 core system without PSI regulation and then >>>> with it set to 300. I'd look at the elapsed time and the >>>> buildchart/bootchart: >>>> scripts/pybootchartgui/README.pybootchart >>>> >>>> Ola, have you been able to test this and if not, can you? >>>> >>> This patch is identical to the V1 except for the commit message, so I >>> consider it >>> >>> Tested-By: Ola Nilsson<ola.x.nilsson@axis.com> >>> >>> /Ola >> >> Richard and Alex were waiting to merge this but I think we need >> some indicating that this produces a better outcome than before the >> patch. >> >> I've pencilled in some time on next Tuesday to do that but >> we're working on a release so higher priority work may interfere. >> >> Qi and I will get to this eventually and reply here or with a new patch. >> >> ../Randy >> > > This change was merged back in April, despite my comments above. > > We're still working towards a product milestone later this month so my > time to > look into this type of problem is still limited - sigh. > > > Recently some people have noticed slow builds when pressure regulation > is enabled > but I'm not sure if this change is the root cause. I'll be doing some > builds > to test that idea out. Has anyone else noticed very long build times > that are say > > 2x the expected time? I saw some unexpectedly slow builds on a new > laptop > 2 weeks ago but then went on vacation and I haven't gotten back to > review the situation. > > > If you look at the build stats from a recent low YP autobuilder: > > https://autobuilder.yoctoproject.org/typhoon/#/builders/100/builds/4707 > > https://autobuilder.yocto.io/pub/failed-builds-data/slow-buildstats.tgz > > $.../poky.git/scripts/pybootchartgui/pybootchartgui.py 20230710094248/ > > you can see a staircase effect of a group of tasks being started and > then the system waiting for a while (1). > While that's not unexpected, I don't recall the effect being so > significant. > > Furthermore, in the middle ~1/3 the build (2), there are very few > tasks are started and > yet the CPU usage and contention seem to remain high. This effect > could be due to a kirkstone build running > and the current algorithm being nicer than what is in the older branches. > This middle part of the build is only running do_package_qa tasks. The individual QA runs don't seem to take long to complete but the CPU usage and contention do seem to be high so perhaps the pressure regulation is limiting the launch of new tasks even if the system could do more work. Logging will help so I'll work on that tomorrow if I can find the time. ../Randy > Also, we're going to add some logging around pressure regulation but > it'll have to > be written to ensure that the logs aren't flooded. > > ../Randy > > > > 1) build staircase effect: > > > > > > 2) 50000 m view, with not much happening in the middle of the build: > > > >> >> >>>> All for today. >>>> >>>> ../Randy >>>> >>>> >>>>> Signed-off-by: Chen Qi<Qi.Chen@windriver.com> >>>>> --- >>>>> bitbake/lib/bb/runqueue.py | 29 ++++++++++++++--------------- >>>>> 1 file changed, 14 insertions(+), 15 deletions(-) >>>>> >>>>> diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py >>>>> index 02f1474540..4d49d25153 100644 >>>>> --- a/bitbake/lib/bb/runqueue.py >>>>> +++ b/bitbake/lib/bb/runqueue.py >>>>> @@ -179,6 +179,7 @@ class RunQueueScheduler(object): >>>>> self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >>>>> self.prev_pressure_time = time.time() >>>>> self.check_pressure = True >>>>> + self.psi_exceeded = False >>>>> except: >>>>> bb.note("The /proc/pressure files can't be read. Continuing build >>>> without monitoring pressure") >>>>> self.check_pressure = False >>>>> @@ -191,6 +192,10 @@ class RunQueueScheduler(object): >>>>> BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. >>>>> """ >>>>> if self.check_pressure: >>>>> + now = time.time() >>>>> + tdiff = now - self.prev_pressure_time >>>>> + if tdiff < 1.0: >>>>> + return self.psi_exceeded >>>>> with open("/proc/pressure/cpu") as cpu_pressure_fds, \ >>>>> open("/proc/pressure/io") as io_pressure_fds, \ >>>>> open("/proc/pressure/memory") as memory_pressure_fds: >>>>> @@ -198,21 +203,15 @@ class RunQueueScheduler(object): >>>>> curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] >>>>> curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] >>>>> curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] >>>>> - now = time.time() >>>>> - tdiff = now - self.prev_pressure_time >>>>> - if tdiff > 1.0: >>>>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >>>> self.rq.max_cpu_pressure >>>>> - exceeds_io_pressure = self.rq.max_io_pressure and >>>> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >>>> self.rq.max_io_pressure >>>>> - exceeds_memory_pressure = self.rq.max_memory_pressure and >>>> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / >>>> tdiff > self.rq.max_memory_pressure >>>>> - self.prev_cpu_pressure = curr_cpu_pressure >>>>> - self.prev_io_pressure = curr_io_pressure >>>>> - self.prev_memory_pressure = curr_memory_pressure >>>>> - self.prev_pressure_time = now >>>>> - else: >>>>> - exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > >>>> self.rq.max_cpu_pressure >>>>> - exceeds_io_pressure = self.rq.max_io_pressure and >>>> (float(curr_io_pressure) - float(self.prev_io_pressure)) > >>>> self.rq.max_io_pressure >>>>> - exceeds_memory_pressure = self.rq.max_memory_pressure and >>>> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > >>>> self.rq.max_memory_pressure >>>>> - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) >>>>> + exceeds_cpu_pressure = self.rq.max_cpu_pressure and >>>> (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > >>>> self.rq.max_cpu_pressure >>>>> + exceeds_io_pressure = self.rq.max_io_pressure and >>>> (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > >>>> self.rq.max_io_pressure >>>>> + exceeds_memory_pressure = self.rq.max_memory_pressure and >>>> (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / >>>> tdiff > self.rq.max_memory_pressure >>>>> + self.prev_cpu_pressure = curr_cpu_pressure >>>>> + self.prev_io_pressure = curr_io_pressure >>>>> + self.prev_memory_pressure = curr_memory_pressure >>>>> + self.prev_pressure_time = now >>>>> + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure >>>>> + return self.psi_exceeded >>>>> return False >>>>> def next_buildable_task(self): >> >> >> -- >> # Randy MacLeod >> # Wind River Linux >> >> -=-=-=-=-=-=-=-=-=-=-=- >> Links: You receive all messages sent to this group. >> View/Reply Online (#14835):https://lists.openembedded.org/g/bitbake-devel/message/14835 >> Mute This Topic:https://lists.openembedded.org/mt/99079426/3616765 >> Group Owner:bitbake-devel+owner@lists.openembedded.org >> Unsubscribe:https://lists.openembedded.org/g/bitbake-devel/unsub [randy.macleod@windriver.com] >> -=-=-=-=-=-=-=-=-=-=-=- >> > > -- > # Randy MacLeod > # Wind River Linux
On Mon, 2023-07-10 at 19:22 -0400, Randy MacLeod via lists.openembedded.org wrote: > > This change was merged back in April, despite my comments above. No, it did not. The change in this thread was posted in May and we decided not to merge it based upon your feedback. There was a different change posted in April: https://git.yoctoproject.org/poky/commit/bitbake/lib?id=ba94f9a3b1960cc0fdc831c20a9d2f8ad289f307 which did merge. Whether it should have done, I don't know but I would like to be clear about what happened! Cheers, Richard
diff --git a/bitbake/lib/bb/runqueue.py b/bitbake/lib/bb/runqueue.py index 02f1474540..4d49d25153 100644 --- a/bitbake/lib/bb/runqueue.py +++ b/bitbake/lib/bb/runqueue.py @@ -179,6 +179,7 @@ class RunQueueScheduler(object): self.prev_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] self.prev_pressure_time = time.time() self.check_pressure = True + self.psi_exceeded = False except: bb.note("The /proc/pressure files can't be read. Continuing build without monitoring pressure") self.check_pressure = False @@ -191,6 +192,10 @@ class RunQueueScheduler(object): BB_PRESSURE_MAX_{CPU|IO|MEMORY} are set, return True if above threshold. """ if self.check_pressure: + now = time.time() + tdiff = now - self.prev_pressure_time + if tdiff < 1.0: + return self.psi_exceeded with open("/proc/pressure/cpu") as cpu_pressure_fds, \ open("/proc/pressure/io") as io_pressure_fds, \ open("/proc/pressure/memory") as memory_pressure_fds: @@ -198,21 +203,15 @@ class RunQueueScheduler(object): curr_cpu_pressure = cpu_pressure_fds.readline().split()[4].split("=")[1] curr_io_pressure = io_pressure_fds.readline().split()[4].split("=")[1] curr_memory_pressure = memory_pressure_fds.readline().split()[4].split("=")[1] - now = time.time() - tdiff = now - self.prev_pressure_time - if tdiff > 1.0: - exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > self.rq.max_cpu_pressure - exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > self.rq.max_io_pressure - exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure - self.prev_cpu_pressure = curr_cpu_pressure - self.prev_io_pressure = curr_io_pressure - self.prev_memory_pressure = curr_memory_pressure - self.prev_pressure_time = now - else: - exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) > self.rq.max_cpu_pressure - exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) > self.rq.max_io_pressure - exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) > self.rq.max_memory_pressure - return (exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure) + exceeds_cpu_pressure = self.rq.max_cpu_pressure and (float(curr_cpu_pressure) - float(self.prev_cpu_pressure)) / tdiff > self.rq.max_cpu_pressure + exceeds_io_pressure = self.rq.max_io_pressure and (float(curr_io_pressure) - float(self.prev_io_pressure)) / tdiff > self.rq.max_io_pressure + exceeds_memory_pressure = self.rq.max_memory_pressure and (float(curr_memory_pressure) - float(self.prev_memory_pressure)) / tdiff > self.rq.max_memory_pressure + self.prev_cpu_pressure = curr_cpu_pressure + self.prev_io_pressure = curr_io_pressure + self.prev_memory_pressure = curr_memory_pressure + self.prev_pressure_time = now + self.psi_exceeded = exceeds_cpu_pressure or exceeds_io_pressure or exceeds_memory_pressure + return self.psi_exceeded return False def next_buildable_task(self):