diff mbox series

[layerindex-web,5/5] layerindex/views.py: fix DuplicatesView 504 from per-row correlated subquery

Message ID 20260510004706.81282-5-tim.orling@konsulko.com
State New
Headers show
Series [layerindex-web,1/5] layerindex/views.py: fix StatsView statistics page timeout | expand

Commit Message

Tim Orling May 10, 2026, 12:47 a.m. UTC
The previous fix (subquery IN clause + select_related) resolved the Python
memory overhead but the 504 Gateway Timeout persisted because
recipes_preferred_count() uses .extra(select={...}) to embed a correlated
multi-table subquery that the database evaluates once for every result row.
With hundreds of duplicate recipes on the master branch this creates an
O(N) compounding scan that dominates page load time.

Replace the call to recipes_preferred_count() in DuplicatesView.get_recipes
with a two-step batch approach:

1. Fetch all duplicate recipes into a list using the existing subquery +
   select_related (one SQL query).

2. Run a single aggregating query to find, for each pn, the maximum
   index_preference among S/A-type layers in this branch that carry that
   pn (one SQL query returning one row per distinct pn).

3. Annotate each recipe object in Python: preferred_count is set to 1 if
   another S/A layer has a strictly higher index_preference for the same
   pn, otherwise 0.  The template only checks preferred_count > 0, so the
   boolean semantics are identical to the original COUNT.

recipes_preferred_count() is intentionally left unchanged — it is still
used by RecipeSearchView where result sets are paginated to 50 rows and
the per-row cost is negligible.

Also add Max to the django.db.models import line.

[YOCTO #16175]

AI-Generated: Claude Cowork Sonnet 4.6
Signed-off-by: Tim Orling <tim.orling@konsulko.com>
---
 layerindex/views.py           |  30 ++++++++-
 tests/test_duplicates_view.py | 113 ++++++++++++++++++++++------------
 2 files changed, 100 insertions(+), 43 deletions(-)
diff mbox series

Patch

diff --git a/layerindex/views.py b/layerindex/views.py
index 742e5d5..84750f7 100644
--- a/layerindex/views.py
+++ b/layerindex/views.py
@@ -25,7 +25,7 @@  from django.contrib.sites.models import Site
 from django.core.exceptions import PermissionDenied
 from django.urls import resolve, reverse, reverse_lazy
 from django.db import transaction
-from django.db.models import Count, Q
+from django.db.models import Count, Max, Q
 from django.db.models.functions import Lower
 from django.db.models.query import QuerySet
 from django.db.models.signals import pre_save
@@ -661,8 +661,32 @@  class DuplicatesView(TemplateView):
         # into memory and building a potentially huge IN (...) clause.
         # See: https://bugzilla.yoctoproject.org/show_bug.cgi?id=16175
         dupes = init_qs.values('pn').annotate(Count('layerbranch', distinct=True)).filter(layerbranch__count__gt=1).values('pn')
-        qs = init_qs.filter(pn__in=dupes).select_related('layerbranch__layer').order_by('pn', 'layerbranch__layer', '-pv')
-        return recipes_preferred_count(qs)
+        recipes = list(init_qs.filter(pn__in=dupes).select_related('layerbranch__layer').order_by('pn', 'layerbranch__layer', '-pv'))
+        if not recipes:
+            return recipes
+        # recipes_preferred_count() attaches a correlated subquery via .extra() that
+        # the database evaluates once per result row.  With hundreds of duplicate
+        # recipes that compounds into a slow multi-table correlated scan that causes
+        # a 504 Gateway Timeout.  Instead, compute preferred_count with a single
+        # batch MAX query across all pns and annotate the results in Python.
+        #
+        # A recipe is "non-preferred" (preferred_count > 0) when another layer of
+        # type S or A carries the same pn and has a higher index_preference.
+        pns = {r.pn for r in recipes}
+        pn_max_pref = dict(
+            Recipe.objects.filter(
+                layerbranch__branch__name=self.kwargs['branch'],
+                pn__in=pns,
+                layerbranch__layer__layer_type__in=['S', 'A'],
+            ).values('pn').annotate(
+                max_pref=Max('layerbranch__layer__index_preference')
+            ).values_list('pn', 'max_pref')
+        )
+        for recipe in recipes:
+            own_pref = recipe.layerbranch.layer.index_preference
+            max_pref = pn_max_pref.get(recipe.pn)
+            recipe.preferred_count = 1 if (max_pref is not None and max_pref > own_pref) else 0
+        return recipes
 
     def get_classes(self, layer_ids):
         init_qs = BBClass.objects.filter(layerbranch__branch__name=self.kwargs['branch'])
diff --git a/tests/test_duplicates_view.py b/tests/test_duplicates_view.py
index 304a35c..78d1036 100644
--- a/tests/test_duplicates_view.py
+++ b/tests/test_duplicates_view.py
@@ -9,9 +9,17 @@ 
 # Tests for bug #16175 - Duplicates page timeout
 # https://bugzilla.yoctoproject.org/show_bug.cgi?id=16175
 #
-# The fix replaces Python list comprehensions used to build IN (...) clauses
-# with Django ORM subqueries, and adds select_related to avoid N+1 queries
-# when the template resolves layerbranch__layer names.
+# The original code had two performance problems:
+#
+# 1. Python list comprehension IN clause: each get_* method pulled all
+#    duplicate names into Python memory and built a huge IN (...) literal.
+#    Fixed by passing the 'dupes' queryset directly so Django emits a subquery.
+#
+# 2. Per-row correlated subquery: get_recipes() called recipes_preferred_count()
+#    which embeds a correlated subquery via .extra() evaluated once per result
+#    row by the database.  With hundreds of duplicate recipes this caused a 504
+#    Gateway Timeout.  Fixed by computing preferred_count with a single batch
+#    MAX query over all pns, then annotating results in Python.
 
 import pytest
 from django.test import TestCase
@@ -35,7 +43,7 @@  class TestDuplicatesView(TestCase):
             updates_enabled=True,
         )
 
-        # Two layers — objects in both will appear as duplicates
+        # layer_a: base layer (type A), lower preference
         self.layer_a = LayerItem.objects.create(
             name='meta-alpha',
             status='P',
@@ -43,7 +51,9 @@  class TestDuplicatesView(TestCase):
             summary='Alpha layer',
             description='Alpha test layer',
             vcs_url='git://example.com/meta-alpha.git',
+            index_preference=0,
         )
+        # layer_b: software layer (type S), higher preference — recipes here win
         self.layer_b = LayerItem.objects.create(
             name='meta-beta',
             status='P',
@@ -51,8 +61,9 @@  class TestDuplicatesView(TestCase):
             summary='Beta layer',
             description='Beta test layer',
             vcs_url='git://example.com/meta-beta.git',
+            index_preference=10,
         )
-        # A third layer whose objects are unique (should NOT appear as duplicates)
+        # layer_c: unique objects only, should never appear as duplicates
         self.layer_c = LayerItem.objects.create(
             name='meta-gamma',
             status='P',
@@ -60,36 +71,42 @@  class TestDuplicatesView(TestCase):
             summary='Gamma layer',
             description='Gamma test layer',
             vcs_url='git://example.com/meta-gamma.git',
+            index_preference=5,
         )
 
         self.lb_a = LayerBranch.objects.create(layer=self.layer_a, branch=self.branch)
         self.lb_b = LayerBranch.objects.create(layer=self.layer_b, branch=self.branch)
         self.lb_c = LayerBranch.objects.create(layer=self.layer_c, branch=self.branch)
 
-        # Duplicate recipe (same pn in layer_a and layer_b)
-        Recipe.objects.create(layerbranch=self.lb_a, filename='shared_1.0.bb',
-                               pn='shared', pv='1.0', filepath='recipes-test')
-        Recipe.objects.create(layerbranch=self.lb_b, filename='shared_1.0.bb',
-                               pn='shared', pv='1.0', filepath='recipes-test')
-        # Unique recipe (only in layer_c — should NOT appear)
-        Recipe.objects.create(layerbranch=self.lb_c, filename='unique_1.0.bb',
-                               pn='unique', pv='1.0', filepath='recipes-test')
-
-        # Duplicate class (same name in layer_a and layer_b)
+        # Duplicate recipe in both layer_a (pref=0) and layer_b (pref=10)
+        self.recipe_a = Recipe.objects.create(
+            layerbranch=self.lb_a, filename='shared_1.0.bb',
+            pn='shared', pv='1.0', filepath='recipes-test')
+        self.recipe_b = Recipe.objects.create(
+            layerbranch=self.lb_b, filename='shared_1.0.bb',
+            pn='shared', pv='1.0', filepath='recipes-test')
+        # Unique recipe only in layer_c — must NOT appear
+        Recipe.objects.create(
+            layerbranch=self.lb_c, filename='unique_1.0.bb',
+            pn='unique', pv='1.0', filepath='recipes-test')
+
+        # Duplicate class in layer_a and layer_b
         BBClass.objects.create(layerbranch=self.lb_a, name='sharedclass')
         BBClass.objects.create(layerbranch=self.lb_b, name='sharedclass')
-        # Unique class (only in layer_c — should NOT appear)
+        # Unique class only in layer_c — must NOT appear
         BBClass.objects.create(layerbranch=self.lb_c, name='uniqueclass')
 
-        # Duplicate include file (same path in layer_a and layer_b)
+        # Duplicate include file in layer_a and layer_b
         IncFile.objects.create(layerbranch=self.lb_a, path='conf/shared.inc')
         IncFile.objects.create(layerbranch=self.lb_b, path='conf/shared.inc')
-        # Unique include file (only in layer_c — should NOT appear)
+        # Unique include file only in layer_c — must NOT appear
         IncFile.objects.create(layerbranch=self.lb_c, path='conf/unique.inc')
 
     def _url(self, branch='main'):
         return reverse('duplicates', kwargs={'branch': branch})
 
+    # --- Basic page health ---
+
     def test_duplicates_view_returns_200(self):
         """DuplicatesView should return HTTP 200."""
         response = self.client.get(self._url())
@@ -110,11 +127,38 @@  class TestDuplicatesView(TestCase):
         self.assertNotIn('unique', pns)
 
     def test_duplicate_recipes_have_two_rows(self):
-        """Each layer's entry for the duplicate recipe should be present."""
+        """Each layer's copy of the duplicate recipe should be present."""
         response = self.client.get(self._url())
         shared = [r for r in response.context['recipes'] if r.pn == 'shared']
         self.assertEqual(len(shared), 2)
 
+    # --- preferred_count (batch MAX computation, replaces per-row correlated subquery) ---
+
+    def test_lower_preference_recipe_is_deemphasised(self):
+        """The layer_a recipe (pref=0) should have preferred_count > 0 because
+        layer_b (pref=10, type S) has a higher-preference copy of the same pn."""
+        response = self.client.get(self._url())
+        recipe_from_a = next(
+            r for r in response.context['recipes']
+            if r.pn == 'shared' and r.layerbranch.layer.name == 'meta-alpha')
+        self.assertGreater(recipe_from_a.preferred_count, 0)
+
+    def test_higher_preference_recipe_is_not_deemphasised(self):
+        """The layer_b recipe (pref=10) should have preferred_count == 0 because
+        no other S/A layer has a higher preference for the same pn."""
+        response = self.client.get(self._url())
+        recipe_from_b = next(
+            r for r in response.context['recipes']
+            if r.pn == 'shared' and r.layerbranch.layer.name == 'meta-beta')
+        self.assertEqual(recipe_from_b.preferred_count, 0)
+
+    def test_preferred_count_attribute_present_on_all_recipes(self):
+        """Every recipe in the context must have a preferred_count attribute."""
+        response = self.client.get(self._url())
+        for recipe in response.context['recipes']:
+            self.assertTrue(hasattr(recipe, 'preferred_count'),
+                            f"preferred_count missing on {recipe.pn}")
+
     # --- Classes ---
 
     def test_duplicate_classes_included(self):
@@ -158,20 +202,15 @@  class TestDuplicatesView(TestCase):
     # --- Layer filter ---
 
     def test_layer_filter_restricts_recipes(self):
-        """Passing ?l=<layer_id> should restrict results to that layer."""
+        """With only one layer selected, nothing can be a duplicate."""
         response = self.client.get(self._url() + f'?l={self.layer_a.id}')
         self.assertEqual(response.status_code, 200)
-        # With only one layer selected, nothing can be a duplicate
         pns = [r.pn for r in response.context['recipes']]
         self.assertNotIn('shared', pns)
 
     def test_wrong_branch_returns_empty(self):
-        """A branch with no data should return empty querysets."""
-        other = Branch.objects.create(
-            name='other',
-            bitbake_branch='other',
-            sort_priority=50,
-        )
+        """A branch with no data should return empty result sets."""
+        Branch.objects.create(name='other', bitbake_branch='other', sort_priority=50)
         response = self.client.get(reverse('duplicates', kwargs={'branch': 'other'}))
         self.assertEqual(response.status_code, 200)
         self.assertEqual(len(list(response.context['recipes'])), 0)
@@ -179,30 +218,24 @@  class TestDuplicatesView(TestCase):
         self.assertEqual(len(list(response.context['incfiles'])), 0)
 
     def test_select_related_avoids_extra_queries(self):
-        """Ensure layerbranch and layer are fetched with select_related.
-
-        Checks that accessing layerbranch.layer.name on query results does not
-        trigger additional database queries (i.e. select_related is working).
-        """
+        """Accessing layerbranch.layer.name after list evaluation fires no new queries."""
         from django.db import connection, reset_queries
         from django.conf import settings
+        import layerindex.views as views_module
 
         settings.DEBUG = True
         reset_queries()
 
-        view = __import__('layerindex.views', fromlist=['DuplicatesView']).DuplicatesView
-        v = view()
+        v = views_module.DuplicatesView()
         v.kwargs = {'branch': 'main'}
-        v.request = None
+        v.request = type('req', (), {'GET': {}})()
 
-        recipes = list(v.get_recipes([]))
+        recipes = v.get_recipes([])
         classes = list(v.get_classes([]))
         incfiles = list(v.get_incfiles([]))
 
-        # Record query count after fetching
         query_count_after_fetch = len(connection.queries)
 
-        # Access layerbranch.layer.name on every result — should NOT fire new queries
         for r in recipes:
             _ = r.layerbranch.layer.name
         for c in classes:
@@ -214,5 +247,5 @@  class TestDuplicatesView(TestCase):
         settings.DEBUG = False
 
         self.assertEqual(query_count_after_fetch, query_count_after_access,
-                         "Unexpected extra queries when accessing layerbranch.layer.name — "
-                         "select_related may not be working")
+                         "Extra queries fired when accessing layerbranch.layer.name — "
+                         "select_related may not be effective")