From patchwork Tue Mar 3 00:43:54 2026 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Joshua Watt X-Patchwork-Id: 82301 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from aws-us-west-2-korg-lkml-1.web.codeaurora.org (localhost.localdomain [127.0.0.1]) by smtp.lore.kernel.org (Postfix) with ESMTP id C2A6FEB364E for ; Tue, 3 Mar 2026 00:46:03 +0000 (UTC) Received: from mail-oi1-f182.google.com (mail-oi1-f182.google.com [209.85.167.182]) by mx.groups.io with SMTP id smtpd.msgproc01-g2.7689.1772498760545146402 for ; Mon, 02 Mar 2026 16:46:00 -0800 Authentication-Results: mx.groups.io; dkim=pass header.i=@gmail.com header.s=20230601 header.b=B6u+v2rP; spf=pass (domain: gmail.com, ip: 209.85.167.182, mailfrom: jpewhacker@gmail.com) Received: by mail-oi1-f182.google.com with SMTP id 5614622812f47-45f09874c4cso3775433b6e.3 for ; Mon, 02 Mar 2026 16:46:00 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1772498759; x=1773103559; darn=lists.openembedded.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=RgZWWYUETcp+KAK54Y1gHYnk1xGLeXOCGllecsNhDhw=; b=B6u+v2rPF22gYNeJglrL/k+gq9iyfdiro4MfRfPYxyMHvXXU96SLXMNGdMUcqkdh0W OtT9dbwF71E7J4pk2qoP9dGhU4KOvyoFEE9IdEP0fpESaJAy4KYqLcc6I+kciJvFgyJX Bv+z2zw8goJNpkTbWR1q+DEbSwRahf3sZ8Z3bspN54twJdP036g4siOXJTf5/quZI3Ol x36tdVUPVSWluP92N5c+zmVKxHofbrynNIH5QX0ywYlXPaPul1uPaVVkNzAu4lZW70BP De3KZCxJmVFLVw94PlQu1KIu/ApK7uiNuiMsdtxmZd+S07eBbyygn5GsvCcMT+wvGh+P aPvw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1772498759; x=1773103559; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-gg:x-gm-message-state:from :to:cc:subject:date:message-id:reply-to; bh=RgZWWYUETcp+KAK54Y1gHYnk1xGLeXOCGllecsNhDhw=; b=pOOsbRPgpWropCut7SbJCXlcztDDpswSeI+Hc1c5C8oqAVEndLj/3k+tYNjf+doOk7 J5CYvWHkLHcukccctgH6WzjaiUYf7LBrVDISCE5BCedJZdOkwFKIsds9JhLV1CNQbY+w EM2Fk3rX5hSOtt21B9MTxSKh+udiPobJ4P/KCagHu8/f4HDZq+FH3cVPN98iJED3KFQd ouUCp+MEaQ52sgzdrnV39xa3gfLGBddw07pr/P+330LKSiN4UYYG0wh+TB3HxZef5SH7 arAMte19B7tQIEoITnAlyXp0JZqmA0Bl/5lcnFLb+PyWOXPWnQUjlGSA+ClME19auEad +CHQ== X-Gm-Message-State: AOJu0Yz/SfGG9nB8Bi+j3yHdUcdmoMhvDS9udDJ8CAPgRE6esU/1rlZb QhdF+r4X1UnggemjnDiosrvOlQetkMS2UV/ZcKsAz+76nim9fBRDVJ3C18MTQg== X-Gm-Gg: ATEYQzzUA9igtsKdsqsYSZzVbbbHChykY432GsMwLiLe1VjjJyDeStS6bYvHiewsax2 PTP4gm2Y/YZeuT+ZanKiHA1VN7US72pWIE4HE3RtbvMBq7cmny2CFmSWlDNIPNNHS2Zm5ZGD81z mGb6nyoWe64AOncDwoqo5WDenaWLgoSIWed2HfddlF4vDw6ifAxHCXYBtGWlIK8u/+cNbb8myyv DQH8S4ASN0ml/KutGsgKKJ5B6BlebayLB2gGhMuHBhidzRywGN7SGytvUoNCcaACJWRX3asTkjk SCdJAvUK466k7olUdmGuXlXfZTgNOl0WJOnBzReRqXp9xHV3XeQXysc93/tOrkYhqaadq2MAN34 Ox0enKe7pqSoxMM6uhV1ZX94q66X8PXBP9aQzRbgA3LT5ntqGazaR3EltpAqLBv/Dxi1K+YHlKh ojZ50HqzPCoij+xe758O9a X-Received: by 2002:a05:6808:171a:b0:460:fce5:2fcf with SMTP id 5614622812f47-464beb43d2dmr7246701b6e.37.1772498759630; Mon, 02 Mar 2026 16:45:59 -0800 (PST) Received: from localhost.localdomain ([2601:282:4200:11c0::ba6c]) by smtp.gmail.com with ESMTPSA id 5614622812f47-464bb59b66fsm8637446b6e.10.2026.03.02.16.45.58 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 02 Mar 2026 16:45:59 -0800 (PST) From: Joshua Watt X-Google-Original-From: Joshua Watt To: openembedded-core@lists.openembedded.org Cc: benjamin.robin@bootlin.com, ross.burton@arm.com, Joshua Watt Subject: [OE-core][PATCH v4 7/9] spdx: De-duplicate CreationInfo Date: Mon, 2 Mar 2026 17:43:54 -0700 Message-ID: <20260303004550.650726-8-JPEWhacker@gmail.com> X-Mailer: git-send-email 2.53.0 In-Reply-To: <20260303004550.650726-1-JPEWhacker@gmail.com> References: <20260226173930.2847872-1-JPEWhacker@gmail.com> <20260303004550.650726-1-JPEWhacker@gmail.com> MIME-Version: 1.0 List-Id: X-Webhook-Received: from 45-33-107-173.ip.linodeusercontent.com [45.33.107.173] by aws-us-west-2-korg-lkml-1.web.codeaurora.org with HTTPS for ; Tue, 03 Mar 2026 00:46:03 -0000 X-Groupsio-URL: https://lists.openembedded.org/g/openembedded-core/message/232230 De-duplicates CreationInfo objects that are identical (except for ID) when writing out an SBoM. This significantly reduces the number of CreationInfo objects that end up in the final document. Signed-off-by: Joshua Watt --- meta/lib/oe/sbom30.py | 112 ++++++++++++++++++++++++++++++------------ meta/lib/oe/spdx30.py | 2 +- 2 files changed, 81 insertions(+), 33 deletions(-) diff --git a/meta/lib/oe/sbom30.py b/meta/lib/oe/sbom30.py index 21f084dc16..55a2863d2d 100644 --- a/meta/lib/oe/sbom30.py +++ b/meta/lib/oe/sbom30.py @@ -14,6 +14,7 @@ import uuid import os import oe.spdx_common from datetime import datetime, timezone +from contextlib import contextmanager OE_SPDX_BASE = "https://rdf.openembedded.org/spdx/3.0/" @@ -191,6 +192,25 @@ def to_list(l): return l +class Dedup(object): + def __init__(self, objset): + self.unique = set() + self.dedup = {} + self.objset = objset + + def find_duplicates(self, cmp, typ, **kwargs): + for o in self.objset.foreach_filter(typ, **kwargs): + for u in self.unique: + if cmp(u, o): + self.dedup[o] = u + break + else: + self.unique.add(o) + + def get(self, o): + return self.dedup.get(o, o) + + class ObjectSet(oe.spdx30.SHACLObjectSet): def __init__(self, d): super().__init__() @@ -895,6 +915,45 @@ class ObjectSet(oe.spdx30.SHACLObjectSet): self.missing_ids -= set(imports.keys()) return self.missing_ids + @contextmanager + def deduplicate(self): + d = Dedup(self) + + yield d + + visited = set() + + def visit(o, path): + if isinstance(o, oe.spdx30.SHACLObject): + if o in visited: + return False + visited.add(o) + + for k in o: + v = o[k] + if isinstance(v, oe.spdx30.SHACLObject): + o[k] = d.get(v) + + elif isinstance(o, oe.spdx30.ListProxy): + for idx, v in enumerate(o): + if isinstance(v, oe.spdx30.SHACLObject): + o[idx] = d.get(v) + + return True + + if d.dedup: + for o in self.objects: + o.walk(visit) + + for k, v in d.dedup.items(): + bb.debug( + 1, + f"Removing duplicate {k.__class__.__name__} {k._id or id(k)} -> {v._id or id(v)}", + ) + self.objects.discard(k) + + self.create_index() + def load_jsonld(d, path, required=False): deserializer = oe.spdx30.JSONLDDeserializer() @@ -1080,39 +1139,28 @@ def create_sbom(d, name, root_elements, add_objectsets=[]): # SBoM should be the only root element of the document objset.doc.rootElement = [sbom] - # De-duplicate licenses - unique = set() - dedup = {} - for lic in objset.foreach_type(oe.spdx30.simplelicensing_LicenseExpression): - for u in unique: - if ( - u.simplelicensing_licenseExpression - == lic.simplelicensing_licenseExpression - and u.simplelicensing_licenseListVersion - == lic.simplelicensing_licenseListVersion - ): - dedup[lic] = u - break - else: - unique.add(lic) - - if dedup: - for rel in objset.foreach_filter( - oe.spdx30.Relationship, - relationshipType=oe.spdx30.RelationshipType.hasDeclaredLicense, - ): - rel.to = [dedup.get(to, to) for to in rel.to] - - for rel in objset.foreach_filter( - oe.spdx30.Relationship, - relationshipType=oe.spdx30.RelationshipType.hasConcludedLicense, - ): - rel.to = [dedup.get(to, to) for to in rel.to] + def cmp_license_expression(a, b): + return ( + a.simplelicensing_licenseExpression == b.simplelicensing_licenseExpression + and a.simplelicensing_licenseListVersion + == b.simplelicensing_licenseListVersion + ) - for k, v in dedup.items(): - bb.debug(1, f"Removing duplicate License {k._id} -> {v._id}") - objset.objects.remove(k) + def cmp_creation_info(a, b): + data_a = {k: a[k] for k in a} + data_b = {k: b[k] for k in b} + data_a["@id"] = "" + data_b["@id"] = "" + return data_a == data_b + + with objset.deduplicate() as dedup: + # De-duplicate licenses + dedup.find_duplicates( + cmp_license_expression, + oe.spdx30.simplelicensing_LicenseExpression, + ) - objset.create_index() + # Deduplicate creation info + dedup.find_duplicates(cmp_creation_info, oe.spdx30.CreationInfo) return objset, sbom diff --git a/meta/lib/oe/spdx30.py b/meta/lib/oe/spdx30.py index cd97eebd18..1f58402ffc 100644 --- a/meta/lib/oe/spdx30.py +++ b/meta/lib/oe/spdx30.py @@ -701,7 +701,7 @@ class SHACLObject(object): self.__dict__["_obj_data"][iri] = prop.init() def __iter__(self): - return self._OBJ_PROPERTIES.keys() + return iter(self._OBJ_PROPERTIES.keys()) def walk(self, callback, path=None): """