@@ -14,6 +14,7 @@ import uuid
import os
import oe.spdx_common
from datetime import datetime, timezone
+from contextlib import contextmanager
OE_SPDX_BASE = "https://rdf.openembedded.org/spdx/3.0/"
@@ -191,6 +192,25 @@ def to_list(l):
return l
+class Dedup(object):
+ def __init__(self, objset):
+ self.unique = set()
+ self.dedup = {}
+ self.objset = objset
+
+ def find_duplicates(self, cmp, typ, **kwargs):
+ for o in self.objset.foreach_filter(typ, **kwargs):
+ for u in self.unique:
+ if cmp(u, o):
+ self.dedup[o] = u
+ break
+ else:
+ self.unique.add(o)
+
+ def get(self, o):
+ return self.dedup.get(o, o)
+
+
class ObjectSet(oe.spdx30.SHACLObjectSet):
def __init__(self, d):
super().__init__()
@@ -895,6 +915,45 @@ class ObjectSet(oe.spdx30.SHACLObjectSet):
self.missing_ids -= set(imports.keys())
return self.missing_ids
+ @contextmanager
+ def deduplicate(self):
+ d = Dedup(self)
+
+ yield d
+
+ visited = set()
+
+ def visit(o, path):
+ if isinstance(o, oe.spdx30.SHACLObject):
+ if o in visited:
+ return False
+ visited.add(o)
+
+ for k in o:
+ v = o[k]
+ if isinstance(v, oe.spdx30.SHACLObject):
+ o[k] = d.get(v)
+
+ elif isinstance(o, oe.spdx30.ListProxy):
+ for idx, v in enumerate(o):
+ if isinstance(v, oe.spdx30.SHACLObject):
+ o[idx] = d.get(v)
+
+ return True
+
+ if d.dedup:
+ for o in self.objects:
+ o.walk(visit)
+
+ for k, v in d.dedup.items():
+ bb.debug(
+ 1,
+ f"Removing duplicate {k.__class__.__name__} {k._id or id(k)} -> {v._id or id(v)}",
+ )
+ self.objects.discard(k)
+
+ self.create_index()
+
def load_jsonld(d, path, required=False):
deserializer = oe.spdx30.JSONLDDeserializer()
@@ -1080,39 +1139,28 @@ def create_sbom(d, name, root_elements, add_objectsets=[]):
# SBoM should be the only root element of the document
objset.doc.rootElement = [sbom]
- # De-duplicate licenses
- unique = set()
- dedup = {}
- for lic in objset.foreach_type(oe.spdx30.simplelicensing_LicenseExpression):
- for u in unique:
- if (
- u.simplelicensing_licenseExpression
- == lic.simplelicensing_licenseExpression
- and u.simplelicensing_licenseListVersion
- == lic.simplelicensing_licenseListVersion
- ):
- dedup[lic] = u
- break
- else:
- unique.add(lic)
-
- if dedup:
- for rel in objset.foreach_filter(
- oe.spdx30.Relationship,
- relationshipType=oe.spdx30.RelationshipType.hasDeclaredLicense,
- ):
- rel.to = [dedup.get(to, to) for to in rel.to]
-
- for rel in objset.foreach_filter(
- oe.spdx30.Relationship,
- relationshipType=oe.spdx30.RelationshipType.hasConcludedLicense,
- ):
- rel.to = [dedup.get(to, to) for to in rel.to]
+ def cmp_license_expression(a, b):
+ return (
+ a.simplelicensing_licenseExpression == b.simplelicensing_licenseExpression
+ and a.simplelicensing_licenseListVersion
+ == b.simplelicensing_licenseListVersion
+ )
- for k, v in dedup.items():
- bb.debug(1, f"Removing duplicate License {k._id} -> {v._id}")
- objset.objects.remove(k)
+ def cmp_creation_info(a, b):
+ data_a = {k: a[k] for k in a}
+ data_b = {k: b[k] for k in b}
+ data_a["@id"] = ""
+ data_b["@id"] = ""
+ return data_a == data_b
+
+ with objset.deduplicate() as dedup:
+ # De-duplicate licenses
+ dedup.find_duplicates(
+ cmp_license_expression,
+ oe.spdx30.simplelicensing_LicenseExpression,
+ )
- objset.create_index()
+ # Deduplicate creation info
+ dedup.find_duplicates(cmp_creation_info, oe.spdx30.CreationInfo)
return objset, sbom
@@ -701,7 +701,7 @@ class SHACLObject(object):
self.__dict__["_obj_data"][iri] = prop.init()
def __iter__(self):
- return self._OBJ_PROPERTIES.keys()
+ return iter(self._OBJ_PROPERTIES.keys())
def walk(self, callback, path=None):
"""
De-duplicates CreationInfo objects that are identical (except for ID) when writing out an SBoM. This significantly reduces the number of CreationInfo objects that end up in the final document. Signed-off-by: Joshua Watt <JPEWhacker@gmail.com> --- meta/lib/oe/sbom30.py | 112 ++++++++++++++++++++++++++++++------------ meta/lib/oe/spdx30.py | 2 +- 2 files changed, 81 insertions(+), 33 deletions(-)