@@ -822,7 +822,52 @@ class ObjectSet(oe.spdx30.SHACLObjectSet):
if not e.externalSpdxId in imports:
imports[e.externalSpdxId] = e
- self.objects |= other.objects
+ # Merge objects intelligently: if same SPDX ID exists, keep the one with more complete data
+ #
+ # WHY DUPLICATES OCCUR: When consolidating SPDX documents (e.g., recipe -> package -> image),
+ # the same package can be referenced at different build stages, each with varying levels of
+ # detail. Early stages may have basic PURLs, while later stages add Git metadata qualifiers.
+ # This is architectural - multi-stage builds naturally create multiple representations of
+ # the same entity.
+ #
+ # However, preserve object identity for types that get referenced (like CreationInfo)
+ # to avoid breaking serialization
+ other_by_id = {}
+ for obj in other.objects:
+ obj_id = getattr(obj, '_id', None)
+ if obj_id:
+ other_by_id[obj_id] = obj
+
+ self_by_id = {}
+ for obj in self.objects:
+ obj_id = getattr(obj, '_id', None)
+ if obj_id:
+ self_by_id[obj_id] = obj
+
+ # Merge: for duplicate IDs, prefer the object with more externalIdentifier entries
+ # but only for Element types (not CreationInfo, Agent, Tool, etc.)
+ for obj_id, other_obj in other_by_id.items():
+ if obj_id in self_by_id:
+ self_obj = self_by_id[obj_id]
+ # Only replace Elements with more complete data
+ # Do NOT replace CreationInfo or other supporting types to preserve object identity
+ if isinstance(self_obj, oe.spdx30.Element):
+ # If both have externalIdentifier, keep the one with more entries
+ self_ext_ids = getattr(self_obj, 'externalIdentifier', [])
+ other_ext_ids = getattr(other_obj, 'externalIdentifier', [])
+ if len(other_ext_ids) > len(self_ext_ids):
+ # Replace self object with other (more complete) object
+ self.objects.discard(self_obj)
+ self.objects.add(other_obj)
+ # For non-Element types (CreationInfo, Agent, Tool), keep existing to preserve identity
+ else:
+ # New object, just add it
+ self.objects.add(other_obj)
+
+ # Add any objects without IDs
+ for obj in other.objects:
+ if not getattr(obj, '_id', None):
+ self.objects.add(obj)
for o in add_objectsets:
merge_doc(o)