diff mbox series

[3/4] spdx30_tasks: Use recipe metadata for dependency PURL generation

Message ID 20260107180951.140895-3-stondo@gmail.com
State Changes Requested
Headers show
Series None | expand

Commit Message

Stefano Tondo Jan. 7, 2026, 6:09 p.m. UTC
From: Stefano Tondo <stefano.tondo.ext@siemens.com>

Use recipe metadata (PV, inherited classes) to determine package ecosystem
and version instead of unreliable filename parsing.

Previous implementation used greedy regex patterns matching any
name-version.tar.gz file, causing false positives:
  zlib-1.3.1.tar.gz → pkg:pypi/zlib (WRONG - zlib is not from PyPI)

Changes:
- Always use d.getVar("PV") for version (addresses review feedback)
- Determine ecosystem via inherits_class() checks (pypi, npm, cpan, etc.)
- Only parse filenames for unambiguous cases (.crate extension)
- Support all major ecosystems: Rust, Go, PyPI, NPM, CPAN, NuGet, Maven
- Use pkg:generic for C/C++ libraries and other non-ecosystem sources

Example results:
- zlib source: pkg:generic/zlib@1.3.1
- zlib built package: pkg:yocto/core/zlib@1.3.1
- Python with pypi class: pkg:pypi/requests@2.31.0
- Rust crate: pkg:cargo/serde@1.0.0

This approach aligns with Yocto's metadata system and ensures every
source download gets a PURL for supply chain tracking.

Signed-off-by: Stefano Tondo <stefano.tondo.ext@siemens.com>
---
 meta/lib/oe/spdx30_tasks.py | 160 ++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
diff mbox series

Patch

diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py
index 86430c7008..c685b649b3 100644
--- a/meta/lib/oe/spdx30_tasks.py
+++ b/meta/lib/oe/spdx30_tasks.py
@@ -357,6 +357,155 @@  def collect_dep_sources(dep_objsets, dest):
             index_sources_by_hash(e.to, dest)
 
 
+def extract_dependency_metadata(d, file_name):
+    """
+    Extract version and generate PURL for dependency packages.
+
+    Uses recipe metadata (PV, inherited classes) to determine package ecosystem
+    rather than guessing from filenames. Only parses filenames for unambiguous
+    cases where the file extension definitively identifies the ecosystem.
+
+    Supported ecosystems:
+    - Rust crates (.crate extension is unambiguous)
+    - Go modules (when GO_IMPORT is set or domain pattern is explicit)
+    - PyPI packages (when recipe inherits pypi class)
+    - NPM packages (when recipe inherits npm class)
+    - CPAN packages (when recipe inherits cpan class)
+    - NuGet packages (when recipe inherits nuget/dotnet class)
+    - Maven packages (when recipe inherits maven class)
+
+    Returns: (version, purl) tuple, or (None, None) if cannot determine
+    """
+    import re
+
+    # Get version from recipe PV (always prefer recipe metadata over filename parsing)
+    pv = d.getVar("PV")
+    version = pv if pv else None
+    purl = None
+
+    # Case 1: Rust crate - .crate extension is unambiguous
+    if file_name.endswith('.crate'):
+        crate_match = re.match(r'^(.+?)-(\d+\.\d+\.\d+(?:\.\d+)?(?:[-+][\w.]+)?)\.crate$', file_name)
+        if crate_match:
+            name = crate_match.group(1)
+            # Use filename version for crates (they embed version in filename)
+            version = crate_match.group(2)
+            purl = f"pkg:cargo/{name}@{version}"
+            return (version, purl)
+
+    # Case 2: Go module - check if GO_IMPORT is set (most reliable)
+    go_import = d.getVar("GO_IMPORT")
+    if go_import and version:
+        # GO_IMPORT contains the module path (e.g., github.com/containers/storage)
+        purl = f"pkg:golang/{go_import}@{version}"
+        return (version, purl)
+
+    # Case 3: Go module from filename - only for explicit hosting domains with version in filename
+    # Patterns like github.com.user.repo-v1.2.3.tar.gz where the domain is explicit
+    go_match = re.match(
+        r'^((?:github|gitlab|gopkg|golang|go\.googlesource)\.com\.[\w.]+(?:\.[\w-]+)*?)-(v?\d+\.\d+\.\d+(?:[-+][\w.]+)?)\.',
+        file_name
+    )
+    if go_match:
+        # Convert dots to slashes for proper Go module path
+        # github.com.containers.storage → github.com/containers/storage
+        module_path = go_match.group(1).replace('.', '/', 1)  # First dot only
+        parts = module_path.split('/', 1)
+        if len(parts) == 2:
+            domain = parts[0]
+            path = parts[1].replace('.', '/')
+            module_path = f"{domain}/{path}"
+
+        version = go_match.group(2)
+        purl = f"pkg:golang/{module_path}@{version}"
+        return (version, purl)
+
+    # Case 4: PyPI package - check if recipe inherits pypi class
+    if bb.data.inherits_class("pypi", d) and version:
+        # Get the PyPI package name from PYPI_PACKAGE variable (handles python3- prefix removal)
+        pypi_package = d.getVar("PYPI_PACKAGE")
+        if pypi_package:
+            # Normalize package name per PEP 503
+            name = re.sub(r"[-_.]+", "-", pypi_package).lower()
+            purl = f"pkg:pypi/{name}@{version}"
+            return (version, purl)
+
+    # Case 5: NPM package - check if recipe inherits npm class
+    if bb.data.inherits_class("npm", d) and version:
+        # Get package name from recipe
+        bpn = d.getVar("BPN")
+        if bpn:
+            # Remove npm- prefix if present
+            name = bpn[4:] if bpn.startswith('npm-') else bpn
+            purl = f"pkg:npm/{name}@{version}"
+            return (version, purl)
+
+    # Case 6: CPAN package - check if recipe inherits cpan class
+    if bb.data.inherits_class("cpan", d) and version:
+        # Get package name from recipe
+        bpn = d.getVar("BPN")
+        if bpn:
+            # Remove perl- or libperl- prefixes if present
+            if bpn.startswith('perl-'):
+                name = bpn[5:]
+            elif bpn.startswith('libperl-'):
+                name = bpn[8:]
+            else:
+                name = bpn
+            purl = f"pkg:cpan/{name}@{version}"
+            return (version, purl)
+
+    # Case 7: NuGet package - check if recipe inherits nuget/dotnet class
+    if (bb.data.inherits_class("nuget", d) or bb.data.inherits_class("dotnet", d)) and version:
+        bpn = d.getVar("BPN")
+        if bpn:
+            # Remove dotnet- or nuget- prefix if present
+            if bpn.startswith('dotnet-'):
+                name = bpn[7:]
+            elif bpn.startswith('nuget-'):
+                name = bpn[6:]
+            else:
+                name = bpn
+            purl = f"pkg:nuget/{name}@{version}"
+            return (version, purl)
+
+    # Case 8: Maven package - check if recipe inherits maven class
+    if bb.data.inherits_class("maven", d) and version:
+        # Maven PURLs require group:artifact format
+        # Check for MAVEN_GROUP_ID and MAVEN_ARTIFACT_ID variables
+        group_id = d.getVar("MAVEN_GROUP_ID")
+        artifact_id = d.getVar("MAVEN_ARTIFACT_ID")
+
+        if group_id and artifact_id:
+            # Proper Maven PURL: pkg:maven/group.id/artifact@version
+            purl = f"pkg:maven/{group_id}/{artifact_id}@{version}"
+            return (version, purl)
+        else:
+            # Fallback: use BPN as artifact name without group
+            bpn = d.getVar("BPN")
+            if bpn:
+                # Remove maven- or java- prefix if present
+                if bpn.startswith('maven-'):
+                    name = bpn[6:]
+                elif bpn.startswith('java-'):
+                    name = bpn[5:]
+                else:
+                    name = bpn
+                purl = f"pkg:maven/{name}@{version}"
+                return (version, purl)
+
+    # Fallback: use pkg:generic for source downloads without specific ecosystem
+    # This covers C/C++ libraries and other non-ecosystem packages
+    bpn = d.getVar("BPN")
+    if version and bpn:
+        # Generic PURL for source tarballs (e.g., zlib, openssl, curl)
+        # The built package will have pkg:yocto/... PURL
+        purl = f"pkg:generic/{bpn}@{version}"
+        return (version, purl)
+
+    return (version, None)
+
+
 def add_download_files(d, objset):
     inputs = set()
 
@@ -408,6 +557,9 @@  def add_download_files(d, objset):
                 inputs.add(file)
 
         else:
+            # Extract version and PURL for dependency packages using recipe metadata
+            dep_version, dep_purl = extract_dependency_metadata(d, file_name)
+
             dl = objset.add(
                 oe.spdx30.software_Package(
                     _id=objset.new_spdxid("source", str(download_idx + 1)),
@@ -420,6 +572,14 @@  def add_download_files(d, objset):
                 )
             )
 
+            # Add version if extracted
+            if dep_version:
+                dl.software_packageVersion = dep_version
+
+            # Add PURL if generated
+            if dep_purl:
+                dl.software_packageUrl = dep_purl
+
             if fd.method.supports_checksum(fd):
                 # TODO Need something better than hard coding this
                 for checksum_id in ["sha256", "sha1"]: