Message ID | 20220128220305.29675-1-saul.wold@windriver.com |
---|---|
State | New |
Headers | show |
Series | [WIP/RFC] create-spdx: Get SPDX-License-Identifier from source | expand |
On 1/28/22 4:03 PM, Saul Wold wrote: > This patch will read the begining of source files and try to find > the SPDX-License-Identifier to populate the licenseInfoInFiles > field for each source file. This does not populate licenseConculed > at this time, nor rolls it up to package level. > > Signed-off-by: Saul Wold <saul.wold@windriver.com> > --- > classes/create-spdx.bbclass | 25 +++++++++++++++++++++++++ > lib/oe/spdx.py | 2 +- > 2 files changed, 26 insertions(+), 1 deletion(-) > > diff --git a/classes/create-spdx.bbclass b/classes/create-spdx.bbclass > index 180d667..9c11945 100644 > --- a/classes/create-spdx.bbclass > +++ b/classes/create-spdx.bbclass > @@ -30,6 +30,21 @@ SPDX_LICENSES ??= "${COREBASE}/meta/files/spdx-licenses.json" > > do_image_complete[depends] = "virtual/kernel:do_create_spdx" > > +def extract_licenses(filename): > + import re > + lic_regex = re.compile('SPDX-License-Identifier:\s+([-A-Za-z\d. ]+)[ |\n|\r\n]*?') > + > + try: > + with open(filename, 'r') as f: > + size = min(15000, os.stat(filename).st_size) > + txt = f.read(size) > + licenses = re.findall(lic_regex, txt) > + if licenses: > + return licenses > + except Exception as e: > + bb.warn(f"Exception on {filename}: {e}") > + return None > + > def get_doc_namespace(d, doc): > import uuid > namespace_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, d.getVar("SPDX_UUID_NAMESPACE")) > @@ -232,6 +247,16 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv > checksumValue=bb.utils.sha256_file(filepath), > )) > > + if "SOURCES" in spdx_file.fileTypes: > + licenses = extract_licenses(filepath) > + if licenses is not None: > + for lic in licenses: > + spdx_file.licenseInfoInFiles.append(lic.strip()) > + else: > + spdx_file.licenseInfoInFiles.append("NOASSERTATION") "NOASSERTION" > + else: > + spdx_file.licenseInfoInFiles.append("NOASSERTATION") "NOASSERTION" > + > doc.files.append(spdx_file) > doc.add_relationship(spdx_pkg, "CONTAINS", spdx_file) > spdx_pkg.hasFiles.append(spdx_file.SPDXID) > diff --git a/lib/oe/spdx.py b/lib/oe/spdx.py > index 9e7ced5..71e7c1c 100644 > --- a/lib/oe/spdx.py > +++ b/lib/oe/spdx.py > @@ -236,7 +236,7 @@ class SPDXFile(SPDXObject): > fileName = _String() > licenseConcluded = _String(default="NOASSERTION") > copyrightText = _String(default="NOASSERTION") > - licenseInfoInFiles = _StringList(default=["NOASSERTION"]) > + licenseInfoInFiles = _StringList() It's required to have "NOASSERTION" as the default if you don't do anything, so we shouldn't change the default here (by and large, this file should capture the spec over our use of it). It's on my TODO list to make the "default" lists behave like default scalars, where appending replaces the default instead of appending to it, but I haven't gotten there yet; it hasn't come up as a problem before. Probably need to do something like: license_info_from_file = [] # scan files here if license_info_from_files: spdx_file.licenseInfoInFiles = license_info_from_files > checksums = _ObjectList(SPDXChecksum) > fileTypes = _StringList() >
diff --git a/classes/create-spdx.bbclass b/classes/create-spdx.bbclass index 180d667..9c11945 100644 --- a/classes/create-spdx.bbclass +++ b/classes/create-spdx.bbclass @@ -30,6 +30,21 @@ SPDX_LICENSES ??= "${COREBASE}/meta/files/spdx-licenses.json" do_image_complete[depends] = "virtual/kernel:do_create_spdx" +def extract_licenses(filename): + import re + lic_regex = re.compile('SPDX-License-Identifier:\s+([-A-Za-z\d. ]+)[ |\n|\r\n]*?') + + try: + with open(filename, 'r') as f: + size = min(15000, os.stat(filename).st_size) + txt = f.read(size) + licenses = re.findall(lic_regex, txt) + if licenses: + return licenses + except Exception as e: + bb.warn(f"Exception on {filename}: {e}") + return None + def get_doc_namespace(d, doc): import uuid namespace_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, d.getVar("SPDX_UUID_NAMESPACE")) @@ -232,6 +247,16 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv checksumValue=bb.utils.sha256_file(filepath), )) + if "SOURCES" in spdx_file.fileTypes: + licenses = extract_licenses(filepath) + if licenses is not None: + for lic in licenses: + spdx_file.licenseInfoInFiles.append(lic.strip()) + else: + spdx_file.licenseInfoInFiles.append("NOASSERTATION") + else: + spdx_file.licenseInfoInFiles.append("NOASSERTATION") + doc.files.append(spdx_file) doc.add_relationship(spdx_pkg, "CONTAINS", spdx_file) spdx_pkg.hasFiles.append(spdx_file.SPDXID) diff --git a/lib/oe/spdx.py b/lib/oe/spdx.py index 9e7ced5..71e7c1c 100644 --- a/lib/oe/spdx.py +++ b/lib/oe/spdx.py @@ -236,7 +236,7 @@ class SPDXFile(SPDXObject): fileName = _String() licenseConcluded = _String(default="NOASSERTION") copyrightText = _String(default="NOASSERTION") - licenseInfoInFiles = _StringList(default=["NOASSERTION"]) + licenseInfoInFiles = _StringList() checksums = _ObjectList(SPDXChecksum) fileTypes = _StringList()
This patch will read the begining of source files and try to find the SPDX-License-Identifier to populate the licenseInfoInFiles field for each source file. This does not populate licenseConculed at this time, nor rolls it up to package level. Signed-off-by: Saul Wold <saul.wold@windriver.com> --- classes/create-spdx.bbclass | 25 +++++++++++++++++++++++++ lib/oe/spdx.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-)