| Message ID | 20250604112133.2581063-3-daniel.turull@ericsson.com |
|---|---|
| State | Superseded |
| Headers | show |
| Series | Check compiled files to filter kernel CVEs | expand |
On Wed, Jun 4, 2025 at 5:21 AM <daniel.turull@ericsson.com> wrote: > > From: Daniel Turull <daniel.turull@ericsson.com> > > When SPDX_INCLUDE_COMPILED_SOURCES is enabled, only include the > source code files that are used during compilation. > > It uses debugsource information generated during do_package. > > This enables an external tool to use the SPDX information to disregard > vulnerabilities that are not compiled. > > As example, when used with the default config with linux-yocto, the spdx size is > reduced from 156MB to 61MB. > > Tested with bitbake world on oe-core. > > CC: Quentin Schulz <quentin.schulz@cherry.de> > CC: Joshua Watt <JPEWhacker@gmail.com> > CC: Peter Marko <peter.marko@siemens.com> > Signed-off-by: Daniel Turull <daniel.turull@ericsson.com> > --- > meta/classes/create-spdx-2.2.bbclass | 9 ++++++ > meta/classes/spdx-common.bbclass | 3 ++ > meta/lib/oe/spdx30_tasks.py | 10 +++++++ > meta/lib/oe/spdx_common.py | 41 ++++++++++++++++++++++++++++ > 4 files changed, 63 insertions(+) > > diff --git a/meta/classes/create-spdx-2.2.bbclass b/meta/classes/create-spdx-2.2.bbclass > index 7e8f8b9ff5..6fc60a1d97 100644 > --- a/meta/classes/create-spdx-2.2.bbclass > +++ b/meta/classes/create-spdx-2.2.bbclass > @@ -137,6 +137,11 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv > spdx_files = [] > > file_counter = 1 > + > + check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1" > + if check_compiled_sources: > + compiled_sources, types = oe.spdx_common.get_compiled_sources(d) > + bb.debug(1, f"Total compiled files: {len(compiled_sources)}") > for subdir, dirs, files in os.walk(topdir): > dirs[:] = [d for d in dirs if d not in ignore_dirs] > if subdir == str(topdir): > @@ -147,6 +152,10 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv > filename = str(filepath.relative_to(topdir)) > > if not filepath.is_symlink() and filepath.is_file(): > + # Check if file is compiled > + if check_compiled_sources: > + if not oe.spdx_common.is_compiled_source(filename, compiled_sources, types): > + continue > spdx_file = oe.spdx.SPDXFile() > spdx_file.SPDXID = get_spdxid(file_counter) > for t in get_types(filepath): > diff --git a/meta/classes/spdx-common.bbclass b/meta/classes/spdx-common.bbclass > index 713a7fc651..ca0416d1c7 100644 > --- a/meta/classes/spdx-common.bbclass > +++ b/meta/classes/spdx-common.bbclass > @@ -26,6 +26,7 @@ SPDX_TOOL_VERSION ??= "1.0" > SPDXRUNTIMEDEPLOY = "${SPDXDIR}/runtime-deploy" > > SPDX_INCLUDE_SOURCES ??= "0" > +SPDX_INCLUDE_COMPILED_SOURCES ??= "0" > > SPDX_UUID_NAMESPACE ??= "sbom.openembedded.org" > SPDX_NAMESPACE_PREFIX ??= "http://spdx.org/spdxdocs" > @@ -40,6 +41,8 @@ SPDX_MULTILIB_SSTATE_ARCHS ??= "${SSTATE_ARCHS}" > python () { > from oe.cve_check import extend_cve_status > extend_cve_status(d) > + if d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1": > + d.setVar("SPDX_INCLUDE_SOURCES", "1") > } > > def create_spdx_source_deps(d): > diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py > index 61d7ba45e3..beeafc2bb7 100644 > --- a/meta/lib/oe/spdx30_tasks.py > +++ b/meta/lib/oe/spdx30_tasks.py > @@ -156,6 +156,11 @@ def add_package_files( > bb.note(f"Skip {topdir}") > return spdx_files > > + check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1" > + if check_compiled_sources: > + compiled_sources, types = oe.spdx_common.get_compiled_sources(d) > + bb.debug(1, f"Total compiled files: {len(compiled_sources)}") > + > for subdir, dirs, files in os.walk(topdir, onerror=walk_error): > dirs[:] = [d for d in dirs if d not in ignore_dirs] > if subdir == str(topdir): > @@ -171,6 +176,11 @@ def add_package_files( > filename = str(filepath.relative_to(topdir)) > file_purposes = get_purposes(filepath) > > + # Check if file is compiled > + if check_compiled_sources: > + if not oe.spdx_common.is_compiled_source(filename, compiled_sources, types): > + continue > + > spdx_file = objset.new_file( > get_spdxid(file_counter), > filename, > diff --git a/meta/lib/oe/spdx_common.py b/meta/lib/oe/spdx_common.py > index 4caefc7673..daf43bce56 100644 > --- a/meta/lib/oe/spdx_common.py > +++ b/meta/lib/oe/spdx_common.py > @@ -242,3 +242,44 @@ def fetch_data_to_uri(fd, name): > uri = uri + "@" + fd.revision > > return uri > + > +def is_compiled_source (filename, compiled_sources, types): > + """ > + Check if the file is a compiled file > + """ > + import os > + # If we don't have compiled source, we assume all are compiled. > + if len(compiled_sources) == 0: idiomatically, this would be: if not compiled_sources: return True > + return True > + # We remove the top directory, to match the format in compiled sources This feels a little bit magic, can you explain why removing the top directory is necessary and or correct to always perform? > + relative = filename[filename.find("/")+1:] you use .find() a lot, but I think split() is more idiomatic: _, relative = filename.split("/", 1) > + basename = os.path.basename(filename) > + # We return always true if the file type is not in the list of compiled files Why is this? > + if basename[basename.find("."):] not in types: if "." not in basename or basename.split(".", 1)[1] not in types: return True > + return True > + # Check that the file is in the list > + return relative in compiled_sources > + > +def get_compiled_sources(d): > + """ > + Get list of compiled sources from debug information and normalize the paths > + """ > + import itertools > + source_info = oe.package.read_debugsources_file(d) > + if not source_info: > + bb.debug(1, "Do not have debugsources.list. Skipping") > + return [], [] > + > + # Sources are not split now in SPDX, so we aggregate them > + sources = list(set(itertools.chain.from_iterable(source_info.values()))) sources should be a set, since you are using the "in" operator, which is much for efficient for sets than lists (especially when they are large) > + # Check extensions of files > + types = [] > + for src in sources: > + basename = os.path.basename(src) > + # We check that the basename has an extension > + if basename.find(".") > 0: > + ext = basename[basename.find("."):] Similarly: if "." in basename: stem, ext = basename.split(".", 1) > + if ext not in types and len(ext)>0: > + types.append(ext) Making types a set would be more efficient, e.g. types = set() ... if ext: types.add(ext) The "in" operator is much more efficient for sets than lists > + bb.debug(1, f"Num of sources: {len(sources)} and types: {len(types)} {str(types)}") > + return sources, types
Thanks Joshua for the comments. I'll resend the patch with the fixes after I have verified that it works with a world build. > -----Original Message----- > From: Joshua Watt <jpewhacker@gmail.com> > Sent: Thursday, 5 June 2025 17:15 > To: Daniel Turull <daniel.turull@ericsson.com> > Cc: openembedded-core@lists.openembedded.org; Quentin Schulz > <quentin.schulz@cherry.de>; Peter Marko <peter.marko@siemens.com> > Subject: Re: [PATCH v6 2/3] spdx: add option to include only compiled sources > > On Wed, Jun 4, 2025 at 5:21 AM <daniel.turull@ericsson.com> wrote: > > > > From: Daniel Turull <daniel.turull@ericsson.com> > > > > When SPDX_INCLUDE_COMPILED_SOURCES is enabled, only include the > source > > code files that are used during compilation. > > > > It uses debugsource information generated during do_package. > > > > This enables an external tool to use the SPDX information to disregard > > vulnerabilities that are not compiled. > > > > As example, when used with the default config with linux-yocto, the > > spdx size is reduced from 156MB to 61MB. > > > > Tested with bitbake world on oe-core. > > > > CC: Quentin Schulz <quentin.schulz@cherry.de> > > CC: Joshua Watt <JPEWhacker@gmail.com> > > CC: Peter Marko <peter.marko@siemens.com> > > Signed-off-by: Daniel Turull <daniel.turull@ericsson.com> > > --- > > meta/classes/create-spdx-2.2.bbclass | 9 ++++++ > > meta/classes/spdx-common.bbclass | 3 ++ > > meta/lib/oe/spdx30_tasks.py | 10 +++++++ > > meta/lib/oe/spdx_common.py | 41 ++++++++++++++++++++++++++++ > > 4 files changed, 63 insertions(+) > > > > diff --git a/meta/classes/create-spdx-2.2.bbclass > > b/meta/classes/create-spdx-2.2.bbclass > > index 7e8f8b9ff5..6fc60a1d97 100644 > > --- a/meta/classes/create-spdx-2.2.bbclass > > +++ b/meta/classes/create-spdx-2.2.bbclass > > @@ -137,6 +137,11 @@ def add_package_files(d, doc, spdx_pkg, topdir, > get_spdxid, get_types, *, archiv > > spdx_files = [] > > > > file_counter = 1 > > + > > + check_compiled_sources = > d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1" > > + if check_compiled_sources: > > + compiled_sources, types = > oe.spdx_common.get_compiled_sources(d) > > + bb.debug(1, f"Total compiled files: {len(compiled_sources)}") > > for subdir, dirs, files in os.walk(topdir): > > dirs[:] = [d for d in dirs if d not in ignore_dirs] > > if subdir == str(topdir): > > @@ -147,6 +152,10 @@ def add_package_files(d, doc, spdx_pkg, topdir, > get_spdxid, get_types, *, archiv > > filename = str(filepath.relative_to(topdir)) > > > > if not filepath.is_symlink() and filepath.is_file(): > > + # Check if file is compiled > > + if check_compiled_sources: > > + if not oe.spdx_common.is_compiled_source(filename, > compiled_sources, types): > > + continue > > spdx_file = oe.spdx.SPDXFile() > > spdx_file.SPDXID = get_spdxid(file_counter) > > for t in get_types(filepath): > > diff --git a/meta/classes/spdx-common.bbclass > > b/meta/classes/spdx-common.bbclass > > index 713a7fc651..ca0416d1c7 100644 > > --- a/meta/classes/spdx-common.bbclass > > +++ b/meta/classes/spdx-common.bbclass > > @@ -26,6 +26,7 @@ SPDX_TOOL_VERSION ??= "1.0" > > SPDXRUNTIMEDEPLOY = "${SPDXDIR}/runtime-deploy" > > > > SPDX_INCLUDE_SOURCES ??= "0" > > +SPDX_INCLUDE_COMPILED_SOURCES ??= "0" > > > > SPDX_UUID_NAMESPACE ??= "sbom.openembedded.org" > > SPDX_NAMESPACE_PREFIX ??= > "http://spdx.o/ > rg%2Fspdxdocs&data=05%7C02%7Cdaniel.turull%40ericsson.com%7Cc6b5feb > a17be4428746508dda443bebe%7C92e84cebfbfd47abbe52080c6b87953f%7C > 0%7C0%7C638847333069033663%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0e > U1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCI > sIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=JujsQFQMyjDumPohQKZ8C3Vp > ECOPPTtwOZE9qqiEaZ0%3D&reserved=0" > > @@ -40,6 +41,8 @@ SPDX_MULTILIB_SSTATE_ARCHS ??= > "${SSTATE_ARCHS}" > > python () { > > from oe.cve_check import extend_cve_status > > extend_cve_status(d) > > + if d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1": > > + d.setVar("SPDX_INCLUDE_SOURCES", "1") > > } > > > > def create_spdx_source_deps(d): > > diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py > > index 61d7ba45e3..beeafc2bb7 100644 > > --- a/meta/lib/oe/spdx30_tasks.py > > +++ b/meta/lib/oe/spdx30_tasks.py > > @@ -156,6 +156,11 @@ def add_package_files( > > bb.note(f"Skip {topdir}") > > return spdx_files > > > > + check_compiled_sources = > d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1" > > + if check_compiled_sources: > > + compiled_sources, types = > oe.spdx_common.get_compiled_sources(d) > > + bb.debug(1, f"Total compiled files: {len(compiled_sources)}") > > + > > for subdir, dirs, files in os.walk(topdir, onerror=walk_error): > > dirs[:] = [d for d in dirs if d not in ignore_dirs] > > if subdir == str(topdir): > > @@ -171,6 +176,11 @@ def add_package_files( > > filename = str(filepath.relative_to(topdir)) > > file_purposes = get_purposes(filepath) > > > > + # Check if file is compiled > > + if check_compiled_sources: > > + if not oe.spdx_common.is_compiled_source(filename, > compiled_sources, types): > > + continue > > + > > spdx_file = objset.new_file( > > get_spdxid(file_counter), > > filename, > > diff --git a/meta/lib/oe/spdx_common.py b/meta/lib/oe/spdx_common.py > > index 4caefc7673..daf43bce56 100644 > > --- a/meta/lib/oe/spdx_common.py > > +++ b/meta/lib/oe/spdx_common.py > > @@ -242,3 +242,44 @@ def fetch_data_to_uri(fd, name): > > uri = uri + "@" + fd.revision > > > > return uri > > + > > +def is_compiled_source (filename, compiled_sources, types): > > + """ > > + Check if the file is a compiled file > > + """ > > + import os > > + # If we don't have compiled source, we assume all are compiled. > > + if len(compiled_sources) == 0: > > idiomatically, this would be: > > if not compiled_sources: > return True Thanks. I'll change it. > > + return True > > + # We remove the top directory, to match the format in compiled > > + sources > > This feels a little bit magic, can you explain why removing the top directory is > necessary and or correct to always perform? From the data that we extract in the do_packaging, the files could be in a different place, mostly for the kernel, therefore I have normalized them removing the top directory. For example for linux-yocto, if I take tcp_ipv4.c SPDX path: linux-yocto-6.12.30+git/net/ipv4/tcp_ipv4.c Packaging path from debugsources without normalization: "/usr/src/kernel/net/ipv4/tcp_ipv4.c", Packaging with normalization: net/ipv4/tcp_ipv4.c For ncurses: SPDX: git/ncurses/tinfo/comp_scan.c Packaging: "/usr/src/debug/ncurses/git/ncurses/tinfo/comp_scan.c" Packaging with normalization: ncurses/tinfo/comp_scan.c > > + relative = filename[filename.find("/")+1:] > > you use .find() a lot, but I think split() is more idiomatic: > > _, relative = filename.split("/", 1) I'll change it for this one. > > + basename = os.path.basename(filename) > > + # We return always true if the file type is not in the list of > > + compiled files > > Why is this? For example Makefiles, python files or config files that are not actually compiled, but should be included since are part of the build > > + if basename[basename.find("."):] not in types: > > if "." not in basename or basename.split(".", 1)[1] not in types: > return True > > > + return True > > + # Check that the file is in the list > > + return relative in compiled_sources > > + > > +def get_compiled_sources(d): > > + """ > > + Get list of compiled sources from debug information and normalize the > paths > > + """ > > + import itertools > > + source_info = oe.package.read_debugsources_file(d) > > + if not source_info: > > + bb.debug(1, "Do not have debugsources.list. Skipping") > > + return [], [] > > + > > + # Sources are not split now in SPDX, so we aggregate them > > + sources = > > + list(set(itertools.chain.from_iterable(source_info.values()))) > > sources should be a set, since you are using the "in" operator, which is much > for efficient for sets than lists (especially when they are > large) I'll change it. That's a good tip. > > > + # Check extensions of files > > + types = [] > > + for src in sources: > > + basename = os.path.basename(src) > > + # We check that the basename has an extension > > + if basename.find(".") > 0: > > + ext = basename[basename.find("."):] > > Similarly: > > if "." in basename: > stem, ext = basename.split(".", 1) > > > + if ext not in types and len(ext)>0: > > + types.append(ext) > > Making types a set would be more efficient, e.g. Thanks for the tip. I'll replace the lists with sets. > types = set() > ... > > if ext: > types.add(ext) > > The "in" operator is much more efficient for sets than lists > > > + bb.debug(1, f"Num of sources: {len(sources)} and types: {len(types)} > {str(types)}") > > + return sources, types
diff --git a/meta/classes/create-spdx-2.2.bbclass b/meta/classes/create-spdx-2.2.bbclass index 7e8f8b9ff5..6fc60a1d97 100644 --- a/meta/classes/create-spdx-2.2.bbclass +++ b/meta/classes/create-spdx-2.2.bbclass @@ -137,6 +137,11 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv spdx_files = [] file_counter = 1 + + check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1" + if check_compiled_sources: + compiled_sources, types = oe.spdx_common.get_compiled_sources(d) + bb.debug(1, f"Total compiled files: {len(compiled_sources)}") for subdir, dirs, files in os.walk(topdir): dirs[:] = [d for d in dirs if d not in ignore_dirs] if subdir == str(topdir): @@ -147,6 +152,10 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv filename = str(filepath.relative_to(topdir)) if not filepath.is_symlink() and filepath.is_file(): + # Check if file is compiled + if check_compiled_sources: + if not oe.spdx_common.is_compiled_source(filename, compiled_sources, types): + continue spdx_file = oe.spdx.SPDXFile() spdx_file.SPDXID = get_spdxid(file_counter) for t in get_types(filepath): diff --git a/meta/classes/spdx-common.bbclass b/meta/classes/spdx-common.bbclass index 713a7fc651..ca0416d1c7 100644 --- a/meta/classes/spdx-common.bbclass +++ b/meta/classes/spdx-common.bbclass @@ -26,6 +26,7 @@ SPDX_TOOL_VERSION ??= "1.0" SPDXRUNTIMEDEPLOY = "${SPDXDIR}/runtime-deploy" SPDX_INCLUDE_SOURCES ??= "0" +SPDX_INCLUDE_COMPILED_SOURCES ??= "0" SPDX_UUID_NAMESPACE ??= "sbom.openembedded.org" SPDX_NAMESPACE_PREFIX ??= "http://spdx.org/spdxdocs" @@ -40,6 +41,8 @@ SPDX_MULTILIB_SSTATE_ARCHS ??= "${SSTATE_ARCHS}" python () { from oe.cve_check import extend_cve_status extend_cve_status(d) + if d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1": + d.setVar("SPDX_INCLUDE_SOURCES", "1") } def create_spdx_source_deps(d): diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py index 61d7ba45e3..beeafc2bb7 100644 --- a/meta/lib/oe/spdx30_tasks.py +++ b/meta/lib/oe/spdx30_tasks.py @@ -156,6 +156,11 @@ def add_package_files( bb.note(f"Skip {topdir}") return spdx_files + check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1" + if check_compiled_sources: + compiled_sources, types = oe.spdx_common.get_compiled_sources(d) + bb.debug(1, f"Total compiled files: {len(compiled_sources)}") + for subdir, dirs, files in os.walk(topdir, onerror=walk_error): dirs[:] = [d for d in dirs if d not in ignore_dirs] if subdir == str(topdir): @@ -171,6 +176,11 @@ def add_package_files( filename = str(filepath.relative_to(topdir)) file_purposes = get_purposes(filepath) + # Check if file is compiled + if check_compiled_sources: + if not oe.spdx_common.is_compiled_source(filename, compiled_sources, types): + continue + spdx_file = objset.new_file( get_spdxid(file_counter), filename, diff --git a/meta/lib/oe/spdx_common.py b/meta/lib/oe/spdx_common.py index 4caefc7673..daf43bce56 100644 --- a/meta/lib/oe/spdx_common.py +++ b/meta/lib/oe/spdx_common.py @@ -242,3 +242,44 @@ def fetch_data_to_uri(fd, name): uri = uri + "@" + fd.revision return uri + +def is_compiled_source (filename, compiled_sources, types): + """ + Check if the file is a compiled file + """ + import os + # If we don't have compiled source, we assume all are compiled. + if len(compiled_sources) == 0: + return True + # We remove the top directory, to match the format in compiled sources + relative = filename[filename.find("/")+1:] + basename = os.path.basename(filename) + # We return always true if the file type is not in the list of compiled files + if basename[basename.find("."):] not in types: + return True + # Check that the file is in the list + return relative in compiled_sources + +def get_compiled_sources(d): + """ + Get list of compiled sources from debug information and normalize the paths + """ + import itertools + source_info = oe.package.read_debugsources_file(d) + if not source_info: + bb.debug(1, "Do not have debugsources.list. Skipping") + return [], [] + + # Sources are not split now in SPDX, so we aggregate them + sources = list(set(itertools.chain.from_iterable(source_info.values()))) + # Check extensions of files + types = [] + for src in sources: + basename = os.path.basename(src) + # We check that the basename has an extension + if basename.find(".") > 0: + ext = basename[basename.find("."):] + if ext not in types and len(ext)>0: + types.append(ext) + bb.debug(1, f"Num of sources: {len(sources)} and types: {len(types)} {str(types)}") + return sources, types