diff mbox series

[v4,1/3] spdx: add option to include only compiled sources

Message ID 20250514125706.495571-2-daniel.turull@ericsson.com
State New
Headers show
Series Check compiled files to filter kernel CVEs | expand

Commit Message

Daniel Turull May 14, 2025, 12:57 p.m. UTC
From: Daniel Turull <daniel.turull@ericsson.com>

When SPDX_INCLUDE_COMPILED_SOURCES is enabled, only include the
source code (.c) files that are used during compilation.

This enables an external tool to use the SPDX information to disregard
vulnerabilities that are not compiled.

This commit adds the basics, so recipes can implement it own methods.

CC: Joshua Watt <JPEWhacker@gmail.com>
CC: Peter Marko <peter.marko@siemens.com>
Signed-off-by: Daniel Turull <daniel.turull@ericsson.com>
---
 meta/classes/create-spdx-2.2.bbclass |  9 ++++++++
 meta/classes/spdx-common.bbclass     |  3 +++
 meta/lib/oe/spdx30_tasks.py          |  9 ++++++++
 meta/lib/oe/spdx_common.py           | 33 ++++++++++++++++++++++++++++
 4 files changed, 54 insertions(+)

Comments

Quentin Schulz May 15, 2025, 12:10 p.m. UTC | #1
Hi Daniel,

On 5/14/25 2:57 PM, Daniel Turull via lists.openembedded.org wrote:
> From: Daniel Turull <daniel.turull@ericsson.com>
> 
> When SPDX_INCLUDE_COMPILED_SOURCES is enabled, only include the
> source code (.c) files that are used during compilation.
> 

Header files also have C code in them sometimes, e.g. 
https://elixir.bootlin.com/linux/v6.14.6/source/drivers/net/ethernet/stmicro/stmmac/descs_com.h

Also, CVEs could apply to macros or constants too, which could also be 
in header files.

Also, there could be assembly code compiled in that could have a CVE as 
well.

Finally, the kernel also has rust code, which could also have CVEs.

All in all, I'm not sure discriminating on the

> This enables an external tool to use the SPDX information to disregard
> vulnerabilities that are not compiled.
> 
> This commit adds the basics, so recipes can implement it own methods.
> 
> CC: Joshua Watt <JPEWhacker@gmail.com>
> CC: Peter Marko <peter.marko@siemens.com>
> Signed-off-by: Daniel Turull <daniel.turull@ericsson.com>
> ---
>   meta/classes/create-spdx-2.2.bbclass |  9 ++++++++
>   meta/classes/spdx-common.bbclass     |  3 +++
>   meta/lib/oe/spdx30_tasks.py          |  9 ++++++++
>   meta/lib/oe/spdx_common.py           | 33 ++++++++++++++++++++++++++++
>   4 files changed, 54 insertions(+)
> 
> diff --git a/meta/classes/create-spdx-2.2.bbclass b/meta/classes/create-spdx-2.2.bbclass
> index 7e8f8b9ff5..dd8ee6ecbe 100644
> --- a/meta/classes/create-spdx-2.2.bbclass
> +++ b/meta/classes/create-spdx-2.2.bbclass
> @@ -137,6 +137,11 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv
>       spdx_files = []
>   
>       file_counter = 1
> +
> +    check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1"
> +    if check_compiled_sources:
> +        compiled_sources = oe.spdx_common.get_compiled_sources(d)
> +        bb.debug(1, f"Total compiled files: {len(compiled_sources)}")
>       for subdir, dirs, files in os.walk(topdir):
>           dirs[:] = [d for d in dirs if d not in ignore_dirs]
>           if subdir == str(topdir):
> @@ -147,6 +152,10 @@ def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv
>               filename = str(filepath.relative_to(topdir))
>   
>               if not filepath.is_symlink() and filepath.is_file():
> +                # Check if file is compiled
> +                if check_compiled_sources:
> +                     if not oe.spdx_common.is_compiled_source(file, compiled_sources):
> +                          break
>                   spdx_file = oe.spdx.SPDXFile()
>                   spdx_file.SPDXID = get_spdxid(file_counter)
>                   for t in get_types(filepath):
> diff --git a/meta/classes/spdx-common.bbclass b/meta/classes/spdx-common.bbclass
> index 713a7fc651..e9dde34513 100644
> --- a/meta/classes/spdx-common.bbclass
> +++ b/meta/classes/spdx-common.bbclass
> @@ -26,6 +26,9 @@ SPDX_TOOL_VERSION ??= "1.0"
>   SPDXRUNTIMEDEPLOY = "${SPDXDIR}/runtime-deploy"
>   
>   SPDX_INCLUDE_SOURCES ??= "0"
> +SPDX_INCLUDE_COMPILED_SOURCES ??= "0"
> +SPDX_COMPILED_SOURCES_DIR ??= "${LOG_DIR}/spdx-compiled/${PN}"
> +SPDX_COMPILED_SOURCES ??= "${SPDX_FILES_DIR}/compiled_src-${BP}.txt"
>   
>   SPDX_UUID_NAMESPACE ??= "sbom.openembedded.org"
>   SPDX_NAMESPACE_PREFIX ??= "http://spdx.org/spdxdocs"
> diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py
> index 61d7ba45e3..083e004330 100644
> --- a/meta/lib/oe/spdx30_tasks.py
> +++ b/meta/lib/oe/spdx30_tasks.py
> @@ -156,6 +156,11 @@ def add_package_files(
>           bb.note(f"Skip {topdir}")
>           return spdx_files
>   
> +    check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1"
> +    if check_compiled_sources:
> +        compiled_sources = oe.spdx_common.get_compiled_sources(d)
> +        bb.debug(1, f"Total compiled files: {len(compiled_sources)}")
> +
>       for subdir, dirs, files in os.walk(topdir, onerror=walk_error):
>           dirs[:] = [d for d in dirs if d not in ignore_dirs]
>           if subdir == str(topdir):
> @@ -167,6 +172,10 @@ def add_package_files(
>               filepath = Path(subdir) / file
>               if filepath.is_symlink() or not filepath.is_file():
>                   continue
> +            # Check if file is compiled
> +            if check_compiled_sources:
> +                 if not oe.spdx_common.is_compiled_source(file, compiled_sources):
> +                      break
>   
>               filename = str(filepath.relative_to(topdir))
>               file_purposes = get_purposes(filepath)
> diff --git a/meta/lib/oe/spdx_common.py b/meta/lib/oe/spdx_common.py
> index 4caefc7673..e1b7f576dd 100644
> --- a/meta/lib/oe/spdx_common.py
> +++ b/meta/lib/oe/spdx_common.py
> @@ -242,3 +242,36 @@ def fetch_data_to_uri(fd, name):
>           uri = uri + "@" + fd.revision
>   
>       return uri
> +
> +
> +def is_compiled_source (filename, compiled_sources):
> +    """
> +    Check if the file, is a compiled file
> +    """
> +    import os
> +    # If we don't have compiled source, we asume all are compiled.

s/asume/assume/

> +    if len(compiled_sources) == 0:
> +        return True
> +    _, extension = os.path.splitext(filename)
> +    # Special case, that we need to ignore, since this is not a source file
> +    # We filter .c files
> +    if filename.rfind(".mod.c") > 0 or extension != ".c":
> +        return True
> +    # Check that the c file is in the list
> +    if filename in compiled_sources:
> +        return True
> +    return False
> +
> +def get_compiled_sources(d):
> +    """
> +    Return compiled files from the SPDX_COMPILED_FILES file

s/SPDX_COMPILED_FILES/SPDX_COMPILED_SOURCES/

Cheers,
Quentin
Daniel Turull May 15, 2025, 1:12 p.m. UTC | #2
Hi Quentin,
Thanks for the feedback. That's a good point for the header files and the rust files. I'll need to find a better way to extract them, since scripts/clang-tools/gen_compile_commands.py only extracts the commands and includes only the c files. So unless we don't have better info on the files used, we should not exclude any header file. Do you know any better script to extract the compiled files from the kernel?

The current code in the spdx class is supposed to only ignore the c files that are not compiled, (so been conservative on what to remove) but probably the script that I have in [PATCH v4 3/3] improve_kernel_cve_report: add script for postprocesing of kernel CVE data.

Needs to be updated, that if the CVE is not in a c file is not ignored, unless we have the header files in the list of compiled files.

I'll correct the minor things in a newer patch, and probably needs another iteration to have it more generic and flexible.

Thanks
Daniel

> -----Original Message-----
> From: Quentin Schulz <quentin.schulz@cherry.de>
> Sent: Thursday, 15 May 2025 14:11
> To: Daniel Turull <daniel.turull@ericsson.com>; openembedded-
> core@lists.openembedded.org
> Cc: Joshua Watt <JPEWhacker@gmail.com>; Peter Marko
> <peter.marko@siemens.com>
> Subject: Re: [OE-core] [PATCH v4 1/3] spdx: add option to include only compiled
> sources
>
> [You don't often get email from quentin.schulz@cherry.de. Learn why this is
> important at https://aka.ms/LearnAboutSenderIdentification ]
>
> Hi Daniel,
>
> On 5/14/25 2:57 PM, Daniel Turull via lists.openembedded.org wrote:
> > From: Daniel Turull <daniel.turull@ericsson.com>
> >
> > When SPDX_INCLUDE_COMPILED_SOURCES is enabled, only include the source
> > code (.c) files that are used during compilation.
> >
>
> Header files also have C code in them sometimes, e.g.
> https://elixir.bootli/
> n.com%2Flinux%2Fv6.14.6%2Fsource%2Fdrivers%2Fnet%2Fethernet%2Fstmicro
> %2Fstmmac%2Fdescs_com.h&data=05%7C02%7Cdaniel.turull%40ericsson.com%
> 7Cc0ebfb107a1c49ca451908dd93a98f54%7C92e84cebfbfd47abbe52080c6b879
> 53f%7C0%7C0%7C638829078665453952%7CUnknown%7CTWFpbGZsb3d8eyJFb
> XB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpb
> CIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=kHG9AtrUhVlP329WUjBmsBAb
> y%2Flvqhipbsg6Ow%2BeRIc%3D&reserved=0
>
> Also, CVEs could apply to macros or constants too, which could also be in header
> files.
>
> Also, there could be assembly code compiled in that could have a CVE as well.
>
> Finally, the kernel also has rust code, which could also have CVEs.
>
> All in all, I'm not sure discriminating on the
>
> > This enables an external tool to use the SPDX information to disregard
> > vulnerabilities that are not compiled.
> >
> > This commit adds the basics, so recipes can implement it own methods.
> >
> > CC: Joshua Watt <JPEWhacker@gmail.com>
> > CC: Peter Marko <peter.marko@siemens.com>
> > Signed-off-by: Daniel Turull <daniel.turull@ericsson.com>
> > ---
> >   meta/classes/create-spdx-2.2.bbclass |  9 ++++++++
> >   meta/classes/spdx-common.bbclass     |  3 +++
> >   meta/lib/oe/spdx30_tasks.py          |  9 ++++++++
> >   meta/lib/oe/spdx_common.py           | 33 ++++++++++++++++++++++++++++
> >   4 files changed, 54 insertions(+)
> >
> > diff --git a/meta/classes/create-spdx-2.2.bbclass
> > b/meta/classes/create-spdx-2.2.bbclass
> > index 7e8f8b9ff5..dd8ee6ecbe 100644
> > --- a/meta/classes/create-spdx-2.2.bbclass
> > +++ b/meta/classes/create-spdx-2.2.bbclass
> > @@ -137,6 +137,11 @@ def add_package_files(d, doc, spdx_pkg, topdir,
> get_spdxid, get_types, *, archiv
> >       spdx_files = []
> >
> >       file_counter = 1
> > +
> > +    check_compiled_sources =
> d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1"
> > +    if check_compiled_sources:
> > +        compiled_sources = oe.spdx_common.get_compiled_sources(d)
> > +        bb.debug(1, f"Total compiled files: {len(compiled_sources)}")
> >       for subdir, dirs, files in os.walk(topdir):
> >           dirs[:] = [d for d in dirs if d not in ignore_dirs]
> >           if subdir == str(topdir):
> > @@ -147,6 +152,10 @@ def add_package_files(d, doc, spdx_pkg, topdir,
> get_spdxid, get_types, *, archiv
> >               filename = str(filepath.relative_to(topdir))
> >
> >               if not filepath.is_symlink() and filepath.is_file():
> > +                # Check if file is compiled
> > +                if check_compiled_sources:
> > +                     if not oe.spdx_common.is_compiled_source(file,
> compiled_sources):
> > +                          break
> >                   spdx_file = oe.spdx.SPDXFile()
> >                   spdx_file.SPDXID = get_spdxid(file_counter)
> >                   for t in get_types(filepath):
> > diff --git a/meta/classes/spdx-common.bbclass
> > b/meta/classes/spdx-common.bbclass
> > index 713a7fc651..e9dde34513 100644
> > --- a/meta/classes/spdx-common.bbclass
> > +++ b/meta/classes/spdx-common.bbclass
> > @@ -26,6 +26,9 @@ SPDX_TOOL_VERSION ??= "1.0"
> >   SPDXRUNTIMEDEPLOY = "${SPDXDIR}/runtime-deploy"
> >
> >   SPDX_INCLUDE_SOURCES ??= "0"
> > +SPDX_INCLUDE_COMPILED_SOURCES ??= "0"
> > +SPDX_COMPILED_SOURCES_DIR ??= "${LOG_DIR}/spdx-compiled/${PN}"
> > +SPDX_COMPILED_SOURCES ??= "${SPDX_FILES_DIR}/compiled_src-${BP}.txt"
> >
> >   SPDX_UUID_NAMESPACE ??= "sbom.openembedded.org"
> >   SPDX_NAMESPACE_PREFIX ??=
> "https://eur02.safelinks.protection.outlook.com/?url=http%3A%2F%2Fspdx.org%25
> 2Fspdxdocs&data=05%7C02%7Cdaniel.turull%40ericsson.com%7Cc0ebfb107a1c
> 49ca451908dd93a98f54%7C92e84cebfbfd47abbe52080c6b87953f%7C0%7C0%7
> C638829078665473184%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnR
> ydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D
> %3D%7C0%7C%7C%7C&sdata=2T1i5cbWuwCi1Evbm6Qnj8WGFkiuoniOadbfe7e9
> gXM%3D&reserved=0"
> > diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py
> > index 61d7ba45e3..083e004330 100644
> > --- a/meta/lib/oe/spdx30_tasks.py
> > +++ b/meta/lib/oe/spdx30_tasks.py
> > @@ -156,6 +156,11 @@ def add_package_files(
> >           bb.note(f"Skip {topdir}")
> >           return spdx_files
> >
> > +    check_compiled_sources =
> d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1"
> > +    if check_compiled_sources:
> > +        compiled_sources = oe.spdx_common.get_compiled_sources(d)
> > +        bb.debug(1, f"Total compiled files: {len(compiled_sources)}")
> > +
> >       for subdir, dirs, files in os.walk(topdir, onerror=walk_error):
> >           dirs[:] = [d for d in dirs if d not in ignore_dirs]
> >           if subdir == str(topdir):
> > @@ -167,6 +172,10 @@ def add_package_files(
> >               filepath = Path(subdir) / file
> >               if filepath.is_symlink() or not filepath.is_file():
> >                   continue
> > +            # Check if file is compiled
> > +            if check_compiled_sources:
> > +                 if not oe.spdx_common.is_compiled_source(file, compiled_sources):
> > +                      break
> >
> >               filename = str(filepath.relative_to(topdir))
> >               file_purposes = get_purposes(filepath) diff --git
> > a/meta/lib/oe/spdx_common.py b/meta/lib/oe/spdx_common.py index
> > 4caefc7673..e1b7f576dd 100644
> > --- a/meta/lib/oe/spdx_common.py
> > +++ b/meta/lib/oe/spdx_common.py
> > @@ -242,3 +242,36 @@ def fetch_data_to_uri(fd, name):
> >           uri = uri + "@" + fd.revision
> >
> >       return uri
> > +
> > +
> > +def is_compiled_source (filename, compiled_sources):
> > +    """
> > +    Check if the file, is a compiled file
> > +    """
> > +    import os
> > +    # If we don't have compiled source, we asume all are compiled.
>
> s/asume/assume/
>
> > +    if len(compiled_sources) == 0:
> > +        return True
> > +    _, extension = os.path.splitext(filename)
> > +    # Special case, that we need to ignore, since this is not a source file
> > +    # We filter .c files
> > +    if filename.rfind(".mod.c") > 0 or extension != ".c":
> > +        return True
> > +    # Check that the c file is in the list
> > +    if filename in compiled_sources:
> > +        return True
> > +    return False
> > +
> > +def get_compiled_sources(d):
> > +    """
> > +    Return compiled files from the SPDX_COMPILED_FILES file
>
> s/SPDX_COMPILED_FILES/SPDX_COMPILED_SOURCES/
>
> Cheers,
> Quentin
Quentin Schulz May 15, 2025, 1:32 p.m. UTC | #3
Hi Daniel,

On 5/15/25 3:12 PM, Daniel Turull wrote:
> Hi Quentin,
> Thanks for the feedback. That's a good point for the header files and the rust files. I'll need to find a better way to extract them, since scripts/clang-tools/gen_compile_commands.py only extracts the commands and includes only the c files. So unless we don't have better info on the files used, we should not exclude any header file. Do you 

I would then suggest to include all files by default and only exclude C 
files if they do not appear in the output of 
scripts/clang-tools/gen_compile_commands.py so you cover everything and 
allowlist only things you know are safe.

know any better script to extract the compiled files from the kernel?
> 

No clue, sorry.

> The current code in the spdx class is supposed to only ignore the c files that are not compiled, (so been conservative on what to remove) but probably the script that I have in [PATCH v4 3/3] improve_kernel_cve_report: add script for postprocesing of kernel CVE data.
> 
> Needs to be updated, that if the CVE is not in a c file is not ignored, unless we have the header files in the list of compiled files.
> 

Anything that is NOT a C file should NOT be ignored (except if you have 
a tool that allows to know which non-C files are compiled in). Anything 
you can guarantee is never compiled in, then you can ignore. For now it 
seems you're saying gen_compile_commands.py only returns C files so 
that's all we can filter out.

Cheers,
Quentin
Daniel Turull May 15, 2025, 1:44 p.m. UTC | #4
Hi Quentin,
Thanks. I'll dig a bit more for the header files if I can modify the 2nd patch in the series. [PATCH v4 2/3] kernel: add support to extract compiled files

About only excluding the c files, this is what the current code in spdx is doing (unless I made a mistake), anything other than c file, it returns always true in the is_compiled_source, so it is included in the spdx.

Probably we could look at the file type that we have in the list, so it can be more dynamic.

I'll send a new version in a few days, in case there are more comments.

Currently there are 399 CVEs with header files (.h) and non with rust file (.rs) in the data from https://git.kernel.org/pub/scm/linux/security/vulns.git/

Best regards,

Daniel
> -----Original Message-----
> From: Quentin Schulz <quentin.schulz@cherry.de>
> Sent: Thursday, 15 May 2025 15:33
> To: Daniel Turull <daniel.turull@ericsson.com>; openembedded-
> core@lists.openembedded.org
> Cc: Joshua Watt <JPEWhacker@gmail.com>; Peter Marko
> <peter.marko@siemens.com>
> Subject: Re: [OE-core] [PATCH v4 1/3] spdx: add option to include only compiled
> sources
> 
> Hi Daniel,
> 
> On 5/15/25 3:12 PM, Daniel Turull wrote:
> > Hi Quentin,
> > Thanks for the feedback. That's a good point for the header files and
> > the rust files. I'll need to find a better way to extract them, since
> > scripts/clang-tools/gen_compile_commands.py only extracts the commands
> > and includes only the c files. So unless we don't have better info on
> > the files used, we should not exclude any header file. Do you
> 
> I would then suggest to include all files by default and only exclude C files if they
> do not appear in the output of scripts/clang-tools/gen_compile_commands.py so
> you cover everything and allowlist only things you know are safe.
> 
> know any better script to extract the compiled files from the kernel?
> >
> 
> No clue, sorry.
> 
> > The current code in the spdx class is supposed to only ignore the c files that are
> not compiled, (so been conservative on what to remove) but probably the script
> that I have in [PATCH v4 3/3] improve_kernel_cve_report: add script for
> postprocesing of kernel CVE data.
> >
> > Needs to be updated, that if the CVE is not in a c file is not ignored, unless we
> have the header files in the list of compiled files.
> >
> 
> Anything that is NOT a C file should NOT be ignored (except if you have a tool that
> allows to know which non-C files are compiled in). Anything you can guarantee is
> never compiled in, then you can ignore. For now it seems you're saying
> gen_compile_commands.py only returns C files so that's all we can filter out.
> 
> Cheers,
> Quentin
Quentin Schulz May 15, 2025, 1:58 p.m. UTC | #5
Hi Daniel,

On 5/15/25 3:44 PM, Daniel Turull wrote:
> Hi Quentin,
> Thanks. I'll dig a bit more for the header files if I can modify the 2nd patch in the series. [PATCH v4 2/3] kernel: add support to extract compiled files
> 
> About only excluding the c files, this is what the current code in spdx is doing (unless I made a mistake), anything other than c file, it returns always true in the is_compiled_source, so it is included in the spdx.
> 

My apologies, I clearly misread the code.

Please make the comment a bit clearer, like saying that we currently 
only support filtering C files, so we need to assume non-C files are 
compiled.

Re-reading the code, I'm wondering what you're actually trying to filter 
out with

if filename.rfind(".mod.c") > 0

? I see there are hidden files in my local build dir matching *.mod.cmd. 
Were you trying to catch those as well as *.mod.c?

Otherwise, You could simply check whether the file ends with .c with:

filename.endswith(".c")

If you want to check for mod.cmd as well:

filename.endswith((".c", "mod.cmd"))

You could also simply replace

     if filename in compiled_sources:
         return True
     return False

with

     return filename in compiled_sources

Cheers,
Quentin
Richard Purdie May 15, 2025, 2:03 p.m. UTC | #6
On Thu, 2025-05-15 at 13:12 +0000, Daniel Turull via lists.openembedded.org wrote:
> Hi Quentin,
> Thanks for the feedback. That's a good point for the header files and
> the rust files. I'll need to find a better way to extract them, since
> scripts/clang-tools/gen_compile_commands.py only extracts the
> commands and includes only the c files. So unless we don't have
> better info on the files used, we should not exclude any header file.
> Do you know any better script to extract the compiled files from the
> kernel?
> 
> The current code in the spdx class is supposed to only ignore the c
> files that are not compiled, (so been conservative on what to remove)
> but probably the script that I have in [PATCH v4 3/3]
> improve_kernel_cve_report: add script for postprocesing of kernel CVE
> data.
> 
> Needs to be updated, that if the CVE is not in a c file is not
> ignored, unless we have the header files in the list of compiled
> files.
> 
> I'll correct the minor things in a newer patch, and probably needs
> another iteration to have it more generic and flexible.

Don't we already have tooling which look at the debug data and extract
the list of source files from that as part of do_package? This is how
we know what to put into the source debug packages?

Cheers,

Richard
Daniel Turull May 15, 2025, 2:04 p.m. UTC | #7
Hi,
I'll clarify the comment, clearly it was not good enough.

I had to include the .mod.c since the kernel included some .mod.c file that were filter out with only looking for .c files.
I probably need to use another function or use another search looking at the whole filename and taking the extension from the first "."

Thanks
Daniel

> -----Original Message-----
> From: Quentin Schulz <quentin.schulz@cherry.de>
> Sent: Thursday, 15 May 2025 15:59
> To: Daniel Turull <daniel.turull@ericsson.com>; openembedded-
> core@lists.openembedded.org
> Cc: Joshua Watt <JPEWhacker@gmail.com>; Peter Marko
> <peter.marko@siemens.com>
> Subject: Re: [OE-core] [PATCH v4 1/3] spdx: add option to include only compiled
> sources
> 
> Hi Daniel,
> 
> On 5/15/25 3:44 PM, Daniel Turull wrote:
> > Hi Quentin,
> > Thanks. I'll dig a bit more for the header files if I can modify the
> > 2nd patch in the series. [PATCH v4 2/3] kernel: add support to extract
> > compiled files
> >
> > About only excluding the c files, this is what the current code in spdx is doing
> (unless I made a mistake), anything other than c file, it returns always true in the
> is_compiled_source, so it is included in the spdx.
> >
> 
> My apologies, I clearly misread the code.
> 
> Please make the comment a bit clearer, like saying that we currently only support
> filtering C files, so we need to assume non-C files are compiled.
> 
> Re-reading the code, I'm wondering what you're actually trying to filter out with
> 
> if filename.rfind(".mod.c") > 0
> 
> ? I see there are hidden files in my local build dir matching *.mod.cmd.
> Were you trying to catch those as well as *.mod.c?
> 
> Otherwise, You could simply check whether the file ends with .c with:
> 
> filename.endswith(".c")
> 
> If you want to check for mod.cmd as well:
> 
> filename.endswith((".c", "mod.cmd"))
> 
> You could also simply replace
> 
>      if filename in compiled_sources:
>          return True
>      return False
> 
> with
> 
>      return filename in compiled_sources
> 
> Cheers,
> Quentin
Daniel Turull May 15, 2025, 2:09 p.m. UTC | #8
Hi,
Thanks for the pointer Richard. I'll look into it, I clearly missed it.

I want to have the patches as less intrusive as possible and if the data is there, the less code we need to extract the used source code.
Thanks
Daniel

> -----Original Message-----
> From: Richard Purdie <richard.purdie@linuxfoundation.org>
> Sent: Thursday, 15 May 2025 16:03
> To: Daniel Turull <daniel.turull@ericsson.com>; Quentin Schulz
> <quentin.schulz@cherry.de>; openembedded-core@lists.openembedded.org
> Cc: Joshua Watt <JPEWhacker@gmail.com>; Peter Marko
> <peter.marko@siemens.com>
> Subject: Re: [OE-core] [PATCH v4 1/3] spdx: add option to include only compiled
> sources
> 
> On Thu, 2025-05-15 at 13:12 +0000, Daniel Turull via lists.openembedded.org
> wrote:
> > Hi Quentin,
> > Thanks for the feedback. That's a good point for the header files and
> > the rust files. I'll need to find a better way to extract them, since
> > scripts/clang-tools/gen_compile_commands.py only extracts the commands
> > and includes only the c files. So unless we don't have better info on
> > the files used, we should not exclude any header file.
> > Do you know any better script to extract the compiled files from the
> > kernel?
> >
> > The current code in the spdx class is supposed to only ignore the c
> > files that are not compiled, (so been conservative on what to remove)
> > but probably the script that I have in [PATCH v4 3/3]
> > improve_kernel_cve_report: add script for postprocesing of kernel CVE
> > data.
> >
> > Needs to be updated, that if the CVE is not in a c file is not
> > ignored, unless we have the header files in the list of compiled
> > files.
> >
> > I'll correct the minor things in a newer patch, and probably needs
> > another iteration to have it more generic and flexible.
> 
> Don't we already have tooling which look at the debug data and extract the list of
> source files from that as part of do_package? This is how we know what to put
> into the source debug packages?
> 
> Cheers,
> 
> Richard
Richard Purdie May 15, 2025, 2:21 p.m. UTC | #9
On Thu, 2025-05-15 at 14:09 +0000, Daniel Turull wrote:
> Thanks for the pointer Richard. I'll look into it, I clearly missed
> it.
> 
> I want to have the patches as less intrusive as possible and if the
> data is there, the less code we need to extract the used source code.

Have a look at:

./meta/recipes-devtools/dwarfsrcfiles/files/dwarfsrcfiles.c

and how do_package ends up using that.

Cheers,

Richard
Daniel Turull May 15, 2025, 2:24 p.m. UTC | #10
Thank you very much for the pointer Richard 
diff mbox series

Patch

diff --git a/meta/classes/create-spdx-2.2.bbclass b/meta/classes/create-spdx-2.2.bbclass
index 7e8f8b9ff5..dd8ee6ecbe 100644
--- a/meta/classes/create-spdx-2.2.bbclass
+++ b/meta/classes/create-spdx-2.2.bbclass
@@ -137,6 +137,11 @@  def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv
     spdx_files = []
 
     file_counter = 1
+
+    check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1"
+    if check_compiled_sources:
+        compiled_sources = oe.spdx_common.get_compiled_sources(d)
+        bb.debug(1, f"Total compiled files: {len(compiled_sources)}")
     for subdir, dirs, files in os.walk(topdir):
         dirs[:] = [d for d in dirs if d not in ignore_dirs]
         if subdir == str(topdir):
@@ -147,6 +152,10 @@  def add_package_files(d, doc, spdx_pkg, topdir, get_spdxid, get_types, *, archiv
             filename = str(filepath.relative_to(topdir))
 
             if not filepath.is_symlink() and filepath.is_file():
+                # Check if file is compiled
+                if check_compiled_sources:
+                     if not oe.spdx_common.is_compiled_source(file, compiled_sources):
+                          break
                 spdx_file = oe.spdx.SPDXFile()
                 spdx_file.SPDXID = get_spdxid(file_counter)
                 for t in get_types(filepath):
diff --git a/meta/classes/spdx-common.bbclass b/meta/classes/spdx-common.bbclass
index 713a7fc651..e9dde34513 100644
--- a/meta/classes/spdx-common.bbclass
+++ b/meta/classes/spdx-common.bbclass
@@ -26,6 +26,9 @@  SPDX_TOOL_VERSION ??= "1.0"
 SPDXRUNTIMEDEPLOY = "${SPDXDIR}/runtime-deploy"
 
 SPDX_INCLUDE_SOURCES ??= "0"
+SPDX_INCLUDE_COMPILED_SOURCES ??= "0"
+SPDX_COMPILED_SOURCES_DIR ??= "${LOG_DIR}/spdx-compiled/${PN}"
+SPDX_COMPILED_SOURCES ??= "${SPDX_FILES_DIR}/compiled_src-${BP}.txt"
 
 SPDX_UUID_NAMESPACE ??= "sbom.openembedded.org"
 SPDX_NAMESPACE_PREFIX ??= "http://spdx.org/spdxdocs"
diff --git a/meta/lib/oe/spdx30_tasks.py b/meta/lib/oe/spdx30_tasks.py
index 61d7ba45e3..083e004330 100644
--- a/meta/lib/oe/spdx30_tasks.py
+++ b/meta/lib/oe/spdx30_tasks.py
@@ -156,6 +156,11 @@  def add_package_files(
         bb.note(f"Skip {topdir}")
         return spdx_files
 
+    check_compiled_sources = d.getVar("SPDX_INCLUDE_COMPILED_SOURCES") == "1"
+    if check_compiled_sources:
+        compiled_sources = oe.spdx_common.get_compiled_sources(d)
+        bb.debug(1, f"Total compiled files: {len(compiled_sources)}")
+
     for subdir, dirs, files in os.walk(topdir, onerror=walk_error):
         dirs[:] = [d for d in dirs if d not in ignore_dirs]
         if subdir == str(topdir):
@@ -167,6 +172,10 @@  def add_package_files(
             filepath = Path(subdir) / file
             if filepath.is_symlink() or not filepath.is_file():
                 continue
+            # Check if file is compiled
+            if check_compiled_sources:
+                 if not oe.spdx_common.is_compiled_source(file, compiled_sources):
+                      break
 
             filename = str(filepath.relative_to(topdir))
             file_purposes = get_purposes(filepath)
diff --git a/meta/lib/oe/spdx_common.py b/meta/lib/oe/spdx_common.py
index 4caefc7673..e1b7f576dd 100644
--- a/meta/lib/oe/spdx_common.py
+++ b/meta/lib/oe/spdx_common.py
@@ -242,3 +242,36 @@  def fetch_data_to_uri(fd, name):
         uri = uri + "@" + fd.revision
 
     return uri
+
+
+def is_compiled_source (filename, compiled_sources):
+    """
+    Check if the file, is a compiled file
+    """
+    import os
+    # If we don't have compiled source, we asume all are compiled.
+    if len(compiled_sources) == 0:
+        return True
+    _, extension = os.path.splitext(filename)
+    # Special case, that we need to ignore, since this is not a source file
+    # We filter .c files
+    if filename.rfind(".mod.c") > 0 or extension != ".c":
+        return True
+    # Check that the c file is in the list
+    if filename in compiled_sources:
+        return True
+    return False
+
+def get_compiled_sources(d):
+    """
+    Return compiled files from the SPDX_COMPILED_FILES file
+    """
+    cfiles = []
+    sources = d.getVar('SPDX_COMPILED_SOURCES')
+    if not sources:
+        return cfiles
+    if not os.path.isfile(sources):
+        return cfiles
+    with open(sources, 'r') as f:
+        cfiles = [line.strip() for line in f]
+    return cfiles