[v3] checksum: Support uri formatted file list

Message ID 20220206134507.123558-1-ptsneves@gmail.com
State New
Headers show
Series [v3] checksum: Support uri formatted file list | expand

Commit Message

Paulo Neves Feb. 6, 2022, 1:45 p.m. UTC
Before this commit, if the file list for checksumming had
files names with spaces there would be a crash. This happened
due to filelist.split breaking on the file names instead of
on k:v boundaries. Now we validate this case and emit a fatal
error if such case is found. This needs to be fatal as the split
will generate broken k:v from then on.

Instead of putting literal spaces in the file list the user
should urlencode the file names and if they contain coded spaces
they will be decoded. This is consistent with the current
practice where file names are urlencoded. A reproducer of the
issue this commit fixes, was to pass a do_compile[file-checksums]
list with files containing spaces in their names, urlencoded or
literal.

Change-Id: I6ac4f1cffbb86e913883491d46e8cc69a028e992
Signed-off-by: Paulo Neves <ptsneves@gmail.com>
---
 bitbake/lib/bb/checksum.py                   |  9 +++++++--
 bitbake/lib/bb/fetch2/__init__.py            |  6 +++---
 meta/classes/base.bbclass                    |  4 +++-
 meta/lib/oeqa/selftest/cases/lic_checksum.py | 18 ++++++++++++++++++
 4 files changed, 31 insertions(+), 6 deletions(-)

Comments

Paulo Neves Feb. 6, 2022, 1:50 p.m. UTC | #1
The current patch passes the newly created test: both on the SRC_URI as 
well as LIC_FILES_CHKSUM. Let me know if there is anything else needed 
for a merge :)


Paulo Neves
On 2/6/22 14:45, Paulo Neves wrote:
> Before this commit, if the file list for checksumming had
> files names with spaces there would be a crash. This happened
> due to filelist.split breaking on the file names instead of
> on k:v boundaries. Now we validate this case and emit a fatal
> error if such case is found. This needs to be fatal as the split
> will generate broken k:v from then on.
>
> Instead of putting literal spaces in the file list the user
> should urlencode the file names and if they contain coded spaces
> they will be decoded. This is consistent with the current
> practice where file names are urlencoded. A reproducer of the
> issue this commit fixes, was to pass a do_compile[file-checksums]
> list with files containing spaces in their names, urlencoded or
> literal.
>
> Change-Id: I6ac4f1cffbb86e913883491d46e8cc69a028e992
> Signed-off-by: Paulo Neves <ptsneves@gmail.com>
> ---
>   bitbake/lib/bb/checksum.py                   |  9 +++++++--
>   bitbake/lib/bb/fetch2/__init__.py            |  6 +++---
>   meta/classes/base.bbclass                    |  4 +++-
>   meta/lib/oeqa/selftest/cases/lic_checksum.py | 18 ++++++++++++++++++
>   4 files changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/bitbake/lib/bb/checksum.py b/bitbake/lib/bb/checksum.py
> index fb8a77f6ab..4d34baffcc 100644
> --- a/bitbake/lib/bb/checksum.py
> +++ b/bitbake/lib/bb/checksum.py
> @@ -8,6 +8,7 @@
>   import glob
>   import operator
>   import os
> +import urllib
>   import stat
>   import bb.utils
>   import logging
> @@ -110,10 +111,14 @@ class FileChecksumCache(MultiProcessCache):
>   
>           checksums = []
>           for pth in filelist.split():
> -            exist = pth.split(":")[1]
> +            spl = pth.split(':')
> +            if len(spl) != 2:
> +                bb.fatal("found unformatted path in filelist " + pth)
> +
> +            exist = spl[1]
>               if exist == "False":
>                   continue
> -            pth = pth.split(":")[0]
> +            pth = urllib.parse.unquote(spl[0])
>               if '*' in pth:
>                   # Handle globs
>                   for f in glob.glob(pth):
> diff --git a/bitbake/lib/bb/fetch2/__init__.py b/bitbake/lib/bb/fetch2/__init__.py
> index d37174185a..5cdd150d6b 100644
> --- a/bitbake/lib/bb/fetch2/__init__.py
> +++ b/bitbake/lib/bb/fetch2/__init__.py
> @@ -1214,13 +1214,13 @@ def get_checksum_file_list(d):
>               paths = ud.method.localpaths(ud, d)
>               for f in paths:
>                   pth = ud.decodedurl
> -                if f.startswith(dl_dir):
> +                if pth.startswith(dl_dir):
>                       # The local fetcher's behaviour is to return a path under DL_DIR if it couldn't find the file anywhere else
> -                    if os.path.exists(f):
> +                    if os.path.exists(pth):
>                           bb.warn("Getting checksum for %s SRC_URI entry %s: file not found except in DL_DIR" % (d.getVar('PN'), os.path.basename(f)))
>                       else:
>                           bb.warn("Unable to get checksum for %s SRC_URI entry %s: file could not be found" % (d.getVar('PN'), os.path.basename(f)))
> -                filelist.append(f + ":" + str(os.path.exists(f)))
> +                filelist.append(urllib.parse.quote(f) + ":" + str(os.path.exists(pth)))
>   
>       return " ".join(filelist)
>   
> diff --git a/meta/classes/base.bbclass b/meta/classes/base.bbclass
> index 5f4956a1d3..8e6cf84479 100644
> --- a/meta/classes/base.bbclass
> +++ b/meta/classes/base.bbclass
> @@ -94,6 +94,8 @@ def extra_path_elements(d):
>   PATH:prepend = "${@extra_path_elements(d)}"
>   
>   def get_lic_checksum_file_list(d):
> +    import urllib
> +
>       filelist = []
>       lic_files = d.getVar("LIC_FILES_CHKSUM") or ''
>       tmpdir = d.getVar("TMPDIR")
> @@ -113,7 +115,7 @@ def get_lic_checksum_file_list(d):
>               if path[0] == '/':
>                   if path.startswith((tmpdir, s, b, workdir)):
>                       continue
> -                filelist.append(path + ":" + str(os.path.exists(path)))
> +                filelist.append(urllib.parse.quote(path) + ":" + str(os.path.exists(path)))
>           except bb.fetch.MalformedUrl:
>               bb.fatal(d.getVar('PN') + ": LIC_FILES_CHKSUM contains an invalid URL: " + url)
>       return " ".join(filelist)
> diff --git a/meta/lib/oeqa/selftest/cases/lic_checksum.py b/meta/lib/oeqa/selftest/cases/lic_checksum.py
> index 91021ac335..6bdbac3ba2 100644
> --- a/meta/lib/oeqa/selftest/cases/lic_checksum.py
> +++ b/meta/lib/oeqa/selftest/cases/lic_checksum.py
> @@ -4,6 +4,7 @@
>   
>   import os
>   import tempfile
> +import urllib
>   
>   from oeqa.selftest.case import OESelftestTestCase
>   from oeqa.utils.commands import bitbake
> @@ -11,6 +12,23 @@ from oeqa.utils import CommandError
>   
>   class LicenseTests(OESelftestTestCase):
>   
> +    def test_checksum_with_space(self):
> +        bitbake_cmd = '-c populate_lic emptytest'
> +
> +        lic_file, lic_path = tempfile.mkstemp(" -afterspace")
> +        os.close(lic_file)
> +        #self.track_for_cleanup(lic_path)
> +
> +        self.write_config("INHERIT:remove = \"report-error\"")
> +
> +        self.write_recipeinc('emptytest', """
> +INHIBIT_DEFAULT_DEPS = "1"
> +LIC_FILES_CHKSUM = "file://%s;md5=d41d8cd98f00b204e9800998ecf8427e"
> +SRC_URI = "file://%s;md5=d41d8cd98f00b204e9800998ecf8427e"
> +""" % (urllib.parse.quote(lic_path), urllib.parse.quote(lic_path)))
> +        result = bitbake(bitbake_cmd)
> +
> +
>       # Verify that changing a license file that has an absolute path causes
>       # the license qa to fail due to a mismatched md5sum.
>       def test_nonmatching_checksum(self):
Richard Purdie Feb. 7, 2022, 10:06 a.m. UTC | #2
On Sun, 2022-02-06 at 14:45 +0100, Paulo Neves wrote:
> Before this commit, if the file list for checksumming had
> files names with spaces there would be a crash. This happened
> due to filelist.split breaking on the file names instead of
> on k:v boundaries. Now we validate this case and emit a fatal
> error if such case is found. This needs to be fatal as the split
> will generate broken k:v from then on.
> 
> Instead of putting literal spaces in the file list the user
> should urlencode the file names and if they contain coded spaces
> they will be decoded. This is consistent with the current
> practice where file names are urlencoded. A reproducer of the
> issue this commit fixes, was to pass a do_compile[file-checksums]
> list with files containing spaces in their names, urlencoded or
> literal.
> 
> Change-Id: I6ac4f1cffbb86e913883491d46e8cc69a028e992
> Signed-off-by: Paulo Neves <ptsneves@gmail.com>
> ---
>  bitbake/lib/bb/checksum.py                   |  9 +++++++--
>  bitbake/lib/bb/fetch2/__init__.py            |  6 +++---
>  meta/classes/base.bbclass                    |  4 +++-
>  meta/lib/oeqa/selftest/cases/lic_checksum.py | 18 ++++++++++++++++++
>  4 files changed, 31 insertions(+), 6 deletions(-)
> 

Since this touches both bitbake and OE-Core I had to split the patch into two to
be able to test it. I did that and found that one of the other selftests fails
when it is applied:

oe-selftest -r layerappend.LayerAppendTests.test_layer_appends

e.g.:

https://autobuilder.yoctoproject.org/typhoon/#/builders/79/builds/3128/steps/14/logs/stdio

but it does this consistently and reproducibly. We'll need to get to the bottom
of that potential regression...

Cheers,

Richard
Paulo Neves Feb. 7, 2022, 10:16 a.m. UTC | #3
Thanks Richard.

I will try to have a look and fix it :)

Paulo Neves

On 2/7/22 11:06, Richard Purdie wrote:
> oe-selftest -r layerappend.LayerAppendTests.test_layer_appends
Richard Purdie April 11, 2022, 2:28 p.m. UTC | #4
On Mon, 2022-02-07 at 10:06 +0000, Richard Purdie via lists.openembedded.org
wrote:
> On Sun, 2022-02-06 at 14:45 +0100, Paulo Neves wrote:
> > Before this commit, if the file list for checksumming had
> > files names with spaces there would be a crash. This happened
> > due to filelist.split breaking on the file names instead of
> > on k:v boundaries. Now we validate this case and emit a fatal
> > error if such case is found. This needs to be fatal as the split
> > will generate broken k:v from then on.
> > 
> > Instead of putting literal spaces in the file list the user
> > should urlencode the file names and if they contain coded spaces
> > they will be decoded. This is consistent with the current
> > practice where file names are urlencoded. A reproducer of the
> > issue this commit fixes, was to pass a do_compile[file-checksums]
> > list with files containing spaces in their names, urlencoded or
> > literal.
> > 
> > Change-Id: I6ac4f1cffbb86e913883491d46e8cc69a028e992
> > Signed-off-by: Paulo Neves <ptsneves@gmail.com>
> > ---
> >  bitbake/lib/bb/checksum.py                   |  9 +++++++--
> >  bitbake/lib/bb/fetch2/__init__.py            |  6 +++---
> >  meta/classes/base.bbclass                    |  4 +++-
> >  meta/lib/oeqa/selftest/cases/lic_checksum.py | 18 ++++++++++++++++++
> >  4 files changed, 31 insertions(+), 6 deletions(-)
> > 
> 
> Since this touches both bitbake and OE-Core I had to split the patch into two to
> be able to test it. I did that and found that one of the other selftests fails
> when it is applied:
> 
> oe-selftest -r layerappend.LayerAppendTests.test_layer_appends
> 
> e.g.:
> 
> https://autobuilder.yoctoproject.org/typhoon/#/builders/79/builds/3128/steps/14/logs/stdio
> 
> but it does this consistently and reproducibly. We'll need to get to the bottom
> of that potential regression...

I'm a little sad we never got this resolved, I took a bit of a further look as
I'd been carrying the patches for a bit hoping this would get some follow up.

The issue is the change in get_checksum_file_list() always comes up as false for
str(os.path.exists(pth)) as it isn't a complete path. As such the change
definitely has a problem which needs to be resolved.

Cheers,

Richard
Paulo Neves April 11, 2022, 3:23 p.m. UTC | #5
Hey Richard,

Thanks for the ping. I will try to have a look at it this week. Sorry I 
got a bit lost with private stuff and forgot about it :(

Paulo Neves

On 4/11/22 16:28, Richard Purdie wrote:
> On Mon, 2022-02-07 at 10:06 +0000, Richard Purdie via lists.openembedded.org
> wrote:
>> On Sun, 2022-02-06 at 14:45 +0100, Paulo Neves wrote:
>>> Before this commit, if the file list for checksumming had
>>> files names with spaces there would be a crash. This happened
>>> due to filelist.split breaking on the file names instead of
>>> on k:v boundaries. Now we validate this case and emit a fatal
>>> error if such case is found. This needs to be fatal as the split
>>> will generate broken k:v from then on.
>>>
>>> Instead of putting literal spaces in the file list the user
>>> should urlencode the file names and if they contain coded spaces
>>> they will be decoded. This is consistent with the current
>>> practice where file names are urlencoded. A reproducer of the
>>> issue this commit fixes, was to pass a do_compile[file-checksums]
>>> list with files containing spaces in their names, urlencoded or
>>> literal.
>>>
>>> Change-Id: I6ac4f1cffbb86e913883491d46e8cc69a028e992
>>> Signed-off-by: Paulo Neves <ptsneves@gmail.com>
>>> ---
>>>   bitbake/lib/bb/checksum.py                   |  9 +++++++--
>>>   bitbake/lib/bb/fetch2/__init__.py            |  6 +++---
>>>   meta/classes/base.bbclass                    |  4 +++-
>>>   meta/lib/oeqa/selftest/cases/lic_checksum.py | 18 ++++++++++++++++++
>>>   4 files changed, 31 insertions(+), 6 deletions(-)
>>>
>> Since this touches both bitbake and OE-Core I had to split the patch into two to
>> be able to test it. I did that and found that one of the other selftests fails
>> when it is applied:
>>
>> oe-selftest -r layerappend.LayerAppendTests.test_layer_appends
>>
>> e.g.:
>>
>> https://autobuilder.yoctoproject.org/typhoon/#/builders/79/builds/3128/steps/14/logs/stdio
>>
>> but it does this consistently and reproducibly. We'll need to get to the bottom
>> of that potential regression...
> I'm a little sad we never got this resolved, I took a bit of a further look as
> I'd been carrying the patches for a bit hoping this would get some follow up.
>
> The issue is the change in get_checksum_file_list() always comes up as false for
> str(os.path.exists(pth)) as it isn't a complete path. As such the change
> definitely has a problem which needs to be resolved.
>
> Cheers,
>
> Richard
>
>
Richard Purdie April 11, 2022, 3:56 p.m. UTC | #6
On Mon, 2022-04-11 at 17:23 +0200, Paulo Neves wrote:
> Hey Richard,
> 
> Thanks for the ping. I will try to have a look at it this week. Sorry I 
> got a bit lost with private stuff and forgot about it :(

I've just sent a different patch which I think should fix the issue. I'll run it
through testing and may be able to merge my bitbake change with your test for
OE-Core. Changes are in master-next.

Cheers,

Richard
Paulo Neves April 12, 2022, 7:51 a.m. UTC | #7
Thanks. I see that it is [1]. I would not have come up with the regex 
fix, so your patch looks much better than mine. Still it looks like the 
whole logic should rely on the url being passed around in encoded form, 
and be decoded only at usage time. Is this what you mean by the code 
refactor?

Paulo Neves


[1] 
https://git.yoctoproject.org/poky/commit/?h=master-next&id=450878eb43f378431066d9e9fd6ae2ce5cc70565

On 4/11/22 17:56, Richard Purdie wrote:
> On Mon, 2022-04-11 at 17:23 +0200, Paulo Neves wrote:
>> Hey Richard,
>>
>> Thanks for the ping. I will try to have a look at it this week. Sorry I
>> got a bit lost with private stuff and forgot about it :(
> I've just sent a different patch which I think should fix the issue. I'll run it
> through testing and may be able to merge my bitbake change with your test for
> OE-Core. Changes are in master-next.
>
> Cheers,
>
> Richard
>
Richard Purdie April 12, 2022, 8:16 a.m. UTC | #8
On Tue, 2022-04-12 at 09:51 +0200, Paulo Neves wrote:
> Thanks. I see that it is [1]. I would not have come up with the regex 
> fix, so your patch looks much better than mine. Still it looks like the 
> whole logic should rely on the url being passed around in encoded form, 
> and be decoded only at usage time. Is this what you mean by the code 
> refactor?
> 
> Paulo Neves
> 
> 
> [1] 
> https://git.yoctoproject.org/poky/commit/?h=master-next&id=450878eb43f378431066d9e9fd6ae2ce5cc70565

That regex is also already used in cache.py so at the very least we should
refactor that code and merge with this new piece, probably in the form of a
generator.

The issue with changing the encoding is the number of callers. You'd only
changed a small number of the call sites and it was going to cause problems
because of that. There was also confusion in the patch between the decode and
encode points. The data at one point was not always an absolute path leading to
"False" for 'exists' tests when the file did actually exist. This is why one of
the other selftests broke.

Cheers,

Richard

Patch

diff --git a/bitbake/lib/bb/checksum.py b/bitbake/lib/bb/checksum.py
index fb8a77f6ab..4d34baffcc 100644
--- a/bitbake/lib/bb/checksum.py
+++ b/bitbake/lib/bb/checksum.py
@@ -8,6 +8,7 @@ 
 import glob
 import operator
 import os
+import urllib
 import stat
 import bb.utils
 import logging
@@ -110,10 +111,14 @@  class FileChecksumCache(MultiProcessCache):
 
         checksums = []
         for pth in filelist.split():
-            exist = pth.split(":")[1]
+            spl = pth.split(':')
+            if len(spl) != 2:
+                bb.fatal("found unformatted path in filelist " + pth)
+
+            exist = spl[1]
             if exist == "False":
                 continue
-            pth = pth.split(":")[0]
+            pth = urllib.parse.unquote(spl[0])
             if '*' in pth:
                 # Handle globs
                 for f in glob.glob(pth):
diff --git a/bitbake/lib/bb/fetch2/__init__.py b/bitbake/lib/bb/fetch2/__init__.py
index d37174185a..5cdd150d6b 100644
--- a/bitbake/lib/bb/fetch2/__init__.py
+++ b/bitbake/lib/bb/fetch2/__init__.py
@@ -1214,13 +1214,13 @@  def get_checksum_file_list(d):
             paths = ud.method.localpaths(ud, d)
             for f in paths:
                 pth = ud.decodedurl
-                if f.startswith(dl_dir):
+                if pth.startswith(dl_dir):
                     # The local fetcher's behaviour is to return a path under DL_DIR if it couldn't find the file anywhere else
-                    if os.path.exists(f):
+                    if os.path.exists(pth):
                         bb.warn("Getting checksum for %s SRC_URI entry %s: file not found except in DL_DIR" % (d.getVar('PN'), os.path.basename(f)))
                     else:
                         bb.warn("Unable to get checksum for %s SRC_URI entry %s: file could not be found" % (d.getVar('PN'), os.path.basename(f)))
-                filelist.append(f + ":" + str(os.path.exists(f)))
+                filelist.append(urllib.parse.quote(f) + ":" + str(os.path.exists(pth)))
 
     return " ".join(filelist)
 
diff --git a/meta/classes/base.bbclass b/meta/classes/base.bbclass
index 5f4956a1d3..8e6cf84479 100644
--- a/meta/classes/base.bbclass
+++ b/meta/classes/base.bbclass
@@ -94,6 +94,8 @@  def extra_path_elements(d):
 PATH:prepend = "${@extra_path_elements(d)}"
 
 def get_lic_checksum_file_list(d):
+    import urllib
+
     filelist = []
     lic_files = d.getVar("LIC_FILES_CHKSUM") or ''
     tmpdir = d.getVar("TMPDIR")
@@ -113,7 +115,7 @@  def get_lic_checksum_file_list(d):
             if path[0] == '/':
                 if path.startswith((tmpdir, s, b, workdir)):
                     continue
-                filelist.append(path + ":" + str(os.path.exists(path)))
+                filelist.append(urllib.parse.quote(path) + ":" + str(os.path.exists(path)))
         except bb.fetch.MalformedUrl:
             bb.fatal(d.getVar('PN') + ": LIC_FILES_CHKSUM contains an invalid URL: " + url)
     return " ".join(filelist)
diff --git a/meta/lib/oeqa/selftest/cases/lic_checksum.py b/meta/lib/oeqa/selftest/cases/lic_checksum.py
index 91021ac335..6bdbac3ba2 100644
--- a/meta/lib/oeqa/selftest/cases/lic_checksum.py
+++ b/meta/lib/oeqa/selftest/cases/lic_checksum.py
@@ -4,6 +4,7 @@ 
 
 import os
 import tempfile
+import urllib
 
 from oeqa.selftest.case import OESelftestTestCase
 from oeqa.utils.commands import bitbake
@@ -11,6 +12,23 @@  from oeqa.utils import CommandError
 
 class LicenseTests(OESelftestTestCase):
 
+    def test_checksum_with_space(self):
+        bitbake_cmd = '-c populate_lic emptytest'
+
+        lic_file, lic_path = tempfile.mkstemp(" -afterspace")
+        os.close(lic_file)
+        #self.track_for_cleanup(lic_path)
+
+        self.write_config("INHERIT:remove = \"report-error\"")
+
+        self.write_recipeinc('emptytest', """
+INHIBIT_DEFAULT_DEPS = "1"
+LIC_FILES_CHKSUM = "file://%s;md5=d41d8cd98f00b204e9800998ecf8427e"
+SRC_URI = "file://%s;md5=d41d8cd98f00b204e9800998ecf8427e"
+""" % (urllib.parse.quote(lic_path), urllib.parse.quote(lic_path)))
+        result = bitbake(bitbake_cmd)
+
+
     # Verify that changing a license file that has an absolute path causes
     # the license qa to fail due to a mismatched md5sum.
     def test_nonmatching_checksum(self):