diff mbox series

[05/13] fetch2: remove duplicated code in url decode and encode

Message ID 20250207124701.14787-5-stefan.herbrechtsmeier-oss@weidmueller.com
State Accepted, archived
Commit a5d569c94700f04b8193c6bccae5af619931b00f
Headers show
Series [01/13] fetch2: do not decode user from file URI | expand

Commit Message

Stefan Herbrechtsmeier Feb. 7, 2025, 12:46 p.m. UTC
From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>

Use the URI class to decode and encode an URL. Remove duplicate code and
unify the behavior.

Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
---

 lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------
 1 file changed, 14 insertions(+), 52 deletions(-)

Comments

Peter Kjellerstedt Feb. 27, 2025, 2:58 a.m. UTC | #1
> -----Original Message-----
> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Stefan Herbrechtsmeier via lists.openembedded.org
> Sent: den 7 februari 2025 13:47
> To: bitbake-devel@lists.openembedded.org
> Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> Subject: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode
> 
> From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> 
> Use the URI class to decode and encode an URL. Remove duplicate code and
> unify the behavior.
> 
> Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> ---
> 
>  lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------
>  1 file changed, 14 insertions(+), 52 deletions(-)
> 
> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
> index ab992b7ea..2a60e94ed 100644
> --- a/lib/bb/fetch2/__init__.py
> +++ b/lib/bb/fetch2/__init__.py
> @@ -353,49 +353,9 @@ def decodeurl(url):
>      user, password, parameters).
>      """
> 
> -    m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url)
> -    if not m:
> -        raise MalformedUrl(url)
> -
> -    type = m.group('type')
> -    location = m.group('location')
> -    if not location:
> -        raise MalformedUrl(url)
> -    user = m.group('user')
> -    parm = m.group('parm')
> -
> -    locidx = location.find('/')
> -    if locidx != -1 and type.lower() != 'file':
> -        host = location[:locidx]
> -        path = location[locidx:]
> -    elif type.lower() == 'file':
> -        host = ""
> -        path = location
> -        if user:
> -            path = user + '@' + path
> -            user = ""
> -    else:
> -        host = location
> -        path = "/"
> -    if user:
> -        m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user)
> -        if m:
> -            user = m.group('user')
> -            pswd = m.group('pswd')
> -    else:
> -        user = ''
> -        pswd = ''
> -
> -    p = collections.OrderedDict()
> -    if parm:
> -        for s in parm.split(';'):
> -            if s:
> -                if not '=' in s:
> -                    raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s))
> -                s1, s2 = s.split('=', 1)
> -                p[s1] = s2
> -
> -    return type, host, urllib.parse.unquote(path), user, pswd, p
> +    uri = URI(url)
> +    path = uri.path if uri.path else "/"
> +    return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params

Unfortunately this does not work if you have a MIRRORS/PREMIRRORS with a 
regular expression that contains a negative lookahead, which we do. We 
have the following in our PREMIRRORS:

git://(?!our\.internal\.git\.server).*/.*    http://downloads.yoctoproject.org/mirror/sources/ \n \

since there is never any point in trying to fetch our internal repositories 
from Yocto's server.

After the above change this now fails with:

  Exception: re.error: missing ), unterminated subpattern at position 0

since decodeurl() has stopped parsing the host part once it sees the "?", 
which the old code did not do. 

And while regular expressions with negative lookups may be a little 
esoteric, this change also silently breaks all the mirrors in the 
default MIRRORS that use npm://.*/?.* and https?://...

My recommendation is to revert the change to decodeurl() because the way 
the regular expression "URLs" in MIRRORS and PREMIRRORS are written do 
not match the expectations in urllib.parse.urlparse().

> 
>  def encodeurl(decoded):
>      """Encodes a URL from tokens (scheme, network location, path,
> @@ -406,24 +366,26 @@ def encodeurl(decoded):
> 
>      if not type:
>          raise MissingParameterError('type', "encoded from the data %s" % str(decoded))
> -    url = ['%s://' % type]
> +    uri = URI()
> +    uri.scheme = type
>      if user and type != "file":
> -        url.append("%s" % user)
> +        uri.username = user
>          if pswd:
> -            url.append(":%s" % pswd)
> -        url.append("@")
> +            uri.password = pswd
>      if host and type != "file":
> -        url.append("%s" % host)
> +        uri.hostname = host
>      if path:
>          # Standardise path to ensure comparisons work
>          while '//' in path:
>              path = path.replace("//", "/")
> -        url.append("%s" % urllib.parse.quote(path))
> +        uri.path = path
> +        if type == "file":
> +            # Use old not IETF compliant style
> +            uri.relative = False
>      if p:
> -        for parm in p:
> -            url.append(";%s=%s" % (parm, p[parm]))
> +        uri.params = p
> 
> -    return "".join(url)
> +    return str(uri)

I believe using the URI class for encodeurl() is fine so this part 
of the change should be ok to keep.

> 
>  def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None):
>      if not ud.url or not uri_find or not uri_replace:
> --
> 2.39.5

//Peter
Stefan Herbrechtsmeier Feb. 28, 2025, 7:12 a.m. UTC | #2
Am 27.02.2025 um 03:58 schrieb Peter Kjellerstedt:
>> -----Original Message-----
>> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Stefan Herbrechtsmeier via lists.openembedded.org
>> Sent: den 7 februari 2025 13:47
>> To: bitbake-devel@lists.openembedded.org
>> Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
>> Subject: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode
>>
>> From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
>>
>> Use the URI class to decode and encode an URL. Remove duplicate code and
>> unify the behavior.
>>
>> Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
>> ---
>>
>>   lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------
>>   1 file changed, 14 insertions(+), 52 deletions(-)
>>
>> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
>> index ab992b7ea..2a60e94ed 100644
>> --- a/lib/bb/fetch2/__init__.py
>> +++ b/lib/bb/fetch2/__init__.py
>> @@ -353,49 +353,9 @@ def decodeurl(url):
>>       user, password, parameters).
>>       """
>>
>> -    m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url)
>> -    if not m:
>> -        raise MalformedUrl(url)
>> -
>> -    type = m.group('type')
>> -    location = m.group('location')
>> -    if not location:
>> -        raise MalformedUrl(url)
>> -    user = m.group('user')
>> -    parm = m.group('parm')
>> -
>> -    locidx = location.find('/')
>> -    if locidx != -1 and type.lower() != 'file':
>> -        host = location[:locidx]
>> -        path = location[locidx:]
>> -    elif type.lower() == 'file':
>> -        host = ""
>> -        path = location
>> -        if user:
>> -            path = user + '@' + path
>> -            user = ""
>> -    else:
>> -        host = location
>> -        path = "/"
>> -    if user:
>> -        m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user)
>> -        if m:
>> -            user = m.group('user')
>> -            pswd = m.group('pswd')
>> -    else:
>> -        user = ''
>> -        pswd = ''
>> -
>> -    p = collections.OrderedDict()
>> -    if parm:
>> -        for s in parm.split(';'):
>> -            if s:
>> -                if not '=' in s:
>> -                    raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s))
>> -                s1, s2 = s.split('=', 1)
>> -                p[s1] = s2
>> -
>> -    return type, host, urllib.parse.unquote(path), user, pswd, p
>> +    uri = URI(url)
>> +    path = uri.path if uri.path else "/"
>> +    return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params
> Unfortunately this does not work if you have a MIRRORS/PREMIRRORS with a
> regular expression that contains a negative lookahead, which we do. We
> have the following in our PREMIRRORS:
>
> git://(?!our\.internal\.git\.server).*/.*    http://downloads.yoctoproject.org/mirror/sources/ \n \
>
> since there is never any point in trying to fetch our internal repositories
> from Yocto's server.
>
> After the above change this now fails with:
>
>    Exception: re.error: missing ), unterminated subpattern at position 0
>
> since decodeurl() has stopped parsing the host part once it sees the "?",
> which the old code did not do.
>
> And while regular expressions with negative lookups may be a little
> esoteric, this change also silently breaks all the mirrors in the
> default MIRRORS that use npm://.*/?.* and https?://...
>
> My recommendation is to revert the change to decodeurl() because the way
> the regular expression "URLs" in MIRRORS and PREMIRRORS are written do
> not match the expectations in urllib.parse.urlparse().

Please revert. The decodeurl, encodeurl and mirror code doesn't support 
query parameters. We have to rework the mirror code. It shouldn't 
interpret the regex as an url. Afterwards we could reapply the patch and 
add query support to the url function.

Can you please add the missing use cases to the MirrorUriTest to detect 
the breakage earlier.

>>   def encodeurl(decoded):
>>       """Encodes a URL from tokens (scheme, network location, path,
>> @@ -406,24 +366,26 @@ def encodeurl(decoded):
>>
>>       if not type:
>>           raise MissingParameterError('type', "encoded from the data %s" % str(decoded))
>> -    url = ['%s://' % type]
>> +    uri = URI()
>> +    uri.scheme = type
>>       if user and type != "file":
>> -        url.append("%s" % user)
>> +        uri.username = user
>>           if pswd:
>> -            url.append(":%s" % pswd)
>> -        url.append("@")
>> +            uri.password = pswd
>>       if host and type != "file":
>> -        url.append("%s" % host)
>> +        uri.hostname = host
>>       if path:
>>           # Standardise path to ensure comparisons work
>>           while '//' in path:
>>               path = path.replace("//", "/")
>> -        url.append("%s" % urllib.parse.quote(path))
>> +        uri.path = path
>> +        if type == "file":
>> +            # Use old not IETF compliant style
>> +            uri.relative = False
>>       if p:
>> -        for parm in p:
>> -            url.append(";%s=%s" % (parm, p[parm]))
>> +        uri.params = p
>>
>> -    return "".join(url)
>> +    return str(uri)
> I believe using the URI class for encodeurl() is fine so this part
> of the change should be ok to keep.
>
>>   def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None):
>>       if not ud.url or not uri_find or not uri_replace:
>> --
>> 2.39.5
> //Peter
>
Peter Kjellerstedt Feb. 28, 2025, 12:50 p.m. UTC | #3
> -----Original Message-----
> From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier-oss@weidmueller.com>
> Sent: den 28 februari 2025 08:12
> To: Peter Kjellerstedt <peter.kjellerstedt@axis.com>; bitbake-devel@lists.openembedded.org
> Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> Subject: Re: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode
> 
> Am 27.02.2025 um 03:58 schrieb Peter Kjellerstedt:
> >> -----Original Message-----
> >> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Stefan Herbrechtsmeier via lists.openembedded.org
> >> Sent: den 7 februari 2025 13:47
> >> To: bitbake-devel@lists.openembedded.org
> >> Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> >> Subject: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode
> >>
> >> From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> >>
> >> Use the URI class to decode and encode an URL. Remove duplicate code and
> >> unify the behavior.
> >>
> >> Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com>
> >> ---
> >>
> >>   lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------
> >>   1 file changed, 14 insertions(+), 52 deletions(-)
> >>
> >> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
> >> index ab992b7ea..2a60e94ed 100644
> >> --- a/lib/bb/fetch2/__init__.py
> >> +++ b/lib/bb/fetch2/__init__.py
> >> @@ -353,49 +353,9 @@ def decodeurl(url):
> >>       user, password, parameters).
> >>       """
> >>
> >> -    m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url)
> >> -    if not m:
> >> -        raise MalformedUrl(url)
> >> -
> >> -    type = m.group('type')
> >> -    location = m.group('location')
> >> -    if not location:
> >> -        raise MalformedUrl(url)
> >> -    user = m.group('user')
> >> -    parm = m.group('parm')
> >> -
> >> -    locidx = location.find('/')
> >> -    if locidx != -1 and type.lower() != 'file':
> >> -        host = location[:locidx]
> >> -        path = location[locidx:]
> >> -    elif type.lower() == 'file':
> >> -        host = ""
> >> -        path = location
> >> -        if user:
> >> -            path = user + '@' + path
> >> -            user = ""
> >> -    else:
> >> -        host = location
> >> -        path = "/"
> >> -    if user:
> >> -        m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user)
> >> -        if m:
> >> -            user = m.group('user')
> >> -            pswd = m.group('pswd')
> >> -    else:
> >> -        user = ''
> >> -        pswd = ''
> >> -
> >> -    p = collections.OrderedDict()
> >> -    if parm:
> >> -        for s in parm.split(';'):
> >> -            if s:
> >> -                if not '=' in s:
> >> -                    raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s))
> >> -                s1, s2 = s.split('=', 1)
> >> -                p[s1] = s2
> >> -
> >> -    return type, host, urllib.parse.unquote(path), user, pswd, p
> >> +    uri = URI(url)
> >> +    path = uri.path if uri.path else "/"
> >> +    return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params
> >
> > Unfortunately this does not work if you have a MIRRORS/PREMIRRORS with a
> > regular expression that contains a negative lookahead, which we do. We
> > have the following in our PREMIRRORS:
> >
> > git://(?!our\.internal\.git\.server).*/.* http://downloads.yoctoproject.org/mirror/sources/ \n \
> >
> > since there is never any point in trying to fetch our internal repositories
> > from Yocto's server.
> >
> > After the above change this now fails with:
> >
> >    Exception: re.error: missing ), unterminated subpattern at position 0
> >
> > since decodeurl() has stopped parsing the host part once it sees the "?",
> > which the old code did not do.
> >
> > And while regular expressions with negative lookups may be a little
> > esoteric, this change also silently breaks all the mirrors in the
> > default MIRRORS that use npm://.*/?.* and https?://...
> >
> > My recommendation is to revert the change to decodeurl() because the way
> > the regular expression "URLs" in MIRRORS and PREMIRRORS are written do
> > not match the expectations in urllib.parse.urlparse().
> 
> Please revert. The decodeurl, encodeurl and mirror code doesn't support
> query parameters. We have to rework the mirror code. It shouldn't
> interpret the regex as an url. Afterwards we could reapply the patch and
> add query support to the url function.

Ok, I will send a revert for decodeurl().

> Can you please add the missing use cases to the MirrorUriTest to detect
> the breakage earlier.

Yes, that was my plan.

> >>   def encodeurl(decoded):
> >>       """Encodes a URL from tokens (scheme, network location, path,
> >> @@ -406,24 +366,26 @@ def encodeurl(decoded):
> >>
> >>       if not type:
> >>           raise MissingParameterError('type', "encoded from the data %s" % str(decoded))
> >> -    url = ['%s://' % type]
> >> +    uri = URI()
> >> +    uri.scheme = type
> >>       if user and type != "file":
> >> -        url.append("%s" % user)
> >> +        uri.username = user
> >>           if pswd:
> >> -            url.append(":%s" % pswd)
> >> -        url.append("@")
> >> +            uri.password = pswd
> >>       if host and type != "file":
> >> -        url.append("%s" % host)
> >> +        uri.hostname = host
> >>       if path:
> >>           # Standardise path to ensure comparisons work
> >>           while '//' in path:
> >>               path = path.replace("//", "/")
> >> -        url.append("%s" % urllib.parse.quote(path))
> >> +        uri.path = path
> >> +        if type == "file":
> >> +            # Use old not IETF compliant style
> >> +            uri.relative = False
> >>       if p:
> >> -        for parm in p:
> >> -            url.append(";%s=%s" % (parm, p[parm]))
> >> +        uri.params = p
> >>
> >> -    return "".join(url)
> >> +    return str(uri)
> >
> > I believe using the URI class for encodeurl() is fine so this part
> > of the change should be ok to keep.
> >
> >>   def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None):
> >>       if not ud.url or not uri_find or not uri_replace:
> >> --
> >> 2.39.5
> >
> > //Peter

//Peter
diff mbox series

Patch

diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
index ab992b7ea..2a60e94ed 100644
--- a/lib/bb/fetch2/__init__.py
+++ b/lib/bb/fetch2/__init__.py
@@ -353,49 +353,9 @@  def decodeurl(url):
     user, password, parameters).
     """
 
-    m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url)
-    if not m:
-        raise MalformedUrl(url)
-
-    type = m.group('type')
-    location = m.group('location')
-    if not location:
-        raise MalformedUrl(url)
-    user = m.group('user')
-    parm = m.group('parm')
-
-    locidx = location.find('/')
-    if locidx != -1 and type.lower() != 'file':
-        host = location[:locidx]
-        path = location[locidx:]
-    elif type.lower() == 'file':
-        host = ""
-        path = location
-        if user:
-            path = user + '@' + path
-            user = ""
-    else:
-        host = location
-        path = "/"
-    if user:
-        m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user)
-        if m:
-            user = m.group('user')
-            pswd = m.group('pswd')
-    else:
-        user = ''
-        pswd = ''
-
-    p = collections.OrderedDict()
-    if parm:
-        for s in parm.split(';'):
-            if s:
-                if not '=' in s:
-                    raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s))
-                s1, s2 = s.split('=', 1)
-                p[s1] = s2
-
-    return type, host, urllib.parse.unquote(path), user, pswd, p
+    uri = URI(url)
+    path = uri.path if uri.path else "/"
+    return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params
 
 def encodeurl(decoded):
     """Encodes a URL from tokens (scheme, network location, path,
@@ -406,24 +366,26 @@  def encodeurl(decoded):
 
     if not type:
         raise MissingParameterError('type', "encoded from the data %s" % str(decoded))
-    url = ['%s://' % type]
+    uri = URI()
+    uri.scheme = type
     if user and type != "file":
-        url.append("%s" % user)
+        uri.username = user
         if pswd:
-            url.append(":%s" % pswd)
-        url.append("@")
+            uri.password = pswd
     if host and type != "file":
-        url.append("%s" % host)
+        uri.hostname = host
     if path:
         # Standardise path to ensure comparisons work
         while '//' in path:
             path = path.replace("//", "/")
-        url.append("%s" % urllib.parse.quote(path))
+        uri.path = path
+        if type == "file":
+            # Use old not IETF compliant style
+            uri.relative = False
     if p:
-        for parm in p:
-            url.append(";%s=%s" % (parm, p[parm]))
+        uri.params = p
 
-    return "".join(url)
+    return str(uri)
 
 def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None):
     if not ud.url or not uri_find or not uri_replace: