Message ID | 20250207124701.14787-5-stefan.herbrechtsmeier-oss@weidmueller.com |
---|---|
State | Accepted, archived |
Commit | a5d569c94700f04b8193c6bccae5af619931b00f |
Headers | show |
Series | [01/13] fetch2: do not decode user from file URI | expand |
> -----Original Message----- > From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Stefan Herbrechtsmeier via lists.openembedded.org > Sent: den 7 februari 2025 13:47 > To: bitbake-devel@lists.openembedded.org > Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > Subject: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode > > From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > > Use the URI class to decode and encode an URL. Remove duplicate code and > unify the behavior. > > Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > --- > > lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------ > 1 file changed, 14 insertions(+), 52 deletions(-) > > diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py > index ab992b7ea..2a60e94ed 100644 > --- a/lib/bb/fetch2/__init__.py > +++ b/lib/bb/fetch2/__init__.py > @@ -353,49 +353,9 @@ def decodeurl(url): > user, password, parameters). > """ > > - m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url) > - if not m: > - raise MalformedUrl(url) > - > - type = m.group('type') > - location = m.group('location') > - if not location: > - raise MalformedUrl(url) > - user = m.group('user') > - parm = m.group('parm') > - > - locidx = location.find('/') > - if locidx != -1 and type.lower() != 'file': > - host = location[:locidx] > - path = location[locidx:] > - elif type.lower() == 'file': > - host = "" > - path = location > - if user: > - path = user + '@' + path > - user = "" > - else: > - host = location > - path = "/" > - if user: > - m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user) > - if m: > - user = m.group('user') > - pswd = m.group('pswd') > - else: > - user = '' > - pswd = '' > - > - p = collections.OrderedDict() > - if parm: > - for s in parm.split(';'): > - if s: > - if not '=' in s: > - raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s)) > - s1, s2 = s.split('=', 1) > - p[s1] = s2 > - > - return type, host, urllib.parse.unquote(path), user, pswd, p > + uri = URI(url) > + path = uri.path if uri.path else "/" > + return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params Unfortunately this does not work if you have a MIRRORS/PREMIRRORS with a regular expression that contains a negative lookahead, which we do. We have the following in our PREMIRRORS: git://(?!our\.internal\.git\.server).*/.* http://downloads.yoctoproject.org/mirror/sources/ \n \ since there is never any point in trying to fetch our internal repositories from Yocto's server. After the above change this now fails with: Exception: re.error: missing ), unterminated subpattern at position 0 since decodeurl() has stopped parsing the host part once it sees the "?", which the old code did not do. And while regular expressions with negative lookups may be a little esoteric, this change also silently breaks all the mirrors in the default MIRRORS that use npm://.*/?.* and https?://... My recommendation is to revert the change to decodeurl() because the way the regular expression "URLs" in MIRRORS and PREMIRRORS are written do not match the expectations in urllib.parse.urlparse(). > > def encodeurl(decoded): > """Encodes a URL from tokens (scheme, network location, path, > @@ -406,24 +366,26 @@ def encodeurl(decoded): > > if not type: > raise MissingParameterError('type', "encoded from the data %s" % str(decoded)) > - url = ['%s://' % type] > + uri = URI() > + uri.scheme = type > if user and type != "file": > - url.append("%s" % user) > + uri.username = user > if pswd: > - url.append(":%s" % pswd) > - url.append("@") > + uri.password = pswd > if host and type != "file": > - url.append("%s" % host) > + uri.hostname = host > if path: > # Standardise path to ensure comparisons work > while '//' in path: > path = path.replace("//", "/") > - url.append("%s" % urllib.parse.quote(path)) > + uri.path = path > + if type == "file": > + # Use old not IETF compliant style > + uri.relative = False > if p: > - for parm in p: > - url.append(";%s=%s" % (parm, p[parm])) > + uri.params = p > > - return "".join(url) > + return str(uri) I believe using the URI class for encodeurl() is fine so this part of the change should be ok to keep. > > def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None): > if not ud.url or not uri_find or not uri_replace: > -- > 2.39.5 //Peter
Am 27.02.2025 um 03:58 schrieb Peter Kjellerstedt: >> -----Original Message----- >> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Stefan Herbrechtsmeier via lists.openembedded.org >> Sent: den 7 februari 2025 13:47 >> To: bitbake-devel@lists.openembedded.org >> Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> >> Subject: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode >> >> From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> >> >> Use the URI class to decode and encode an URL. Remove duplicate code and >> unify the behavior. >> >> Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> >> --- >> >> lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------ >> 1 file changed, 14 insertions(+), 52 deletions(-) >> >> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py >> index ab992b7ea..2a60e94ed 100644 >> --- a/lib/bb/fetch2/__init__.py >> +++ b/lib/bb/fetch2/__init__.py >> @@ -353,49 +353,9 @@ def decodeurl(url): >> user, password, parameters). >> """ >> >> - m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url) >> - if not m: >> - raise MalformedUrl(url) >> - >> - type = m.group('type') >> - location = m.group('location') >> - if not location: >> - raise MalformedUrl(url) >> - user = m.group('user') >> - parm = m.group('parm') >> - >> - locidx = location.find('/') >> - if locidx != -1 and type.lower() != 'file': >> - host = location[:locidx] >> - path = location[locidx:] >> - elif type.lower() == 'file': >> - host = "" >> - path = location >> - if user: >> - path = user + '@' + path >> - user = "" >> - else: >> - host = location >> - path = "/" >> - if user: >> - m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user) >> - if m: >> - user = m.group('user') >> - pswd = m.group('pswd') >> - else: >> - user = '' >> - pswd = '' >> - >> - p = collections.OrderedDict() >> - if parm: >> - for s in parm.split(';'): >> - if s: >> - if not '=' in s: >> - raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s)) >> - s1, s2 = s.split('=', 1) >> - p[s1] = s2 >> - >> - return type, host, urllib.parse.unquote(path), user, pswd, p >> + uri = URI(url) >> + path = uri.path if uri.path else "/" >> + return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params > Unfortunately this does not work if you have a MIRRORS/PREMIRRORS with a > regular expression that contains a negative lookahead, which we do. We > have the following in our PREMIRRORS: > > git://(?!our\.internal\.git\.server).*/.* http://downloads.yoctoproject.org/mirror/sources/ \n \ > > since there is never any point in trying to fetch our internal repositories > from Yocto's server. > > After the above change this now fails with: > > Exception: re.error: missing ), unterminated subpattern at position 0 > > since decodeurl() has stopped parsing the host part once it sees the "?", > which the old code did not do. > > And while regular expressions with negative lookups may be a little > esoteric, this change also silently breaks all the mirrors in the > default MIRRORS that use npm://.*/?.* and https?://... > > My recommendation is to revert the change to decodeurl() because the way > the regular expression "URLs" in MIRRORS and PREMIRRORS are written do > not match the expectations in urllib.parse.urlparse(). Please revert. The decodeurl, encodeurl and mirror code doesn't support query parameters. We have to rework the mirror code. It shouldn't interpret the regex as an url. Afterwards we could reapply the patch and add query support to the url function. Can you please add the missing use cases to the MirrorUriTest to detect the breakage earlier. >> def encodeurl(decoded): >> """Encodes a URL from tokens (scheme, network location, path, >> @@ -406,24 +366,26 @@ def encodeurl(decoded): >> >> if not type: >> raise MissingParameterError('type', "encoded from the data %s" % str(decoded)) >> - url = ['%s://' % type] >> + uri = URI() >> + uri.scheme = type >> if user and type != "file": >> - url.append("%s" % user) >> + uri.username = user >> if pswd: >> - url.append(":%s" % pswd) >> - url.append("@") >> + uri.password = pswd >> if host and type != "file": >> - url.append("%s" % host) >> + uri.hostname = host >> if path: >> # Standardise path to ensure comparisons work >> while '//' in path: >> path = path.replace("//", "/") >> - url.append("%s" % urllib.parse.quote(path)) >> + uri.path = path >> + if type == "file": >> + # Use old not IETF compliant style >> + uri.relative = False >> if p: >> - for parm in p: >> - url.append(";%s=%s" % (parm, p[parm])) >> + uri.params = p >> >> - return "".join(url) >> + return str(uri) > I believe using the URI class for encodeurl() is fine so this part > of the change should be ok to keep. > >> def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None): >> if not ud.url or not uri_find or not uri_replace: >> -- >> 2.39.5 > //Peter >
> -----Original Message----- > From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier-oss@weidmueller.com> > Sent: den 28 februari 2025 08:12 > To: Peter Kjellerstedt <peter.kjellerstedt@axis.com>; bitbake-devel@lists.openembedded.org > Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > Subject: Re: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode > > Am 27.02.2025 um 03:58 schrieb Peter Kjellerstedt: > >> -----Original Message----- > >> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Stefan Herbrechtsmeier via lists.openembedded.org > >> Sent: den 7 februari 2025 13:47 > >> To: bitbake-devel@lists.openembedded.org > >> Cc: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > >> Subject: [bitbake-devel] [PATCH 05/13] fetch2: remove duplicated code in url decode and encode > >> > >> From: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > >> > >> Use the URI class to decode and encode an URL. Remove duplicate code and > >> unify the behavior. > >> > >> Signed-off-by: Stefan Herbrechtsmeier <stefan.herbrechtsmeier@weidmueller.com> > >> --- > >> > >> lib/bb/fetch2/__init__.py | 66 +++++++++------------------------------ > >> 1 file changed, 14 insertions(+), 52 deletions(-) > >> > >> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py > >> index ab992b7ea..2a60e94ed 100644 > >> --- a/lib/bb/fetch2/__init__.py > >> +++ b/lib/bb/fetch2/__init__.py > >> @@ -353,49 +353,9 @@ def decodeurl(url): > >> user, password, parameters). > >> """ > >> > >> - m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url) > >> - if not m: > >> - raise MalformedUrl(url) > >> - > >> - type = m.group('type') > >> - location = m.group('location') > >> - if not location: > >> - raise MalformedUrl(url) > >> - user = m.group('user') > >> - parm = m.group('parm') > >> - > >> - locidx = location.find('/') > >> - if locidx != -1 and type.lower() != 'file': > >> - host = location[:locidx] > >> - path = location[locidx:] > >> - elif type.lower() == 'file': > >> - host = "" > >> - path = location > >> - if user: > >> - path = user + '@' + path > >> - user = "" > >> - else: > >> - host = location > >> - path = "/" > >> - if user: > >> - m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user) > >> - if m: > >> - user = m.group('user') > >> - pswd = m.group('pswd') > >> - else: > >> - user = '' > >> - pswd = '' > >> - > >> - p = collections.OrderedDict() > >> - if parm: > >> - for s in parm.split(';'): > >> - if s: > >> - if not '=' in s: > >> - raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s)) > >> - s1, s2 = s.split('=', 1) > >> - p[s1] = s2 > >> - > >> - return type, host, urllib.parse.unquote(path), user, pswd, p > >> + uri = URI(url) > >> + path = uri.path if uri.path else "/" > >> + return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params > > > > Unfortunately this does not work if you have a MIRRORS/PREMIRRORS with a > > regular expression that contains a negative lookahead, which we do. We > > have the following in our PREMIRRORS: > > > > git://(?!our\.internal\.git\.server).*/.* http://downloads.yoctoproject.org/mirror/sources/ \n \ > > > > since there is never any point in trying to fetch our internal repositories > > from Yocto's server. > > > > After the above change this now fails with: > > > > Exception: re.error: missing ), unterminated subpattern at position 0 > > > > since decodeurl() has stopped parsing the host part once it sees the "?", > > which the old code did not do. > > > > And while regular expressions with negative lookups may be a little > > esoteric, this change also silently breaks all the mirrors in the > > default MIRRORS that use npm://.*/?.* and https?://... > > > > My recommendation is to revert the change to decodeurl() because the way > > the regular expression "URLs" in MIRRORS and PREMIRRORS are written do > > not match the expectations in urllib.parse.urlparse(). > > Please revert. The decodeurl, encodeurl and mirror code doesn't support > query parameters. We have to rework the mirror code. It shouldn't > interpret the regex as an url. Afterwards we could reapply the patch and > add query support to the url function. Ok, I will send a revert for decodeurl(). > Can you please add the missing use cases to the MirrorUriTest to detect > the breakage earlier. Yes, that was my plan. > >> def encodeurl(decoded): > >> """Encodes a URL from tokens (scheme, network location, path, > >> @@ -406,24 +366,26 @@ def encodeurl(decoded): > >> > >> if not type: > >> raise MissingParameterError('type', "encoded from the data %s" % str(decoded)) > >> - url = ['%s://' % type] > >> + uri = URI() > >> + uri.scheme = type > >> if user and type != "file": > >> - url.append("%s" % user) > >> + uri.username = user > >> if pswd: > >> - url.append(":%s" % pswd) > >> - url.append("@") > >> + uri.password = pswd > >> if host and type != "file": > >> - url.append("%s" % host) > >> + uri.hostname = host > >> if path: > >> # Standardise path to ensure comparisons work > >> while '//' in path: > >> path = path.replace("//", "/") > >> - url.append("%s" % urllib.parse.quote(path)) > >> + uri.path = path > >> + if type == "file": > >> + # Use old not IETF compliant style > >> + uri.relative = False > >> if p: > >> - for parm in p: > >> - url.append(";%s=%s" % (parm, p[parm])) > >> + uri.params = p > >> > >> - return "".join(url) > >> + return str(uri) > > > > I believe using the URI class for encodeurl() is fine so this part > > of the change should be ok to keep. > > > >> def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None): > >> if not ud.url or not uri_find or not uri_replace: > >> -- > >> 2.39.5 > > > > //Peter //Peter
diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py index ab992b7ea..2a60e94ed 100644 --- a/lib/bb/fetch2/__init__.py +++ b/lib/bb/fetch2/__init__.py @@ -353,49 +353,9 @@ def decodeurl(url): user, password, parameters). """ - m = re.compile('(?P<type>[^:]*)://((?P<user>[^/;]+)@)?(?P<location>[^;]+)(;(?P<parm>.*))?').match(url) - if not m: - raise MalformedUrl(url) - - type = m.group('type') - location = m.group('location') - if not location: - raise MalformedUrl(url) - user = m.group('user') - parm = m.group('parm') - - locidx = location.find('/') - if locidx != -1 and type.lower() != 'file': - host = location[:locidx] - path = location[locidx:] - elif type.lower() == 'file': - host = "" - path = location - if user: - path = user + '@' + path - user = "" - else: - host = location - path = "/" - if user: - m = re.compile('(?P<user>[^:]+)(:?(?P<pswd>.*))').match(user) - if m: - user = m.group('user') - pswd = m.group('pswd') - else: - user = '' - pswd = '' - - p = collections.OrderedDict() - if parm: - for s in parm.split(';'): - if s: - if not '=' in s: - raise MalformedUrl(url, "The URL: '%s' is invalid: parameter %s does not specify a value (missing '=')" % (url, s)) - s1, s2 = s.split('=', 1) - p[s1] = s2 - - return type, host, urllib.parse.unquote(path), user, pswd, p + uri = URI(url) + path = uri.path if uri.path else "/" + return uri.scheme, uri.hostport, path, uri.username, uri.password, uri.params def encodeurl(decoded): """Encodes a URL from tokens (scheme, network location, path, @@ -406,24 +366,26 @@ def encodeurl(decoded): if not type: raise MissingParameterError('type', "encoded from the data %s" % str(decoded)) - url = ['%s://' % type] + uri = URI() + uri.scheme = type if user and type != "file": - url.append("%s" % user) + uri.username = user if pswd: - url.append(":%s" % pswd) - url.append("@") + uri.password = pswd if host and type != "file": - url.append("%s" % host) + uri.hostname = host if path: # Standardise path to ensure comparisons work while '//' in path: path = path.replace("//", "/") - url.append("%s" % urllib.parse.quote(path)) + uri.path = path + if type == "file": + # Use old not IETF compliant style + uri.relative = False if p: - for parm in p: - url.append(";%s=%s" % (parm, p[parm])) + uri.params = p - return "".join(url) + return str(uri) def uri_replace(ud, uri_find, uri_replace, replacements, d, mirrortarball=None): if not ud.url or not uri_find or not uri_replace: