[v3] fetch2/repo: Implement AUTOREV for repo fetcher

Message ID 20211115125435.1001033-1-jasper@fancydomain.eu
State New
Headers show
Series [v3] fetch2/repo: Implement AUTOREV for repo fetcher | expand

Commit Message

Jasper Orschulko Nov. 15, 2021, 12:54 p.m. UTC
From: Martin Koppehel <martin@mko.dev>

- Implement AUTOINC and submodule support for REPO provider
- Implement full srcrev support
- Add comments and fixup empty DL_DIR initialization
- Distinguish between artificial and plain rev
- Comments/documentation

The previous implementation of the repo fetcher could not handle updates
to the repo manifest file, nor deal with dynamic refspecs within this
manifest.

This patch fixes these shortcomings as follows:
During the recipe parsing phase, the repository containing the repo
manifest is cloned. This is done, as we need to parse the XML file
contained within, in order to discover all involved git repositories. A
combined hash is then calculated from the manifest repo, as well as any
git repo specified in the manifest. This hash is used for determining the
necessity of an update.

Additionally, the recipe will throw an error if the repo source is set to
a fixed revision but one or more repositories within the manifest
reference a dynamic refspec. This is done to ensure the reproducibility
of a version-pinned recipe.

Signed-off-by: Jasper Orschulko <Jasper.Orschulko@iris-sensing.com>
---
 lib/bb/fetch2/repo.py | 259 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 230 insertions(+), 29 deletions(-)

Patch

diff --git a/lib/bb/fetch2/repo.py b/lib/bb/fetch2/repo.py
index fa4cb814..706d37fa 100644
--- a/lib/bb/fetch2/repo.py
+++ b/lib/bb/fetch2/repo.py
@@ -3,6 +3,7 @@  BitBake "Fetch" repo (git) implementation
 
 """
 
+# Copyright (C) 2021 Martin Koppehel <martin@mko.dev>, iris-GmbH infrared & intelligent sensors
 # Copyright (C) 2009 Tom Rini <trini@embeddedalley.com>
 #
 # Based on git.py which is:
@@ -13,9 +14,11 @@  BitBake "Fetch" repo (git) implementation
 
 import os
 import bb
+import hashlib
+import xml.etree.ElementTree as ET
 from   bb.fetch2 import FetchMethod
 from   bb.fetch2 import runfetchcmd
-from   bb.fetch2 import logger
+
 
 class Repo(FetchMethod):
     """Class to fetch a module or modules from repo (git) repositories"""
@@ -27,46 +30,84 @@  class Repo(FetchMethod):
 
     def urldata_init(self, ud, d):
         """
-        We don"t care about the git rev of the manifests repository, but
-        we do care about the manifest to use.  The default is "default".
-        We also care about the branch or tag to be used.  The default is
-        "master".
+        We do care about the rev of the manifests repository, as well as the
+        manifest file. However, when SRCREV=AUTOINC, then we use the specified
+        branch in SRC_URI, with a fallback to master.
+        use sm=fetch to fetch possibly referenced submodules in repositories.
         """
 
         ud.basecmd = d.getVar("FETCHCMD_repo") or "/usr/bin/env repo"
+        ud.gitcmd = d.getVar("FETCHCMD_git") or "git -c core.fsyncobjectfiles=0"
 
         ud.proto = ud.parm.get('protocol', 'git')
         ud.branch = ud.parm.get('branch', 'master')
+
+        ud.submodules = ud.parm.get('sm', 'fetch')
         ud.manifest = ud.parm.get('manifest', 'default.xml')
         if not ud.manifest.endswith('.xml'):
             ud.manifest += '.xml'
 
-        ud.localfile = d.expand("repo_%s%s_%s_%s.tar.gz" % (ud.host, ud.path.replace("/", "."), ud.manifest, ud.branch))
+        repodir = d.getVar("REPODIR") or (d.getVar("DL_DIR") + "/repo")
+        gitsrcname = "%s%s.%s" % (ud.host, ud.path.replace("/", "."), ud.manifest)
+        ud.codir = os.path.join(repodir, d.getVar("BPN"), gitsrcname)
+
+        if ud.user:
+            ud.username = ud.user + "@"
+        else:
+            ud.username = ""
+        ud.remoteRepo = "%s://%s%s%s" % (ud.proto, ud.username, ud.host, ud.path)
+
+        ud.repodir = os.path.join(ud.codir, "repo")
+        # a temporary directory to compute _latest_revision
+        ud.tempdir = os.path.join(ud.codir, "temp")
+        ud.stampfile = os.path.join(ud.codir, "__hash.txt")
+        ud.setup_revisions(d)
+
+        # ud.localfile is used to fill localpath, where the downloaded tarball is stored.
+        # in our case, we want something like repo_$GIT_URL_$MANIFEST_$SRCREV
+        # todo: do we want the packagename?
+        ud.localfile = "repo_%s%s_%s_%s.tar.gz" % (ud.host, ud.path.replace("/", "."), ud.manifest, ud.revision)
+
+    def need_update(self, ud, d):
+        if ud.revision.startswith("_"):
+          return True
+        return not os.path.exists(ud.localfile)
 
     def download(self, ud, d):
         """Fetch url"""
 
-        if os.access(os.path.join(d.getVar("DL_DIR"), ud.localfile), os.R_OK):
-            logger.debug("%s already exists (or was stashed). Skipping repo init / sync.", ud.localpath)
-            return
+        bb.utils.mkdirhier(ud.repodir)
 
-        repodir = d.getVar("REPODIR") or (d.getVar("DL_DIR") + "/repo")
-        gitsrcname = "%s%s" % (ud.host, ud.path.replace("/", "."))
-        codir = os.path.join(repodir, gitsrcname, ud.manifest)
-
-        if ud.user:
-            username = ud.user + "@"
+        # we want to run a repo init *always* in case the branch or manifest name changes.
+        # if not os.path.exists(os.path.join(repodir, ".repo")):
+        if ud.submodules == "fetch":
+            submodules = "--fetch-submodules"
         else:
-            username = ""
+            submodules = ""
 
-        repodir = os.path.join(codir, "repo")
-        bb.utils.mkdirhier(repodir)
-        if not os.path.exists(os.path.join(repodir, ".repo")):
-            bb.fetch2.check_network_access(d, "%s init -m %s -b %s -u %s://%s%s%s" % (ud.basecmd, ud.manifest, ud.branch, ud.proto, username, ud.host, ud.path), ud.url)
-            runfetchcmd("%s init -m %s -b %s -u %s://%s%s%s" % (ud.basecmd, ud.manifest, ud.branch, ud.proto, username, ud.host, ud.path), d, workdir=repodir)
+        # fixup the revision -> when it starts with underscore, it's an artificial one
+        # therefore we then use the specified remote branch used to generate
+        # the artificial revision in _latest_revision
+        realRevision = ud.revision
+        if ud.revision.startswith("_"):
+            realRevision = ud.branch
 
-        bb.fetch2.check_network_access(d, "%s sync %s" % (ud.basecmd, ud.url), ud.url)
-        runfetchcmd("%s sync" % ud.basecmd, d, workdir=repodir)
+        # always run repo init, because we might want to switch branch or manifests.
+        bb.fetch2.check_network_access(d, "%s init -m %s -b %s -u %s" % (ud.basecmd, ud.manifest, realRevision, ud.remoteRepo), ud.url)
+        runfetchcmd("%s init -m %s -b %s -u %s" % (ud.basecmd, ud.manifest, realRevision, ud.remoteRepo), d, workdir=ud.repodir)
+
+        bb.fetch2.check_network_access(d, "%s sync %s %s" % (ud.basecmd, submodules, ud.url), ud.url)
+        runfetchcmd("%s sync %s" % (ud.basecmd, submodules), d, workdir=ud.repodir)
+
+        # at this point, we have a checked out version, so make sure that revisions are pinned if the user requests that
+        if self._checkIsFixedRevision(ud, d, realRevision, ud.repodir):
+            # our manifest repo has a tag or a fixed sha revision => ensure all linked repos have the same.
+            def projectIterator(project, remote, revision):
+                if not self._checkIsFixedRevision(ud, d, revision, os.path.join(ud.repodir, project.get("path"))):
+                    raise bb.fetch2.FetchError("repo recipe is set to a fixed revision but project %s has variable revision %s.\nTo fix this, specify tags or SHA-hashes in your repo manifest file.\nThis is important to guarantee reproducibility." % (project.get('name'), revision), ud.url)
+
+            output = runfetchcmd("%s manifest" % ud.basecmd, d, workdir=ud.repodir)
+            self._scanRepoManifest(ud, d, output, projectIterator, content=True)
 
         scmdata = ud.parm.get("scmdata", "")
         if scmdata == "keep":
@@ -75,13 +116,173 @@  class Repo(FetchMethod):
             tar_flags = "--exclude='.repo' --exclude='.git'"
 
         # Create a cache
-        runfetchcmd("tar %s -czf %s %s" % (tar_flags, ud.localpath, os.path.join(".", "*") ), d, workdir=codir)
+        runfetchcmd("tar %s -czf %s %s" % (tar_flags, ud.localpath, os.path.join(".", "*")), d, workdir=ud.codir)
 
     def supports_srcrev(self):
-        return False
+        return True
+
+    def clean(self, ud, d):
+        """ clean the repo directory """
+
+        to_remove = [ud.localpath, ud.repodir, ud.tempdir, ud.stampfile]
+        # The localpath is a symlink to clonedir when it is cloned from a
+        # mirror, so remove both of them.
+        if os.path.islink(ud.localpath):
+            clonedir = os.path.realpath(ud.localpath)
+            to_remove.append(clonedir)
+
+        for r in to_remove:
+            if os.path.exists(r):
+                bb.utils.remove(r, True)
+
+    # this is taken from the git fetcher
+    def _lsremote(self, ud, d, search, repo):
+        """
+        Run git ls-remote with the specified search string
+        """
+        # Prevent recursion e.g. in OE if SRCPV is in PV, PV is in WORKDIR,
+        # and WORKDIR is in PATH (as a result of RSS), our call to
+        # runfetchcmd() exports PATH so this function will get called again (!)
+        # In this scenario the return call of the function isn't actually
+        # important - WORKDIR isn't needed in PATH to call git ls-remote
+        # anyway.
+        if d.getVar('_BB_REPO_IN_LSREMOTE', False):
+            return ''
+        d.setVar('_BB_REPO_IN_LSREMOTE', '1')
+        try:
+            cmd = "%s ls-remote \"%s\" %s" % \
+                (ud.gitcmd, repo, search)
+            if ud.proto.lower() != 'file':
+                bb.fetch2.check_network_access(d, cmd, ud.remoteRepo)
+            output = runfetchcmd(cmd, d, True)
+            if not output:
+                raise bb.fetch2.FetchError("The command %s gave empty output unexpectedly" % cmd, ud.url)
+        finally:
+            d.delVar('_BB_REPO_IN_LSREMOTE')
+        return output
+
+    # simple helper function to check if a specific object exists.
+    # used to check that things reference commits
+    def _revParse(self, ud, d, obj, repoDir, expect):
+        try:
+            cmd = "%s rev-parse -q %s" % (ud.gitcmd, obj)
+            runfetchcmd(cmd, d, workdir = repoDir)
+        except:
+            return not expect
+        return expect
+
+    # checks that the given revision is *not* a branch.
+    def _checkIsFixedRevision(self, ud, d, commithash, repodir):
+        if not self._revParse(ud, d, "refs/heads/%s" % commithash, repodir, False):
+            return False
+        return self._revParse(ud, d, "refs/tags/%s" % commithash, repodir, True) or self._revParse(ud, d, commithash, repodir, True)
+
+    def _resolveBranchToSHA1(self, ud, d, name, repo):
+        output = self._lsremote(ud, d, name, repo)
+        searchstring = "refs/heads/%s" % name
+        found = False
+        for line in output.strip().split('\n'):
+            sha1, ref = line.split()
+            if searchstring == ref:
+                return sha1
+                break
+
+        if not found:
+            raise bb.fetch2.FetchError("Could not determine remote ref!")
+
+    def _scanRepoManifest(self, ud, d, path, callback, content=False):
+        # parse the specified XML manifest
+        if content:
+          xml = ET.fromstring(path)
+        else:
+          xml = ET.parse(path)
+
+        # repo manifest *may* specify a <default> element, specifying fallback remotes and revisions
+        defaultObject = xml.find('default')
+
+        # parse all remotes and their corresponding default revisions
+        remotes = {}
+        remoteRevisions = {}
+        for remote in xml.findall('remote'):
+            remotes[remote.get('name')] = remote.get('fetch')
+            remoteRevisions[remote.get('name')] = remote.get('revision')
+
+
+        # iterate through the <project> elements, resolving the correct remote
+        # and revision
+        for project in xml.findall('project'):
+            # resolve the remote of the project
+            # when no remote is specified in the project take the one from <default>
+            # when both aren't specified, throw
+            remoteName = project.get('remote')
+            if remoteName is None and defaultObject is not None:
+                remoteName = defaultObject.get('remote')
+            if remoteName is None:
+                raise bb.fetch2.FetchError("repo manifest specifies no remote for %s" % project.get('name'))
+
+            # resolve the remoteName to a git remote URL and optionally
+            # the revision if it was specified in <remote>
+            if remotes[remoteName] is not None:
+                remoteRev = remoteRevisions[remoteName]
+                remote = remotes[remoteName]
+
+            # use revision in the project, when not specified use the one from <remote>
+            # when that is not specified use <default> and when we not have anything specified
+            # throw an exception
+            revision = project.get('revision') or remoteRev
+            if revision is None and defaultObject is not None:
+                revision = defaultObject.get('revision')
+            if revision is None:
+                raise bb.fetch2.FetchError("repo manifest specifies no revision for %s" % project.get('name'))
+            callback(project, remote, revision)
+
+    def _build_revision(self, ud, d, name):
+        return ud.revisions[name]
+
+    def _revision_key(self, ud, d, name):
+        return "%s-%s" % (d.getVar("BPN"), name)
+
+    def _latest_revision(self, ud, d, name):
+        """
+        Computes an artificial revision from the manifest repository and all
+        referenced repositories and their remote revisions.
+        name is ignored because we can only have a single branch/name
+        """
+        if d.getVar('_BB_REPO_IN_LATEST_REV', False):
+            return ''
+        d.setVar('_BB_REPO_IN_LATEST_REV', '1')
+
+        # we use a sha256 to mixup all the hashes we have
+        hashCalc = hashlib.sha256()
+
+        # first, add the hash of the repo itself
+        sha1 = self._resolveBranchToSHA1(ud, d, ud.branch, ud.remoteRepo)
+        hashUpdate = bytes(sha1, 'utf-8')
+        hashCalc.update(hashUpdate)
+
+        # Parse the repo XML files, remove things
+        try:
+            # create/cleanup temporary dir where to clone the repo-manifest URL
+            if os.path.isdir(ud.tempdir):
+                bb.utils.prunedir(ud.tempdir)
+            bb.utils.mkdirhier(ud.tempdir)
+
+            # clone the manifest repo to the temporary dir we just set up
+            bb.fetch2.check_network_access(d, "%s clone -b %s --depth 1 --single-branch %s ." % (ud.gitcmd, ud.branch, ud.remoteRepo), ud.url)
+            runfetchcmd("%s clone -b %s --depth 1 --single-branch %s %s" % (ud.gitcmd, ud.branch, ud.remoteRepo, ud.tempdir), d, workdir=ud.tempdir)
+
+            def projectIterator(project, remote, revision):
+                # perform an ls-remote on the branch, update the checksum with the commit hash
+                gitRemotePath = "%s/%s" % (remote, project.get('name'))
+
+                sha1 = self._resolveBranchToSHA1(ud, d, revision, gitRemotePath)
+                hashUpdate = bytes(sha1, 'utf-8')
+                hashCalc.update(hashUpdate)
+
+            self._scanRepoManifest(ud, d, "%s/%s" % (ud.tempdir, ud.manifest), projectIterator)
+        finally:
+            d.delVar('_BB_REPO_IN_LATEST_REV')
+        digest = "_" + hashCalc.hexdigest()
+        return digest
 
-    def _build_revision(self, ud, d):
-        return ud.manifest
 
-    def _want_sortable_revision(self, ud, d):
-        return False