diff mbox series

[1/3] fetch2/git: Add support for fast initial shallow fetch

Message ID 20250129112406.1660522-2-stefan-koch@siemens.com
State New
Headers show
Series fetch2/git: Improve shallow, lfs, and tag support | expand

Commit Message

Stefan Koch Jan. 29, 2025, 11:24 a.m. UTC
When `ud.shallow == 1`:
- Prefer an initial shallow clone over an initial bare clone,
  while still utilizing any already existing bare clones.

This improves:
- Solves timeout issues during initial clones on slow internet connections
  by reducing the amount of data transferred.
- Eliminates the need to use a HTTPS tarball SRC_URI
  to reduce data transfer.
- Allows SSH-based authentication (e.g. cert and agent-based) when
  using non-public repos, so additional HTTPS tokens may not be required.

Signed-off-by: Stefan Koch <stefan-koch@siemens.com>
---
 lib/bb/fetch2/git.py | 92 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 21 deletions(-)

Comments

Alexander Kanavin Jan. 29, 2025, 11:35 a.m. UTC | #1
This, and the other patches have no tests via bitbake-selftest. Can
you look into adding some?

Alex

On Wed, 29 Jan 2025 at 12:24, Koch, Stefan via lists.openembedded.org
<stefan-koch=siemens.com@lists.openembedded.org> wrote:
>
> When `ud.shallow == 1`:
> - Prefer an initial shallow clone over an initial bare clone,
>   while still utilizing any already existing bare clones.
>
> This improves:
> - Solves timeout issues during initial clones on slow internet connections
>   by reducing the amount of data transferred.
> - Eliminates the need to use a HTTPS tarball SRC_URI
>   to reduce data transfer.
> - Allows SSH-based authentication (e.g. cert and agent-based) when
>   using non-public repos, so additional HTTPS tokens may not be required.
>
> Signed-off-by: Stefan Koch <stefan-koch@siemens.com>
> ---
>  lib/bb/fetch2/git.py | 92 ++++++++++++++++++++++++++++++++++----------
>  1 file changed, 71 insertions(+), 21 deletions(-)
>
> diff --git a/lib/bb/fetch2/git.py b/lib/bb/fetch2/git.py
> index 6badda597..6d87c2f18 100644
> --- a/lib/bb/fetch2/git.py
> +++ b/lib/bb/fetch2/git.py
> @@ -366,6 +366,33 @@ class Git(FetchMethod):
>      def tarball_need_update(self, ud):
>          return ud.write_tarballs and not os.path.exists(ud.fullmirror)
>
> +    # Helper method for fetching Git LFS data
> +    def lfs_fetch(self, ud, d, clonedir, revision, progresshandler, fetchall=False):
> +        try:
> +            if self._need_lfs(ud) and self._contains_lfs(ud, d, clonedir) and self._find_git_lfs(d) and len(revision):
> +                # Using worktree with the revision because .lfsconfig may exists
> +                worktree_add_cmd = "%s worktree add wt %s" % (ud.basecmd, revision)
> +                runfetchcmd(worktree_add_cmd, d, log=progresshandler, workdir=clonedir)
> +                lfs_fetch_cmd = "%s lfs fetch %s" % (ud.basecmd, "--all" if fetchall else "")
> +                runfetchcmd(lfs_fetch_cmd, d, log=progresshandler, workdir=(clonedir + "/wt"))
> +                worktree_rem_cmd = "%s worktree remove -f wt" % ud.basecmd
> +                runfetchcmd(worktree_rem_cmd, d, log=progresshandler, workdir=clonedir)
> +        except:
> +            logger.warning("Fetching LFS did not succeed.")
> +
> +    # Create as a temp file and move atomically into position to avoid races
> +    @contextmanager
> +    def create_atomic(self, filename):
> +        fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
> +        try:
> +            yield tfile
> +            umask = os.umask(0o666)
> +            os.umask(umask)
> +            os.chmod(tfile, (0o666 & ~umask))
> +            os.rename(tfile, filename)
> +        finally:
> +            os.close(fd)
> +
>      def try_premirror(self, ud, d):
>          # If we don't do this, updating an existing checkout with only premirrors
>          # is not possible
> @@ -446,7 +473,40 @@ class Git(FetchMethod):
>              if ud.proto.lower() != 'file':
>                  bb.fetch2.check_network_access(d, clone_cmd, ud.url)
>              progresshandler = GitProgressHandler(d)
> -            runfetchcmd(clone_cmd, d, log=progresshandler)
> +
> +            # When ud.shallow is enabled:
> +            # Try creating an initial shallow clone
> +            shallowstate = False
> +            if ud.shallow:
> +                tempdir = tempfile.mkdtemp(dir=d.getVar('DL_DIR'))
> +                shallowclone = os.path.join(tempdir, 'git')
> +                try:
> +                    self.clone_shallow_local(ud, shallowclone, d)
> +                    shallowstate = True
> +                except:
> +                    logger.warning("Creating initial shallow clone failed, try regular clone now.")
> +
> +                # When the shallow clone has succeeded:
> +                # Create shallow tarball
> +                if shallowstate:
> +                    logger.info("Creating tarball of git repository")
> +                    with self.create_atomic(ud.fullshallow) as tfile:
> +                        runfetchcmd("tar -czf %s ." % tfile, d, workdir=shallowclone)
> +                    runfetchcmd("touch %s.done" % ud.fullshallow, d)
> +
> +                # Always cleanup tempdir
> +                bb.utils.remove(tempdir, recurse=True)
> +
> +                # When the shallow clone has succeeded:
> +                # Use shallow tarball
> +                if shallowstate:
> +                    ud.localpath = ud.fullshallow
> +                    return
> +
> +            # When ud.shallow is disabled or the shallow clone failed:
> +            # Create an initial regular clone
> +            if not shallowstate:
> +                runfetchcmd(clone_cmd, d, log=progresshandler)
>
>          # Update the checkout if needed
>          if self.clonedir_need_update(ud, d):
> @@ -509,20 +569,6 @@ class Git(FetchMethod):
>                      runfetchcmd("tar -cf - lfs | tar -xf - -C %s" % ud.clonedir, d, workdir="%s/.git" % ud.destdir)
>
>      def build_mirror_data(self, ud, d):
> -
> -        # Create as a temp file and move atomically into position to avoid races
> -        @contextmanager
> -        def create_atomic(filename):
> -            fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
> -            try:
> -                yield tfile
> -                umask = os.umask(0o666)
> -                os.umask(umask)
> -                os.chmod(tfile, (0o666 & ~umask))
> -                os.rename(tfile, filename)
> -            finally:
> -                os.close(fd)
> -
>          if ud.shallow and ud.write_shallow_tarballs:
>              if not os.path.exists(ud.fullshallow):
>                  if os.path.islink(ud.fullshallow):
> @@ -533,7 +579,7 @@ class Git(FetchMethod):
>                      self.clone_shallow_local(ud, shallowclone, d)
>
>                      logger.info("Creating tarball of git repository")
> -                    with create_atomic(ud.fullshallow) as tfile:
> +                    with self.create_atomic(ud.fullshallow) as tfile:
>                          runfetchcmd("tar -czf %s ." % tfile, d, workdir=shallowclone)
>                      runfetchcmd("touch %s.done" % ud.fullshallow, d)
>                  finally:
> @@ -543,7 +589,7 @@ class Git(FetchMethod):
>                  os.unlink(ud.fullmirror)
>
>              logger.info("Creating tarball of git repository")
> -            with create_atomic(ud.fullmirror) as tfile:
> +            with self.create_atomic(ud.fullmirror) as tfile:
>                  mtime = runfetchcmd("{} log --all -1 --format=%cD".format(ud.basecmd), d,
>                          quiet=True, workdir=ud.clonedir)
>                  runfetchcmd("tar -czf %s --owner oe:0 --group oe:0 --mtime \"%s\" ."
> @@ -557,12 +603,15 @@ class Git(FetchMethod):
>          - For BB_GIT_SHALLOW_REVS: git fetch --shallow-exclude=<revs> rev
>          """
>
> +        progresshandler = GitProgressHandler(d)
> +        repourl = self._get_repo_url(ud)
>          bb.utils.mkdirhier(dest)
>          init_cmd = "%s init -q" % ud.basecmd
>          if ud.bareclone:
>              init_cmd += " --bare"
>          runfetchcmd(init_cmd, d, workdir=dest)
> -        runfetchcmd("%s remote add origin %s" % (ud.basecmd, ud.clonedir), d, workdir=dest)
> +        # Use repourl when creating the initial shallow clone
> +        runfetchcmd("%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl) if ud.shallow and not os.path.exists(ud.clonedir) else ud.clonedir), d, workdir=dest)
>
>          # Check the histories which should be excluded
>          shallow_exclude = ''
> @@ -600,10 +649,12 @@ class Git(FetchMethod):
>              # The ud.clonedir is a local temporary dir, will be removed when
>              # fetch is done, so we can do anything on it.
>              adv_cmd = 'git branch -f advertise-%s %s' % (revision, revision)
> -            runfetchcmd(adv_cmd, d, workdir=ud.clonedir)
> +            if not ud.shallow:
> +                runfetchcmd(adv_cmd, d, workdir=ud.clonedir)
>
> -            runfetchcmd(fetch_cmd, d, workdir=dest)
> +            runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=dest)
>              runfetchcmd("%s update-ref %s %s" % (ud.basecmd, ref, revision), d, workdir=dest)
> +            self.lfs_fetch(ud, d, dest, ud.revisions[ud.names[0]], progresshandler)
>
>          # Apply extra ref wildcards
>          all_refs_remote = runfetchcmd("%s ls-remote origin 'refs/*'" % ud.basecmd, \
> @@ -629,7 +680,6 @@ class Git(FetchMethod):
>              runfetchcmd("%s update-ref %s %s" % (ud.basecmd, ref, revision), d, workdir=dest)
>
>          # The url is local ud.clonedir, set it to upstream one
> -        repourl = self._get_repo_url(ud)
>          runfetchcmd("%s remote set-url origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=dest)
>
>      def unpack(self, ud, destdir, d):
> --
> 2.39.5
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#17109): https://lists.openembedded.org/g/bitbake-devel/message/17109
> Mute This Topic: https://lists.openembedded.org/mt/110876221/1686489
> Group Owner: bitbake-devel+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/bitbake-devel/unsub [alex.kanavin@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
diff mbox series

Patch

diff --git a/lib/bb/fetch2/git.py b/lib/bb/fetch2/git.py
index 6badda597..6d87c2f18 100644
--- a/lib/bb/fetch2/git.py
+++ b/lib/bb/fetch2/git.py
@@ -366,6 +366,33 @@  class Git(FetchMethod):
     def tarball_need_update(self, ud):
         return ud.write_tarballs and not os.path.exists(ud.fullmirror)
 
+    # Helper method for fetching Git LFS data
+    def lfs_fetch(self, ud, d, clonedir, revision, progresshandler, fetchall=False):
+        try:
+            if self._need_lfs(ud) and self._contains_lfs(ud, d, clonedir) and self._find_git_lfs(d) and len(revision):
+                # Using worktree with the revision because .lfsconfig may exists
+                worktree_add_cmd = "%s worktree add wt %s" % (ud.basecmd, revision)
+                runfetchcmd(worktree_add_cmd, d, log=progresshandler, workdir=clonedir)
+                lfs_fetch_cmd = "%s lfs fetch %s" % (ud.basecmd, "--all" if fetchall else "")
+                runfetchcmd(lfs_fetch_cmd, d, log=progresshandler, workdir=(clonedir + "/wt"))
+                worktree_rem_cmd = "%s worktree remove -f wt" % ud.basecmd
+                runfetchcmd(worktree_rem_cmd, d, log=progresshandler, workdir=clonedir)
+        except:
+            logger.warning("Fetching LFS did not succeed.")
+
+    # Create as a temp file and move atomically into position to avoid races
+    @contextmanager
+    def create_atomic(self, filename):
+        fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
+        try:
+            yield tfile
+            umask = os.umask(0o666)
+            os.umask(umask)
+            os.chmod(tfile, (0o666 & ~umask))
+            os.rename(tfile, filename)
+        finally:
+            os.close(fd)
+
     def try_premirror(self, ud, d):
         # If we don't do this, updating an existing checkout with only premirrors
         # is not possible
@@ -446,7 +473,40 @@  class Git(FetchMethod):
             if ud.proto.lower() != 'file':
                 bb.fetch2.check_network_access(d, clone_cmd, ud.url)
             progresshandler = GitProgressHandler(d)
-            runfetchcmd(clone_cmd, d, log=progresshandler)
+
+            # When ud.shallow is enabled:
+            # Try creating an initial shallow clone
+            shallowstate = False
+            if ud.shallow:
+                tempdir = tempfile.mkdtemp(dir=d.getVar('DL_DIR'))
+                shallowclone = os.path.join(tempdir, 'git')
+                try:
+                    self.clone_shallow_local(ud, shallowclone, d)
+                    shallowstate = True
+                except:
+                    logger.warning("Creating initial shallow clone failed, try regular clone now.")
+
+                # When the shallow clone has succeeded:
+                # Create shallow tarball
+                if shallowstate:
+                    logger.info("Creating tarball of git repository")
+                    with self.create_atomic(ud.fullshallow) as tfile:
+                        runfetchcmd("tar -czf %s ." % tfile, d, workdir=shallowclone)
+                    runfetchcmd("touch %s.done" % ud.fullshallow, d)
+
+                # Always cleanup tempdir
+                bb.utils.remove(tempdir, recurse=True)
+
+                # When the shallow clone has succeeded:
+                # Use shallow tarball
+                if shallowstate:
+                    ud.localpath = ud.fullshallow
+                    return
+
+            # When ud.shallow is disabled or the shallow clone failed:
+            # Create an initial regular clone
+            if not shallowstate:
+                runfetchcmd(clone_cmd, d, log=progresshandler)
 
         # Update the checkout if needed
         if self.clonedir_need_update(ud, d):
@@ -509,20 +569,6 @@  class Git(FetchMethod):
                     runfetchcmd("tar -cf - lfs | tar -xf - -C %s" % ud.clonedir, d, workdir="%s/.git" % ud.destdir)
 
     def build_mirror_data(self, ud, d):
-
-        # Create as a temp file and move atomically into position to avoid races
-        @contextmanager
-        def create_atomic(filename):
-            fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
-            try:
-                yield tfile
-                umask = os.umask(0o666)
-                os.umask(umask)
-                os.chmod(tfile, (0o666 & ~umask))
-                os.rename(tfile, filename)
-            finally:
-                os.close(fd)
-
         if ud.shallow and ud.write_shallow_tarballs:
             if not os.path.exists(ud.fullshallow):
                 if os.path.islink(ud.fullshallow):
@@ -533,7 +579,7 @@  class Git(FetchMethod):
                     self.clone_shallow_local(ud, shallowclone, d)
 
                     logger.info("Creating tarball of git repository")
-                    with create_atomic(ud.fullshallow) as tfile:
+                    with self.create_atomic(ud.fullshallow) as tfile:
                         runfetchcmd("tar -czf %s ." % tfile, d, workdir=shallowclone)
                     runfetchcmd("touch %s.done" % ud.fullshallow, d)
                 finally:
@@ -543,7 +589,7 @@  class Git(FetchMethod):
                 os.unlink(ud.fullmirror)
 
             logger.info("Creating tarball of git repository")
-            with create_atomic(ud.fullmirror) as tfile:
+            with self.create_atomic(ud.fullmirror) as tfile:
                 mtime = runfetchcmd("{} log --all -1 --format=%cD".format(ud.basecmd), d,
                         quiet=True, workdir=ud.clonedir)
                 runfetchcmd("tar -czf %s --owner oe:0 --group oe:0 --mtime \"%s\" ."
@@ -557,12 +603,15 @@  class Git(FetchMethod):
         - For BB_GIT_SHALLOW_REVS: git fetch --shallow-exclude=<revs> rev
         """
 
+        progresshandler = GitProgressHandler(d)
+        repourl = self._get_repo_url(ud)
         bb.utils.mkdirhier(dest)
         init_cmd = "%s init -q" % ud.basecmd
         if ud.bareclone:
             init_cmd += " --bare"
         runfetchcmd(init_cmd, d, workdir=dest)
-        runfetchcmd("%s remote add origin %s" % (ud.basecmd, ud.clonedir), d, workdir=dest)
+        # Use repourl when creating the initial shallow clone
+        runfetchcmd("%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl) if ud.shallow and not os.path.exists(ud.clonedir) else ud.clonedir), d, workdir=dest)
 
         # Check the histories which should be excluded
         shallow_exclude = ''
@@ -600,10 +649,12 @@  class Git(FetchMethod):
             # The ud.clonedir is a local temporary dir, will be removed when
             # fetch is done, so we can do anything on it.
             adv_cmd = 'git branch -f advertise-%s %s' % (revision, revision)
-            runfetchcmd(adv_cmd, d, workdir=ud.clonedir)
+            if not ud.shallow:
+                runfetchcmd(adv_cmd, d, workdir=ud.clonedir)
 
-            runfetchcmd(fetch_cmd, d, workdir=dest)
+            runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=dest)
             runfetchcmd("%s update-ref %s %s" % (ud.basecmd, ref, revision), d, workdir=dest)
+            self.lfs_fetch(ud, d, dest, ud.revisions[ud.names[0]], progresshandler)
 
         # Apply extra ref wildcards
         all_refs_remote = runfetchcmd("%s ls-remote origin 'refs/*'" % ud.basecmd, \
@@ -629,7 +680,6 @@  class Git(FetchMethod):
             runfetchcmd("%s update-ref %s %s" % (ud.basecmd, ref, revision), d, workdir=dest)
 
         # The url is local ud.clonedir, set it to upstream one
-        repourl = self._get_repo_url(ud)
         runfetchcmd("%s remote set-url origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=dest)
 
     def unpack(self, ud, destdir, d):