diff mbox series

[v4,1/4] fetch2/git: Add support for fast initial shallow fetch

Message ID 20250221172602.527924-1-stefan-koch@siemens.com
State Accepted, archived
Commit 457288b2fda86fd00cdcaefac616129b0029e1f9
Headers show
Series [v4,1/4] fetch2/git: Add support for fast initial shallow fetch | expand

Commit Message

Stefan Koch Feb. 21, 2025, 5:25 p.m. UTC
When `ud.shallow == 1`:
- Prefer an initial shallow clone over an initial full bare clone,
  while still utilizing any already existing full bare clones.
- This will be skipped when `ud.shallow_skip_fast == 1`

This improves:
- Resolve timeout issues during initial clones on slow internet connections
  by reducing the amount of data transferred.
- Eliminate the need to use an HTTPS tarball `SRC_URI`
  to reduce data transfer.
- Allow SSH-based authentication (e.g. cert and agent-based) when
  using non-public repos, so additional HTTPS tokens may not be required.

Signed-off-by: Stefan Koch <stefan-koch@siemens.com>
---
 lib/bb/fetch2/git.py | 100 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 80 insertions(+), 20 deletions(-)

Comments

Ross Burton Feb. 27, 2025, 8:39 p.m. UTC | #1
On 21 Feb 2025, at 17:25, Koch, Stefan via lists.openembedded.org <stefan-koch=siemens.com@lists.openembedded.org> wrote:
> +        ud.shallow_skip_fast = d.getVar("BB_GIT_SHALLOW_SKIP_FAST") == "1"

Why would we need a way _from a recipe_ to say “even though I asked for a shallow clone, don’t actually do a shallow clone”?  If a recipe doesn’t want to do a shallow clone, it can not set BB_GIT_SHALLOW=1.

> +    @contextmanager
> +    def create_atomic(self, filename):
> +        """Create as a temp file and move atomically into position to avoid races"""
> +        fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
> +        try:
> +            yield tfile
> +            umask = os.umask(0o666)
> +            os.umask(umask)
> +            os.chmod(tfile, (0o666 & ~umask))
> +            os.rename(tfile, filename)
> +        finally:
> +            os.close(fd)

I realise you’re just moving code, but atomically creating an empty file seems pretty pointless.

> +            # When skipping fast initial shallow or the fast inital shallow clone failed:
> +            # Try again with an initial regular clone
> +            if not shallow_fast_state:
> +                ud.shallow_skip_fast = True
> +                runfetchcmd(clone_cmd, d, log=progresshandler)

At this point why can’t you just set ud.shallow=False instead of needing a separate skip variable?

Ross
Stefan Koch March 3, 2025, 1:24 p.m. UTC | #2
On Thu, 2025-02-27 at 20:39 +0000, Ross Burton wrote:
> On 21 Feb 2025, at 17:25, Koch, Stefan via lists.openembedded.org
> <stefan-koch=siemens.com@lists.openembedded.org> wrote:
> > +        ud.shallow_skip_fast =
> > d.getVar("BB_GIT_SHALLOW_SKIP_FAST") == "1"
> 
> Why would we need a way _from a recipe_ to say “even though I asked
> for a shallow clone, don’t actually do a shallow clone”?  If a recipe
> doesn’t want to do a shallow clone, it can not set BB_GIT_SHALLOW=1.
v5 patch set will not have `BB_GIT_SHALLOW_SKIP_FAST` anymore, but
still have skip fast functionality, see mail from 2025-02-26 at 08:58
+0000  sent by Richard.
> 
> > +    @contextmanager
> > +    def create_atomic(self, filename):
> > +        """Create as a temp file and move atomically into position
> > to avoid races"""
> > +        fd, tfile =
> > tempfile.mkstemp(dir=os.path.dirname(filename))
> > +        try:
> > +            yield tfile
> > +            umask = os.umask(0o666)
> > +            os.umask(umask)
> > +            os.chmod(tfile, (0o666 & ~umask))
> > +            os.rename(tfile, filename)
> > +        finally:
> > +            os.close(fd)
> 
> I realise you’re just moving code, but atomically creating an empty
> file seems pretty pointless.
This is the bug report for that code snippet:
https://bugzilla.yoctoproject.org/show_bug.cgi?id=14441

This was introduced by commit 3250bc950c56bd7dd2114df26e5a8e13b04ceac8:
fetch2/git: Avoid races over mirror tarball creation

Maybe, Richard can explain why this is needed...
> 
> > +            # When skipping fast initial shallow or the fast
> > inital shallow clone failed:
> > +            # Try again with an initial regular clone
> > +            if not shallow_fast_state:
> > +                ud.shallow_skip_fast = True
> > +                runfetchcmd(clone_cmd, d, log=progresshandler)
> 
> At this point why can’t you just set ud.shallow=False instead of
> needing a separate skip variable?
Thanks, v5 patchset will not need that skip variable
> 
> Ross

-- 
Stefan Koch
Siemens AG
www.siemens.com
Richard Purdie March 3, 2025, 1:30 p.m. UTC | #3
On Mon, 2025-03-03 at 13:24 +0000, Koch, Stefan wrote:
> On Thu, 2025-02-27 at 20:39 +0000, Ross Burton wrote:
> > On 21 Feb 2025, at 17:25, Koch, Stefan via lists.openembedded.org
> > <stefan-koch=siemens.com@lists.openembedded.org> wrote:
> > > +        ud.shallow_skip_fast =
> > > d.getVar("BB_GIT_SHALLOW_SKIP_FAST") == "1"
> > 
> > Why would we need a way _from a recipe_ to say “even though I asked
> > for a shallow clone, don’t actually do a shallow clone”?  If a recipe
> > doesn’t want to do a shallow clone, it can not set BB_GIT_SHALLOW=1.
> v5 patch set will not have `BB_GIT_SHALLOW_SKIP_FAST` anymore, but
> still have skip fast functionality, see mail from 2025-02-26 at 08:58
> +0000  sent by Richard.
> > 
> > > +    @contextmanager
> > > +    def create_atomic(self, filename):
> > > +        """Create as a temp file and move atomically into position
> > > to avoid races"""
> > > +        fd, tfile =
> > > tempfile.mkstemp(dir=os.path.dirname(filename))
> > > +        try:
> > > +            yield tfile
> > > +            umask = os.umask(0o666)
> > > +            os.umask(umask)
> > > +            os.chmod(tfile, (0o666 & ~umask))
> > > +            os.rename(tfile, filename)
> > > +        finally:
> > > +            os.close(fd)
> > 
> > I realise you’re just moving code, but atomically creating an empty
> > file seems pretty pointless.
> This is the bug report for that code snippet:
> https://bugzilla.yoctoproject.org/show_bug.cgi?id=14441
> 
> This was introduced by commit 3250bc950c56bd7dd2114df26e5a8e13b04ceac8:
> fetch2/git: Avoid races over mirror tarball creation
> 
> Maybe, Richard can explain why this is needed...

It took me a minute to realise but it isn't an empty file, it is passed
through the yield, content added, *then* moved into position.

Cheers,

Richard
diff mbox series

Patch

diff --git a/lib/bb/fetch2/git.py b/lib/bb/fetch2/git.py
index 6badda597..5ef2c2147 100644
--- a/lib/bb/fetch2/git.py
+++ b/lib/bb/fetch2/git.py
@@ -207,6 +207,7 @@  class Git(FetchMethod):
         if ud.bareclone:
             ud.cloneflags += " --mirror"
 
+        ud.shallow_skip_fast = d.getVar("BB_GIT_SHALLOW_SKIP_FAST") == "1"
         ud.shallow = d.getVar("BB_GIT_SHALLOW") == "1"
         ud.shallow_extra_refs = (d.getVar("BB_GIT_SHALLOW_EXTRA_REFS") or "").split()
 
@@ -253,6 +254,7 @@  class Git(FetchMethod):
                 all(ud.shallow_depths[n] == 0 for n in ud.names)):
             # Shallow disabled for this URL
             ud.shallow = False
+            ud.shallow_skip_fast = False
 
         if ud.usehead:
             # When usehead is set let's associate 'HEAD' with the unresolved
@@ -366,6 +368,33 @@  class Git(FetchMethod):
     def tarball_need_update(self, ud):
         return ud.write_tarballs and not os.path.exists(ud.fullmirror)
 
+    def lfs_fetch(self, ud, d, clonedir, revision, fetchall=False, progresshandler=None):
+        """Helper method for fetching Git LFS data"""
+        try:
+            if self._need_lfs(ud) and self._contains_lfs(ud, d, clonedir) and self._find_git_lfs(d) and len(revision):
+                # Using worktree with the revision because .lfsconfig may exists
+                worktree_add_cmd = "%s worktree add wt %s" % (ud.basecmd, revision)
+                runfetchcmd(worktree_add_cmd, d, log=progresshandler, workdir=clonedir)
+                lfs_fetch_cmd = "%s lfs fetch %s" % (ud.basecmd, "--all" if fetchall else "")
+                runfetchcmd(lfs_fetch_cmd, d, log=progresshandler, workdir=(clonedir + "/wt"))
+                worktree_rem_cmd = "%s worktree remove -f wt" % ud.basecmd
+                runfetchcmd(worktree_rem_cmd, d, log=progresshandler, workdir=clonedir)
+        except:
+            logger.warning("Fetching LFS did not succeed.")
+
+    @contextmanager
+    def create_atomic(self, filename):
+        """Create as a temp file and move atomically into position to avoid races"""
+        fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
+        try:
+            yield tfile
+            umask = os.umask(0o666)
+            os.umask(umask)
+            os.chmod(tfile, (0o666 & ~umask))
+            os.rename(tfile, filename)
+        finally:
+            os.close(fd)
+
     def try_premirror(self, ud, d):
         # If we don't do this, updating an existing checkout with only premirrors
         # is not possible
@@ -446,7 +475,41 @@  class Git(FetchMethod):
             if ud.proto.lower() != 'file':
                 bb.fetch2.check_network_access(d, clone_cmd, ud.url)
             progresshandler = GitProgressHandler(d)
-            runfetchcmd(clone_cmd, d, log=progresshandler)
+
+            # Try creating a fast initial shallow clone
+            # Enabling ud.shallow_skip_fast will skip this
+            shallow_fast_state = False
+            if ud.shallow and not ud.shallow_skip_fast:
+                tempdir = tempfile.mkdtemp(dir=d.getVar('DL_DIR'))
+                shallowclone = os.path.join(tempdir, 'git')
+                try:
+                    self.clone_shallow_local(ud, shallowclone, d)
+                    shallow_fast_state = True
+                except:
+                    logger.warning("Creating fast initial shallow clone failed, try initial regular clone now.")
+
+                # When the shallow clone has succeeded:
+                # Create shallow tarball
+                if shallow_fast_state:
+                    logger.info("Creating tarball of git repository")
+                    with self.create_atomic(ud.fullshallow) as tfile:
+                        runfetchcmd("tar -czf %s ." % tfile, d, workdir=shallowclone)
+                    runfetchcmd("touch %s.done" % ud.fullshallow, d)
+
+                # Always cleanup tempdir
+                bb.utils.remove(tempdir, recurse=True)
+
+                # When the shallow clone has succeeded:
+                # Use shallow tarball
+                if shallow_fast_state:
+                    ud.localpath = ud.fullshallow
+                    return
+
+            # When skipping fast initial shallow or the fast inital shallow clone failed:
+            # Try again with an initial regular clone
+            if not shallow_fast_state:
+                ud.shallow_skip_fast = True
+                runfetchcmd(clone_cmd, d, log=progresshandler)
 
         # Update the checkout if needed
         if self.clonedir_need_update(ud, d):
@@ -509,20 +572,6 @@  class Git(FetchMethod):
                     runfetchcmd("tar -cf - lfs | tar -xf - -C %s" % ud.clonedir, d, workdir="%s/.git" % ud.destdir)
 
     def build_mirror_data(self, ud, d):
-
-        # Create as a temp file and move atomically into position to avoid races
-        @contextmanager
-        def create_atomic(filename):
-            fd, tfile = tempfile.mkstemp(dir=os.path.dirname(filename))
-            try:
-                yield tfile
-                umask = os.umask(0o666)
-                os.umask(umask)
-                os.chmod(tfile, (0o666 & ~umask))
-                os.rename(tfile, filename)
-            finally:
-                os.close(fd)
-
         if ud.shallow and ud.write_shallow_tarballs:
             if not os.path.exists(ud.fullshallow):
                 if os.path.islink(ud.fullshallow):
@@ -533,7 +582,7 @@  class Git(FetchMethod):
                     self.clone_shallow_local(ud, shallowclone, d)
 
                     logger.info("Creating tarball of git repository")
-                    with create_atomic(ud.fullshallow) as tfile:
+                    with self.create_atomic(ud.fullshallow) as tfile:
                         runfetchcmd("tar -czf %s ." % tfile, d, workdir=shallowclone)
                     runfetchcmd("touch %s.done" % ud.fullshallow, d)
                 finally:
@@ -543,7 +592,7 @@  class Git(FetchMethod):
                 os.unlink(ud.fullmirror)
 
             logger.info("Creating tarball of git repository")
-            with create_atomic(ud.fullmirror) as tfile:
+            with self.create_atomic(ud.fullmirror) as tfile:
                 mtime = runfetchcmd("{} log --all -1 --format=%cD".format(ud.basecmd), d,
                         quiet=True, workdir=ud.clonedir)
                 runfetchcmd("tar -czf %s --owner oe:0 --group oe:0 --mtime \"%s\" ."
@@ -557,12 +606,20 @@  class Git(FetchMethod):
         - For BB_GIT_SHALLOW_REVS: git fetch --shallow-exclude=<revs> rev
         """
 
+        progresshandler = GitProgressHandler(d)
+        repourl = self._get_repo_url(ud)
         bb.utils.mkdirhier(dest)
         init_cmd = "%s init -q" % ud.basecmd
         if ud.bareclone:
             init_cmd += " --bare"
         runfetchcmd(init_cmd, d, workdir=dest)
-        runfetchcmd("%s remote add origin %s" % (ud.basecmd, ud.clonedir), d, workdir=dest)
+        # Use repourl when creating a fast initial shallow clone
+        # Prefer already existing full bare clones if available
+        if not ud.shallow_skip_fast and not os.path.exists(ud.clonedir):
+            remote = shlex.quote(repourl)
+        else:
+            remote = ud.clonedir
+        runfetchcmd("%s remote add origin %s" % (ud.basecmd, remote), d, workdir=dest)
 
         # Check the histories which should be excluded
         shallow_exclude = ''
@@ -600,10 +657,14 @@  class Git(FetchMethod):
             # The ud.clonedir is a local temporary dir, will be removed when
             # fetch is done, so we can do anything on it.
             adv_cmd = 'git branch -f advertise-%s %s' % (revision, revision)
-            runfetchcmd(adv_cmd, d, workdir=ud.clonedir)
+            if ud.shallow_skip_fast:
+                runfetchcmd(adv_cmd, d, workdir=ud.clonedir)
 
             runfetchcmd(fetch_cmd, d, workdir=dest)
             runfetchcmd("%s update-ref %s %s" % (ud.basecmd, ref, revision), d, workdir=dest)
+            # Fetch Git LFS data for fast shallow clones
+            if not ud.shallow_skip_fast:
+                self.lfs_fetch(ud, d, dest, ud.revisions[ud.names[0]])
 
         # Apply extra ref wildcards
         all_refs_remote = runfetchcmd("%s ls-remote origin 'refs/*'" % ud.basecmd, \
@@ -629,7 +690,6 @@  class Git(FetchMethod):
             runfetchcmd("%s update-ref %s %s" % (ud.basecmd, ref, revision), d, workdir=dest)
 
         # The url is local ud.clonedir, set it to upstream one
-        repourl = self._get_repo_url(ud)
         runfetchcmd("%s remote set-url origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=dest)
 
     def unpack(self, ud, destdir, d):