diff mbox series

[RFC,V2] bitbake: fetch2/git: Use git fetch to shallow clone revisions

Message ID 20220826131047.51373-1-liezhi.yang@windriver.com
State New
Headers show
Series [RFC,V2] bitbake: fetch2/git: Use git fetch to shallow clone revisions | expand

Commit Message

Robert Yang Aug. 26, 2022, 1:10 p.m. UTC
* V2
  Fixed typos in commit message

The "git clone --depth" only works for refs, doesn't support revisions, but
"git fetch --depth" supports revisions, so use it to do the shallow clone, the
idea is from "git clone --recurse-submodules --shallow-submodules".

The workflow is (Only enabled when BB_GIT_SHALLOW = "1"):
$ git init --bare <clonedir>
$ git remote add origin <url>
$ git fetch origin --depth <depth> revision
$ git branch <branchname> FETCH_HEAD
$ git tag v<branchname> FETCH_HEAD

Here is the testing data based on poky, the testing server has a very good
network bandwidth:

Add 'BB_GIT_SHALLOW = "1"' conf/local.conf
$ rm -fr tmp downloads # Fresh download for each build
$ time bitbake world --runall=fetch
$ du -sh downloads/git2/

       Full        Shallow      Saved
--------------------------------------
Time:  15m59s       2m31s       84% (13m28s)
Size:  12G          1.2G        90% (10.8G)

* The Size is for downloads/git2/, the tarballs are not counted.

We can see that it saves a lot of download time and disk space, for example:

linux-yocto: 2.8G -> 228M
llvm: 2.5G -> 171M
cryptography: 1.5G -> 35M

And "$ bitbake world" works well.

This a RFC patch, please feel free to give you comments.

Signed-off-by: Robert Yang <liezhi.yang@windriver.com>
---
 bitbake/lib/bb/fetch2/git.py | 83 ++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 17 deletions(-)

Comments

Peter Kjellerstedt Aug. 26, 2022, 2:21 p.m. UTC | #1
> -----Original Message-----
> From: bitbake-devel@lists.openembedded.org <bitbake-
> devel@lists.openembedded.org> On Behalf Of Robert Yang
> Sent: den 26 augusti 2022 15:11
> To: bitbake-devel@lists.openembedded.org
> Subject: [bitbake-devel] [RFC][PATCH V2] bitbake: fetch2/git: Use git
> fetch to shallow clone revisions
> 
> * V2
>   Fixed typos in commit message

Patch history should go after the --- below.

> The "git clone --depth" only works for refs, doesn't support revisions, but
> "git fetch --depth" supports revisions, so use it to do the shallow clone, the
> idea is from "git clone --recurse-submodules --shallow-submodules".
> 
> The workflow is (Only enabled when BB_GIT_SHALLOW = "1"):
> $ git init --bare <clonedir>
> $ git remote add origin <url>
> $ git fetch origin --depth <depth> revision
> $ git branch <branchname> FETCH_HEAD
> $ git tag v<branchname> FETCH_HEAD
> 
> Here is the testing data based on poky, the testing server has a very good
> network bandwidth:
> 
> Add 'BB_GIT_SHALLOW = "1"' conf/local.conf
> $ rm -fr tmp downloads # Fresh download for each build
> $ time bitbake world --runall=fetch
> $ du -sh downloads/git2/
> 
>        Full        Shallow      Saved
> --------------------------------------
> Time:  15m59s       2m31s       84% (13m28s)
> Size:  12G          1.2G        90% (10.8G)
> 
> * The Size is for downloads/git2/, the tarballs are not counted.
> 
> We can see that it saves a lot of download time and disk space, for
> example:
> 
> linux-yocto: 2.8G -> 228M
> llvm: 2.5G -> 171M
> cryptography: 1.5G -> 35M
> 
> And "$ bitbake world" works well.
> 
> This a RFC patch, please feel free to give you comments.
> 
> Signed-off-by: Robert Yang <liezhi.yang@windriver.com>
> ---
>  bitbake/lib/bb/fetch2/git.py | 83 ++++++++++++++++++++++++++++--------
>  1 file changed, 66 insertions(+), 17 deletions(-)
> 
> diff --git a/bitbake/lib/bb/fetch2/git.py b/bitbake/lib/bb/fetch2/git.py
> index 4534bd75800..57bb61d5ee1 100644
> --- a/bitbake/lib/bb/fetch2/git.py
> +++ b/bitbake/lib/bb/fetch2/git.py
> @@ -244,6 +244,7 @@ class Git(FetchMethod):
>                  ud.unresolvedrev[name] = 'HEAD'
> 
>          ud.basecmd = d.getVar("FETCHCMD_git") or "git -c core.fsyncobjectfiles=0 -c gc.autoDetach=false -c core.pager=cat"
> +        ud.basecmd = "LANG=C %s" % ud.basecmd
> 
>          write_tarballs = d.getVar("BB_GENERATE_MIRROR_TARBALLS") or "0"
>          ud.write_tarballs = write_tarballs != "0" or ud.rebaseable
> @@ -344,6 +345,49 @@ class Git(FetchMethod):
>              return False
>          return True
> 
> +    def shallow_clone_by_fetch(self, ud, repourl, d):
> +        """
> +        Use "git fetch --depth <depth> revision" to implement shallow clone
> +        since git can't clone a revision, a better solution should be:
> +        "git fetch --depth <depth> revision:<branchname>" but it doesn't work
> +        when revision is a tag, e.g.:
> +        error: cannot update ref 'refs/heads/master': trying to write
> +                non-commit object <revision> to branch 'refs/heads/master'
> +        """
> +
> +        import datetime
> +
> +        depth = ud.shallow_depths[ud.names[0]]
> +        revision = ud.revisions[ud.names[0]]
> +        branchname = ud.branches[ud.names[0]]
> +        if not branchname:
> +            branchname = "master"
> +
> +        # Rename branchname if it exists which can:
> +        # - Avoid conflicts during update
> +        # - Keep the revision on a branch so that "git submodule update --recursive"
> +        #    can work since it requires the revision on a branch.
> +        branch_path = os.path.join(ud.clonedir, 'refs/heads/%s' % branchname)
> +        if os.path.exists(branch_path):
> +            os.rename(branch_path, '%s.%s' % (branch_path, datetime.datetime.now().strftime("%Y%m%d%H%M%S")))

Any reason this is done using os.rename() rather than `git branch -m?

> +
> +        init_cmd = "%s init --bare -q" % ud.basecmd
> +        add_remote_cmd = "%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl))
> +        fetch_cmd = "%s fetch --progress origin --depth %s %s" % (ud.basecmd, depth, revision)
> +        # Create both branch and tag for the revision
> +        branch_cmd = "%s branch -f %s FETCH_HEAD" % (ud.basecmd, branchname)
> +        tag_cmd = "%s tag -f v%s FETCH_HEAD" % (ud.basecmd, branchname)

Why not define these as a list instead:

        cmds = [
            "%s init --bare -q" % ud.basecmd,
            "%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl))
            "%s fetch --progress origin --depth %s %s" % (ud.basecmd, depth, revision),
            # Create both branch and tag for the revision
            "%s branch -f %s FETCH_HEAD" % (ud.basecmd, branchname),
            "%s tag -f v%s FETCH_HEAD" % (ud.basecmd, branchname),
        ]

> +
> +        if ud.proto.lower() != 'file':
> +            bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
> +
> +        if not os.path.exists(ud.clonedir):
> +            bb.utils.mkdirhier(ud.clonedir)
> +
> +        progresshandler = GitProgressHandler(d)
> +        for cmd in (init_cmd, add_remote_cmd, fetch_cmd, branch_cmd, tag_cmd):
> +            runfetchcmd(cmd, d, log=progresshandler, workdir=ud.clonedir)
> +
>      def download(self, ud, d):
>          """Fetch url"""
> 
> @@ -360,7 +404,7 @@ class Git(FetchMethod):
>              else:
>                  tmpdir = tempfile.mkdtemp(dir=d.getVar('DL_DIR'))
>                  runfetchcmd("tar -xzf %s" % ud.fullmirror, d, workdir=tmpdir)
> -                fetch_cmd = "LANG=C %s fetch -f --progress %s " % (ud.basecmd, shlex.quote(tmpdir))
> +                fetch_cmd = "%s fetch -f --progress %s " % (ud.basecmd, shlex.quote(tmpdir))
>                  runfetchcmd(fetch_cmd, d, workdir=ud.clonedir)
>          repourl = self._get_repo_url(ud)
> 
> @@ -369,27 +413,32 @@ class Git(FetchMethod):
>              # We do this since git will use a "-l" option automatically for local urls where possible
>              if repourl.startswith("file://"):
>                  repourl = repourl[7:]
> -            clone_cmd = "LANG=C %s clone --bare --mirror %s %s --progress" % (ud.basecmd, shlex.quote(repourl), ud.clonedir)
> -            if ud.proto.lower() != 'file':
> -                bb.fetch2.check_network_access(d, clone_cmd, ud.url)
> -            progresshandler = GitProgressHandler(d)
> -            runfetchcmd(clone_cmd, d, log=progresshandler)
> +            if ud.shallow:
> +                self.shallow_clone_by_fetch(ud, repourl, d)
> +            else:
> +                clone_cmd = "%s clone --bare --mirror %s %s --progress" % (ud.basecmd, shlex.quote(repourl), ud.clonedir)
> +                progresshandler = GitProgressHandler(d)
> +                if ud.proto.lower() != 'file':
> +                    bb.fetch2.check_network_access(d, clone_cmd, ud.url)
> +                runfetchcmd(clone_cmd, d, log=progresshandler)
> 
>          # Update the checkout if needed
>          if self.clonedir_need_update(ud, d):
>              output = runfetchcmd("%s remote" % ud.basecmd, d, quiet=True, workdir=ud.clonedir)
>              if "origin" in output:
> -              runfetchcmd("%s remote rm origin" % ud.basecmd, d, workdir=ud.clonedir)
> -
> -            runfetchcmd("%s remote add --mirror=fetch origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=ud.clonedir)
> -            fetch_cmd = "LANG=C %s fetch -f --progress %s refs/*:refs/*" % (ud.basecmd, shlex.quote(repourl))
> -            if ud.proto.lower() != 'file':
> -                bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
> -            progresshandler = GitProgressHandler(d)
> -            runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=ud.clonedir)
> -            runfetchcmd("%s prune-packed" % ud.basecmd, d, workdir=ud.clonedir)
> -            runfetchcmd("%s pack-refs --all" % ud.basecmd, d, workdir=ud.clonedir)
> -            runfetchcmd("%s pack-redundant --all | xargs -r rm" % ud.basecmd, d, workdir=ud.clonedir)
> +                runfetchcmd("%s remote rm origin" % ud.basecmd, d, workdir=ud.clonedir)
> +            if ud.shallow:
> +                self.shallow_clone_by_fetch(ud, repourl, d)
> +            else:
> +                runfetchcmd("%s remote add --mirror=fetch origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=ud.clonedir)
> +                fetch_cmd = "%s fetch -f --progress %s refs/*:refs/*" % (ud.basecmd, shlex.quote(repourl))
> +                if ud.proto.lower() != 'file':
> +                    bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
> +                progresshandler = GitProgressHandler(d)
> +                runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=ud.clonedir)
> +                runfetchcmd("%s prune-packed" % ud.basecmd, d, workdir=ud.clonedir)
> +                runfetchcmd("%s pack-refs --all" % ud.basecmd, d, workdir=ud.clonedir)
> +                runfetchcmd("%s pack-redundant --all | xargs -r rm" % ud.basecmd, d, workdir=ud.clonedir)
>              try:
>                  os.unlink(ud.fullmirror)
>              except OSError as exc:
> --
> 2.35.1

//Peter
Robert Yang Aug. 27, 2022, 3:36 a.m. UTC | #2
Hi Peter,

On 8/26/22 22:21, Peter Kjellerstedt wrote:
>> -----Original Message-----
>> From: bitbake-devel@lists.openembedded.org <bitbake-
>> devel@lists.openembedded.org> On Behalf Of Robert Yang
>> Sent: den 26 augusti 2022 15:11
>> To: bitbake-devel@lists.openembedded.org
>> Subject: [bitbake-devel] [RFC][PATCH V2] bitbake: fetch2/git: Use git
>> fetch to shallow clone revisions
>>
>> * V2
>>    Fixed typos in commit message
> 
> Patch history should go after the --- below.
> 
>> The "git clone --depth" only works for refs, doesn't support revisions, but
>> "git fetch --depth" supports revisions, so use it to do the shallow clone, the
>> idea is from "git clone --recurse-submodules --shallow-submodules".
>>
>> The workflow is (Only enabled when BB_GIT_SHALLOW = "1"):
>> $ git init --bare <clonedir>
>> $ git remote add origin <url>
>> $ git fetch origin --depth <depth> revision
>> $ git branch <branchname> FETCH_HEAD
>> $ git tag v<branchname> FETCH_HEAD
>>
>> Here is the testing data based on poky, the testing server has a very good
>> network bandwidth:
>>
>> Add 'BB_GIT_SHALLOW = "1"' conf/local.conf
>> $ rm -fr tmp downloads # Fresh download for each build
>> $ time bitbake world --runall=fetch
>> $ du -sh downloads/git2/
>>
>>         Full        Shallow      Saved
>> --------------------------------------
>> Time:  15m59s       2m31s       84% (13m28s)
>> Size:  12G          1.2G        90% (10.8G)
>>
>> * The Size is for downloads/git2/, the tarballs are not counted.
>>
>> We can see that it saves a lot of download time and disk space, for
>> example:
>>
>> linux-yocto: 2.8G -> 228M
>> llvm: 2.5G -> 171M
>> cryptography: 1.5G -> 35M
>>
>> And "$ bitbake world" works well.
>>
>> This a RFC patch, please feel free to give you comments.
>>
>> Signed-off-by: Robert Yang <liezhi.yang@windriver.com>
>> ---
>>   bitbake/lib/bb/fetch2/git.py | 83 ++++++++++++++++++++++++++++--------
>>   1 file changed, 66 insertions(+), 17 deletions(-)
>>
>> diff --git a/bitbake/lib/bb/fetch2/git.py b/bitbake/lib/bb/fetch2/git.py
>> index 4534bd75800..57bb61d5ee1 100644
>> --- a/bitbake/lib/bb/fetch2/git.py
>> +++ b/bitbake/lib/bb/fetch2/git.py
>> @@ -244,6 +244,7 @@ class Git(FetchMethod):
>>                   ud.unresolvedrev[name] = 'HEAD'
>>
>>           ud.basecmd = d.getVar("FETCHCMD_git") or "git -c core.fsyncobjectfiles=0 -c gc.autoDetach=false -c core.pager=cat"
>> +        ud.basecmd = "LANG=C %s" % ud.basecmd
>>
>>           write_tarballs = d.getVar("BB_GENERATE_MIRROR_TARBALLS") or "0"
>>           ud.write_tarballs = write_tarballs != "0" or ud.rebaseable
>> @@ -344,6 +345,49 @@ class Git(FetchMethod):
>>               return False
>>           return True
>>
>> +    def shallow_clone_by_fetch(self, ud, repourl, d):
>> +        """
>> +        Use "git fetch --depth <depth> revision" to implement shallow clone
>> +        since git can't clone a revision, a better solution should be:
>> +        "git fetch --depth <depth> revision:<branchname>" but it doesn't work
>> +        when revision is a tag, e.g.:
>> +        error: cannot update ref 'refs/heads/master': trying to write
>> +                non-commit object <revision> to branch 'refs/heads/master'
>> +        """
>> +
>> +        import datetime
>> +
>> +        depth = ud.shallow_depths[ud.names[0]]
>> +        revision = ud.revisions[ud.names[0]]
>> +        branchname = ud.branches[ud.names[0]]
>> +        if not branchname:
>> +            branchname = "master"
>> +
>> +        # Rename branchname if it exists which can:
>> +        # - Avoid conflicts during update
>> +        # - Keep the revision on a branch so that "git submodule update --recursive"
>> +        #    can work since it requires the revision on a branch.
>> +        branch_path = os.path.join(ud.clonedir, 'refs/heads/%s' % branchname)
>> +        if os.path.exists(branch_path):
>> +            os.rename(branch_path, '%s.%s' % (branch_path, datetime.datetime.now().strftime("%Y%m%d%H%M%S")))
> 
> Any reason this is done using os.rename() rather than `git branch -m?

It is because this is simpler and to keep align with branch_path, otherwise, we 
need:
- git branch --list to get the branch list and split them by '\n', remove the star.
- Check branch in the list
- git branch -m to rename the branch

> 
>> +
>> +        init_cmd = "%s init --bare -q" % ud.basecmd
>> +        add_remote_cmd = "%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl))
>> +        fetch_cmd = "%s fetch --progress origin --depth %s %s" % (ud.basecmd, depth, revision)
>> +        # Create both branch and tag for the revision
>> +        branch_cmd = "%s branch -f %s FETCH_HEAD" % (ud.basecmd, branchname)
>> +        tag_cmd = "%s tag -f v%s FETCH_HEAD" % (ud.basecmd, branchname)
> 
> Why not define these as a list instead:
> 
>          cmds = [
>              "%s init --bare -q" % ud.basecmd,
>              "%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl))
>              "%s fetch --progress origin --depth %s %s" % (ud.basecmd, depth, revision),
>              # Create both branch and tag for the revision
>              "%s branch -f %s FETCH_HEAD" % (ud.basecmd, branchname),
>              "%s tag -f v%s FETCH_HEAD" % (ud.basecmd, branchname),
>          ]

Thanks, I will update it with others' comments in the following days (If there 
are any).

// Robert

> 
>> +
>> +        if ud.proto.lower() != 'file':
>> +            bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
>> +
>> +        if not os.path.exists(ud.clonedir):
>> +            bb.utils.mkdirhier(ud.clonedir)
>> +
>> +        progresshandler = GitProgressHandler(d)
>> +        for cmd in (init_cmd, add_remote_cmd, fetch_cmd, branch_cmd, tag_cmd):
>> +            runfetchcmd(cmd, d, log=progresshandler, workdir=ud.clonedir)
>> +
>>       def download(self, ud, d):
>>           """Fetch url"""
>>
>> @@ -360,7 +404,7 @@ class Git(FetchMethod):
>>               else:
>>                   tmpdir = tempfile.mkdtemp(dir=d.getVar('DL_DIR'))
>>                   runfetchcmd("tar -xzf %s" % ud.fullmirror, d, workdir=tmpdir)
>> -                fetch_cmd = "LANG=C %s fetch -f --progress %s " % (ud.basecmd, shlex.quote(tmpdir))
>> +                fetch_cmd = "%s fetch -f --progress %s " % (ud.basecmd, shlex.quote(tmpdir))
>>                   runfetchcmd(fetch_cmd, d, workdir=ud.clonedir)
>>           repourl = self._get_repo_url(ud)
>>
>> @@ -369,27 +413,32 @@ class Git(FetchMethod):
>>               # We do this since git will use a "-l" option automatically for local urls where possible
>>               if repourl.startswith("file://"):
>>                   repourl = repourl[7:]
>> -            clone_cmd = "LANG=C %s clone --bare --mirror %s %s --progress" % (ud.basecmd, shlex.quote(repourl), ud.clonedir)
>> -            if ud.proto.lower() != 'file':
>> -                bb.fetch2.check_network_access(d, clone_cmd, ud.url)
>> -            progresshandler = GitProgressHandler(d)
>> -            runfetchcmd(clone_cmd, d, log=progresshandler)
>> +            if ud.shallow:
>> +                self.shallow_clone_by_fetch(ud, repourl, d)
>> +            else:
>> +                clone_cmd = "%s clone --bare --mirror %s %s --progress" % (ud.basecmd, shlex.quote(repourl), ud.clonedir)
>> +                progresshandler = GitProgressHandler(d)
>> +                if ud.proto.lower() != 'file':
>> +                    bb.fetch2.check_network_access(d, clone_cmd, ud.url)
>> +                runfetchcmd(clone_cmd, d, log=progresshandler)
>>
>>           # Update the checkout if needed
>>           if self.clonedir_need_update(ud, d):
>>               output = runfetchcmd("%s remote" % ud.basecmd, d, quiet=True, workdir=ud.clonedir)
>>               if "origin" in output:
>> -              runfetchcmd("%s remote rm origin" % ud.basecmd, d, workdir=ud.clonedir)
>> -
>> -            runfetchcmd("%s remote add --mirror=fetch origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=ud.clonedir)
>> -            fetch_cmd = "LANG=C %s fetch -f --progress %s refs/*:refs/*" % (ud.basecmd, shlex.quote(repourl))
>> -            if ud.proto.lower() != 'file':
>> -                bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
>> -            progresshandler = GitProgressHandler(d)
>> -            runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=ud.clonedir)
>> -            runfetchcmd("%s prune-packed" % ud.basecmd, d, workdir=ud.clonedir)
>> -            runfetchcmd("%s pack-refs --all" % ud.basecmd, d, workdir=ud.clonedir)
>> -            runfetchcmd("%s pack-redundant --all | xargs -r rm" % ud.basecmd, d, workdir=ud.clonedir)
>> +                runfetchcmd("%s remote rm origin" % ud.basecmd, d, workdir=ud.clonedir)
>> +            if ud.shallow:
>> +                self.shallow_clone_by_fetch(ud, repourl, d)
>> +            else:
>> +                runfetchcmd("%s remote add --mirror=fetch origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=ud.clonedir)
>> +                fetch_cmd = "%s fetch -f --progress %s refs/*:refs/*" % (ud.basecmd, shlex.quote(repourl))
>> +                if ud.proto.lower() != 'file':
>> +                    bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
>> +                progresshandler = GitProgressHandler(d)
>> +                runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=ud.clonedir)
>> +                runfetchcmd("%s prune-packed" % ud.basecmd, d, workdir=ud.clonedir)
>> +                runfetchcmd("%s pack-refs --all" % ud.basecmd, d, workdir=ud.clonedir)
>> +                runfetchcmd("%s pack-redundant --all | xargs -r rm" % ud.basecmd, d, workdir=ud.clonedir)
>>               try:
>>                   os.unlink(ud.fullmirror)
>>               except OSError as exc:
>> --
>> 2.35.1
> 
> //Peter
>
Peter Kjellerstedt Aug. 29, 2022, 10:46 a.m. UTC | #3
> -----Original Message-----
> From: Robert Yang <liezhi.yang@windriver.com>
> Sent: den 27 augusti 2022 05:37
> To: Peter Kjellerstedt <peter.kjellerstedt@axis.com>; bitbake-devel@lists.openembedded.org
> Subject: Re: [bitbake-devel] [RFC][PATCH V2] bitbake: fetch2/git: Use git fetch to shallow clone revisions
> 
> Hi Peter,
> 
> On 8/26/22 22:21, Peter Kjellerstedt wrote:
> >> -----Original Message-----
> >> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Robert Yang
> >> Sent: den 26 augusti 2022 15:11
> >> To: bitbake-devel@lists.openembedded.org
> >> Subject: [bitbake-devel] [RFC][PATCH V2] bitbake: fetch2/git: Use git fetch to shallow clone revisions
> >>
> >> * V2
> >>    Fixed typos in commit message
> >
> > Patch history should go after the --- below.
> >
> >> The "git clone --depth" only works for refs, doesn't support revisions, but
> >> "git fetch --depth" supports revisions, so use it to do the shallow clone, the
> >> idea is from "git clone --recurse-submodules --shallow-submodules".
> >>
> >> The workflow is (Only enabled when BB_GIT_SHALLOW = "1"):
> >> $ git init --bare <clonedir>
> >> $ git remote add origin <url>
> >> $ git fetch origin --depth <depth> revision
> >> $ git branch <branchname> FETCH_HEAD
> >> $ git tag v<branchname> FETCH_HEAD
> >>
> >> Here is the testing data based on poky, the testing server has a very good
> >> network bandwidth:
> >>
> >> Add 'BB_GIT_SHALLOW = "1"' conf/local.conf
> >> $ rm -fr tmp downloads # Fresh download for each build
> >> $ time bitbake world --runall=fetch
> >> $ du -sh downloads/git2/
> >>
> >>         Full        Shallow      Saved
> >> --------------------------------------
> >> Time:  15m59s       2m31s       84% (13m28s)
> >> Size:  12G          1.2G        90% (10.8G)
> >>
> >> * The Size is for downloads/git2/, the tarballs are not counted.
> >>
> >> We can see that it saves a lot of download time and disk space, for
> >> example:
> >>
> >> linux-yocto: 2.8G -> 228M
> >> llvm: 2.5G -> 171M
> >> cryptography: 1.5G -> 35M
> >>
> >> And "$ bitbake world" works well.
> >>
> >> This a RFC patch, please feel free to give you comments.
> >>
> >> Signed-off-by: Robert Yang <liezhi.yang@windriver.com>
> >> ---
> >>   bitbake/lib/bb/fetch2/git.py | 83 ++++++++++++++++++++++++++++--------
> >>   1 file changed, 66 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/bitbake/lib/bb/fetch2/git.py b/bitbake/lib/bb/fetch2/git.py
> >> index 4534bd75800..57bb61d5ee1 100644
> >> --- a/bitbake/lib/bb/fetch2/git.py
> >> +++ b/bitbake/lib/bb/fetch2/git.py
> >> @@ -244,6 +244,7 @@ class Git(FetchMethod):
> >>                   ud.unresolvedrev[name] = 'HEAD'
> >>
> >>           ud.basecmd = d.getVar("FETCHCMD_git") or "git -c core.fsyncobjectfiles=0 -c gc.autoDetach=false -c core.pager=cat"
> >> +        ud.basecmd = "LANG=C %s" % ud.basecmd
> >>
> >>           write_tarballs = d.getVar("BB_GENERATE_MIRROR_TARBALLS") or "0"
> >>           ud.write_tarballs = write_tarballs != "0" or ud.rebaseable
> >> @@ -344,6 +345,49 @@ class Git(FetchMethod):
> >>               return False
> >>           return True
> >>
> >> +    def shallow_clone_by_fetch(self, ud, repourl, d):
> >> +        """
> >> +        Use "git fetch --depth <depth> revision" to implement shallow clone
> >> +        since git can't clone a revision, a better solution should be:
> >> +        "git fetch --depth <depth> revision:<branchname>" but it doesn't work
> >> +        when revision is a tag, e.g.:
> >> +        error: cannot update ref 'refs/heads/master': trying to write
> >> +                non-commit object <revision> to branch 'refs/heads/master'
> >> +        """
> >> +
> >> +        import datetime
> >> +
> >> +        depth = ud.shallow_depths[ud.names[0]]
> >> +        revision = ud.revisions[ud.names[0]]
> >> +        branchname = ud.branches[ud.names[0]]
> >> +        if not branchname:
> >> +            branchname = "master"
> >> +
> >> +        # Rename branchname if it exists which can:
> >> +        # - Avoid conflicts during update
> >> +        # - Keep the revision on a branch so that "git submodule update --recursive"
> >> +        #    can work since it requires the revision on a branch.
> >> +        branch_path = os.path.join(ud.clonedir, 'refs/heads/%s' % branchname)
> >> +        if os.path.exists(branch_path):
> >> +            os.rename(branch_path, '%s.%s' % (branch_path, datetime.datetime.now().strftime("%Y%m%d%H%M%S")))
> >
> > Any reason this is done using os.rename() rather than `git branch -m?
> 
> It is because this is simpler and to keep align with branch_path, otherwise, we
> need:
> - git branch --list to get the branch list and split them by '\n', remove the star.
> - Check branch in the list
> - git branch -m to rename the branch

If you accept that the command can fail, then you do not need to list 
the branches. Just do the rename. If the branch exists, then the 
rename will succeed, otherwise it will fail, but that is expected and 
ignored. 

What I do not like about the use of os.rename() here is that it uses 
internal knowledge of how Git stores its data.

//Peter
Robert Yang Aug. 31, 2022, 3:10 a.m. UTC | #4
On 8/29/22 6:46 PM, Peter Kjellerstedt wrote:
>> -----Original Message-----
>> From: Robert Yang <liezhi.yang@windriver.com>
>> Sent: den 27 augusti 2022 05:37
>> To: Peter Kjellerstedt <peter.kjellerstedt@axis.com>; bitbake-devel@lists.openembedded.org
>> Subject: Re: [bitbake-devel] [RFC][PATCH V2] bitbake: fetch2/git: Use git fetch to shallow clone revisions
>>
>> Hi Peter,
>>
>> On 8/26/22 22:21, Peter Kjellerstedt wrote:
>>>> -----Original Message-----
>>>> From: bitbake-devel@lists.openembedded.org <bitbake-devel@lists.openembedded.org> On Behalf Of Robert Yang
>>>> Sent: den 26 augusti 2022 15:11
>>>> To: bitbake-devel@lists.openembedded.org
>>>> Subject: [bitbake-devel] [RFC][PATCH V2] bitbake: fetch2/git: Use git fetch to shallow clone revisions
>>>>
>>>> * V2
>>>>     Fixed typos in commit message
>>>
>>> Patch history should go after the --- below.
>>>
>>>> The "git clone --depth" only works for refs, doesn't support revisions, but
>>>> "git fetch --depth" supports revisions, so use it to do the shallow clone, the
>>>> idea is from "git clone --recurse-submodules --shallow-submodules".
>>>>
>>>> The workflow is (Only enabled when BB_GIT_SHALLOW = "1"):
>>>> $ git init --bare <clonedir>
>>>> $ git remote add origin <url>
>>>> $ git fetch origin --depth <depth> revision
>>>> $ git branch <branchname> FETCH_HEAD
>>>> $ git tag v<branchname> FETCH_HEAD
>>>>
>>>> Here is the testing data based on poky, the testing server has a very good
>>>> network bandwidth:
>>>>
>>>> Add 'BB_GIT_SHALLOW = "1"' conf/local.conf
>>>> $ rm -fr tmp downloads # Fresh download for each build
>>>> $ time bitbake world --runall=fetch
>>>> $ du -sh downloads/git2/
>>>>
>>>>          Full        Shallow      Saved
>>>> --------------------------------------
>>>> Time:  15m59s       2m31s       84% (13m28s)
>>>> Size:  12G          1.2G        90% (10.8G)
>>>>
>>>> * The Size is for downloads/git2/, the tarballs are not counted.
>>>>
>>>> We can see that it saves a lot of download time and disk space, for
>>>> example:
>>>>
>>>> linux-yocto: 2.8G -> 228M
>>>> llvm: 2.5G -> 171M
>>>> cryptography: 1.5G -> 35M
>>>>
>>>> And "$ bitbake world" works well.
>>>>
>>>> This a RFC patch, please feel free to give you comments.
>>>>
>>>> Signed-off-by: Robert Yang <liezhi.yang@windriver.com>
>>>> ---
>>>>    bitbake/lib/bb/fetch2/git.py | 83 ++++++++++++++++++++++++++++--------
>>>>    1 file changed, 66 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/bitbake/lib/bb/fetch2/git.py b/bitbake/lib/bb/fetch2/git.py
>>>> index 4534bd75800..57bb61d5ee1 100644
>>>> --- a/bitbake/lib/bb/fetch2/git.py
>>>> +++ b/bitbake/lib/bb/fetch2/git.py
>>>> @@ -244,6 +244,7 @@ class Git(FetchMethod):
>>>>                    ud.unresolvedrev[name] = 'HEAD'
>>>>
>>>>            ud.basecmd = d.getVar("FETCHCMD_git") or "git -c core.fsyncobjectfiles=0 -c gc.autoDetach=false -c core.pager=cat"
>>>> +        ud.basecmd = "LANG=C %s" % ud.basecmd
>>>>
>>>>            write_tarballs = d.getVar("BB_GENERATE_MIRROR_TARBALLS") or "0"
>>>>            ud.write_tarballs = write_tarballs != "0" or ud.rebaseable
>>>> @@ -344,6 +345,49 @@ class Git(FetchMethod):
>>>>                return False
>>>>            return True
>>>>
>>>> +    def shallow_clone_by_fetch(self, ud, repourl, d):
>>>> +        """
>>>> +        Use "git fetch --depth <depth> revision" to implement shallow clone
>>>> +        since git can't clone a revision, a better solution should be:
>>>> +        "git fetch --depth <depth> revision:<branchname>" but it doesn't work
>>>> +        when revision is a tag, e.g.:
>>>> +        error: cannot update ref 'refs/heads/master': trying to write
>>>> +                non-commit object <revision> to branch 'refs/heads/master'
>>>> +        """
>>>> +
>>>> +        import datetime
>>>> +
>>>> +        depth = ud.shallow_depths[ud.names[0]]
>>>> +        revision = ud.revisions[ud.names[0]]
>>>> +        branchname = ud.branches[ud.names[0]]
>>>> +        if not branchname:
>>>> +            branchname = "master"
>>>> +
>>>> +        # Rename branchname if it exists which can:
>>>> +        # - Avoid conflicts during update
>>>> +        # - Keep the revision on a branch so that "git submodule update --recursive"
>>>> +        #    can work since it requires the revision on a branch.
>>>> +        branch_path = os.path.join(ud.clonedir, 'refs/heads/%s' % branchname)
>>>> +        if os.path.exists(branch_path):
>>>> +            os.rename(branch_path, '%s.%s' % (branch_path, datetime.datetime.now().strftime("%Y%m%d%H%M%S")))
>>>
>>> Any reason this is done using os.rename() rather than `git branch -m?
>>
>> It is because this is simpler and to keep align with branch_path, otherwise, we
>> need:
>> - git branch --list to get the branch list and split them by '\n', remove the star.
>> - Check branch in the list
>> - git branch -m to rename the branch
> 
> If you accept that the command can fail, then you do not need to list
> the branches. Just do the rename. If the branch exists, then the
> rename will succeed, otherwise it will fail, but that is expected and
> ignored.
> 
> What I do not like about the use of os.rename() here is that it uses
> internal knowledge of how Git stores its data.

Thanks, I will update it.

// Robert

> 
> //Peter
>
diff mbox series

Patch

diff --git a/bitbake/lib/bb/fetch2/git.py b/bitbake/lib/bb/fetch2/git.py
index 4534bd75800..57bb61d5ee1 100644
--- a/bitbake/lib/bb/fetch2/git.py
+++ b/bitbake/lib/bb/fetch2/git.py
@@ -244,6 +244,7 @@  class Git(FetchMethod):
                 ud.unresolvedrev[name] = 'HEAD'
 
         ud.basecmd = d.getVar("FETCHCMD_git") or "git -c core.fsyncobjectfiles=0 -c gc.autoDetach=false -c core.pager=cat"
+        ud.basecmd = "LANG=C %s" % ud.basecmd
 
         write_tarballs = d.getVar("BB_GENERATE_MIRROR_TARBALLS") or "0"
         ud.write_tarballs = write_tarballs != "0" or ud.rebaseable
@@ -344,6 +345,49 @@  class Git(FetchMethod):
             return False
         return True
 
+    def shallow_clone_by_fetch(self, ud, repourl, d):
+        """
+        Use "git fetch --depth <depth> revision" to implement shallow clone
+        since git can't clone a revision, a better solution should be:
+        "git fetch --depth <depth> revision:<branchname>" but it doesn't work
+        when revision is a tag, e.g.:
+        error: cannot update ref 'refs/heads/master': trying to write
+                non-commit object <revision> to branch 'refs/heads/master'
+        """
+
+        import datetime
+
+        depth = ud.shallow_depths[ud.names[0]]
+        revision = ud.revisions[ud.names[0]]
+        branchname = ud.branches[ud.names[0]]
+        if not branchname:
+            branchname = "master"
+
+        # Rename branchname if it exists which can:
+        # - Avoid conflicts during update
+        # - Keep the revision on a branch so that "git submodule update --recursive"
+        #    can work since it requires the revision on a branch.
+        branch_path = os.path.join(ud.clonedir, 'refs/heads/%s' % branchname)
+        if os.path.exists(branch_path):
+            os.rename(branch_path, '%s.%s' % (branch_path, datetime.datetime.now().strftime("%Y%m%d%H%M%S")))
+
+        init_cmd = "%s init --bare -q" % ud.basecmd
+        add_remote_cmd = "%s remote add origin %s" % (ud.basecmd, shlex.quote(repourl))
+        fetch_cmd = "%s fetch --progress origin --depth %s %s" % (ud.basecmd, depth, revision)
+        # Create both branch and tag for the revision
+        branch_cmd = "%s branch -f %s FETCH_HEAD" % (ud.basecmd, branchname)
+        tag_cmd = "%s tag -f v%s FETCH_HEAD" % (ud.basecmd, branchname)
+
+        if ud.proto.lower() != 'file':
+            bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
+
+        if not os.path.exists(ud.clonedir):
+            bb.utils.mkdirhier(ud.clonedir)
+
+        progresshandler = GitProgressHandler(d)
+        for cmd in (init_cmd, add_remote_cmd, fetch_cmd, branch_cmd, tag_cmd):
+            runfetchcmd(cmd, d, log=progresshandler, workdir=ud.clonedir)
+
     def download(self, ud, d):
         """Fetch url"""
 
@@ -360,7 +404,7 @@  class Git(FetchMethod):
             else:
                 tmpdir = tempfile.mkdtemp(dir=d.getVar('DL_DIR'))
                 runfetchcmd("tar -xzf %s" % ud.fullmirror, d, workdir=tmpdir)
-                fetch_cmd = "LANG=C %s fetch -f --progress %s " % (ud.basecmd, shlex.quote(tmpdir))
+                fetch_cmd = "%s fetch -f --progress %s " % (ud.basecmd, shlex.quote(tmpdir))
                 runfetchcmd(fetch_cmd, d, workdir=ud.clonedir)
         repourl = self._get_repo_url(ud)
 
@@ -369,27 +413,32 @@  class Git(FetchMethod):
             # We do this since git will use a "-l" option automatically for local urls where possible
             if repourl.startswith("file://"):
                 repourl = repourl[7:]
-            clone_cmd = "LANG=C %s clone --bare --mirror %s %s --progress" % (ud.basecmd, shlex.quote(repourl), ud.clonedir)
-            if ud.proto.lower() != 'file':
-                bb.fetch2.check_network_access(d, clone_cmd, ud.url)
-            progresshandler = GitProgressHandler(d)
-            runfetchcmd(clone_cmd, d, log=progresshandler)
+            if ud.shallow:
+                self.shallow_clone_by_fetch(ud, repourl, d)
+            else:
+                clone_cmd = "%s clone --bare --mirror %s %s --progress" % (ud.basecmd, shlex.quote(repourl), ud.clonedir)
+                progresshandler = GitProgressHandler(d)
+                if ud.proto.lower() != 'file':
+                    bb.fetch2.check_network_access(d, clone_cmd, ud.url)
+                runfetchcmd(clone_cmd, d, log=progresshandler)
 
         # Update the checkout if needed
         if self.clonedir_need_update(ud, d):
             output = runfetchcmd("%s remote" % ud.basecmd, d, quiet=True, workdir=ud.clonedir)
             if "origin" in output:
-              runfetchcmd("%s remote rm origin" % ud.basecmd, d, workdir=ud.clonedir)
-
-            runfetchcmd("%s remote add --mirror=fetch origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=ud.clonedir)
-            fetch_cmd = "LANG=C %s fetch -f --progress %s refs/*:refs/*" % (ud.basecmd, shlex.quote(repourl))
-            if ud.proto.lower() != 'file':
-                bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
-            progresshandler = GitProgressHandler(d)
-            runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=ud.clonedir)
-            runfetchcmd("%s prune-packed" % ud.basecmd, d, workdir=ud.clonedir)
-            runfetchcmd("%s pack-refs --all" % ud.basecmd, d, workdir=ud.clonedir)
-            runfetchcmd("%s pack-redundant --all | xargs -r rm" % ud.basecmd, d, workdir=ud.clonedir)
+                runfetchcmd("%s remote rm origin" % ud.basecmd, d, workdir=ud.clonedir)
+            if ud.shallow:
+                self.shallow_clone_by_fetch(ud, repourl, d)
+            else:
+                runfetchcmd("%s remote add --mirror=fetch origin %s" % (ud.basecmd, shlex.quote(repourl)), d, workdir=ud.clonedir)
+                fetch_cmd = "%s fetch -f --progress %s refs/*:refs/*" % (ud.basecmd, shlex.quote(repourl))
+                if ud.proto.lower() != 'file':
+                    bb.fetch2.check_network_access(d, fetch_cmd, ud.url)
+                progresshandler = GitProgressHandler(d)
+                runfetchcmd(fetch_cmd, d, log=progresshandler, workdir=ud.clonedir)
+                runfetchcmd("%s prune-packed" % ud.basecmd, d, workdir=ud.clonedir)
+                runfetchcmd("%s pack-refs --all" % ud.basecmd, d, workdir=ud.clonedir)
+                runfetchcmd("%s pack-redundant --all | xargs -r rm" % ud.basecmd, d, workdir=ud.clonedir)
             try:
                 os.unlink(ud.fullmirror)
             except OSError as exc: