aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlberto Pianon <alberto@pianon.eu>2023-03-22 20:27:42 +0100
committerAlberto Pianon <alberto@pianon.eu>2023-03-22 20:27:42 +0100
commit54251b9bb27241eab9368c9facc150f709ccc3c5 (patch)
tree55523c91b77bfafab196793704cfe384eb278012
parent4d9ec332d5bfc8b60b54f8ec2a17d34e35aa903a (diff)
downloadbitbake-contrib-54251b9bb27241eab9368c9facc150f709ccc3c5.tar.gz
add upstr. source tracing functionality in unpack
do_unpack currently unpacks all SRC_URI entries into WORKDIR, and can even mix files coming from multiple SRC_URI entries into the same subdir, making it hard to trace each source file found in WORKDIR back to its corresponding upstream source. Being able to trace source files to their corresponding upstream source is fundamental for Software Composition Analysis (SCA), Software Bill of Materials (SBoM) generation (create-spdx.bbclass), license compliance checking and CVE checking. To solve this issue, this patch implements a process that consists of: 1) unpacking each SRC_URI element into a temporary directory 2) collecting relevant metadata for Software Composition Analysis (file sha1, upstream download location (in SPDX-compliant format), relative path in the upstream repo/package, etc.); 3) moving everything to WORKDIR, and iterate with the next SRC_URI element; 4) saving metadata in a json file after all SRC_URI elements have been processed By patching the relevant fetcher modules and adding a bb.trace module, this patch implements steps 1,3,4 , while it provides only a bare-bone implementation of step 2, in which all relevant raw metadata (file paths, url, urldata, real destination dir, npmsw dependency tree, git submodule revisions) are collected, but not processed nor saved. This should allow to develop a full implementation of step 2 (data collection) in a separate module independently from the development of the rest of bb code, i.e. without the need of further patching bb fetchers. Signed-off-by: Alberto Pianon <alberto@pianon.eu>
-rwxr-xr-xbin/bitbake-selftest1
-rw-r--r--lib/bb/fetch2/__init__.py56
-rw-r--r--lib/bb/fetch2/crate.py4
-rw-r--r--lib/bb/fetch2/git.py6
-rw-r--r--lib/bb/fetch2/gitannex.py2
-rw-r--r--lib/bb/fetch2/gitsm.py21
-rw-r--r--lib/bb/fetch2/hg.py3
-rw-r--r--lib/bb/fetch2/npm.py3
-rw-r--r--lib/bb/fetch2/npmsw.py17
-rw-r--r--lib/bb/tests/trace_base.py202
-rw-r--r--lib/bb/trace/__init__.py15
-rw-r--r--lib/bb/trace/unpack_base.py283
12 files changed, 590 insertions, 23 deletions
diff --git a/bin/bitbake-selftest b/bin/bitbake-selftest
index f25f23b1a..7be354f9e 100755
--- a/bin/bitbake-selftest
+++ b/bin/bitbake-selftest
@@ -26,6 +26,7 @@ tests = ["bb.tests.codeparser",
"bb.tests.data",
"bb.tests.event",
"bb.tests.fetch",
+ "bb.tests.trace_base",
"bb.tests.parse",
"bb.tests.persist_data",
"bb.tests.runqueue",
diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
index 718b9f295..0d0ba95ed 100644
--- a/lib/bb/fetch2/__init__.py
+++ b/lib/bb/fetch2/__init__.py
@@ -27,6 +27,7 @@ import bb.persist_data, bb.utils
import bb.checksum
import bb.process
import bb.event
+from bb.trace import TraceUnpack
__version__ = "2"
_checksum_cache = bb.checksum.FileChecksumCache()
@@ -1284,6 +1285,7 @@ class FetchData(object):
if not self.pswd and "pswd" in self.parm:
self.pswd = self.parm["pswd"]
self.setup = False
+ self.destdir = None
def configure_checksum(checksum_id):
if "name" in self.parm:
@@ -1468,7 +1470,7 @@ class FetchMethod(object):
"""
raise NoMethodError(urldata.url)
- def unpack(self, urldata, rootdir, data):
+ def unpack(self, urldata, rootdir, data, trace):
iterate = False
file = urldata.localpath
@@ -1559,6 +1561,8 @@ class FetchMethod(object):
bb.utils.mkdirhier(unpackdir)
else:
unpackdir = rootdir
+ urldata.destdir = unpackdir
+ urldata.is_unpacked_archive = unpack and cmd
if not unpack or not cmd:
# If file == dest, then avoid any copies, as we already put the file into dest!
@@ -1574,6 +1578,7 @@ class FetchMethod(object):
if urlpath.find("/") != -1:
destdir = urlpath.rsplit("/", 1)[0] + '/'
bb.utils.mkdirhier("%s/%s" % (unpackdir, destdir))
+ urldata.destdir = "%s/%s" % (unpackdir, destdir)
cmd = 'cp -fpPRH "%s" "%s"' % (file, destdir)
if not cmd:
@@ -1850,26 +1855,69 @@ class Fetch(object):
if not ret:
raise FetchError("URL %s doesn't work" % u, u)
- def unpack(self, root, urls=None):
+ def unpack(self, root, urls=None, trace=None):
"""
- Unpack urls to root
+ Unpack urls to a tmp dir, trace, and then move everything to root
"""
if not urls:
urls = self.urls
+ if trace:
+ # the unpack method is recursively called by gitsm and npmsw
+ # fetchers to unpack modules; in such case, we need to pass through
+ # the trace object and avoid committing changes and moving tmpdir
+ # contents to root
+ destdir = root
+ is_module = True
+ else:
+ trace = TraceUnpack(root, self.d)
+ destdir = trace.tmpdir
+ is_module = False
for u in urls:
ud = self.ud[u]
+ # absolute subdir, destsuffix and subpath params wouldn't work when
+ # unpacking in the tmp dir, convert them to relative paths
+ realroot = os.path.realpath(root)
+ params = [ 'subdir', 'destsuffix', 'subpath' ]
+ for p in params:
+ if not ud.parm.get(p):
+ continue
+ if os.path.isabs(ud.parm[p]):
+ realpath = os.path.realpath(ud.parm[p])
+ if realpath.startswith(realroot):
+ ud.parm[p] = os.path.relpath(realpath, realroot)
ud.setup_localpath(self.d)
if ud.lockfile:
lf = bb.utils.lockfile(ud.lockfile)
- ud.method.unpack(ud, root, self.d)
+ ud.method.unpack(ud, destdir, self.d, trace)
if ud.lockfile:
bb.utils.unlockfile(lf)
+ if ud.type in [ "npmsw", "gitsm" ]:
+ # changes already committed in ud.method.unpack, see
+ # bb.fetch2.npmsw.NpmShrinkWrap.unpack and
+ # bb.fetch2.gitsm.GitSM.unpack
+ if not is_module:
+ trace.move2root()
+ continue
+
+ if hasattr(ud, "nocheckout") and ud.nocheckout:
+ logger.warning(
+ "Can't trace sources for"
+ " %s because repo has not been checked out" % u)
+ continue
+
+ trace.commit(u, ud)
+ trace.move2root()
+
+ if not is_module:
+ trace.write_data()
+ trace.close()
+
def clean(self, urls=None):
"""
Clean files that the fetcher gets or places
diff --git a/lib/bb/fetch2/crate.py b/lib/bb/fetch2/crate.py
index 2889e39c7..1967e79a3 100644
--- a/lib/bb/fetch2/crate.py
+++ b/lib/bb/fetch2/crate.py
@@ -76,7 +76,7 @@ class Crate(Wget):
logger.debug2("Fetching %s to %s" % (ud.url, ud.parm['downloadfilename']))
- def unpack(self, ud, rootdir, d):
+ def unpack(self, ud, rootdir, d, trace):
"""
Uses the crate to build the necessary paths for cargo to utilize it
"""
@@ -101,8 +101,10 @@ class Crate(Wget):
pn = d.getVar('BPN')
if pn == ud.parm.get('name'):
cmd = "tar -xz --no-same-owner -f %s" % thefile
+ ud.destdir = rootdir
else:
cargo_bitbake = self._cargo_bitbake_path(rootdir)
+ ud.destdir = cargo_bitbake
cmd = "tar -xz --no-same-owner -f %s -C %s" % (thefile, cargo_bitbake)
diff --git a/lib/bb/fetch2/git.py b/lib/bb/fetch2/git.py
index d0d68538e..3b0a87b36 100644
--- a/lib/bb/fetch2/git.py
+++ b/lib/bb/fetch2/git.py
@@ -419,7 +419,9 @@ class Git(FetchMethod):
# releases of Git LFS.
with tempfile.TemporaryDirectory(dir=d.getVar('DL_DIR')) as tmpdir:
# Do the checkout. This implicitly involves a Git LFS fetch.
- Git.unpack(self, ud, tmpdir, d)
+ Git.unpack(self, ud, tmpdir, d, trace={})
+ # (this is just a temporary unpack to force LFS blob fetching, don't
+ # need to trace anything at this point)
# Scoop up a copy of any stuff that Git LFS downloaded. Merge them into
# the bare clonedir.
@@ -534,7 +536,7 @@ class Git(FetchMethod):
shallow_cmd.extend(shallow_revisions)
runfetchcmd(subprocess.list2cmdline(shallow_cmd), d, workdir=dest)
- def unpack(self, ud, destdir, d):
+ def unpack(self, ud, destdir, d, trace):
""" unpack the downloaded src to destdir"""
subdir = ud.parm.get("subdir")
diff --git a/lib/bb/fetch2/gitannex.py b/lib/bb/fetch2/gitannex.py
index 80a808d88..b1aec94ef 100644
--- a/lib/bb/fetch2/gitannex.py
+++ b/lib/bb/fetch2/gitannex.py
@@ -62,7 +62,7 @@ class GitANNEX(Git):
runfetchcmd("%s annex get" % ud.basecmd, d, workdir=dest)
runfetchcmd("chmod u+w -R %s/.git/annex" % (dest), d, quiet=True, workdir=dest)
- def unpack(self, ud, destdir, d):
+ def unpack(self, ud, destdir, d, trace):
Git.unpack(self, ud, destdir, d)
try:
diff --git a/lib/bb/fetch2/gitsm.py b/lib/bb/fetch2/gitsm.py
index f8e239bc5..6fd36d180 100644
--- a/lib/bb/fetch2/gitsm.py
+++ b/lib/bb/fetch2/gitsm.py
@@ -44,6 +44,7 @@ class GitSM(Git):
paths = {}
revision = {}
uris = {}
+ urls = {}
subrevision = {}
def parse_gitmodules(gitmodules):
@@ -137,8 +138,9 @@ class GitSM(Git):
ld.setVar('SRCREV_FORMAT', module)
function(ud, url, module, paths[module], workdir, ld)
+ urls[module] = url
- return submodules != []
+ return submodules, paths, revision, uris, urls, subrevision
def need_update(self, ud, d):
if Git.need_update(self, ud, d):
@@ -199,7 +201,7 @@ class GitSM(Git):
else:
self.process_submodules(ud, ud.clonedir, download_submodule, d)
- def unpack(self, ud, destdir, d):
+ def unpack(self, ud, destdir, d, trace):
def unpack_submodules(ud, url, module, modpath, workdir, d):
url += ";bareclone=1;nobranch=1"
@@ -211,7 +213,7 @@ class GitSM(Git):
try:
newfetch = Fetch([url], d, cache=False)
- newfetch.unpack(root=os.path.dirname(os.path.join(repo_conf, 'modules', module)))
+ newfetch.unpack(root=os.path.dirname(os.path.join(repo_conf, 'modules', module)), trace=trace)
except Exception as e:
logger.error('gitsm: submodule unpack failed: %s %s' % (type(e).__name__, str(e)))
raise
@@ -231,15 +233,20 @@ class GitSM(Git):
logger.error("Unable to set git config core.bare to false for %s" % os.path.join(repo_conf, 'modules', module))
raise
- Git.unpack(self, ud, destdir, d)
+ Git.unpack(self, ud, destdir, d, trace)
- ret = self.process_submodules(ud, ud.destdir, unpack_submodules, d)
+ if not ud.nocheckout:
+ trace.commit(ud.url, ud)
- if not ud.bareclone and ret:
+ submodules, paths, revision, uris, urls, subrevision= self.process_submodules(ud, ud.destdir, unpack_submodules, d)
+
+ if not ud.bareclone and submodules:
# All submodules should already be downloaded and configured in the tree. This simply sets
# up the configuration and checks out the files. The main project config should remain
# unmodified, and no download from the internet should occur.
- runfetchcmd("%s submodule update --recursive --no-fetch" % (ud.basecmd), d, quiet=True, workdir=ud.destdir)
+ for m in submodules:
+ runfetchcmd("%s submodule update --recursive --no-fetch %s" % (ud.basecmd, paths[m]), d, quiet=True, workdir=ud.destdir)
+ trace.commit(urls[m], ud, subdir=paths[m], gitsm_revision=subrevision[m])
def implicit_urldata(self, ud, d):
import shutil, subprocess, tempfile
diff --git a/lib/bb/fetch2/hg.py b/lib/bb/fetch2/hg.py
index 063e13008..478b9d6ec 100644
--- a/lib/bb/fetch2/hg.py
+++ b/lib/bb/fetch2/hg.py
@@ -234,7 +234,7 @@ class Hg(FetchMethod):
def localpath(self, ud, d):
return ud.pkgdir
- def unpack(self, ud, destdir, d):
+ def unpack(self, ud, destdir, d, trace):
"""
Make a local clone or export for the url
"""
@@ -242,6 +242,7 @@ class Hg(FetchMethod):
revflag = "-r %s" % ud.revision
subdir = ud.parm.get("destsuffix", ud.module)
codir = "%s/%s" % (destdir, subdir)
+ ud.destdir = codir
scmdata = ud.parm.get("scmdata", "")
if scmdata != "nokeep":
diff --git a/lib/bb/fetch2/npm.py b/lib/bb/fetch2/npm.py
index 8a179a339..788ca364c 100644
--- a/lib/bb/fetch2/npm.py
+++ b/lib/bb/fetch2/npm.py
@@ -289,11 +289,12 @@ class Npm(FetchMethod):
self._setup_proxy(ud, d)
ud.proxy.download()
- def unpack(self, ud, rootdir, d):
+ def unpack(self, ud, rootdir, d, trace):
"""Unpack the downloaded archive"""
destsuffix = ud.parm.get("destsuffix", "npm")
destdir = os.path.join(rootdir, destsuffix)
npm_unpack(ud.localpath, destdir, d)
+ ud.destdir = destdir
def clean(self, ud, d):
"""Clean any existing full or partial download"""
diff --git a/lib/bb/fetch2/npmsw.py b/lib/bb/fetch2/npmsw.py
index 36fcbfba1..9c9ed75cf 100644
--- a/lib/bb/fetch2/npmsw.py
+++ b/lib/bb/fetch2/npmsw.py
@@ -185,6 +185,8 @@ class NpmShrinkWrap(FetchMethod):
"extrapaths": extrapaths,
"destsuffix": destsuffix,
"unpack": unpack,
+ "deptree": deptree,
+ "version": version,
})
try:
@@ -248,35 +250,38 @@ class NpmShrinkWrap(FetchMethod):
"""Fetch url"""
ud.proxy.download()
- def unpack(self, ud, rootdir, d):
+ def unpack(self, ud, rootdir, d, trace):
"""Unpack the downloaded dependencies"""
- destdir = d.getVar("S")
- destsuffix = ud.parm.get("destsuffix")
- if destsuffix:
- destdir = os.path.join(rootdir, destsuffix)
+ destsuffix = ud.parm.get("destsuffix") or os.path.relpath(d.getVar("S"), d.getVar("WORKDIR"))
+ destdir = os.path.join(rootdir, destsuffix)
+ ud.destdir = destdir
bb.utils.mkdirhier(destdir)
bb.utils.copyfile(ud.shrinkwrap_file,
os.path.join(destdir, "npm-shrinkwrap.json"))
+ trace.commit(ud.url, ud)
auto = [dep["url"] for dep in ud.deps if not dep["localpath"]]
manual = [dep for dep in ud.deps if dep["localpath"]]
if auto:
- ud.proxy.unpack(destdir, auto)
+ ud.proxy.unpack(destdir, auto, trace=trace)
for dep in manual:
depdestdir = os.path.join(destdir, dep["destsuffix"])
if dep["url"]:
npm_unpack(dep["localpath"], depdestdir, d)
+ u = dep["url"]
else:
depsrcdir= os.path.join(destdir, dep["localpath"])
+ u = dep["localpath"]
if dep["unpack"]:
npm_unpack(depsrcdir, depdestdir, d)
else:
bb.utils.mkdirhier(depdestdir)
cmd = 'cp -fpPRH "%s/." .' % (depsrcdir)
runfetchcmd(cmd, d, workdir=depdestdir)
+ trace.commit(u, ud, subdir=dep["destsuffix"])
def clean(self, ud, d):
"""Clean any existing full or partial download"""
diff --git a/lib/bb/tests/trace_base.py b/lib/bb/tests/trace_base.py
new file mode 100644
index 000000000..d750a4086
--- /dev/null
+++ b/lib/bb/tests/trace_base.py
@@ -0,0 +1,202 @@
+
+# Copyright (C) 2023 Alberto Pianon <pianon@array.eu>
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+
+import os
+import re
+import unittest
+import tempfile
+from pathlib import Path
+import subprocess
+
+import bb
+
+def create_src_dst(tmpdir):
+ src_dir = os.path.join(tmpdir, "src/")
+ dst_dir = os.path.join(tmpdir, "dst/")
+ os.makedirs(src_dir)
+ os.makedirs(dst_dir)
+ return Path(src_dir), Path(dst_dir)
+
+def make_dirname(path):
+ dirname = os.path.dirname(path)
+ if dirname:
+ os.makedirs(dirname, exist_ok=True)
+
+def create_file(path, content):
+ make_dirname(path)
+ with open(path, "w") as f:
+ f.write(content)
+
+def create_link(path, target):
+ make_dirname(path)
+ os.symlink(target, path)
+
+def get_tree(path):
+ curdir = os.getcwd()
+ os.chdir(path)
+ tree = []
+ for root, dirs, files in os.walk("."):
+ for f in dirs + files:
+ tree.append(re.sub(r"^\.\/", "", os.path.join(root, f)))
+ os.chdir(curdir)
+ return sorted(tree)
+
+def read_file(path):
+ with open(path) as f:
+ return f.read()
+
+class MoveContentsTest(unittest.TestCase):
+
+ def test_dir_merge_and_file_overwrite(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "dir/subdir/file.txt", "new")
+ create_file(dst_dir / "dir/subdir/file.txt", "old")
+ create_file(dst_dir / "dir/subdir/file1.txt", "old")
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ expected_dst_tree = [
+ "dir",
+ "dir/subdir",
+ "dir/subdir/file.txt",
+ "dir/subdir/file1.txt"
+ ]
+ self.assertEqual(get_tree(src_dir), [])
+ self.assertEqual(get_tree(dst_dir), expected_dst_tree)
+ self.assertEqual(read_file(dst_dir / "dir/subdir/file.txt"), "new")
+ self.assertEqual(read_file(dst_dir / "dir/subdir/file1.txt"), "old")
+
+ def test_file_vs_symlink_conflicts(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+
+ create_file(src_dir / "dir/subdir/fileA.txt", "new")
+ create_file(src_dir / "dir/fileB.txt", "new")
+ create_link(src_dir / "file.txt", "dir/subdir/fileA.txt")
+
+ create_file(dst_dir / "dir/subdir/fileA.txt", "old")
+ create_link(dst_dir / "dir/fileB.txt", "subdir/fileA.txt")
+ create_file(dst_dir / "file.txt", "old")
+
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ self.assertEqual(get_tree(src_dir), [])
+ self.assertTrue(os.path.islink(dst_dir / "file.txt"))
+ self.assertEqual(
+ os.readlink(dst_dir / "file.txt"),
+ "dir/subdir/fileA.txt"
+ )
+ self.assertFalse(os.path.islink(dst_dir / "dir/fileB.txt"))
+ self.assertEqual(read_file(dst_dir / "dir/fileB.txt"), "new")
+
+ def test_dir_vs_file_conflict(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "items/item0/content.txt", "hello")
+ create_file(dst_dir / "items/item0", "there")
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ self.assertEqual(get_tree(src_dir), [])
+ self.assertTrue(os.path.isdir(dst_dir / "items/item0"))
+ self.assertEqual(
+ read_file(dst_dir / "items/item0/content.txt"), "hello")
+
+ def test_dir_vs_symlink_conflict(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "items/item0/content.txt", "hello")
+ create_file(dst_dir / "items/item1/content.txt", "there")
+ create_link(dst_dir / "items/item0", "item1")
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ self.assertEqual(get_tree(src_dir), [])
+ self.assertFalse(os.path.islink(dst_dir / "items/item0"))
+ self.assertEqual(
+ read_file(dst_dir / "items/item0/content.txt"), "hello")
+ self.assertEqual(
+ read_file(dst_dir / "items/item1/content.txt"), "there")
+
+ def test_symlink_vs_empty_dir_conflict(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "items/item1/content.txt", "there")
+ create_link(src_dir / "items/item0", "item1")
+ os.makedirs(dst_dir / "items/item0")
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ self.assertEqual(get_tree(src_dir), [])
+ self.assertTrue(os.path.islink(dst_dir / "items/item0"))
+ self.assertEqual(read_file(dst_dir / "items/item0/content.txt"), "there")
+
+ def test_symlink_vs_nonempty_dir_conflict(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "items/item1/content.txt", "there")
+ create_link(src_dir / "items/item0", "item1")
+ create_file(dst_dir / "items/item0/content.txt", "hello")
+ with self.assertRaises(bb.trace.TraceException) as context:
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+
+ def test_file_vs_empty_dir_conflict(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "items/item0", "test")
+ os.makedirs(dst_dir / "items/item0")
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ self.assertEqual(get_tree(src_dir), [])
+ self.assertTrue(os.path.isfile(dst_dir/ "items/item0"))
+
+ def test_file_vs_nonempty_dir_conflict(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ create_file(src_dir / "items/item0", "test")
+ create_file(dst_dir / "items/item0/content.txt", "test")
+ with self.assertRaises(bb.trace.TraceException) as context:
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+
+ def test_git_dir(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ src_dir, dst_dir = create_src_dst(tmpdir)
+ git_repo = src_dir / "src/my_git_repo"
+ create_file(git_repo / "foo.txt", "hello")
+ subprocess.check_output(["git", "init"], cwd=git_repo)
+ create_file(dst_dir / "src/my_git_repo/content.txt", "there")
+ bb.trace.unpack_base.move_contents(src_dir, dst_dir)
+ self.assertFalse(
+ os.path.exists(dst_dir / "src/my_git_repo/content.txt"))
+ # git clone dir should be pruned if already existing
+ self.assertEqual(
+ read_file(dst_dir / "src/my_git_repo/foo.txt"), "hello")
+ self.assertTrue(os.path.isdir(dst_dir / "src/my_git_repo/.git"))
+
+
+class FindAllFilesAndLinksTest(unittest.TestCase):
+
+ def test_findall_files_and_links(self):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ tmpdir = Path(tmpdir)
+ files = {
+ str(tmpdir/"foo/example/example.txt"): "ciao",
+ str(tmpdir/"foo/foo.txt"): "foo",
+ str(tmpdir/"foo/foo2.txt"): "foo2",
+ str(tmpdir/"README"): "hello",
+ }
+ ignored = {
+ str(tmpdir/".git"): "fake",
+ str(tmpdir/"foo/.git/dummy"): "dummy"
+ }
+ allfiles = files.copy()
+ allfiles.update(ignored)
+ links = {
+ str(tmpdir/"example"): "foo/example", # link to dir
+ str(tmpdir/"example.txt"): "foo/example/example.txt", # link to file
+ }
+ for path, content in allfiles.items():
+ create_file(path, content)
+ for path, target in links.items():
+ create_link(path, target)
+ res_files, res_links = bb.trace.unpack_base.findall_files_and_links(tmpdir, exclude=['.git'])
+ self.assertEqual(res_files, sorted(list(files.keys())))
+ self.assertEqual(res_links, sorted(list(links.keys())))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/lib/bb/trace/__init__.py b/lib/bb/trace/__init__.py
new file mode 100644
index 000000000..efa35ba14
--- /dev/null
+++ b/lib/bb/trace/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (C) 2023 Alberto Pianon <pianon@array.eu>
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+
+class TraceException(Exception):
+ pass
+
+try:
+ from .unpack import TraceUnpack
+except (ModuleNotFoundError, ImportError):
+ # fallback to base class (which implements the process necessary to trace
+ # upstream data but does not actually collect any data)
+ from .unpack_base import TraceUnpackBase as TraceUnpack
+
diff --git a/lib/bb/trace/unpack_base.py b/lib/bb/trace/unpack_base.py
new file mode 100644
index 000000000..e073977ba
--- /dev/null
+++ b/lib/bb/trace/unpack_base.py
@@ -0,0 +1,283 @@
+"""Module implementing a base process for upstream source tracing
+
+The process consists of:
+
+- creating a temporary directory where each SRC_URI element is unpacked (if we
+ unpack directly to WORKDIR, the latter may contain other files coming from
+ other SRC_URI element unpacking or from other tasks, making it much harder to
+ trace files for each SRC_URI element individually);
+
+- collecting relevant metadata for Software Composition Analysis (file sha1,
+ upstream download location (in SPDX-compliant format), path in the upstream
+ repo/package, etc.);
+
+- moving everything to WORKDIR, and iterate with the next SRC_URI element;
+
+- saving metadata in a json file after all elements have been processed.
+"""
+
+# Copyright (C) 2023 Alberto Pianon <pianon@array.eu>
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+
+import os
+import json
+import tempfile
+
+import bb.utils
+import bb.compress.zstd
+
+from bb.trace import TraceException
+
+def scandir(path):
+ with os.scandir(path) as scan:
+ return { e.name: e for e in scan }
+
+def is_real_dir(e):
+ return e.is_dir() and not e.is_symlink()
+
+def is_real_and_nonempty_dir(e):
+ return is_real_dir(e) and scandir(e.path)
+
+def is_file_or_symlink(e):
+ return e.is_file() or e.is_symlink()
+
+def is_git_dir(e):
+ path_scandir = scandir(e.path)
+ if ".git" in path_scandir and path_scandir[".git"].is_dir():
+ try:
+ bb.process.run(
+ ["git", "rev-parse", "--is-inside-work-tree"], cwd=e.path)
+ return True
+ except bb.process.ExecutionError:
+ return False
+ return False
+
+def check_is_real_dir(path, name):
+ if not os.path.exists(path) or os.path.islink(path) or os.path.isfile(path):
+ raise TraceException(
+ "%s path %s is not a directory" % (name, path))
+
+def move_contents(src_dir, dst_dir):
+ """Move and merge contents from src_dir to dst_dir
+
+ Conflict resolution criteria:
+
+ - if a file (or symlink) exists both in src_dir and in dst_dir, the
+ file/symlink in dst_dir will be overwritten;
+
+ - if a subdirectory exists both in src_dir and in dst_dir, their contents
+ will be merged, and in case of file/symlink conflicts, files/symlinks in
+ dst_dir will be overwritten - unless src_dir is a git repo; in such a
+ case, dst_dir will be pruned and src_dir will be moved to dst_dir, for
+ consistency with bb.fetch2.git.Git.unpack method's behavior (which prunes
+ clone dir if already existing, before cloning)
+
+ - if the same relative path exists both in src_dir and in dst_dir, but the
+ path in src_dir is a directory and the path in dst_dir is a file/symlink,
+ the latter will be overwritten;
+
+ - if instead the path in src_dir is a file and the path in dst_dir is a
+ directory, the latter will be overwritten only if it is empty, otherwise
+ an exception will be raised.
+
+ In order to reduce execution time, os.scandir is used instead of os.listdir,
+ and os.rename is used to move/overwrite files, as well as to move src dir
+ subdirectories that do not exist or are empty in dst_dir. To make os.rename
+ work as intended, both src_dir and dst_dir must reside in the same
+ filesystem.
+ """
+
+ check_is_real_dir(src_dir, "Source")
+ check_is_real_dir(dst_dir, "Destination")
+
+ if os.lstat(src_dir).st_dev != os.lstat(dst_dir).st_dev:
+ raise TraceException(
+ "Source %s and destination %s must be in the same filesystem" %
+ (src_dir, dst_dir)
+ )
+
+ src_scandir = scandir(src_dir)
+ dst_scandir = scandir(dst_dir)
+
+ for src_name, src in src_scandir.items():
+ dst = dst_scandir.get(src_name)
+ if dst:
+ # handle conflicts
+ if is_real_dir(src) and is_real_and_nonempty_dir(dst):
+ if is_git_dir(src):
+ bb.utils.prunedir(dst.path)
+ else:
+ move_contents(src.path, dst.path)
+ os.rmdir(src.path)
+ continue
+ elif is_real_dir(src) and is_file_or_symlink(dst):
+ os.remove(dst.path)
+ elif is_file_or_symlink(src) and is_real_dir(dst):
+ try:
+ os.rmdir(dst.path)
+ except OSError as e:
+ if e.errno == 39:
+ raise TraceException(
+ "Error while moving %s contents to %s, cannot move"
+ " %s to %s: source is a file or a symlink, while"
+ " destination is a non-empty directory."
+ % (src_dir, dst_dir, src.path, dst.path)
+ )
+ else:
+ raise e
+ dst_path = dst.path if dst else os.path.join(dst_dir, src_name)
+ os.rename(src.path, dst_path)
+
+def findall_files_and_links(path, exclude=[]):
+ """recusively find all files and links in path, excluding dir and file names
+ in exclude.
+
+ Returns tuple of sorted lists of file and link paths. Sorting is for
+ reproducibility (order of files returned by os.scandir may randomly vary)
+
+ It uses os.scandir instead of os.walk or os.listdir because it's much faster
+ """
+ files = []
+ links = []
+ with os.scandir(path) as scan:
+ for e in scan:
+ if e.name in exclude:
+ continue
+ if e.is_symlink():
+ links.append(e.path)
+ elif e.is_file():
+ files.append(e.path)
+ elif e.is_dir():
+ _files, _links = findall_files_and_links(e.path, exclude)
+ files += _files
+ links += _links
+ return sorted(files), sorted(links)
+
+
+class TraceUnpackBase:
+ """base class for implementing a process for upstream source tracing
+ See module help for more details on the process.
+
+ This is just a base class, that implements the process but does not collect
+ any data. As such, it can be used just to test if the process correctly
+ integrates with all bb fetchers.
+
+ To be of actual use, it should be subclassed implementing _collect_data()
+ and _process_data() methods.
+
+ Method call order:
+ - __init__()
+ - commit()
+ - move2root()
+ - write_data()
+ - close()
+ """
+
+ def __init__(self, root, d):
+ """initialize properties and create temporary directory in root
+
+ Temporary unpack dir is created in 'root' to be sure they are in the
+ same filesystem, to allow faster moving of contents at the end.
+
+ If some basic variables are missing from datastore (WORKDIR, PN, PV,
+ BBLAYERS), it means that we are inside a fetcher test
+ (self.is_fetcher_test=True); in such case, some steps (commit and
+ write_data) should be skipped because they would miss required data.
+ """
+
+ self.root = root
+ self.d = d
+ self.td = {}
+ required_vars = [ "WORKDIR", "PN", "PV", "BBLAYERS" ]
+ for var in required_vars:
+ if not self.d.getVar(var):
+ self.is_fetcher_test = True
+ break
+ else:
+ self.is_fetcher_test = False
+ if not os.path.exists(root):
+ bb.utils.mkdirhier(root)
+ self.tmpdir = tempfile.mkdtemp(dir=root)
+
+ def commit(self, u, ud, subdir=None, gitsm_revision=None):
+ """collect and infer metadata by scanning self.tmpdir after unpack
+
+ This method is generally called by bb.fetch2.Fetch.unpack() (which is a
+ wrapper for fetcher-specific unpack methods).
+
+ However in two cases (gitsm and npmsw fetchers) it needs to be called
+ also by the fetcher-specific unpack method, because both gitsm and npmsw
+ generate multiple src uris (modules) from one single SRC_URI element
+ (main git repo or npm-shrinkwrap.json), unpack the "main" SRC_URI
+ element and then unpack generated src uris. Each of such generated src
+ uris corresponds to a separate upstream package (git submodule or npm
+ module) which needs to be separately traced.
+
+ Params are:
+
+ - u -> str: src uri of the upstream repo/package that is being processed
+ (eg.
+ git://github.com/containernetworking/cni.git;nobranch=1;name=cni;protocol=https)
+
+ - ud -> bb.fetch2.FetchData: src uri fetch data object. It usually
+ corresponds to the fetch data of u, but when called by gitsm and npmsw
+ fetchers u is the src uri of the (sub)module being processed, while ud
+ is the src uri fetch data of the "main" SRC_URI element (main git repo
+ or npm-shrinkwrap.json file). NOTE: ud.destdir is the destination
+ directory where the "main" SRC_URI element is unpacked; it should be
+ used to infer each file's path in the upstream repo/package
+
+ - subdir -> str: subdir of ud.destdir where the (sub)module has been
+ unpacked (only for gitsm and npmsw fetchers). It should be used to
+ infer each file's path in the upstream repo/package
+
+ - gitsm_revision -> str: revision of the git submodule that is being
+ processed
+ """
+ if self.is_fetcher_test:
+ return
+ destdir = os.path.join(ud.destdir, subdir) if subdir else ud.destdir
+ files, links = findall_files_and_links(
+ destdir, exclude=['.git', '.hg', '.svn', 'node_modules'])
+ self._collect_data(u, ud, files, links, destdir, gitsm_revision)
+
+ def _collect_data(self, u, ud, files, links, destdir, gitsm_revision):
+ """collect SCA metadata on the committed files. Not implemented"""
+ pass
+
+ def move2root(self):
+ """move all files from temporary directory to root (=WORKDIR, generally)
+
+ It needs to be a separate method from commit() because of the way gitsm
+ and npmsw fetchers work: with such fetchers, we cannot move anything to
+ root before all git|npm (sub)modules have been processed, but we need to
+ commit trace data for each (sub)module individually, so commit() and
+ move2root() need to be two separate methods.
+ """
+ move_contents(self.tmpdir, self.root)
+
+ def _process_data(self):
+ """post-process self.td - eg. to group data and optimize json output.
+ Not implemented"""
+ pass
+
+ def write_data(self):
+ if self.is_fetcher_test:
+ return
+ self._process_data()
+ path = "%s/temp/%s-%s.unpack.trace.json.zst" % (
+ self.d.getVar("WORKDIR"), self.d.getVar("PN"), self.d.getVar("PV"))
+ # FIXME find the right way and place to store this file so that it
+ # can be picked up by create-spdx even when do_unpack is not run
+ # because built component is in sstate-cache
+ with bb.compress.zstd.open(path, "wt", encoding="utf-8") as f:
+ json.dump(self.td, f)
+ f.flush()
+
+ def close(self):
+ os.rmdir(self.tmpdir)
+ del self.td
+
+