path: root/scripts
diff options
authorAlex Kiernan <>2023-12-23 14:31:18 +0000
committerRichard Purdie <>2023-12-28 10:54:51 +0000
commit2fa1b25d7485bfbb92bcc33067beb6751218b36a (patch)
tree62c40fc09b89b5b21845c815dc46bfe10b95ae2b /scripts
parent692dd762a0c817797c28381c6169205fbaeb2705 (diff)
sstate-cache-management: Rewrite in python
This (should be) a drop in replacement for Signed-off-by: Alex Kiernan <> Signed-off-by: Alexandre Belloni <>
Diffstat (limited to 'scripts')
1 files changed, 329 insertions, 0 deletions
diff --git a/scripts/ b/scripts/
new file mode 100755
index 0000000000..09b7aa2aef
--- /dev/null
+++ b/scripts/
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+# Copyright OpenEmbedded Contributors
+# SPDX-License-Identifier: MIT
+import argparse
+import os
+import re
+import sys
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+if sys.version_info < (3, 8, 0):
+ raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.")
+SSTATE_PREFIX = "sstate:"
+SSTATE_EXTENSION = ".tar.zst"
+# .siginfo.done files are mentioned in the original script?
+ f"{SSTATE_EXTENSION}.siginfo",
+RE_SSTATE_PKGSPEC = re.compile(
+ rf"""sstate:(?P<pn>[^:]*):
+ (?P<package_target>[^:]*):
+ (?P<pv>[^:]*):
+ (?P<pr>[^:]*):
+ (?P<sstate_pkgarch>[^:]*):
+ (?P<sstate_version>[^_]*):
+ (?P<bb_unihash>[^_]*)_
+ (?P<bb_task>[^:]*)
+ (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""",
+ re.X,
+# Really we'd like something like a Path subclass which implements a stat
+# cache here, unfortunately there's no good way to do that transparently
+# (yet); see:
+class SstateEntry:
+ """Class for keeping track of an entry in sstate-cache."""
+ path: Path
+ match: re.Match
+ stat_result: os.stat_result = None
+ def __hash__(self):
+ return self.path.__hash__()
+ def __getattr__(self, name):
+ return
+# this is what's in the original script; as far as I can tell, it's an
+# implementation artefact which we don't need?
+def find_archs():
+ # all_archs
+ builder_arch = os.uname().machine
+ layer_paths = [Path("../..")]
+ tune_archs = set()
+ re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"')
+ for path in layer_paths:
+ for tunefile in [
+ p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file()
+ ]:
+ with open(tunefile) as f:
+ for line in f:
+ m = re_tune.match(line)
+ if m:
+ tune_archs.update(
+ # all_machines
+ machine_archs = set()
+ for path in layer_paths:
+ for machine_file in path.glob("meta*/conf/machine/*.conf"):
+ machine_archs.add([-1][:-5])
+ extra_archs = set()
+ all_archs = (
+ set(
+ arch.replace("-", "_")
+ for arch in machine_archs | tune_archs | set(["allarch", builder_arch])
+ )
+ | extra_archs
+ )
+ print(all_archs)
+# again, not needed?
+def find_tasks():
+ print(set([p.bb_task for p in paths]))
+def collect_sstate_paths(args):
+ def scandir(path, paths):
+ # Assume everything is a directory; by not checking we avoid needing an
+ # additional stat which is potentially a synchronous roundtrip over NFS
+ try:
+ for p in path.iterdir():
+ filename =[-1]
+ if filename.startswith(SSTATE_PREFIX):
+ if filename.endswith(SSTATE_SUFFIXES):
+ m = RE_SSTATE_PKGSPEC.match([-1])
+ assert m
+ paths.add(SstateEntry(p, m))
+ # ignore other things (includes things like lockfiles)
+ else:
+ scandir(p, paths)
+ except NotADirectoryError:
+ pass
+ paths = set()
+ # TODO: parellise scandir
+ scandir(Path(args.cache_dir), paths)
+ def path_stat(p):
+ p.stat_result = p.path.lstat()
+ if args.remove_duplicated:
+ # This is probably slightly performance negative on a local filesystem
+ # when we interact with the GIL; over NFS it's a massive win.
+ with ThreadPoolExecutor( as executor:
+, paths)
+ return paths
+def remove_by_stamps(args, paths):
+ all_sums = set()
+ for stamps_dir in args.stamps_dir:
+ stamps_path = Path(stamps_dir)
+ assert stamps_path.is_dir()
+ re_sigdata = re.compile(r"do_.*.sigdata\.([^.]*)")
+ all_sums |= set(
+ [
+ for x in stamps_path.glob("*/*/*.do_*.sigdata.*")
+ ]
+ )
+ re_setscene = re.compile(r"do_.*_setscene\.([^.]*)")
+ all_sums |= set(
+ [
+ for x in stamps_path.glob("*/*/*.do_*_setscene.*")
+ ]
+ )
+ return [p for p in paths if p.bb_unihash not in all_sums]
+def remove_duplicated(args, paths):
+ # Skip populate_lic as it produces duplicates in a normal build
+ #
+ # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates
+ valid_paths = [p for p in paths if p.bb_task != "populate_lic"]
+ keep = dict()
+ remove = list()
+ for p in valid_paths:
+ sstate_sig = ":".join([, p.sstate_pkgarch, p.bb_task, p.ext])
+ if sstate_sig not in keep:
+ keep[sstate_sig] = p
+ elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime:
+ remove.append(keep[sstate_sig])
+ keep[sstate_sig] = p
+ else:
+ remove.append(p)
+ return remove
+def remove_orphans(args, paths):
+ remove = list()
+ pathsigs = defaultdict(list)
+ for p in paths:
+ sstate_sig = ":".join([, p.sstate_pkgarch, p.bb_task])
+ pathsigs[sstate_sig].append(p)
+ for k, v in pathsigs.items():
+ if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0:
+ remove.extend(v)
+ return remove
+def parse_arguments():
+ parser = argparse.ArgumentParser(description="sstate cache management utility.")
+ parser.add_argument(
+ "--cache-dir",
+ default=os.environ.get("SSTATE_CACHE_DIR"),
+ help="""Specify sstate cache directory, will use the environment
+ variable SSTATE_CACHE_DIR if it is not specified.""",
+ )
+ # parser.add_argument(
+ # "--extra-archs",
+ # help="""Specify list of architectures which should be tested, this list
+ # will be extended with native arch, allarch and empty arch. The
+ # script won't be trying to generate list of available archs from
+ # AVAILTUNES in tune files.""",
+ # )
+ # parser.add_argument(
+ # "--extra-layer",
+ # help="""Specify the layer which will be used for searching the archs,
+ # it will search the meta and meta-* layers in the top dir by
+ # default, and will search meta, meta-*, <layer1>, <layer2>,
+ # ...<layern> when specified. Use "," as the separator.
+ #
+ # This is useless for --stamps-dir or when --extra-archs is used.""",
+ # )
+ parser.add_argument(
+ "-d",
+ "--remove-duplicated",
+ action="store_true",
+ help="""Remove the duplicated sstate cache files of one package, only
+ the newest one will be kept. The duplicated sstate cache files
+ of one package must have the same arch, which means sstate cache
+ files with multiple archs are not considered duplicate.
+ Conflicts with --stamps-dir.""",
+ )
+ parser.add_argument(
+ "--remove-orphans",
+ action="store_true",
+ help=f"""Remove orphan siginfo files from the sstate cache, i.e. those
+ where this is no {SSTATE_EXTENSION} file but there are associated
+ tracking files.""",
+ )
+ parser.add_argument(
+ "--stamps-dir",
+ action="append",
+ help="""Specify the build directory's stamps directories, the sstate
+ cache file which IS USED by these build diretories will be KEPT,
+ other sstate cache files in cache-dir will be removed. Can be
+ specified multiple times for several directories.
+ Conflicts with --remove-duplicated.""",
+ )
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel."
+ )
+ # parser.add_argument(
+ # "-L",
+ # "--follow-symlink",
+ # action="store_true",
+ # help="Remove both the symbol link and the destination file, default: no.",
+ # )
+ parser.add_argument(
+ "-y",
+ "--yes",
+ action="store_true",
+ help="""Automatic yes to prompts; assume "yes" as answer to all prompts
+ and run non-interactively.""",
+ )
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Explain what is being done."
+ )
+ parser.add_argument(
+ "-D",
+ "--debug",
+ action="count",
+ default=0,
+ help="Show debug info, repeat for more debug info.",
+ )
+ args = parser.parse_args()
+ if args.cache_dir is None or (
+ not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans
+ ):
+ parser.print_usage()
+ sys.exit(1)
+ return args
+def main():
+ args = parse_arguments()
+ paths = collect_sstate_paths(args)
+ if args.remove_duplicated:
+ remove = remove_duplicated(args, paths)
+ elif args.stamps_dir:
+ remove = remove_by_stamps(args, paths)
+ else:
+ remove = list()
+ if args.remove_orphans:
+ remove = set(remove) | set(remove_orphans(args, paths))
+ if args.debug >= 1:
+ print("\n".join([str(p.path) for p in remove]))
+ print(f"{len(remove)} out of {len(paths)} files will be removed!")
+ if not args.yes:
+ print("Do you want to continue (y/n)?")
+ confirm = input() in ("y", "Y")
+ else:
+ confirm = True
+ if confirm:
+ # TODO: parallelise remove
+ for p in remove:
+ p.path.unlink()
+if __name__ == "__main__":
+ main()