diff mbox series

sstate-cache-cleaner.py: Add a script for sstate cache cleaning

Message ID 20221121111102.5556-1-tomasz.dziendzielski@gmail.com
State New
Headers show
Series sstate-cache-cleaner.py: Add a script for sstate cache cleaning | expand

Commit Message

Tomasz Dziendzielski Nov. 21, 2022, 11:11 a.m. UTC
From: Mikolaj Lasota <mikolaj.lasota@protonmail.com>

Bash script used at the moment takes too much time to calculate obsolete
sstate cache files. Let's try to rewrite necessary logic in python and
store intermediate data in memory rather than temporary files.

Signed-off-by: Mikolaj Lasota <mikolaj.lasota@protonmail.com>
Signed-off-by: Tomasz Dziendzielski <tomasz.dziendzielski@gmail.com>
---
 scripts/sstate-cache-cleaner.py | 166 ++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100755 scripts/sstate-cache-cleaner.py
diff mbox series

Patch

diff --git a/scripts/sstate-cache-cleaner.py b/scripts/sstate-cache-cleaner.py
new file mode 100755
index 0000000000..f01db35775
--- /dev/null
+++ b/scripts/sstate-cache-cleaner.py
@@ -0,0 +1,166 @@ 
+#!/usr/bin/env python3
+
+"""
+This script is a python rewrite of poky based scripts/sstate-cache-management.sh
+It has a subset of original script features - namely the ability to filter cache files by stamp files references.
+The output is a list of unreferenced sstate-cache files - which are obsolete and can be removed.
+
+To test the script agains the original one (shell) one might create a small test environment:
+ - create a local sstate-cache directory
+ - run two or more separate builds (different hashes/machines) using above dir (SSTATE_DIR)
+ - run original shell script using stamp dir from one of the above builds and the common cache dir
+ - run this script with the same arguments (same stamp & cache dirs)
+"""
+
+import argparse
+import fnmatch
+import logging
+import os
+import re
+import time
+from functools import reduce
+
+formatter = logging.Formatter('%(asctime)s - %(funcName)s - %(levelname)s - %(message)s')
+logger = logging.getLogger('sstate-cache-cleaner')
+logger.setLevel(logging.DEBUG)
+fh = logging.FileHandler('sstate-cache-cleaner.log', 'w')
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+ch.setFormatter(formatter)
+logger.addHandler(fh)
+logger.addHandler(ch)
+
+TIME = time.time()
+ONE_DAY_IN_SECONDS = 86400
+
+def collect_sstate_cache_files(cache_dir):
+    """ Collect all sstate-cache files form cache_dir and figure out accelerated tasks for cleaning. """
+
+    logger.info('Collecting sstate-cache files...')
+
+    sstate_tasks = set()
+    cache_files = dict()
+    cache_file_regex = re.compile(r'sstate.*:([^_]*)_(.*)\.tgz.*')
+    for root, dirs, files in os.walk(cache_dir):
+        for filename in files:
+            if fnmatch.fnmatch(filename, 'sstate*'):
+                match = cache_file_regex.match(filename)
+                if match:
+                    _hash = match.group(1)
+                    _task = match.group(2)
+                    sstate_tasks.add(_task)
+                    f = os.path.join(root, filename)
+                    try:
+                        if os.stat(f).st_ctime < TIME - ONE_DAY_IN_SECONDS:
+                            if _hash in cache_files:
+                                cache_files[_hash].append(f)
+                            else:
+                                cache_files[_hash] = [f]
+                    except FileNotFoundError as err:
+                        logger.error(err)
+
+    num_of_files = reduce(lambda count, element: count + len(element), cache_files.values(), 0)
+    num_of_hashes = len(cache_files)
+    logger.info(f'Found {num_of_files} sstate files ({num_of_hashes} hashes)')
+    return cache_files, sstate_tasks
+
+def collect_stamps(stamps_dirs_list, tasks):
+    """ Collect hashes from the stamp files (only for tasks which were found in sstate-cache) """
+
+    logger.info('Collecting stamps...')
+
+    stamps = set()
+    for stamps_dir in stamps_dirs_list:
+        logger.debug(f'Looking for stamps in {stamps_dir}')
+        for root, dirs, files in os.walk(stamps_dir):
+            for filename in files:
+                for task in tasks:
+                    if fnmatch.fnmatch(filename, f'*.do_{task}_setscene.*'):
+                        match = re.match(rf'.*\.do_{task}_setscene\.([^\.]*).*', filename)
+                        if match:
+                            stamps.add(match.group(1))
+                    elif fnmatch.fnmatch(filename, f'*.do_{task}.*'):
+                        match = re.match(rf'.*do_{task}(\.sigdata)?\.([^\.]*).*', filename)
+                        if match:
+                            stamps.add(match.group(2))
+                    continue
+
+    logger.info(f'Found {len(stamps)} stamps')
+    return stamps
+
+def compute_obsolete_sstate_cache_files(stamps, cache):
+    """ Figure out which cache files are obsolete.
+
+    Check if a cache file is referenced by a stamp file. If yes - it is needed - and therefore should be filtered out
+    from the processed list. The list which is returned is a list of files to be removed.
+    """
+
+    logger.info('Filtering sstate-cache list for unreferenced (obsolete) files...')
+
+    num_stamps = len(stamps) - 1
+    progress = -1
+    for i, stamp in enumerate(stamps):
+        _progress = int(i / num_stamps * 100)
+        if _progress % 5 == 0 and _progress > progress:
+            progress = _progress
+            logger.debug(f'[{progress:3d}%] Cleaning stamp {i}/{num_stamps}')
+        if stamp in cache:
+            del cache[stamp]
+
+    num_of_files = reduce(lambda count, element: count + len(element), cache.values(), 0)
+    logger.info(f'Found {num_of_files} sstate files to be removed')
+    return cache
+
+def parse_arguments():
+    """ Parse arguments for cache & stamp directories and output file name """
+
+    parser = argparse.ArgumentParser(
+                        description='Sstate cache cleanup script. \
+                                     Cache files which are not referenced by stamp files will be listed for removal.',
+                        epilog='This is a python re-write of poky provided sstate-cache-management.sh script. \
+                                Only stamp based cleaning is implemented.')
+    parser.add_argument('--cache-dir', required=True,
+                        help='Specify sstate-cache directory')
+    parser.add_argument('--stamps-dir', required=True, nargs='+',
+                        help='Specify stamps directories')
+    parser.add_argument('--output-file', '-f', required=True,
+                        help='Specify a file for script output - a list of obsolete sstate-cache files.')
+
+    logger.debug('Parsing arguments...')
+    return parser.parse_args()
+
+def main():
+    args = parse_arguments()
+
+    stamps_dirs_list = args.stamps_dir
+    for i, path in enumerate(stamps_dirs_list):
+        abs_path = os.path.abspath(path)
+        if not os.path.isdir(abs_path):
+            raise ValueError(f'Stamps directory doesn\'t exist: {abs_path} !')
+        stamps_dirs_list[i] = abs_path
+
+    cache_dir = os.path.abspath(args.cache_dir)
+    if not os.path.isdir(cache_dir):
+        raise ValueError(f'Cache directory doesn\'t exist: {cache_dir} !')
+
+    output_file_path = os.path.abspath(args.output_file)
+
+    cache, tasks = collect_sstate_cache_files(cache_dir)
+    stamps = collect_stamps(stamps_dirs_list, tasks)
+
+    obsolete_sstate = compute_obsolete_sstate_cache_files(stamps, cache)
+    obsolete_sstate_files = [item for sublist in obsolete_sstate.values() for item in sublist]
+
+    if not os.path.isdir(os.path.dirname(output_file_path)):
+        logger.warning(f'Output directory doesn\'t exist and will be created: {output_file_path}')
+        os.makedirs(os.path.dirname(output_file_path))
+
+    with open(output_file_path, 'w') as out:
+        out.write('\n'.join(obsolete_sstate_files))
+
+    logger.info(f'List of obsolete sstate-cache files saved: {output_file_path}')
+
+if __name__ == "__main__":
+    main()