From 898789eb6cc2befec6a4038c6f9c4742b3a6af94 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 5 Mar 2024 15:11:33 +0100 Subject: [PATCH 1/2] Add a script to expunge old releases from releases.nixos.org Since releases are the main garbage collector roots, this is the first step in doing garbage collection of cache.nixos.org. Also, the releases themselves take up a non-trivial amount of storage (~32 TiB). Note: this script doesn't do anything yet, it just prints which releases/files would be deleted/moved to Glacier. To run: $ aws s3 ls --recursive s3://nix-releases > bucket-contents $ python3 ./expunge-releases.py --- gc/expunge-releases/expunge-releases.py | 121 ++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 gc/expunge-releases/expunge-releases.py diff --git a/gc/expunge-releases/expunge-releases.py b/gc/expunge-releases/expunge-releases.py new file mode 100644 index 0000000..03c28e7 --- /dev/null +++ b/gc/expunge-releases/expunge-releases.py @@ -0,0 +1,121 @@ +from datetime import datetime +import os +import re + +release_dirs = ["nixos", "nixpkgs"] + +name_to_release = re.compile('(nixos|nixpkgs)-(darwin-)?(\d\d.\d\d)') +version_split = re.compile('-|\.|alpha|beta|pre') +is_prerelease = re.compile('alpha|beta|pre') + +minimum_release_to_keep = "23.05" + +class FileEntry: + def __init__(self, date, size, name): + self.date = date + self.size = size + self.name = name + +def parse_file(filename): + entries = [] + with open(filename, 'r') as file: + for line in file: + #date = datetime.strptime(line[0:19], "%Y-%m-%d %H:%M:%S") + date = None # don't need it + size = int(line[19:31]) + name = line[31:].rstrip('\n') + entry = FileEntry(date, size, name) + entries.append(entry) + return entries + +class Release: + def __init__(self, name, date): + self.name = name + self.date = date + self.files = [] + + def major_release(self): + basename = os.path.basename(self.name) + m = name_to_release.match(basename) + return m[3] if m else None + + def sort_key(self): + # Prereleases get sorted before non-prereleases. + # Pad integer components to compare them numerically. + name = os.path.basename(self.name) + return [not re.search(is_prerelease, name)] + [maybe_pad_int(s) for s in re.split(version_split, name)] + +def maybe_pad_int(s): + return s.zfill(10) if s.isdigit() else s + +releases = dict() + +filename = "bucket-contents" +file_entries = parse_file(filename) + +# Gather all releases. These are identified by having a "src-url" file +# in them. +for entry in file_entries: + if entry.name.endswith("src-url"): + #print("Date:", entry.date, "Size:", entry.size, "Name:", entry.name) + name = os.path.dirname(entry.name) + releases[name] = Release(name, entry.date) + +# Get the files for each release. +for entry in file_entries: + rel_name = os.path.dirname(entry.name) + rel = releases.get(rel_name) + if rel is not None: + rel.files.append(entry) + +print("Found {} releases.".format(len(releases))) + +# Group the releases by parent directory (like nixpkgs/23.11-darwin/). +release_parents = dict() + +for rel in releases.values(): + parent_dir = os.path.dirname(rel.name) + release_parents.setdefault(parent_dir, []).append(rel) + +# For each parent directory, group the releases by major release +# (e.g. 23.11). +expunged_releases = 0 +expunged_files = 0 +expunged_size = 0 + +for release_parent_name, rels in release_parents.items(): + major_releases = dict() + for rel in rels: + major_release = rel.major_release() + if major_release is None: + print("Skipping release '{}'.".format(rel.name)) + else: + major_releases.setdefault(major_release, []).append(rel) + + for major_release_name, rels in major_releases.items(): + if major_release_name >= minimum_release_to_keep: + print("Keeping all {} releases in major release group '{}/{}'.".format(len(rels), release_parent_name, major_release_name)) + else: + # Sort the releases by release name (lexicographically after splitting into version components), keep the newest one. + sorted_rels = sorted(rels, key=lambda rel: rel.sort_key()) + print("Major release group '{}/{}' ({} releases).".format(release_parent_name, major_release_name, len(rels))) + #for rel in sorted_rels: + # print(" ", rel.date, rel.name, rel.sort_key()) + + # Keep the most recent release in the group. + most_recent_rel = sorted_rels.pop() + + # Expunge the other releases. + for rel in sorted_rels: + rel_size = sum([f.size for f in rel.files]) + print(" Expunging release '{}' ({} files, {:.2f} MiB).".format(rel.name, len(rel.files), rel_size / 1024**2)) + #for f in rel.files: + # print(" {}".format(f.name)) + # # TODO: delete or move to glacier. + expunged_releases += 1 + expunged_files += len(rel.files) + expunged_size += rel_size + + print(" Keeping release '{}'.".format(most_recent_rel.name)) + +print("Expunged {} releases, {} files, {:.2f} GiB.".format(expunged_releases, expunged_files, expunged_size / 1024**3)) From 3967dbe15bb6e7b833f0ee3030126c5029272262 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 5 Mar 2024 17:20:07 +0100 Subject: [PATCH 2/2] Write files to expunge to stdout --- gc/expunge-releases/expunge-releases.py | 41 +++++++++++++------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/gc/expunge-releases/expunge-releases.py b/gc/expunge-releases/expunge-releases.py index 03c28e7..57f5971 100644 --- a/gc/expunge-releases/expunge-releases.py +++ b/gc/expunge-releases/expunge-releases.py @@ -1,6 +1,7 @@ from datetime import datetime import os import re +import sys release_dirs = ["nixos", "nixpkgs"] @@ -10,11 +11,14 @@ minimum_release_to_keep = "23.05" +def log(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + class FileEntry: - def __init__(self, date, size, name): + def __init__(self, date, size, key): self.date = date self.size = size - self.name = name + self.key = key def parse_file(filename): entries = [] @@ -23,8 +27,8 @@ def parse_file(filename): #date = datetime.strptime(line[0:19], "%Y-%m-%d %H:%M:%S") date = None # don't need it size = int(line[19:31]) - name = line[31:].rstrip('\n') - entry = FileEntry(date, size, name) + key = line[31:].rstrip('\n') + entry = FileEntry(date, size, key) entries.append(entry) return entries @@ -56,19 +60,19 @@ def maybe_pad_int(s): # Gather all releases. These are identified by having a "src-url" file # in them. for entry in file_entries: - if entry.name.endswith("src-url"): - #print("Date:", entry.date, "Size:", entry.size, "Name:", entry.name) - name = os.path.dirname(entry.name) + if entry.key.endswith("src-url"): + #log("Date:", entry.date, "Size:", entry.size, "Name:", entry.key) + name = os.path.dirname(entry.key) releases[name] = Release(name, entry.date) # Get the files for each release. for entry in file_entries: - rel_name = os.path.dirname(entry.name) + rel_name = os.path.dirname(entry.key) rel = releases.get(rel_name) if rel is not None: rel.files.append(entry) -print("Found {} releases.".format(len(releases))) +log("Found {} releases.".format(len(releases))) # Group the releases by parent directory (like nixpkgs/23.11-darwin/). release_parents = dict() @@ -88,19 +92,19 @@ def maybe_pad_int(s): for rel in rels: major_release = rel.major_release() if major_release is None: - print("Skipping release '{}'.".format(rel.name)) + log("Skipping release '{}'.".format(rel.name)) else: major_releases.setdefault(major_release, []).append(rel) for major_release_name, rels in major_releases.items(): if major_release_name >= minimum_release_to_keep: - print("Keeping all {} releases in major release group '{}/{}'.".format(len(rels), release_parent_name, major_release_name)) + log("Keeping all {} releases in major release group '{}/{}'.".format(len(rels), release_parent_name, major_release_name)) else: # Sort the releases by release name (lexicographically after splitting into version components), keep the newest one. sorted_rels = sorted(rels, key=lambda rel: rel.sort_key()) - print("Major release group '{}/{}' ({} releases).".format(release_parent_name, major_release_name, len(rels))) + log("Major release group '{}/{}' ({} releases).".format(release_parent_name, major_release_name, len(rels))) #for rel in sorted_rels: - # print(" ", rel.date, rel.name, rel.sort_key()) + # log(" ", rel.date, rel.name, rel.sort_key()) # Keep the most recent release in the group. most_recent_rel = sorted_rels.pop() @@ -108,14 +112,13 @@ def maybe_pad_int(s): # Expunge the other releases. for rel in sorted_rels: rel_size = sum([f.size for f in rel.files]) - print(" Expunging release '{}' ({} files, {:.2f} MiB).".format(rel.name, len(rel.files), rel_size / 1024**2)) - #for f in rel.files: - # print(" {}".format(f.name)) - # # TODO: delete or move to glacier. + log(" Expunging release '{}' ({} files, {:.2f} MiB).".format(rel.name, len(rel.files), rel_size / 1024**2)) + for f in rel.files: + print(f.key) expunged_releases += 1 expunged_files += len(rel.files) expunged_size += rel_size - print(" Keeping release '{}'.".format(most_recent_rel.name)) + log(" Keeping release '{}'.".format(most_recent_rel.name)) -print("Expunged {} releases, {} files, {:.2f} GiB.".format(expunged_releases, expunged_files, expunged_size / 1024**3)) +log("Expunged {} releases, {} files, {:.2f} GiB.".format(expunged_releases, expunged_files, expunged_size / 1024**3))