SL-18837: Introduce flatten_files.py and use to post release assets

author: Nat Goodspeed <nat@lindenlab.com> 2023-08-29 17:55:53 -0400
committer: Nat Goodspeed <nat@lindenlab.com> 2023-08-29 17:55:53 -0400
commit: 7779cebdcd1aecbce92c660072c00064185a95f1 (patch)
tree: 2a1702ef77ee8c179cfc566cc1c547a61df1ca6e /.github/workflows/flatten_files.py
parent: 25efba151f98308a0e2d9af52a76173be3f8aa04 (diff)
1 files changed, 180 insertions, 0 deletions
diff --git a/.github/workflows/flatten_files.py b/.github/workflows/flatten_files.py
new file mode 100755
index 0000000000..542fa0206b
--- /dev/null
+++ b/.github/workflows/flatten_files.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""\
+@file   flatten_files.py
+@author Nat Goodspeed
+@date   2023-08-18
+@brief  From an input directory tree, populate a single flat output directory.
+
+$LicenseInfo:firstyear=2023&license=viewerlgpl$
+Copyright (c) 2023, Linden Research, Inc.
+$/LicenseInfo$
+"""
+
+DESCRIPTION = """\
+From an input directory tree, populate a single flat output directory.
+
+For files with colliding names, rename them to unambiguous names derived from
+their relative pathname within the input tree.
+
+This is useful when downloading GitHub build artifacts from multiple platforms
+to post them all as release assets without collisions.
+"""
+
+from collections import defaultdict
+from contextlib import suppress
+import filecmp
+import os
+from pathlib import Path
+import sys
+
+class Error(Exception):
+    pass
+
+def flatten(output, input='.', dry_run=False):
+    try:
+        in_stat = os.stat(input)
+    except FileNotFoundError as err:
+        raise Error(f'{input} does not exist') from err
+
+    try:
+        out_stat = os.stat(output)
+    except FileNotFoundError:
+        # output doesn't yet exist - at this point that's okay
+        out_stat = None
+
+    # use samestat() to avoid being fooled by different ways of expressing the
+    # same path
+    if out_stat and os.path.samestat(out_stat, in_stat):
+        # output directory same as input: in this case, don't prune output
+        # directory from input tree walk because we'd prune everything
+        out_stat = None
+    elif out_stat:
+        # distinct existing output directory (potentially containing, or
+        # contained by, the input directory)
+        outfiles = [f for f in Path(output).rglob('*') if f.is_file()]
+        if outfiles:
+            print(f'Warning: {output} already contains {len(outfiles)} files:', file=sys.stderr)
+            for f in sorted(outfiles):
+                print('  ', f.relative_to(output), file=sys.stderr)
+
+    # Use os.walk() instead of Path.rglob() so we can prune unwanted
+    # directories.
+    infiles = []
+    for parent, dirs, files in os.walk(input):
+        infiles.extend(Path(parent, f) for f in files)
+        # Prune directories: because we must modify the dirs list in-place,
+        # and because we're using indexes, traverse backwards so deletion
+        # won't affect subsequent iterations. Yes we really must subtract 1
+        # that many times.
+        for idx in range(len(dirs)-1, -1, -1):
+            if dirs[idx].startswith('.'):
+                # skip dot-named directories
+                print(f'ignoring {dirs[idx]}', file=sys.stderr)
+                del dirs[idx]
+            elif out_stat and os.path.samestat(os.stat(os.path.join(parent, dirs[idx])), out_stat):
+                # output directory lives under input directory: ignore any
+                # previous contents
+                print(f'ignoring nested output directory {os.path.join(parent, dirs[idx])}',
+                      file=sys.stderr)
+                del dirs[idx]
+
+    # Now that we've traversed the input tree, create the output directory if
+    # needed.
+    output = Path(output)
+    output.mkdir(parents=True, exist_ok=True)
+
+    # group files by basename to identify collisions
+    basenames = defaultdict(list)
+    for f in infiles:
+        basenames[f.name].append(f)
+
+    # output names: populate it right away with unique basenames
+    outnames = { name: files[0] for name, files in basenames.items()
+                 if len(files) == 1 }
+
+    # now focus on the collisions
+    for name, files in basenames.items():
+        if len(files) <= 1:
+            continue
+
+        # Special case: are these colliding files equal? e.g. viewer_version.txt
+        # Pass shallow=False so we actually read the files in question. Even
+        # if they're identical, they've been downloaded from different
+        # artifacts and have different timestamps (which would fool the default
+        # shallow=True). This could be time-consuming if we were comparing two
+        # of our very large files, but (a) our very large files already have
+        # distinct names and so don't reach this call and (b) if we somehow do
+        # wind up in that situation, it would be even more important to post
+        # only a single copy.
+        if all(filecmp.cmp(files[0], f, shallow=False) for f in files[1:]):
+            # pick only one of them and use its simple basename
+            outnames[name] = files[0]
+            continue
+
+        # Because of our intended use for GitHub Actions build artifacts, we
+        # assume the top-level artifact names are descriptive. We'd still like
+        # to eliminate mid-level directory names that don't help disambiguate,
+        # so for instance, given:
+        # Windows metadata/newview/viewer_version.txt
+        # macOS metadata/newview/viewer_version.txt
+        # we see no reason to retain the 'newview' pathname component. Try
+        # longer and longer prefixes of the pathname parents. (But don't
+        # forget to trim off the original input directory pathname.)
+        filepairs = [(f, f.relative_to(input)) for f in files]
+        partslen = max(len(rel.parts) for f, rel in filepairs)
+        # skip the basename itself, we'll append that explicitly
+        for prefixlen in range(partslen - 1):
+            # Consider these relative names (shouldn't occur, but...):
+            # parent/autobuild-package.xml
+            # parent/newview/autobuild-package.xml
+            # Unless these are in fact identical, they'll collide, meaning
+            # we'll see them here. But beware their unequal numbers of parts.
+            # partslen will be 3, so prefixlen will be 0, 1 -- but unless we
+            # constrain it with min(), for prefixlen == 1 we'd construct:
+            # ('parent', 'autobuild-package.xml', 'autobuild-package.xml')
+            # ('parent', 'newview', 'autobuild-package.xml')
+            # whereas of course the correct answer would be:
+            # ('parent', 'autobuild-package.xml')
+            # ('parent', 'newview', 'autobuild-package.xml')
+            # Since we already know the basename is identical for every f in
+            # files, though, we can omit it from our uniqueness testing.
+            trynames = { rel.parts[:min(prefixlen+1, len(rel.parts)-1)]: f
+                         for f, rel in filepairs }
+            if len(trynames) == len(files):
+                # Found a prefix without collisions -- note that we're
+                # guaranteed to get here eventually since the full paths are
+                # distinct in the filesystem, we just want to try to shorten.
+                # Path.parts is specifically documented to be a tuple. Join
+                # the key tuple with some delimiter acceptable to the
+                # filesystem.
+                outnames.update(('-'.join(nametuple + (name,)), f)
+                                for nametuple, f in trynames.items())
+                # stop considering longer prefixlens
+                break
+
+    # at this point outnames should have distinct keys -- move to the output
+    # directory
+    for name, f in outnames.items():
+        newname = output / name
+        if (not dry_run) and newname != f:
+            newname = f.rename(newname)
+        print(f'{f} => {newname}')
+
+def main(*raw_args):
+    from argparse import ArgumentParser
+    parser = ArgumentParser(description=DESCRIPTION)
+    parser.add_argument('-n', '--dry-run', action='store_true', default=False,
+                        help="""show what would happen without moving files""")
+    parser.add_argument('output', metavar='OUTDIR',
+                        help="""populate OUTDIR with (possibly renamed) files""")
+    parser.add_argument('input', metavar='INDIR', nargs='?', default='.',
+                        help="""recursively read files under INDIR tree""")
+
+    args = parser.parse_args(raw_args)
+    flatten(args.output, args.input, dry_run=args.dry_run)
+
+if __name__ == "__main__":
+    try:
+        sys.exit(main(*sys.argv[1:]))
+    except Error as err:
+        sys.exit(str(err))
author	Nat Goodspeed <nat@lindenlab.com>	2023-08-29 17:55:53 -0400
committer	Nat Goodspeed <nat@lindenlab.com>	2023-08-29 17:55:53 -0400
commit	7779cebdcd1aecbce92c660072c00064185a95f1 (patch)
tree	2a1702ef77ee8c179cfc566cc1c547a61df1ca6e /.github/workflows/flatten_files.py
parent	25efba151f98308a0e2d9af52a76173be3f8aa04 (diff)