From 39eff4e53aaa12dc041f6e44c50fa5af56a752c6 Mon Sep 17 00:00:00 2001 From: Reid 'arrdem' McKenzie Date: Sun, 5 Dec 2021 11:35:19 -0700 Subject: [PATCH] Spinner while copying; handle dirt files better --- projects/archiver/BUILD | 1 + projects/archiver/org_photos.py | 200 ++++++++++++++++++++------------ 2 files changed, 128 insertions(+), 73 deletions(-) diff --git a/projects/archiver/BUILD b/projects/archiver/BUILD index d60c6a2..667dc8e 100644 --- a/projects/archiver/BUILD +++ b/projects/archiver/BUILD @@ -14,5 +14,6 @@ zapp_binary( ], deps = [ py_requirement("ExifRead"), + py_requirement("yaspin"), ] ) diff --git a/projects/archiver/org_photos.py b/projects/archiver/org_photos.py index 0d372bd..95aeabd 100644 --- a/projects/archiver/org_photos.py +++ b/projects/archiver/org_photos.py @@ -29,6 +29,14 @@ from .util import * # FIXME: use piexif, which supports writeback not exifread. import exifread +from yaspin import Spinner, yaspin + + +_print = print + + +def print(*strs, **kwargs): + _print("\r", *strs, **kwargs) parser = argparse.ArgumentParser() @@ -38,6 +46,17 @@ parser.add_argument("destructive", action="store_true", default=False) MODIFIED_ISO_DATE = "%Y:%m:%dT%H:%M:%SF%f" +SPINNER = Spinner(["|", "/", "-", "\\"], 200) +KNOWN_IMG_TYPES = { + ".jpg": ".jpeg", + ".jpeg": ".jpeg", + ".png": ".png", + ".mov": ".mov", + ".gif": ".gif", + ".mp4": ".mp4", + ".m4a": ".m4a", + ".oga": ".oga", # How the hell do I have ogg files kicking around +} def exif_tags(p: Path) -> object: @@ -60,14 +79,29 @@ def safe_strptime(date, format): try: return datetime.strptime(date, format) except ValueError: - return None + pass -def date_from_name(p: Path): - """Try to munge a datestamp out of a path.""" +def safe_ymdhmms(date): + fmt = ( + r"(?P\d{4})(?P\d{2})(?P\d{2})" + r" " + r"(?P\d{2})(?P\d{2})(?P\d{2})(?P\d{3})" + ) + m = re.match(fmt, date) + if m: + return datetime( + year=int(m.group("year")), + month=int(m.group("month")), + day=int(m.group("day")), + hour=int(m.group("hour")), + minute=int(m.group("minute")), + second=int(m.group("second")), + microsecond=int(m.group("millisecond")) * 1000, + ) - fname = ".".join(p.name.split(".")[:-1]) +def date_from_name(fname: str): # Discard common prefixes fname = fname.replace("IMG_", "") fname = fname.replace("PXL_", "") @@ -93,54 +127,59 @@ def date_from_name(p: Path): # Try to guess the date # File date formats: - for fmt in [ + for unfmt in [ # Our date format - MODIFIED_ISO_DATE, + lambda d: safe_strptime(d, MODIFIED_ISO_DATE), # A bug # 2014:08:21T19:4640F1408672000 # 2015:12:14T23:0933F1450159773 - "%Y:%m:%dT%H:%M%SF%f", + lambda d: safe_strptime(d, "%Y:%m:%dT%H:%M%SF%f"), # 2020-12-21 17.15.09.0 - "%Y-%m-%d %H.%M.%S.%f", + lambda d: safe_strptime(d, "%Y-%m-%d %H.%M.%S.%f"), # 2020-12-21 17.15.09 - "%Y-%m-%d %H.%M.%S", + lambda d: safe_strptime(d, "%Y-%m-%d %H.%M.%S"), # 2019-02-09 12.45.32-6 # 2019-01-13 13.43.45-16 - "%Y-%m-%d %H.%M.%S-%f", + lambda d: safe_strptime(d, "%Y-%m-%d %H.%M.%S-%f"), # Note the _1 or such may not be millis, but we assume it is. # 20171113_130826_1 # 20171113 130826 1 - "%Y%m%d %H%M%S %f", + lambda d: safe_strptime(d, "%Y%m%d %H%M%S %f"), # 20180404_114639 # 20180404 114639 - "%Y%m%d %H%M%S", + lambda d: safe_strptime(d, "%Y%m%d %H%M%S"), # 2017-11-05_15:15:55 # 2017-11-05 15:15:55 - "%Y-%m-%d %H:%M:%S", + lambda d: safe_strptime(d, "%Y-%m-%d %H:%M:%S"), + lambda d: safe_strptime(d, "%Y%m%d %h%m%s%f"), + # HACK: + # Python doesn't support %s as milliseconds; these don't quite work. + # So use a custom matcher. + # 20210526 002327780 # 20210417_220753284 # 20210417 220753284 # 20210304 204755545 - "%Y%m%d %h%m%s%f", + # 20211111 224304117 + safe_ymdhmms, ]: - try: - return datetime.strptime(fname, fmt) - except ValueError: - continue - else: + val = unfmt(fname) + if val is not None: + return val + + +def date_from_path(p: Path): + """Try to munge a datestamp out of a path.""" + + fname = ".".join(p.name.split(".")[:-1]) + + date = date_from_name(fname) + if not date: print(f"Warning: Unable to infer datetime from {fname!r}", file=sys.stderr) + return date def normalize_ext(p: Path): - renaming = { - ".jpg": ".jpeg", - ".jpeg": ".jpeg", - ".png": ".png", - ".mov": ".mov", - ".gif": ".gif", - ".mp4": ".mp4", - ".m4a": ".m4a", - ".oga": ".oga", # How the hell do I have ogg files kicking around - } + renaming = KNOWN_IMG_TYPES exts = [e.lower() for e in p.suffixes] # Guess an ext out of potentially many, allowing only for folding of effective dupes exts = set(renaming[e] for e in exts if e in renaming) @@ -264,7 +303,7 @@ def img_info(p: Path) -> ImgInfo: ) if date and (date := safe_strptime(date, "%Y:%m:%d %H:%M:%S")): pass - elif date := date_from_name(p): + elif date := date_from_path(p): dirty |= True else: # The oldest of the mtime and the ctime @@ -285,6 +324,9 @@ def img_info(p: Path) -> ImgInfo: date = date.replace(microsecond=subsec) + if not (2015 <= date.year <= datetime.now().year): + raise ValueError(f"{p}'s inferred date ({date!r}) is beyond the sanity-check range!") + return ImgInfo( p, tags, @@ -310,66 +352,78 @@ def main(): raise OSError() src.rename(target) # Execute the rename - except OSError: # cross-device move - copyfile(src, target) - if opts.destructive: - src.chmod(0o644) - src.unlink() - print(" unlink: ok") + except OSError: # cross-device move + with yaspin(SPINNER): + copyfile(src, target) + + if opts.destructive: + src.unlink() + print(" unlink: ok") print("---") sequence_name = None sequence = 0 - for src in list(opts.src_dir.glob("**/*")): + for src in opts.src_dir.glob("**/*"): + print(f"{src}:") + ext = "." + src.name.lower().split(".")[-1] + print(f" msg: ext inferred as {ext}") + if src.is_dir(): continue - elif src.name.startswith("."): + elif ext in ["thm", "lrv", "ico", "sav"] or src.name.startswith("._"): + if opts.destructive: + src.unlink() continue - print(f"{src}:") - - info = img_info(src) - year_dir = Path(opts.dest_dir / str(info.date.year)) - year_dir.mkdir(exist_ok=True) # Ignore existing and continue - # Figure out a stable file name - stable_name = f"v1_{info.date.strftime(MODIFIED_ISO_DATE)}_{sanitize(info.camera_make)}_{sanitize(info.camera_model)}_{info.device_fingerprint()}" - - # De-conflict using a sequence number added to the sub-seconds field - if sequence_name == stable_name: - sequence += 1 - info = info.incr(sequence) - print(f" warning: de-conflicting filenames with sequence {sequence}") + elif ext in KNOWN_IMG_TYPES: + info = img_info(src) + year_dir = Path(opts.dest_dir / str(info.date.year)) + year_dir.mkdir(exist_ok=True) # Ignore existing and continue + # Figure out a stable file name stable_name = f"v1_{info.date.strftime(MODIFIED_ISO_DATE)}_{sanitize(info.camera_make)}_{sanitize(info.camera_model)}_{info.device_fingerprint()}" - else: - sequence = 0 - sequence_name = stable_name + # De-conflict using a sequence number added to the sub-seconds field + if sequence_name == stable_name: + sequence += 1 + info = info.incr(sequence) + print(f" warning: de-conflicting filenames with sequence {sequence}") + stable_name = f"v1_{info.date.strftime(MODIFIED_ISO_DATE)}_{sanitize(info.camera_make)}_{sanitize(info.camera_model)}_{info.device_fingerprint()}" - try: - ext = normalize_ext(src) - except AssertionError: - continue # Just skip fucked up files - target = Path(year_dir / f"{stable_name}{ext}") + else: + sequence = 0 + sequence_name = stable_name + + try: + ext = normalize_ext(src) + except AssertionError: + continue # Just skip fucked up files + target = Path(year_dir / f"{stable_name}{ext}") + + if not target.exists(): + # src & !target => copy + _copy(src, target) + + elif src == target: + # src == target; skip DO NOT DELETE SRC + pass + + elif checksum_path_blocks(src) == checksum_path_blocks(target): + print(f" ok: {target}") + # src != target && id(src) == id(target); delete src + if opts.destructive: + src.unlink() + + else: + # src != target && id(src) != id(target); replace target with src? + print(f" warning: {target} is a content-id collision with a different checksum; skipping") - if not target.exists(): - # src & !target => copy - _copy(src, target) - elif src == target: - # src == target; skip DO NOT DELETE SRC - pass - elif checksum_path_blocks(src) == checksum_path_blocks(target): - print(f" ok: {target}") - # src != target && id(src) == id(target); delete src - if opts.destructive: - src.chmod(0o644) - src.unlink() else: - # src != target && id(src) != id(target); replace target with src? - print(f" warning: {target} is a content-id collision with a different checksum; skipping") + print(f" msg: unknown filetype {ext}") + if __name__ == "__main__": main()