De-conflict sequential source files that map to the same path

This commit is contained in:
Reid 'arrdem' McKenzie 2021-10-04 13:31:00 -06:00
parent 9aae534971
commit b37d999796

View file

@ -17,7 +17,7 @@ Inspired by https://github.com/herval/org_photos/blob/main/org_photos.rb
""" """
import argparse import argparse
from datetime import datetime from datetime import datetime, timedelta
from hashlib import sha256, sha512 from hashlib import sha256, sha512
from pathlib import Path from pathlib import Path
import re import re
@ -34,7 +34,7 @@ import exifread
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("src_dir", type=Path) parser.add_argument("src_dir", type=Path)
parser.add_argument("dest_dir", type=Path) parser.add_argument("dest_dir", type=Path)
parser.add_option("destructive", action="store_true", default=False) parser.add_argument("destructive", action="store_true", default=False)
MODIFIED_ISO_DATE = "%Y:%m:%dT%H:%M:%SF%f" MODIFIED_ISO_DATE = "%Y:%m:%dT%H:%M:%SF%f"
@ -198,6 +198,20 @@ class ImgInfo(t.NamedTuple):
def file_sha512sum(self): def file_sha512sum(self):
return checksum_path(self.file_path, sha512) return checksum_path(self.file_path, sha512)
def incr(self, offset: int) -> "ImgInfo":
return ImgInfo(
self.file_path,
self.tags,
self.camera_make,
self.camera_model,
self.camera_sn,
self.lens_make,
self.lens_model,
self.lens_sn,
self.software,
self.date + timedelta(microseconds=offset),
True,
)
def img_info(p: Path) -> ImgInfo: def img_info(p: Path) -> ImgInfo:
"""Figure out everything we know from the image info.""" """Figure out everything we know from the image info."""
@ -289,7 +303,7 @@ def img_info(p: Path) -> ImgInfo:
def main(): def main():
opts, args = parser.parse_known_args() opts, args = parser.parse_known_args()
def _copy(): def _copy(src, target):
print(f" rename: {target}") print(f" rename: {target}")
try: try:
if not opts.destructive: if not opts.destructive:
@ -302,12 +316,20 @@ def main():
if opts.destructive: if opts.destructive:
src.chmod(0o644) src.chmod(0o644)
src.unlink() src.unlink()
print(" unlink: ok")
print("---") print("---")
sequence_name = None
sequence = 0
for src in list(opts.src_dir.glob("**/*")): for src in list(opts.src_dir.glob("**/*")):
if src.is_dir(): if src.is_dir():
continue continue
elif src.name.startswith("."):
continue
print(f"{src}:") print(f"{src}:")
info = img_info(src) info = img_info(src)
@ -315,6 +337,18 @@ def main():
year_dir.mkdir(exist_ok=True) # Ignore existing and continue year_dir.mkdir(exist_ok=True) # Ignore existing and continue
# Figure out a stable file name # Figure out a stable file name
stable_name = f"v1_{info.date.strftime(MODIFIED_ISO_DATE)}_{sanitize(info.camera_make)}_{sanitize(info.camera_model)}_{info.device_fingerprint()}" stable_name = f"v1_{info.date.strftime(MODIFIED_ISO_DATE)}_{sanitize(info.camera_make)}_{sanitize(info.camera_model)}_{info.device_fingerprint()}"
# De-conflict using a sequence number added to the sub-seconds field
if sequence_name == stable_name:
sequence += 1
info = info.incr(sequence)
print(f" warning: de-conflicting filenames with sequence {sequence}")
stable_name = f"v1_{info.date.strftime(MODIFIED_ISO_DATE)}_{sanitize(info.camera_make)}_{sanitize(info.camera_model)}_{info.device_fingerprint()}"
else:
sequence = 0
sequence_name = stable_name
try: try:
ext = normalize_ext(src) ext = normalize_ext(src)
except AssertionError: except AssertionError:
@ -323,19 +357,19 @@ def main():
if not target.exists(): if not target.exists():
# src & !target => copy # src & !target => copy
_copy() _copy(src, target)
elif src == target: elif src == target:
# src == target; skip DO NOT DELETE SRC # src == target; skip DO NOT DELETE SRC
pass pass
elif checksum_path_blocks(src) == checksum_path_blocks(target): elif checksum_path_blocks(src) == checksum_path_blocks(target):
print(f" ok: {target}")
# src != target && id(src) == id(target); delete src # src != target && id(src) == id(target); delete src
if opts.destructive: if opts.destructive:
src.chmod(0o644) src.chmod(0o644)
src.unlink() src.unlink()
else: else:
# src != target && id(src) != id(target); replace target with src? # src != target && id(src) != id(target); replace target with src?
print(f" warning: {target} is a content-id collision with a different checksum") print(f" warning: {target} is a content-id collision with a different checksum; skipping")
if __name__ == "__main__": if __name__ == "__main__":
main() main()