3 changed files with 70 additions and 59 deletions
--- a/.gitignore
+++ b/.gitignore
@ -102,4 +102,4 @@ ENV/
 *.swp
 *.swo
-mcdata/
+test/
--- a/main.py
+++ b/main.py
@ -1,18 +1,18 @@
 import glob
 import os
 import hashlib
 import time
 import traceback
 from multiprocessing import Pool
 import queue
 from PIL import Image, UnidentifiedImageError
 import acoustid
 import chromaprint
 from imagehash import average_hash
 from videohash import VideoHash
 from videohash.exceptions import FFmpegFailedToExtractFrames
 hashes = set()
 audio_hashes = []
 delete = set()
 hash_lookup = {}
 filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
@ -21,11 +21,11 @@ print('ENTER to continue, ctrl+c to cancel.')
 try:
    input()
 except KeyboardInterrupt:
-    print('Cancelled.')
+    print('\nCancelled.')
    os._exit(0)
-print('Sorting file list by smallest size...')
+print('Sorting file list by size...')
-filenames.sort(key=os.path.getsize, reverse=False)
+filenames.sort(key=os.path.getsize, reverse=True)
 print('Deduplicating by md5 hash...')
@ -39,82 +39,87 @@ for filename in filenames:
        hasher.update(buf)
        digest = hasher.hexdigest()
        if digest in hashes:
            print('Found digest', digest, 'collision for', filename)
            delete.add(filename)
        else:
            hashes.add(digest)
 print('Found', len(delete), 'duplicates by md5 hash.')
 time.sleep(4)
 print('Deduplicating by media fingerprint...')
 def get_image_hash(filename):
    basename = os.path.basename(os.path.dirname(filename))
    try:
        image = Image.open(filename)
-        return basename + str(average_hash(image)) + '_image'
+        return str(average_hash(image)) + '_image'
    except UnidentifiedImageError:
        return None
-def get_video_hash(filename):
+def get_audio_hash(filename):
    basename = os.path.basename(os.path.dirname(filename))
    try:
-        v = VideoHash(path=filename)
+        fp = acoustid.fingerprint_file(filename)[1]
-        digest = str(v.hash_hex)
+        values, _ = chromaprint.decode_fingerprint(fp)
-        v.delete_storage_path()
+        return chromaprint.hash_fingerprint(values)
    except acoustid.FingerprintGenerationError:
        return None
-        return basename + digest + '_video'
+def get_video_hash(filename):
    try:
        return str(VideoHash(path=filename).hash_hex) + '_video'
    except FFmpegFailedToExtractFrames:
        return None
 count = 0
 total = len(filenames)
-def hasher(filename):
+for filename in filenames:
-    if filename in delete: return None
+    count += 1
    print('Hashing file', count, '/', total, end='\r')
-    print('Hashing file:', filename)
+    if filename in delete: continue
    try:
-        digest = get_image_hash(filename)#  or get_video_hash(filename)
+        digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
    except KeyboardInterrupt:
-        print('Cancelled.')
+        print('Skipping media hashing.')
-        os._exit(0)
+        break
    except BaseException as e:
        print()
        print('Exception', e.__class__.__name__, str(e), 'while hashing:')
        print(filename)
-        print(traceback.format_exc())
+        continue
        return None
-    if not digest: return None
+    if not digest: continue
-    return (filename, digest)
+    hash_lookup[digest] = filename
-with Pool() as pool:
+    if type(digest) == int:
-    results = pool.map(hasher, filenames)
+        for h in audio_hashes:
-
+            if bin(digest ^ h).count('1') <= 5:   # TODO adjust?
-print('Finished hashing.')
+                delete.add(filename)
-print()
+                print()
-print('Checking digests:')
+                print(digest, filename, 'close to', h, hash_lookup[h])
-print()
+                break
-
+        else:  # for
-for result in results:
+            audio_hashes.append(digest)
    if not result: continue
    print(result)
    filename, digest = result
    if digest in hashes:
        print('Found digest', digest, 'collision for', filename)
        delete.add(filename)
    else:
-        hashes.add(digest)
+        if digest in hashes:
            delete.add(filename)
        else:
            hashes.add(digest)
 print()
-print('Found', len(delete), 'total duplicate files.')
+print()
 for dupe in delete:
    print(dupe)
-for dupe in sorted(list(delete)):
+print()
-    print('Deleting:', dupe)
+print('Found', len(delete), 'total duplicate files. Delete them?')
 print('ENTER to continue, ctrl+c to cancel.')
 try:
    input()
 except KeyboardInterrupt:
    print('\nCancelled.')
    os._exit(0)
 print('Deleting...')
 for dupe in delete:
    os.remove(dupe)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,13 +1,19 @@
-Brotli==1.1.0
+audioread==3.0.0
-certifi==2023.7.22
+Brotli==1.0.9
 certifi==2022.12.7
 charset-normalizer==3.1.0
 idna==3.4
 imagedominantcolor==1.0.1
 ImageHash==4.3.1
-mutagen==1.47.0
+mutagen==1.46.0
-numpy==1.26.1
+numpy==1.24.2
-Pillow==10.1.0
+Pillow==9.5.0
-pycryptodomex==3.19.0
+pyacoustid==1.2.2
 pycryptodomex==3.17
 PyWavelets==1.4.1
-scipy==1.11.3
+requests==2.28.2
 scipy==1.10.1
 urllib3==1.26.15
 videohash==3.0.1
-websockets==11.0.3
+websockets==11.0
-yt-dlp==2023.10.13
+yt-dlp==2023.3.4