Compare bits on audio hashes

2023-04-03 22:11:34 -06:00
parent d678c8fd0f
commit dddfbb0724
1 changed files with 19 additions and 4 deletions
--- a/main.py
+++ b/main.py
@@ -4,11 +4,13 @@ import hashlib
 from PIL import Image, UnidentifiedImageError

 import acoustid
+import chromaprint
 from imagehash import average_hash
 from videohash import VideoHash
 from videohash.exceptions import FFmpegFailedToExtractFrames

 hashes = set()
+audio_hashes = []
 delete = set()

 filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
@@ -52,7 +54,9 @@ def get_image_hash(filename):

 def get_audio_hash(filename):
    try:
-        return str(acoustid.fingerprint_file(filename)[1].decode()) + '_audio'
+        fp = acoustid.fingerprint_file(filename)[1]
+        values, _ = chromaprint.decode_fingerprint(fp)
+        return chromaprint.hash_fingerprint(values)
    except acoustid.FingerprintGenerationError:
        return None

@@ -84,13 +88,24 @@ for filename in filenames:

    if not digest: continue

-    if digest in hashes:
-        delete.add(filename)
+    if type(digest) == int:
+        for h in audio_hashes:
+            if bin(digest ^ h).count('1') <= 5:   # TODO adjust?
+                delete.add(filename)
+                break
+        else:  # for
+            audio_hashes.append(digest)
    else:
-        hashes.add(digest)
+        if digest in hashes:
+            delete.add(filename)
+        else:
+            hashes.add(digest)

+print()
+print()
 for dupe in delete:
    print(dupe)
+
 print()
 print('Found', len(delete), 'total duplicate files. Delete them?')
 print('ENTER to continue, ctrl+c to cancel.')