From d678c8fd0ff14bee4d3de4addd3456282d704c78 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Mon, 3 Apr 2023 21:04:37 -0600 Subject: [PATCH 1/4] Add audio hashing --- main.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index f949256..6055557 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ import os import hashlib from PIL import Image, UnidentifiedImageError +import acoustid from imagehash import average_hash from videohash import VideoHash from videohash.exceptions import FFmpegFailedToExtractFrames @@ -49,6 +50,12 @@ def get_image_hash(filename): except UnidentifiedImageError: return None +def get_audio_hash(filename): + try: + return str(acoustid.fingerprint_file(filename)[1].decode()) + '_audio' + except acoustid.FingerprintGenerationError: + return None + def get_video_hash(filename): try: return str(VideoHash(path=filename).hash_hex) + '_video' @@ -65,7 +72,10 @@ for filename in filenames: if filename in delete: continue try: - digest = get_image_hash(filename) or get_video_hash(filename) + digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename) + except KeyboardInterrupt: + print('Skipping media hashing.') + break except BaseException as e: print() print('Exception', e.__class__.__name__, str(e), 'while hashing:') @@ -79,8 +89,16 @@ for filename in filenames: else: hashes.add(digest) +for dupe in delete: + print(dupe) print() -print('Found', len(delete), 'total duplicate files.') +print('Found', len(delete), 'total duplicate files. Delete them?') +print('ENTER to continue, ctrl+c to cancel.') +try: + input() +except KeyboardInterrupt: + print('\nCancelled.') + os._exit(0) print('Deleting...') for dupe in delete: From dddfbb07247801b4bbcead7446362ef37131cf12 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Mon, 3 Apr 2023 22:11:34 -0600 Subject: [PATCH 2/4] Compare bits on audio hashes --- main.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 6055557..0ec2daa 100644 --- a/main.py +++ b/main.py @@ -4,11 +4,13 @@ import hashlib from PIL import Image, UnidentifiedImageError import acoustid +import chromaprint from imagehash import average_hash from videohash import VideoHash from videohash.exceptions import FFmpegFailedToExtractFrames hashes = set() +audio_hashes = [] delete = set() filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] @@ -52,7 +54,9 @@ def get_image_hash(filename): def get_audio_hash(filename): try: - return str(acoustid.fingerprint_file(filename)[1].decode()) + '_audio' + fp = acoustid.fingerprint_file(filename)[1] + values, _ = chromaprint.decode_fingerprint(fp) + return chromaprint.hash_fingerprint(values) except acoustid.FingerprintGenerationError: return None @@ -84,13 +88,24 @@ for filename in filenames: if not digest: continue - if digest in hashes: - delete.add(filename) + if type(digest) == int: + for h in audio_hashes: + if bin(digest ^ h).count('1') <= 5: # TODO adjust? + delete.add(filename) + break + else: # for + audio_hashes.append(digest) else: - hashes.add(digest) + if digest in hashes: + delete.add(filename) + else: + hashes.add(digest) +print() +print() for dupe in delete: print(dupe) + print() print('Found', len(delete), 'total duplicate files. Delete them?') print('ENTER to continue, ctrl+c to cancel.') From 230460c1539a5964047f97403b3ff915b9974fe2 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Mon, 3 Apr 2023 22:11:53 -0600 Subject: [PATCH 3/4] Freeze requirements --- .gitignore | 2 +- requirements.txt | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 28d346b..afef025 100644 --- a/.gitignore +++ b/.gitignore @@ -102,4 +102,4 @@ ENV/ *.swp *.swo -mcdata/ +test/ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5192096 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +audioread==3.0.0 +Brotli==1.0.9 +certifi==2022.12.7 +charset-normalizer==3.1.0 +idna==3.4 +imagedominantcolor==1.0.1 +ImageHash==4.3.1 +mutagen==1.46.0 +numpy==1.24.2 +Pillow==9.5.0 +pyacoustid==1.2.2 +pycryptodomex==3.17 +PyWavelets==1.4.1 +requests==2.28.2 +scipy==1.10.1 +urllib3==1.26.15 +videohash==3.0.1 +websockets==11.0 +yt-dlp==2023.3.4 From c226c3807ad069a750375ffb34e2d2465e459f5b Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Tue, 17 Oct 2023 13:49:24 -0600 Subject: [PATCH 4/4] IDK --- main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.py b/main.py index 0ec2daa..5e75a8d 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,7 @@ from videohash.exceptions import FFmpegFailedToExtractFrames hashes = set() audio_hashes = [] delete = set() +hash_lookup = {} filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] @@ -88,10 +89,14 @@ for filename in filenames: if not digest: continue + hash_lookup[digest] = filename + if type(digest) == int: for h in audio_hashes: if bin(digest ^ h).count('1') <= 5: # TODO adjust? delete.add(filename) + print() + print(digest, filename, 'close to', h, hash_lookup[h]) break else: # for audio_hashes.append(digest)