Compare commits

..

4 Commits

Author SHA1 Message Date
c226c3807a IDK 2023-10-17 13:49:24 -06:00
230460c153 Freeze requirements 2023-04-03 22:11:53 -06:00
dddfbb0724 Compare bits on audio hashes 2023-04-03 22:11:34 -06:00
d678c8fd0f Add audio hashing 2023-04-03 21:04:37 -06:00
3 changed files with 64 additions and 13 deletions

2
.gitignore vendored
View File

@@ -102,4 +102,4 @@ ENV/
*.swp *.swp
*.swo *.swo
mcdata/ test/

50
main.py
View File

@@ -1,15 +1,18 @@
import glob import glob
import os import os
import hashlib import hashlib
import time
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
import acoustid
import chromaprint
from imagehash import average_hash from imagehash import average_hash
from videohash import VideoHash from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set() hashes = set()
audio_hashes = []
delete = set() delete = set()
hash_lookup = {}
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
@@ -21,8 +24,8 @@ except KeyboardInterrupt:
print('\nCancelled.') print('\nCancelled.')
os._exit(0) os._exit(0)
print('Sorting file list by smallest size...') print('Sorting file list by size...')
filenames.sort(key=os.path.getsize, reverse=False) filenames.sort(key=os.path.getsize, reverse=True)
print('Deduplicating by md5 hash...') print('Deduplicating by md5 hash...')
@@ -50,6 +53,14 @@ def get_image_hash(filename):
except UnidentifiedImageError: except UnidentifiedImageError:
return None return None
def get_audio_hash(filename):
try:
fp = acoustid.fingerprint_file(filename)[1]
values, _ = chromaprint.decode_fingerprint(fp)
return chromaprint.hash_fingerprint(values)
except acoustid.FingerprintGenerationError:
return None
def get_video_hash(filename): def get_video_hash(filename):
try: try:
return str(VideoHash(path=filename).hash_hex) + '_video' return str(VideoHash(path=filename).hash_hex) + '_video'
@@ -66,27 +77,48 @@ for filename in filenames:
if filename in delete: continue if filename in delete: continue
try: try:
digest = get_image_hash(filename) or get_video_hash(filename) digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
except KeyboardInterrupt: except KeyboardInterrupt:
print('\nCancelled.') print('Skipping media hashing.')
os._exit(0) break
except BaseException as e: except BaseException as e:
print() print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:') print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename) print(filename)
continue continue
time.sleep(5)
if not digest: continue if not digest: continue
hash_lookup[digest] = filename
if type(digest) == int:
for h in audio_hashes:
if bin(digest ^ h).count('1') <= 5: # TODO adjust?
delete.add(filename)
print()
print(digest, filename, 'close to', h, hash_lookup[h])
break
else: # for
audio_hashes.append(digest)
else:
if digest in hashes: if digest in hashes:
delete.add(filename) delete.add(filename)
else: else:
hashes.add(digest) hashes.add(digest)
print() print()
print('Found', len(delete), 'total duplicate files.') print()
for dupe in delete:
print(dupe)
print()
print('Found', len(delete), 'total duplicate files. Delete them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Deleting...') print('Deleting...')
for dupe in delete: for dupe in delete:

19
requirements.txt Normal file
View File

@@ -0,0 +1,19 @@
audioread==3.0.0
Brotli==1.0.9
certifi==2022.12.7
charset-normalizer==3.1.0
idna==3.4
imagedominantcolor==1.0.1
ImageHash==4.3.1
mutagen==1.46.0
numpy==1.24.2
Pillow==9.5.0
pyacoustid==1.2.2
pycryptodomex==3.17
PyWavelets==1.4.1
requests==2.28.2
scipy==1.10.1
urllib3==1.26.15
videohash==3.0.1
websockets==11.0
yt-dlp==2023.3.4