Compare commits

...

3 Commits

Author SHA1 Message Date
230460c153 Freeze requirements 2023-04-03 22:11:53 -06:00
dddfbb0724 Compare bits on audio hashes 2023-04-03 22:11:34 -06:00
d678c8fd0f Add audio hashing 2023-04-03 21:04:37 -06:00
3 changed files with 58 additions and 6 deletions

2
.gitignore vendored
View File

@ -102,4 +102,4 @@ ENV/
*.swp
*.swo
mcdata/
test/

43
main.py
View File

@ -3,11 +3,14 @@ import os
import hashlib
from PIL import Image, UnidentifiedImageError
import acoustid
import chromaprint
from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set()
audio_hashes = []
delete = set()
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
@ -49,6 +52,14 @@ def get_image_hash(filename):
except UnidentifiedImageError:
return None
def get_audio_hash(filename):
try:
fp = acoustid.fingerprint_file(filename)[1]
values, _ = chromaprint.decode_fingerprint(fp)
return chromaprint.hash_fingerprint(values)
except acoustid.FingerprintGenerationError:
return None
def get_video_hash(filename):
try:
return str(VideoHash(path=filename).hash_hex) + '_video'
@ -65,7 +76,10 @@ for filename in filenames:
if filename in delete: continue
try:
digest = get_image_hash(filename) or get_video_hash(filename)
digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
except KeyboardInterrupt:
print('Skipping media hashing.')
break
except BaseException as e:
print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
@ -74,13 +88,32 @@ for filename in filenames:
if not digest: continue
if digest in hashes:
delete.add(filename)
if type(digest) == int:
for h in audio_hashes:
if bin(digest ^ h).count('1') <= 5: # TODO adjust?
delete.add(filename)
break
else: # for
audio_hashes.append(digest)
else:
hashes.add(digest)
if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
print()
print('Found', len(delete), 'total duplicate files.')
print()
for dupe in delete:
print(dupe)
print()
print('Found', len(delete), 'total duplicate files. Delete them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Deleting...')
for dupe in delete:

19
requirements.txt Normal file
View File

@ -0,0 +1,19 @@
audioread==3.0.0
Brotli==1.0.9
certifi==2022.12.7
charset-normalizer==3.1.0
idna==3.4
imagedominantcolor==1.0.1
ImageHash==4.3.1
mutagen==1.46.0
numpy==1.24.2
Pillow==9.5.0
pyacoustid==1.2.2
pycryptodomex==3.17
PyWavelets==1.4.1
requests==2.28.2
scipy==1.10.1
urllib3==1.26.15
videohash==3.0.1
websockets==11.0
yt-dlp==2023.3.4