Compare commits

..

5 Commits

Author SHA1 Message Date
f15a631983 Merge branch 'master' into no_audio 2023-10-18 22:37:30 +00:00
ef79eb5cad Parallelize hashing, also remove video 2023-10-18 22:36:42 +00:00
d4ee0f1f98 Freeze requirements 2023-10-18 16:33:56 +00:00
5e840ae8bb Support cancelling 2023-10-18 10:28:24 -06:00
c226c3807a IDK 2023-10-17 13:49:24 -06:00
3 changed files with 60 additions and 66 deletions

2
.gitignore vendored
View File

@@ -102,4 +102,4 @@ ENV/
*.swp
*.swo
test/
mcdata/

100
main.py
View File

@@ -1,16 +1,17 @@
import glob
import os
import hashlib
import time
import traceback
from multiprocessing import Pool
import queue
from PIL import Image, UnidentifiedImageError
import acoustid
import chromaprint
from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set()
audio_hashes = []
delete = set()
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
@@ -20,11 +21,11 @@ print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
print('Cancelled.')
os._exit(0)
print('Sorting file list by size...')
filenames.sort(key=os.path.getsize, reverse=True)
print('Sorting file list by smallest size...')
filenames.sort(key=os.path.getsize, reverse=False)
print('Deduplicating by md5 hash...')
@@ -38,83 +39,82 @@ for filename in filenames:
hasher.update(buf)
digest = hasher.hexdigest()
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else:
hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.')
time.sleep(4)
print('Deduplicating by media fingerprint...')
def get_image_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try:
image = Image.open(filename)
return str(average_hash(image)) + '_image'
return basename + str(average_hash(image)) + '_image'
except UnidentifiedImageError:
return None
def get_audio_hash(filename):
try:
fp = acoustid.fingerprint_file(filename)[1]
values, _ = chromaprint.decode_fingerprint(fp)
return chromaprint.hash_fingerprint(values)
except acoustid.FingerprintGenerationError:
return None
def get_video_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try:
return str(VideoHash(path=filename).hash_hex) + '_video'
v = VideoHash(path=filename)
digest = str(v.hash_hex)
v.delete_storage_path()
return basename + digest + '_video'
except FFmpegFailedToExtractFrames:
return None
count = 0
total = len(filenames)
for filename in filenames:
count += 1
print('Hashing file', count, '/', total, end='\r')
def hasher(filename):
if filename in delete: return None
if filename in delete: continue
print('Hashing file:', filename)
try:
digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
digest = get_image_hash(filename)# or get_video_hash(filename)
except KeyboardInterrupt:
print('Skipping media hashing.')
break
print('Cancelled.')
os._exit(0)
except BaseException as e:
print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename)
continue
print(traceback.format_exc())
return None
if not digest: continue
if not digest: return None
if type(digest) == int:
for h in audio_hashes:
if bin(digest ^ h).count('1') <= 5: # TODO adjust?
delete.add(filename)
break
else: # for
audio_hashes.append(digest)
return (filename, digest)
with Pool() as pool:
results = pool.map(hasher, filenames)
print('Finished hashing.')
print()
print('Checking digests:')
print()
for result in results:
if not result: continue
print(result)
filename, digest = result
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else:
if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
hashes.add(digest)
print()
print()
for dupe in delete:
print(dupe)
print('Found', len(delete), 'total duplicate files.')
print()
print('Found', len(delete), 'total duplicate files. Delete them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Deleting...')
for dupe in delete:
for dupe in sorted(list(delete)):
print('Deleting:', dupe)
os.remove(dupe)

View File

@@ -1,19 +1,13 @@
audioread==3.0.0
Brotli==1.0.9
certifi==2022.12.7
charset-normalizer==3.1.0
idna==3.4
Brotli==1.1.0
certifi==2023.7.22
imagedominantcolor==1.0.1
ImageHash==4.3.1
mutagen==1.46.0
numpy==1.24.2
Pillow==9.5.0
pyacoustid==1.2.2
pycryptodomex==3.17
mutagen==1.47.0
numpy==1.26.1
Pillow==10.1.0
pycryptodomex==3.19.0
PyWavelets==1.4.1
requests==2.28.2
scipy==1.10.1
urllib3==1.26.15
scipy==1.11.3
videohash==3.0.1
websockets==11.0
yt-dlp==2023.3.4
websockets==11.0.3
yt-dlp==2023.10.13