Compare commits

..

8 Commits

Author SHA1 Message Date
f15a631983 Merge branch 'master' into no_audio 2023-10-18 22:37:30 +00:00
ef79eb5cad Parallelize hashing, also remove video 2023-10-18 22:36:42 +00:00
d4ee0f1f98 Freeze requirements 2023-10-18 16:33:56 +00:00
5e840ae8bb Support cancelling 2023-10-18 10:28:24 -06:00
c226c3807a IDK 2023-10-17 13:49:24 -06:00
230460c153 Freeze requirements 2023-04-03 22:11:53 -06:00
dddfbb0724 Compare bits on audio hashes 2023-04-03 22:11:34 -06:00
d678c8fd0f Add audio hashing 2023-04-03 21:04:37 -06:00
2 changed files with 60 additions and 14 deletions

61
main.py
View File

@@ -1,6 +1,10 @@
import glob import glob
import os import os
import hashlib import hashlib
import time
import traceback
from multiprocessing import Pool
import queue
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from imagehash import average_hash from imagehash import average_hash
@@ -17,11 +21,11 @@ print('ENTER to continue, ctrl+c to cancel.')
try: try:
input() input()
except KeyboardInterrupt: except KeyboardInterrupt:
print('\nCancelled.') print('Cancelled.')
os._exit(0) os._exit(0)
print('Sorting file list by size...') print('Sorting file list by smallest size...')
filenames.sort(key=os.path.getsize, reverse=True) filenames.sort(key=os.path.getsize, reverse=False)
print('Deduplicating by md5 hash...') print('Deduplicating by md5 hash...')
@@ -35,46 +39,75 @@ for filename in filenames:
hasher.update(buf) hasher.update(buf)
digest = hasher.hexdigest() digest = hasher.hexdigest()
if digest in hashes: if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename) delete.add(filename)
else: else:
hashes.add(digest) hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.') print('Found', len(delete), 'duplicates by md5 hash.')
time.sleep(4)
print('Deduplicating by media fingerprint...') print('Deduplicating by media fingerprint...')
def get_image_hash(filename): def get_image_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try: try:
image = Image.open(filename) image = Image.open(filename)
return str(average_hash(image)) + '_image' return basename + str(average_hash(image)) + '_image'
except UnidentifiedImageError: except UnidentifiedImageError:
return None return None
def get_video_hash(filename): def get_video_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try: try:
return str(VideoHash(path=filename).hash_hex) + '_video' v = VideoHash(path=filename)
digest = str(v.hash_hex)
v.delete_storage_path()
return basename + digest + '_video'
except FFmpegFailedToExtractFrames: except FFmpegFailedToExtractFrames:
return None return None
count = 0 count = 0
total = len(filenames) total = len(filenames)
for filename in filenames: def hasher(filename):
count += 1 if filename in delete: return None
print('Hashing file', count, '/', total, end='\r')
if filename in delete: continue print('Hashing file:', filename)
try: try:
digest = get_image_hash(filename) or get_video_hash(filename) digest = get_image_hash(filename)# or get_video_hash(filename)
except KeyboardInterrupt:
print('Cancelled.')
os._exit(0)
except BaseException as e: except BaseException as e:
print() print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:') print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename) print(filename)
continue print(traceback.format_exc())
return None
if not digest: continue if not digest: return None
return (filename, digest)
with Pool() as pool:
results = pool.map(hasher, filenames)
print('Finished hashing.')
print()
print('Checking digests:')
print()
for result in results:
if not result: continue
print(result)
filename, digest = result
if digest in hashes: if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename) delete.add(filename)
else: else:
hashes.add(digest) hashes.add(digest)
@@ -82,6 +115,6 @@ for filename in filenames:
print() print()
print('Found', len(delete), 'total duplicate files.') print('Found', len(delete), 'total duplicate files.')
print('Deleting...') for dupe in sorted(list(delete)):
for dupe in delete: print('Deleting:', dupe)
os.remove(dupe) os.remove(dupe)

13
requirements.txt Normal file
View File

@@ -0,0 +1,13 @@
Brotli==1.1.0
certifi==2023.7.22
imagedominantcolor==1.0.1
ImageHash==4.3.1
mutagen==1.47.0
numpy==1.26.1
Pillow==10.1.0
pycryptodomex==3.19.0
PyWavelets==1.4.1
scipy==1.11.3
videohash==3.0.1
websockets==11.0.3
yt-dlp==2023.10.13