From ef79eb5cad4585af295257adeca969edadde95d7 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Wed, 18 Oct 2023 22:36:42 +0000 Subject: [PATCH] Parallelize hashing, also remove video --- main.py | 55 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index ce77271..ac7bea4 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,9 @@ import glob import os import hashlib import time +import traceback +from multiprocessing import Pool +import queue from PIL import Image, UnidentifiedImageError from imagehash import average_hash @@ -18,7 +21,7 @@ print('ENTER to continue, ctrl+c to cancel.') try: input() except KeyboardInterrupt: - print('\nCancelled.') + print('Cancelled.') os._exit(0) print('Sorting file list by smallest size...') @@ -36,51 +39,75 @@ for filename in filenames: hasher.update(buf) digest = hasher.hexdigest() if digest in hashes: + print('Found digest', digest, 'collision for', filename) delete.add(filename) else: hashes.add(digest) print('Found', len(delete), 'duplicates by md5 hash.') + +time.sleep(4) print('Deduplicating by media fingerprint...') def get_image_hash(filename): + basename = os.path.basename(os.path.dirname(filename)) try: image = Image.open(filename) - return str(average_hash(image)) + '_image' + return basename + str(average_hash(image)) + '_image' except UnidentifiedImageError: return None def get_video_hash(filename): + basename = os.path.basename(os.path.dirname(filename)) try: - return str(VideoHash(path=filename).hash_hex) + '_video' + v = VideoHash(path=filename) + digest = str(v.hash_hex) + v.delete_storage_path() + + return basename + digest + '_video' except FFmpegFailedToExtractFrames: return None count = 0 total = len(filenames) -for filename in filenames: - count += 1 - print('Hashing file', count, '/', total, end='\r') +def hasher(filename): + if filename in delete: return None - if filename in delete: continue + print('Hashing file:', filename) try: - digest = get_image_hash(filename) or get_video_hash(filename) + digest = get_image_hash(filename)# or get_video_hash(filename) except KeyboardInterrupt: - print('\nCancelled.') + print('Cancelled.') os._exit(0) except BaseException as e: print() print('Exception', e.__class__.__name__, str(e), 'while hashing:') print(filename) - continue + print(traceback.format_exc()) + return None + + if not digest: return None + + return (filename, digest) + +with Pool() as pool: + results = pool.map(hasher, filenames) + +print('Finished hashing.') +print() +print('Checking digests:') +print() - time.sleep(5) +for result in results: + if not result: continue + print(result) - if not digest: continue + filename, digest = result if digest in hashes: + print('Found digest', digest, 'collision for', filename) delete.add(filename) else: hashes.add(digest) @@ -88,6 +115,6 @@ for filename in filenames: print() print('Found', len(delete), 'total duplicate files.') -print('Deleting...') -for dupe in delete: +for dupe in sorted(list(delete)): + print('Deleting:', dupe) os.remove(dupe)