diff --git a/main.py b/main.py index ce77271..ac7bea4 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,9 @@ import glob import os import hashlib import time +import traceback +from multiprocessing import Pool +import queue from PIL import Image, UnidentifiedImageError from imagehash import average_hash @@ -18,7 +21,7 @@ print('ENTER to continue, ctrl+c to cancel.') try: input() except KeyboardInterrupt: - print('\nCancelled.') + print('Cancelled.') os._exit(0) print('Sorting file list by smallest size...') @@ -36,51 +39,75 @@ for filename in filenames: hasher.update(buf) digest = hasher.hexdigest() if digest in hashes: + print('Found digest', digest, 'collision for', filename) delete.add(filename) else: hashes.add(digest) print('Found', len(delete), 'duplicates by md5 hash.') + +time.sleep(4) print('Deduplicating by media fingerprint...') def get_image_hash(filename): + basename = os.path.basename(os.path.dirname(filename)) try: image = Image.open(filename) - return str(average_hash(image)) + '_image' + return basename + str(average_hash(image)) + '_image' except UnidentifiedImageError: return None def get_video_hash(filename): + basename = os.path.basename(os.path.dirname(filename)) try: - return str(VideoHash(path=filename).hash_hex) + '_video' + v = VideoHash(path=filename) + digest = str(v.hash_hex) + v.delete_storage_path() + + return basename + digest + '_video' except FFmpegFailedToExtractFrames: return None count = 0 total = len(filenames) -for filename in filenames: - count += 1 - print('Hashing file', count, '/', total, end='\r') +def hasher(filename): + if filename in delete: return None - if filename in delete: continue + print('Hashing file:', filename) try: - digest = get_image_hash(filename) or get_video_hash(filename) + digest = get_image_hash(filename)# or get_video_hash(filename) except KeyboardInterrupt: - print('\nCancelled.') + print('Cancelled.') os._exit(0) except BaseException as e: print() print('Exception', e.__class__.__name__, str(e), 'while hashing:') print(filename) - continue + print(traceback.format_exc()) + return None + + if not digest: return None + + return (filename, digest) + +with Pool() as pool: + results = pool.map(hasher, filenames) + +print('Finished hashing.') +print() +print('Checking digests:') +print() - time.sleep(5) +for result in results: + if not result: continue + print(result) - if not digest: continue + filename, digest = result if digest in hashes: + print('Found digest', digest, 'collision for', filename) delete.add(filename) else: hashes.add(digest) @@ -88,6 +115,6 @@ for filename in filenames: print() print('Found', len(delete), 'total duplicate files.') -print('Deleting...') -for dupe in delete: +for dupe in sorted(list(delete)): + print('Deleting:', dupe) os.remove(dupe)