import glob import os import hashlib import time import traceback from multiprocessing import Pool import queue from PIL import Image, UnidentifiedImageError from imagehash import average_hash from videohash import VideoHash from videohash.exceptions import FFmpegFailedToExtractFrames hashes = set() delete = set() filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?') print('ENTER to continue, ctrl+c to cancel.') try: input() except KeyboardInterrupt: print('Cancelled.') os._exit(0) print('Sorting file list by smallest size...') filenames.sort(key=os.path.getsize, reverse=False) print('Deduplicating by md5 hash...') for filename in filenames: # hash first 65536 bytes of each file only CHUNK_SIZE = 65536 hasher = hashlib.md5() with open(filename, 'rb') as file: buf = file.read(CHUNK_SIZE) hasher.update(buf) digest = hasher.hexdigest() if digest in hashes: print('Found digest', digest, 'collision for', filename) delete.add(filename) else: hashes.add(digest) print('Found', len(delete), 'duplicates by md5 hash.') time.sleep(4) print('Deduplicating by media fingerprint...') def get_image_hash(filename): basename = os.path.basename(os.path.dirname(filename)) try: image = Image.open(filename) return basename + str(average_hash(image)) + '_image' except UnidentifiedImageError: return None def get_video_hash(filename): basename = os.path.basename(os.path.dirname(filename)) try: v = VideoHash(path=filename) digest = str(v.hash_hex) v.delete_storage_path() return basename + digest + '_video' except FFmpegFailedToExtractFrames: return None count = 0 total = len(filenames) def hasher(filename): if filename in delete: return None print('Hashing file:', filename) try: digest = get_image_hash(filename)# or get_video_hash(filename) except KeyboardInterrupt: print('Cancelled.') os._exit(0) except BaseException as e: print() print('Exception', e.__class__.__name__, str(e), 'while hashing:') print(filename) print(traceback.format_exc()) return None if not digest: return None return (filename, digest) with Pool() as pool: results = pool.map(hasher, filenames) print('Finished hashing.') print() print('Checking digests:') print() for result in results: if not result: continue print(result) filename, digest = result if digest in hashes: print('Found digest', digest, 'collision for', filename) delete.add(filename) else: hashes.add(digest) print() print('Found', len(delete), 'total duplicate files.') for dupe in sorted(list(delete)): print('Deleting:', dupe) os.remove(dupe)