import glob import os import hashlib import time from PIL import Image, UnidentifiedImageError from imagehash import average_hash from videohash import VideoHash from videohash.exceptions import FFmpegFailedToExtractFrames hashes = set() delete = set() filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?') print('ENTER to continue, ctrl+c to cancel.') try: input() except KeyboardInterrupt: print('\nCancelled.') os._exit(0) print('Sorting file list by smallest size...') filenames.sort(key=os.path.getsize, reverse=False) print('Deduplicating by md5 hash...') for filename in filenames: # hash first 65536 bytes of each file only CHUNK_SIZE = 65536 hasher = hashlib.md5() with open(filename, 'rb') as file: buf = file.read(CHUNK_SIZE) hasher.update(buf) digest = hasher.hexdigest() if digest in hashes: delete.add(filename) else: hashes.add(digest) print('Found', len(delete), 'duplicates by md5 hash.') print('Deduplicating by media fingerprint...') def get_image_hash(filename): try: image = Image.open(filename) return str(average_hash(image)) + '_image' except UnidentifiedImageError: return None def get_video_hash(filename): try: return str(VideoHash(path=filename).hash_hex) + '_video' except FFmpegFailedToExtractFrames: return None count = 0 total = len(filenames) for filename in filenames: count += 1 print('Hashing file', count, '/', total, end='\r') if filename in delete: continue try: digest = get_image_hash(filename) or get_video_hash(filename) except KeyboardInterrupt: print('\nCancelled.') os._exit(0) except BaseException as e: print() print('Exception', e.__class__.__name__, str(e), 'while hashing:') print(filename) continue time.sleep(5) if not digest: continue if digest in hashes: delete.add(filename) else: hashes.add(digest) print() print('Found', len(delete), 'total duplicate files.') print('Deleting...') for dupe in delete: os.remove(dupe)