import glob import os import hashlib from PIL import Image, UnidentifiedImageError import acoustid import chromaprint from imagehash import average_hash from videohash import VideoHash from videohash.exceptions import FFmpegFailedToExtractFrames hashes = set() audio_hashes = [] delete = set() filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?') print('ENTER to continue, ctrl+c to cancel.') try: input() except KeyboardInterrupt: print('\nCancelled.') os._exit(0) print('Sorting file list by size...') filenames.sort(key=os.path.getsize, reverse=True) print('Deduplicating by md5 hash...') for filename in filenames: # hash first 65536 bytes of each file only CHUNK_SIZE = 65536 hasher = hashlib.md5() with open(filename, 'rb') as file: buf = file.read(CHUNK_SIZE) hasher.update(buf) digest = hasher.hexdigest() if digest in hashes: delete.add(filename) else: hashes.add(digest) print('Found', len(delete), 'duplicates by md5 hash.') print('Deduplicating by media fingerprint...') def get_image_hash(filename): try: image = Image.open(filename) return str(average_hash(image)) + '_image' except UnidentifiedImageError: return None def get_audio_hash(filename): try: fp = acoustid.fingerprint_file(filename)[1] values, _ = chromaprint.decode_fingerprint(fp) return chromaprint.hash_fingerprint(values) except acoustid.FingerprintGenerationError: return None def get_video_hash(filename): try: return str(VideoHash(path=filename).hash_hex) + '_video' except FFmpegFailedToExtractFrames: return None count = 0 total = len(filenames) for filename in filenames: count += 1 print('Hashing file', count, '/', total, end='\r') if filename in delete: continue try: digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename) except KeyboardInterrupt: print('Skipping media hashing.') break except BaseException as e: print() print('Exception', e.__class__.__name__, str(e), 'while hashing:') print(filename) continue if not digest: continue if type(digest) == int: for h in audio_hashes: if bin(digest ^ h).count('1') <= 5: # TODO adjust? delete.add(filename) break else: # for audio_hashes.append(digest) else: if digest in hashes: delete.add(filename) else: hashes.add(digest) print() print() for dupe in delete: print(dupe) print() print('Found', len(delete), 'total duplicate files. Delete them?') print('ENTER to continue, ctrl+c to cancel.') try: input() except KeyboardInterrupt: print('\nCancelled.') os._exit(0) print('Deleting...') for dupe in delete: os.remove(dupe)