diff --git a/main.py b/main.py new file mode 100644 index 0000000..cfa71ff --- /dev/null +++ b/main.py @@ -0,0 +1,88 @@ +import glob +import os +import hashlib +from PIL import Image, UnidentifiedImageError + +from imagehash import average_hash +from videohash import VideoHash +from videohash.exceptions import FFmpegFailedToExtractFrames + +hashes = set() +delete = set() + +filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] + +print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?') +print('ENTER to continue, ctrl+c to cancel.') +try: + input() +except KeyboardInterrupt: + print('\nCancelled.') + os._exit(0) + +print('Sorting file list by size...') +filenames.sort(key=os.path.getsize, reverse=True) + +print('Deduplicating by md5 hash...') + +for filename in filenames: + # hash first 65536 bytes of each file first + CHUNK_SIZE = 65536 + + hasher = hashlib.md5() + with open(filename, 'rb') as file: + buf = file.read(CHUNK_SIZE) + hasher.update(buf) + digest = hasher.hexdigest() + if digest in hashes: + delete.add(filename) + else: + hashes.add(digest) + +print('Found', len(delete), 'duplicates by md5 hash.') +print('Deduplicating by media fingerprint...') + +def get_image_hash(filename): + try: + image = Image.open(filename) + return str(average_hash(image)) + '_image' + except UnidentifiedImageError: + return None + +def get_video_hash(filename): + try: + return str(VideoHash(path=filename).hash_hex) + '_video' + except FFmpegFailedToExtractFrames: + return None + +count = 0 +total = len(filenames) + +for filename in filenames: + print('Hashing file', count, '/', total, end='\r') + count += 1 + + if filename in delete: continue + + try: + digest = get_image_hash(filename) or get_video_hash(filename) + except BaseException as e: + print() + print('Exception', e.__class__.__name__, str(e), 'while hashing:') + print(filename) + continue + + if not digest: continue + + if digest in hashes: + delete.add(filename) + else: + hashes.add(digest) + +print() +print('Found', len(delete), 'total duplicate files.') + +print('Deleting...') +for dupe in delete: + os.remove(dupe) +