You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

125 lines
3.1 KiB

import glob
import os
import hashlib
from PIL import Image, UnidentifiedImageError
import acoustid
import chromaprint
from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set()
audio_hashes = []
delete = set()
hash_lookup = {}
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Sorting file list by size...')
filenames.sort(key=os.path.getsize, reverse=True)
print('Deduplicating by md5 hash...')
for filename in filenames:
# hash first 65536 bytes of each file only
CHUNK_SIZE = 65536
hasher = hashlib.md5()
with open(filename, 'rb') as file:
buf = file.read(CHUNK_SIZE)
hasher.update(buf)
digest = hasher.hexdigest()
if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.')
print('Deduplicating by media fingerprint...')
def get_image_hash(filename):
try:
image = Image.open(filename)
return str(average_hash(image)) + '_image'
except UnidentifiedImageError:
return None
def get_audio_hash(filename):
try:
fp = acoustid.fingerprint_file(filename)[1]
values, _ = chromaprint.decode_fingerprint(fp)
return chromaprint.hash_fingerprint(values)
except acoustid.FingerprintGenerationError:
return None
def get_video_hash(filename):
try:
return str(VideoHash(path=filename).hash_hex) + '_video'
except FFmpegFailedToExtractFrames:
return None
count = 0
total = len(filenames)
for filename in filenames:
count += 1
print('Hashing file', count, '/', total, end='\r')
if filename in delete: continue
try:
digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
except KeyboardInterrupt:
print('Skipping media hashing.')
break
except BaseException as e:
print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename)
continue
if not digest: continue
hash_lookup[digest] = filename
if type(digest) == int:
for h in audio_hashes:
if bin(digest ^ h).count('1') <= 5: # TODO adjust?
delete.add(filename)
print()
print(digest, filename, 'close to', h, hash_lookup[h])
break
else: # for
audio_hashes.append(digest)
else:
if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
print()
print()
for dupe in delete:
print(dupe)
print()
print('Found', len(delete), 'total duplicate files. Delete them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Deleting...')
for dupe in delete:
os.remove(dupe)