121 lines
3.0 KiB
Python
121 lines
3.0 KiB
Python
import glob
|
|
import os
|
|
import hashlib
|
|
from PIL import Image, UnidentifiedImageError
|
|
|
|
import acoustid
|
|
import chromaprint
|
|
from imagehash import average_hash
|
|
from videohash import VideoHash
|
|
from videohash.exceptions import FFmpegFailedToExtractFrames
|
|
|
|
hashes = set()
|
|
audio_hashes = []
|
|
delete = set()
|
|
|
|
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
|
|
|
|
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')
|
|
print('ENTER to continue, ctrl+c to cancel.')
|
|
try:
|
|
input()
|
|
except KeyboardInterrupt:
|
|
print('\nCancelled.')
|
|
os._exit(0)
|
|
|
|
print('Sorting file list by size...')
|
|
filenames.sort(key=os.path.getsize, reverse=True)
|
|
|
|
print('Deduplicating by md5 hash...')
|
|
|
|
for filename in filenames:
|
|
# hash first 65536 bytes of each file only
|
|
CHUNK_SIZE = 65536
|
|
|
|
hasher = hashlib.md5()
|
|
with open(filename, 'rb') as file:
|
|
buf = file.read(CHUNK_SIZE)
|
|
hasher.update(buf)
|
|
digest = hasher.hexdigest()
|
|
if digest in hashes:
|
|
delete.add(filename)
|
|
else:
|
|
hashes.add(digest)
|
|
|
|
print('Found', len(delete), 'duplicates by md5 hash.')
|
|
print('Deduplicating by media fingerprint...')
|
|
|
|
def get_image_hash(filename):
|
|
try:
|
|
image = Image.open(filename)
|
|
return str(average_hash(image)) + '_image'
|
|
except UnidentifiedImageError:
|
|
return None
|
|
|
|
def get_audio_hash(filename):
|
|
try:
|
|
fp = acoustid.fingerprint_file(filename)[1]
|
|
values, _ = chromaprint.decode_fingerprint(fp)
|
|
return chromaprint.hash_fingerprint(values)
|
|
except acoustid.FingerprintGenerationError:
|
|
return None
|
|
|
|
def get_video_hash(filename):
|
|
try:
|
|
return str(VideoHash(path=filename).hash_hex) + '_video'
|
|
except FFmpegFailedToExtractFrames:
|
|
return None
|
|
|
|
count = 0
|
|
total = len(filenames)
|
|
|
|
for filename in filenames:
|
|
count += 1
|
|
print('Hashing file', count, '/', total, end='\r')
|
|
|
|
if filename in delete: continue
|
|
|
|
try:
|
|
digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
|
|
except KeyboardInterrupt:
|
|
print('Skipping media hashing.')
|
|
break
|
|
except BaseException as e:
|
|
print()
|
|
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
|
|
print(filename)
|
|
continue
|
|
|
|
if not digest: continue
|
|
|
|
if type(digest) == int:
|
|
for h in audio_hashes:
|
|
if bin(digest ^ h).count('1') <= 5: # TODO adjust?
|
|
delete.add(filename)
|
|
break
|
|
else: # for
|
|
audio_hashes.append(digest)
|
|
else:
|
|
if digest in hashes:
|
|
delete.add(filename)
|
|
else:
|
|
hashes.add(digest)
|
|
|
|
print()
|
|
print()
|
|
for dupe in delete:
|
|
print(dupe)
|
|
|
|
print()
|
|
print('Found', len(delete), 'total duplicate files. Delete them?')
|
|
print('ENTER to continue, ctrl+c to cancel.')
|
|
try:
|
|
input()
|
|
except KeyboardInterrupt:
|
|
print('\nCancelled.')
|
|
os._exit(0)
|
|
|
|
print('Deleting...')
|
|
for dupe in delete:
|
|
os.remove(dupe)
|