You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
120 lines
3.0 KiB
120 lines
3.0 KiB
import glob |
|
import os |
|
import hashlib |
|
from PIL import Image, UnidentifiedImageError |
|
|
|
import acoustid |
|
import chromaprint |
|
from imagehash import average_hash |
|
from videohash import VideoHash |
|
from videohash.exceptions import FFmpegFailedToExtractFrames |
|
|
|
hashes = set() |
|
audio_hashes = [] |
|
delete = set() |
|
|
|
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] |
|
|
|
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?') |
|
print('ENTER to continue, ctrl+c to cancel.') |
|
try: |
|
input() |
|
except KeyboardInterrupt: |
|
print('\nCancelled.') |
|
os._exit(0) |
|
|
|
print('Sorting file list by size...') |
|
filenames.sort(key=os.path.getsize, reverse=True) |
|
|
|
print('Deduplicating by md5 hash...') |
|
|
|
for filename in filenames: |
|
# hash first 65536 bytes of each file only |
|
CHUNK_SIZE = 65536 |
|
|
|
hasher = hashlib.md5() |
|
with open(filename, 'rb') as file: |
|
buf = file.read(CHUNK_SIZE) |
|
hasher.update(buf) |
|
digest = hasher.hexdigest() |
|
if digest in hashes: |
|
delete.add(filename) |
|
else: |
|
hashes.add(digest) |
|
|
|
print('Found', len(delete), 'duplicates by md5 hash.') |
|
print('Deduplicating by media fingerprint...') |
|
|
|
def get_image_hash(filename): |
|
try: |
|
image = Image.open(filename) |
|
return str(average_hash(image)) + '_image' |
|
except UnidentifiedImageError: |
|
return None |
|
|
|
def get_audio_hash(filename): |
|
try: |
|
fp = acoustid.fingerprint_file(filename)[1] |
|
values, _ = chromaprint.decode_fingerprint(fp) |
|
return chromaprint.hash_fingerprint(values) |
|
except acoustid.FingerprintGenerationError: |
|
return None |
|
|
|
def get_video_hash(filename): |
|
try: |
|
return str(VideoHash(path=filename).hash_hex) + '_video' |
|
except FFmpegFailedToExtractFrames: |
|
return None |
|
|
|
count = 0 |
|
total = len(filenames) |
|
|
|
for filename in filenames: |
|
count += 1 |
|
print('Hashing file', count, '/', total, end='\r') |
|
|
|
if filename in delete: continue |
|
|
|
try: |
|
digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename) |
|
except KeyboardInterrupt: |
|
print('Skipping media hashing.') |
|
break |
|
except BaseException as e: |
|
print() |
|
print('Exception', e.__class__.__name__, str(e), 'while hashing:') |
|
print(filename) |
|
continue |
|
|
|
if not digest: continue |
|
|
|
if type(digest) == int: |
|
for h in audio_hashes: |
|
if bin(digest ^ h).count('1') <= 5: # TODO adjust? |
|
delete.add(filename) |
|
break |
|
else: # for |
|
audio_hashes.append(digest) |
|
else: |
|
if digest in hashes: |
|
delete.add(filename) |
|
else: |
|
hashes.add(digest) |
|
|
|
print() |
|
print() |
|
for dupe in delete: |
|
print(dupe) |
|
|
|
print() |
|
print('Found', len(delete), 'total duplicate files. Delete them?') |
|
print('ENTER to continue, ctrl+c to cancel.') |
|
try: |
|
input() |
|
except KeyboardInterrupt: |
|
print('\nCancelled.') |
|
os._exit(0) |
|
|
|
print('Deleting...') |
|
for dupe in delete: |
|
os.remove(dupe)
|
|
|