dedu/main.py

106 lines
2.6 KiB
Python
Raw Normal View History

2022-09-04 05:08:40 +00:00
import glob
import os
import hashlib
from PIL import Image, UnidentifiedImageError
2023-04-04 03:04:37 +00:00
import acoustid
2022-09-04 05:08:40 +00:00
from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set()
delete = set()
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Sorting file list by size...')
filenames.sort(key=os.path.getsize, reverse=True)
print('Deduplicating by md5 hash...')
for filename in filenames:
2022-09-04 05:54:45 +00:00
# hash first 65536 bytes of each file only
2022-09-04 05:08:40 +00:00
CHUNK_SIZE = 65536
hasher = hashlib.md5()
with open(filename, 'rb') as file:
buf = file.read(CHUNK_SIZE)
hasher.update(buf)
digest = hasher.hexdigest()
if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.')
print('Deduplicating by media fingerprint...')
def get_image_hash(filename):
try:
image = Image.open(filename)
return str(average_hash(image)) + '_image'
except UnidentifiedImageError:
return None
2023-04-04 03:04:37 +00:00
def get_audio_hash(filename):
try:
return str(acoustid.fingerprint_file(filename)[1].decode()) + '_audio'
except acoustid.FingerprintGenerationError:
return None
2022-09-04 05:08:40 +00:00
def get_video_hash(filename):
try:
return str(VideoHash(path=filename).hash_hex) + '_video'
except FFmpegFailedToExtractFrames:
return None
count = 0
total = len(filenames)
for filename in filenames:
count += 1
2022-09-04 05:54:45 +00:00
print('Hashing file', count, '/', total, end='\r')
2022-09-04 05:08:40 +00:00
if filename in delete: continue
try:
2023-04-04 03:04:37 +00:00
digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
except KeyboardInterrupt:
print('Skipping media hashing.')
break
2022-09-04 05:08:40 +00:00
except BaseException as e:
print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename)
continue
if not digest: continue
if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
2023-04-04 03:04:37 +00:00
for dupe in delete:
print(dupe)
2022-09-04 05:08:40 +00:00
print()
2023-04-04 03:04:37 +00:00
print('Found', len(delete), 'total duplicate files. Delete them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
2022-09-04 05:08:40 +00:00
print('Deleting...')
for dupe in delete:
os.remove(dupe)