dedu/main.py

121 lines
2.9 KiB
Python

import glob
import os
import hashlib
import time
import traceback
from multiprocessing import Pool
import queue
from PIL import Image, UnidentifiedImageError
from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set()
delete = set()
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('Cancelled.')
os._exit(0)
print('Sorting file list by smallest size...')
filenames.sort(key=os.path.getsize, reverse=False)
print('Deduplicating by md5 hash...')
for filename in filenames:
# hash first 65536 bytes of each file only
CHUNK_SIZE = 65536
hasher = hashlib.md5()
with open(filename, 'rb') as file:
buf = file.read(CHUNK_SIZE)
hasher.update(buf)
digest = hasher.hexdigest()
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else:
hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.')
time.sleep(4)
print('Deduplicating by media fingerprint...')
def get_image_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try:
image = Image.open(filename)
return basename + str(average_hash(image)) + '_image'
except UnidentifiedImageError:
return None
def get_video_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try:
v = VideoHash(path=filename)
digest = str(v.hash_hex)
v.delete_storage_path()
return basename + digest + '_video'
except FFmpegFailedToExtractFrames:
return None
count = 0
total = len(filenames)
def hasher(filename):
if filename in delete: return None
print('Hashing file:', filename)
try:
digest = get_image_hash(filename)# or get_video_hash(filename)
except KeyboardInterrupt:
print('Cancelled.')
os._exit(0)
except BaseException as e:
print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename)
print(traceback.format_exc())
return None
if not digest: return None
return (filename, digest)
with Pool() as pool:
results = pool.map(hasher, filenames)
print('Finished hashing.')
print()
print('Checking digests:')
print()
for result in results:
if not result: continue
print(result)
filename, digest = result
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else:
hashes.add(digest)
print()
print('Found', len(delete), 'total duplicate files.')
for dupe in sorted(list(delete)):
print('Deleting:', dupe)
os.remove(dupe)