You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
2.9 KiB

import glob
import os
import hashlib
import time
import traceback
from multiprocessing import Pool
import queue
from PIL import Image, UnidentifiedImageError
from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set()
delete = set()
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('Cancelled.')
os._exit(0)
print('Sorting file list by smallest size...')
filenames.sort(key=os.path.getsize, reverse=False)
print('Deduplicating by md5 hash...')
for filename in filenames:
# hash first 65536 bytes of each file only
CHUNK_SIZE = 65536
hasher = hashlib.md5()
with open(filename, 'rb') as file:
buf = file.read(CHUNK_SIZE)
hasher.update(buf)
digest = hasher.hexdigest()
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else:
hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.')
time.sleep(4)
print('Deduplicating by media fingerprint...')
def get_image_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try:
image = Image.open(filename)
return basename + str(average_hash(image)) + '_image'
except UnidentifiedImageError:
return None
def get_video_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try:
v = VideoHash(path=filename)
digest = str(v.hash_hex)
v.delete_storage_path()
return basename + digest + '_video'
except FFmpegFailedToExtractFrames:
return None
count = 0
total = len(filenames)
def hasher(filename):
if filename in delete: return None
print('Hashing file:', filename)
try:
digest = get_image_hash(filename)# or get_video_hash(filename)
except KeyboardInterrupt:
print('Cancelled.')
os._exit(0)
except BaseException as e:
print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename)
print(traceback.format_exc())
return None
if not digest: return None
return (filename, digest)
with Pool() as pool:
results = pool.map(hasher, filenames)
print('Finished hashing.')
print()
print('Checking digests:')
print()
for result in results:
if not result: continue
print(result)
filename, digest = result
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else:
hashes.add(digest)
print()
print('Found', len(delete), 'total duplicate files.')
for dupe in sorted(list(delete)):
print('Deleting:', dupe)
os.remove(dupe)