|
|
|
@ -2,6 +2,9 @@ import glob |
|
|
|
|
import os |
|
|
|
|
import hashlib |
|
|
|
|
import time |
|
|
|
|
import traceback |
|
|
|
|
from multiprocessing import Pool |
|
|
|
|
import queue |
|
|
|
|
from PIL import Image, UnidentifiedImageError |
|
|
|
|
|
|
|
|
|
from imagehash import average_hash |
|
|
|
@ -18,7 +21,7 @@ print('ENTER to continue, ctrl+c to cancel.') |
|
|
|
|
try: |
|
|
|
|
input() |
|
|
|
|
except KeyboardInterrupt: |
|
|
|
|
print('\nCancelled.') |
|
|
|
|
print('Cancelled.') |
|
|
|
|
os._exit(0) |
|
|
|
|
|
|
|
|
|
print('Sorting file list by smallest size...') |
|
|
|
@ -36,51 +39,75 @@ for filename in filenames: |
|
|
|
|
hasher.update(buf) |
|
|
|
|
digest = hasher.hexdigest() |
|
|
|
|
if digest in hashes: |
|
|
|
|
print('Found digest', digest, 'collision for', filename) |
|
|
|
|
delete.add(filename) |
|
|
|
|
else: |
|
|
|
|
hashes.add(digest) |
|
|
|
|
|
|
|
|
|
print('Found', len(delete), 'duplicates by md5 hash.') |
|
|
|
|
|
|
|
|
|
time.sleep(4) |
|
|
|
|
print('Deduplicating by media fingerprint...') |
|
|
|
|
|
|
|
|
|
def get_image_hash(filename): |
|
|
|
|
basename = os.path.basename(os.path.dirname(filename)) |
|
|
|
|
try: |
|
|
|
|
image = Image.open(filename) |
|
|
|
|
return str(average_hash(image)) + '_image' |
|
|
|
|
return basename + str(average_hash(image)) + '_image' |
|
|
|
|
except UnidentifiedImageError: |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
def get_video_hash(filename): |
|
|
|
|
basename = os.path.basename(os.path.dirname(filename)) |
|
|
|
|
try: |
|
|
|
|
return str(VideoHash(path=filename).hash_hex) + '_video' |
|
|
|
|
v = VideoHash(path=filename) |
|
|
|
|
digest = str(v.hash_hex) |
|
|
|
|
v.delete_storage_path() |
|
|
|
|
|
|
|
|
|
return basename + digest + '_video' |
|
|
|
|
except FFmpegFailedToExtractFrames: |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
count = 0 |
|
|
|
|
total = len(filenames) |
|
|
|
|
|
|
|
|
|
for filename in filenames: |
|
|
|
|
count += 1 |
|
|
|
|
print('Hashing file', count, '/', total, end='\r') |
|
|
|
|
def hasher(filename): |
|
|
|
|
if filename in delete: return None |
|
|
|
|
|
|
|
|
|
if filename in delete: continue |
|
|
|
|
print('Hashing file:', filename) |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
digest = get_image_hash(filename) or get_video_hash(filename) |
|
|
|
|
digest = get_image_hash(filename)# or get_video_hash(filename) |
|
|
|
|
except KeyboardInterrupt: |
|
|
|
|
print('\nCancelled.') |
|
|
|
|
print('Cancelled.') |
|
|
|
|
os._exit(0) |
|
|
|
|
except BaseException as e: |
|
|
|
|
print() |
|
|
|
|
print('Exception', e.__class__.__name__, str(e), 'while hashing:') |
|
|
|
|
print(filename) |
|
|
|
|
continue |
|
|
|
|
print(traceback.format_exc()) |
|
|
|
|
return None |
|
|
|
|
|
|
|
|
|
if not digest: return None |
|
|
|
|
|
|
|
|
|
return (filename, digest) |
|
|
|
|
|
|
|
|
|
with Pool() as pool: |
|
|
|
|
results = pool.map(hasher, filenames) |
|
|
|
|
|
|
|
|
|
print('Finished hashing.') |
|
|
|
|
print() |
|
|
|
|
print('Checking digests:') |
|
|
|
|
print() |
|
|
|
|
|
|
|
|
|
time.sleep(5) |
|
|
|
|
for result in results: |
|
|
|
|
if not result: continue |
|
|
|
|
print(result) |
|
|
|
|
|
|
|
|
|
if not digest: continue |
|
|
|
|
filename, digest = result |
|
|
|
|
|
|
|
|
|
if digest in hashes: |
|
|
|
|
print('Found digest', digest, 'collision for', filename) |
|
|
|
|
delete.add(filename) |
|
|
|
|
else: |
|
|
|
|
hashes.add(digest) |
|
|
|
@ -88,6 +115,6 @@ for filename in filenames: |
|
|
|
|
print() |
|
|
|
|
print('Found', len(delete), 'total duplicate files.') |
|
|
|
|
|
|
|
|
|
print('Deleting...') |
|
|
|
|
for dupe in delete: |
|
|
|
|
for dupe in sorted(list(delete)): |
|
|
|
|
print('Deleting:', dupe) |
|
|
|
|
os.remove(dupe) |
|
|
|
|