parent
758b11e45b
commit
6730c67f7f
1 changed files with 88 additions and 0 deletions
@ -0,0 +1,88 @@ |
||||
import glob |
||||
import os |
||||
import hashlib |
||||
from PIL import Image, UnidentifiedImageError |
||||
|
||||
from imagehash import average_hash |
||||
from videohash import VideoHash |
||||
from videohash.exceptions import FFmpegFailedToExtractFrames |
||||
|
||||
hashes = set() |
||||
delete = set() |
||||
|
||||
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] |
||||
|
||||
print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?') |
||||
print('ENTER to continue, ctrl+c to cancel.') |
||||
try: |
||||
input() |
||||
except KeyboardInterrupt: |
||||
print('\nCancelled.') |
||||
os._exit(0) |
||||
|
||||
print('Sorting file list by size...') |
||||
filenames.sort(key=os.path.getsize, reverse=True) |
||||
|
||||
print('Deduplicating by md5 hash...') |
||||
|
||||
for filename in filenames: |
||||
# hash first 65536 bytes of each file first |
||||
CHUNK_SIZE = 65536 |
||||
|
||||
hasher = hashlib.md5() |
||||
with open(filename, 'rb') as file: |
||||
buf = file.read(CHUNK_SIZE) |
||||
hasher.update(buf) |
||||
digest = hasher.hexdigest() |
||||
if digest in hashes: |
||||
delete.add(filename) |
||||
else: |
||||
hashes.add(digest) |
||||
|
||||
print('Found', len(delete), 'duplicates by md5 hash.') |
||||
print('Deduplicating by media fingerprint...') |
||||
|
||||
def get_image_hash(filename): |
||||
try: |
||||
image = Image.open(filename) |
||||
return str(average_hash(image)) + '_image' |
||||
except UnidentifiedImageError: |
||||
return None |
||||
|
||||
def get_video_hash(filename): |
||||
try: |
||||
return str(VideoHash(path=filename).hash_hex) + '_video' |
||||
except FFmpegFailedToExtractFrames: |
||||
return None |
||||
|
||||
count = 0 |
||||
total = len(filenames) |
||||
|
||||
for filename in filenames: |
||||
print('Hashing file', count, '/', total, end='\r') |
||||
count += 1 |
||||
|
||||
if filename in delete: continue |
||||
|
||||
try: |
||||
digest = get_image_hash(filename) or get_video_hash(filename) |
||||
except BaseException as e: |
||||
print() |
||||
print('Exception', e.__class__.__name__, str(e), 'while hashing:') |
||||
print(filename) |
||||
continue |
||||
|
||||
if not digest: continue |
||||
|
||||
if digest in hashes: |
||||
delete.add(filename) |
||||
else: |
||||
hashes.add(digest) |
||||
|
||||
print() |
||||
print('Found', len(delete), 'total duplicate files.') |
||||
|
||||
print('Deleting...') |
||||
for dupe in delete: |
||||
os.remove(dupe) |
||||
|
Loading…
Reference in new issue