dedu/main.py

import glob
import os
import hashlib
import time
from PIL import Image, UnidentifiedImageError

from imagehash import average_hash
from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames

hashes = set()
delete = set()

filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]

print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
    input()
except KeyboardInterrupt:
    print('\nCancelled.')
    os._exit(0)

print('Sorting file list by smallest size...')
filenames.sort(key=os.path.getsize, reverse=False)

print('Deduplicating by md5 hash...')

for filename in filenames:
    # hash first 65536 bytes of each file only
    CHUNK_SIZE = 65536

    hasher = hashlib.md5()
    with open(filename, 'rb') as file:
        buf = file.read(CHUNK_SIZE)
        hasher.update(buf)
        digest = hasher.hexdigest()
        if digest in hashes:
            delete.add(filename)
        else:
            hashes.add(digest)

print('Found', len(delete), 'duplicates by md5 hash.')
print('Deduplicating by media fingerprint...')

def get_image_hash(filename):
    try:
        image = Image.open(filename)
        return str(average_hash(image)) + '_image'
    except UnidentifiedImageError:
        return None

def get_video_hash(filename):
    try:
        return str(VideoHash(path=filename).hash_hex) + '_video'
    except FFmpegFailedToExtractFrames:
        return None

count = 0
total = len(filenames)

for filename in filenames:
    count += 1
    print('Hashing file', count, '/', total, end='\r')

    if filename in delete: continue

    try:
        digest = get_image_hash(filename) or get_video_hash(filename)
    except KeyboardInterrupt:
        print('\nCancelled.')
        os._exit(0)
    except BaseException as e:
        print()
        print('Exception', e.__class__.__name__, str(e), 'while hashing:')
        print(filename)
        continue

    time.sleep(5)

    if not digest: continue

    if digest in hashes:
        delete.add(filename)
    else:
        hashes.add(digest)

print()
print('Found', len(delete), 'total duplicate files.')

print('Deleting...')
for dupe in delete:
    os.remove(dupe)
Add deduplication script 2022-09-04 05:08:40 +00:00			`import glob`
			`import os`
			`import hashlib`
Support cancelling 2023-10-18 16:28:24 +00:00			`import time`
Add deduplication script 2022-09-04 05:08:40 +00:00			`from PIL import Image, UnidentifiedImageError`

			`from imagehash import average_hash`
			`from videohash import VideoHash`
			`from videohash.exceptions import FFmpegFailedToExtractFrames`

			`hashes = set()`
			`delete = set()`

			`filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]`

			`print('Found', len(filenames), 'files in', os.getcwd() + '. Deduplicate them?')`
			`print('ENTER to continue, ctrl+c to cancel.')`
			`try:`
			`input()`
			`except KeyboardInterrupt:`
			`print('\nCancelled.')`
			`os._exit(0)`

Support cancelling 2023-10-18 16:28:24 +00:00			`print('Sorting file list by smallest size...')`
			`filenames.sort(key=os.path.getsize, reverse=False)`
Add deduplication script 2022-09-04 05:08:40 +00:00
			`print('Deduplicating by md5 hash...')`

			`for filename in filenames:`
Fixes 2022-09-04 05:54:45 +00:00			`# hash first 65536 bytes of each file only`
Add deduplication script 2022-09-04 05:08:40 +00:00			`CHUNK_SIZE = 65536`

			`hasher = hashlib.md5()`
			`with open(filename, 'rb') as file:`
			`buf = file.read(CHUNK_SIZE)`
			`hasher.update(buf)`
			`digest = hasher.hexdigest()`
			`if digest in hashes:`
			`delete.add(filename)`
			`else:`
			`hashes.add(digest)`

			`print('Found', len(delete), 'duplicates by md5 hash.')`
			`print('Deduplicating by media fingerprint...')`

			`def get_image_hash(filename):`
			`try:`
			`image = Image.open(filename)`
			`return str(average_hash(image)) + '_image'`
			`except UnidentifiedImageError:`
			`return None`

			`def get_video_hash(filename):`
			`try:`
			`return str(VideoHash(path=filename).hash_hex) + '_video'`
			`except FFmpegFailedToExtractFrames:`
			`return None`

			`count = 0`
			`total = len(filenames)`

			`for filename in filenames:`
			`count += 1`
Fixes 2022-09-04 05:54:45 +00:00			`print('Hashing file', count, '/', total, end='\r')`
Add deduplication script 2022-09-04 05:08:40 +00:00
			`if filename in delete: continue`

			`try:`
			`digest = get_image_hash(filename) or get_video_hash(filename)`
Support cancelling 2023-10-18 16:28:24 +00:00			`except KeyboardInterrupt:`
			`print('\nCancelled.')`
			`os._exit(0)`
Add deduplication script 2022-09-04 05:08:40 +00:00			`except BaseException as e:`
			`print()`
			`print('Exception', e.__class__.__name__, str(e), 'while hashing:')`
			`print(filename)`
			`continue`

Support cancelling 2023-10-18 16:28:24 +00:00			`time.sleep(5)`

Add deduplication script 2022-09-04 05:08:40 +00:00			`if not digest: continue`

			`if digest in hashes:`
			`delete.add(filename)`
			`else:`
			`hashes.add(digest)`

			`print()`
			`print('Found', len(delete), 'total duplicate files.')`

			`print('Deleting...')`
			`for dupe in delete:`
			`os.remove(dupe)`