Compare commits

..

No commits in common. "f15a6319830d55d9745bdeb8dd9820db8569e7ce" and "c226c3807ad069a750375ffb34e2d2465e459f5b" have entirely different histories.

3 changed files with 70 additions and 59 deletions

2
.gitignore vendored
View File

@ -102,4 +102,4 @@ ENV/
*.swp *.swp
*.swo *.swo
mcdata/ test/

103
main.py
View File

@ -1,18 +1,18 @@
import glob import glob
import os import os
import hashlib import hashlib
import time
import traceback
from multiprocessing import Pool
import queue
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
import acoustid
import chromaprint
from imagehash import average_hash from imagehash import average_hash
from videohash import VideoHash from videohash import VideoHash
from videohash.exceptions import FFmpegFailedToExtractFrames from videohash.exceptions import FFmpegFailedToExtractFrames
hashes = set() hashes = set()
audio_hashes = []
delete = set() delete = set()
hash_lookup = {}
filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)]
@ -21,11 +21,11 @@ print('ENTER to continue, ctrl+c to cancel.')
try: try:
input() input()
except KeyboardInterrupt: except KeyboardInterrupt:
print('Cancelled.') print('\nCancelled.')
os._exit(0) os._exit(0)
print('Sorting file list by smallest size...') print('Sorting file list by size...')
filenames.sort(key=os.path.getsize, reverse=False) filenames.sort(key=os.path.getsize, reverse=True)
print('Deduplicating by md5 hash...') print('Deduplicating by md5 hash...')
@ -39,82 +39,87 @@ for filename in filenames:
hasher.update(buf) hasher.update(buf)
digest = hasher.hexdigest() digest = hasher.hexdigest()
if digest in hashes: if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename) delete.add(filename)
else: else:
hashes.add(digest) hashes.add(digest)
print('Found', len(delete), 'duplicates by md5 hash.') print('Found', len(delete), 'duplicates by md5 hash.')
time.sleep(4)
print('Deduplicating by media fingerprint...') print('Deduplicating by media fingerprint...')
def get_image_hash(filename): def get_image_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try: try:
image = Image.open(filename) image = Image.open(filename)
return basename + str(average_hash(image)) + '_image' return str(average_hash(image)) + '_image'
except UnidentifiedImageError: except UnidentifiedImageError:
return None return None
def get_video_hash(filename): def get_audio_hash(filename):
basename = os.path.basename(os.path.dirname(filename))
try: try:
v = VideoHash(path=filename) fp = acoustid.fingerprint_file(filename)[1]
digest = str(v.hash_hex) values, _ = chromaprint.decode_fingerprint(fp)
v.delete_storage_path() return chromaprint.hash_fingerprint(values)
except acoustid.FingerprintGenerationError:
return None
return basename + digest + '_video' def get_video_hash(filename):
try:
return str(VideoHash(path=filename).hash_hex) + '_video'
except FFmpegFailedToExtractFrames: except FFmpegFailedToExtractFrames:
return None return None
count = 0 count = 0
total = len(filenames) total = len(filenames)
def hasher(filename): for filename in filenames:
if filename in delete: return None count += 1
print('Hashing file', count, '/', total, end='\r')
print('Hashing file:', filename) if filename in delete: continue
try: try:
digest = get_image_hash(filename)# or get_video_hash(filename) digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename)
except KeyboardInterrupt: except KeyboardInterrupt:
print('Cancelled.') print('Skipping media hashing.')
os._exit(0) break
except BaseException as e: except BaseException as e:
print() print()
print('Exception', e.__class__.__name__, str(e), 'while hashing:') print('Exception', e.__class__.__name__, str(e), 'while hashing:')
print(filename) print(filename)
print(traceback.format_exc()) continue
return None
if not digest: return None if not digest: continue
return (filename, digest) hash_lookup[digest] = filename
with Pool() as pool: if type(digest) == int:
results = pool.map(hasher, filenames) for h in audio_hashes:
if bin(digest ^ h).count('1') <= 5: # TODO adjust?
print('Finished hashing.') delete.add(filename)
print() print()
print('Checking digests:') print(digest, filename, 'close to', h, hash_lookup[h])
print() break
else: # for
for result in results: audio_hashes.append(digest)
if not result: continue
print(result)
filename, digest = result
if digest in hashes:
print('Found digest', digest, 'collision for', filename)
delete.add(filename)
else: else:
hashes.add(digest) if digest in hashes:
delete.add(filename)
else:
hashes.add(digest)
print() print()
print('Found', len(delete), 'total duplicate files.') print()
for dupe in delete:
print(dupe)
for dupe in sorted(list(delete)): print()
print('Deleting:', dupe) print('Found', len(delete), 'total duplicate files. Delete them?')
print('ENTER to continue, ctrl+c to cancel.')
try:
input()
except KeyboardInterrupt:
print('\nCancelled.')
os._exit(0)
print('Deleting...')
for dupe in delete:
os.remove(dupe) os.remove(dupe)

View File

@ -1,13 +1,19 @@
Brotli==1.1.0 audioread==3.0.0
certifi==2023.7.22 Brotli==1.0.9
certifi==2022.12.7
charset-normalizer==3.1.0
idna==3.4
imagedominantcolor==1.0.1 imagedominantcolor==1.0.1
ImageHash==4.3.1 ImageHash==4.3.1
mutagen==1.47.0 mutagen==1.46.0
numpy==1.26.1 numpy==1.24.2
Pillow==10.1.0 Pillow==9.5.0
pycryptodomex==3.19.0 pyacoustid==1.2.2
pycryptodomex==3.17
PyWavelets==1.4.1 PyWavelets==1.4.1
scipy==1.11.3 requests==2.28.2
scipy==1.10.1
urllib3==1.26.15
videohash==3.0.1 videohash==3.0.1
websockets==11.0.3 websockets==11.0
yt-dlp==2023.10.13 yt-dlp==2023.3.4