Compare commits
	
		
			4 Commits
		
	
	
		
			c226c3807a
			...
			f15a631983
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| f15a631983 | |||
| ef79eb5cad | |||
| d4ee0f1f98 | |||
| 5e840ae8bb | 
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -102,4 +102,4 @@ ENV/ | ||||
| *.swp | ||||
| *.swo | ||||
|  | ||||
| test/ | ||||
| mcdata/ | ||||
|   | ||||
							
								
								
									
										103
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										103
									
								
								main.py
									
									
									
									
									
								
							| @@ -1,18 +1,18 @@ | ||||
| import glob | ||||
| import os | ||||
| import hashlib | ||||
| import time | ||||
| import traceback | ||||
| from multiprocessing import Pool | ||||
| import queue | ||||
| from PIL import Image, UnidentifiedImageError | ||||
|  | ||||
| import acoustid | ||||
| import chromaprint | ||||
| from imagehash import average_hash | ||||
| from videohash import VideoHash | ||||
| from videohash.exceptions import FFmpegFailedToExtractFrames | ||||
|  | ||||
| hashes = set() | ||||
| audio_hashes = [] | ||||
| delete = set() | ||||
| hash_lookup = {} | ||||
|  | ||||
| filenames = [x for x in glob.glob('**', recursive=True) if os.path.isfile(x)] | ||||
|  | ||||
| @@ -21,11 +21,11 @@ print('ENTER to continue, ctrl+c to cancel.') | ||||
| try: | ||||
|     input() | ||||
| except KeyboardInterrupt: | ||||
|     print('\nCancelled.') | ||||
|     print('Cancelled.') | ||||
|     os._exit(0) | ||||
|  | ||||
| print('Sorting file list by size...') | ||||
| filenames.sort(key=os.path.getsize, reverse=True) | ||||
| print('Sorting file list by smallest size...') | ||||
| filenames.sort(key=os.path.getsize, reverse=False) | ||||
|  | ||||
| print('Deduplicating by md5 hash...') | ||||
|  | ||||
| @@ -39,87 +39,82 @@ for filename in filenames: | ||||
|         hasher.update(buf) | ||||
|         digest = hasher.hexdigest() | ||||
|         if digest in hashes: | ||||
|             print('Found digest', digest, 'collision for', filename) | ||||
|             delete.add(filename) | ||||
|         else: | ||||
|             hashes.add(digest) | ||||
|  | ||||
| print('Found', len(delete), 'duplicates by md5 hash.') | ||||
|  | ||||
| time.sleep(4) | ||||
| print('Deduplicating by media fingerprint...') | ||||
|  | ||||
| def get_image_hash(filename): | ||||
|     basename = os.path.basename(os.path.dirname(filename)) | ||||
|     try: | ||||
|         image = Image.open(filename) | ||||
|         return str(average_hash(image)) + '_image' | ||||
|         return basename + str(average_hash(image)) + '_image' | ||||
|     except UnidentifiedImageError: | ||||
|         return None | ||||
|  | ||||
| def get_audio_hash(filename): | ||||
|     try: | ||||
|         fp = acoustid.fingerprint_file(filename)[1] | ||||
|         values, _ = chromaprint.decode_fingerprint(fp) | ||||
|         return chromaprint.hash_fingerprint(values) | ||||
|     except acoustid.FingerprintGenerationError: | ||||
|         return None | ||||
|  | ||||
| def get_video_hash(filename): | ||||
|     basename = os.path.basename(os.path.dirname(filename)) | ||||
|     try: | ||||
|         return str(VideoHash(path=filename).hash_hex) + '_video' | ||||
|         v = VideoHash(path=filename) | ||||
|         digest = str(v.hash_hex) | ||||
|         v.delete_storage_path() | ||||
|  | ||||
|         return basename + digest + '_video' | ||||
|     except FFmpegFailedToExtractFrames: | ||||
|         return None | ||||
|  | ||||
| count = 0 | ||||
| total = len(filenames) | ||||
|  | ||||
| for filename in filenames: | ||||
|     count += 1 | ||||
|     print('Hashing file', count, '/', total, end='\r') | ||||
| def hasher(filename): | ||||
|     if filename in delete: return None | ||||
|  | ||||
|     if filename in delete: continue | ||||
|     print('Hashing file:', filename) | ||||
|  | ||||
|     try: | ||||
|         digest = get_image_hash(filename) or get_audio_hash(filename) or get_video_hash(filename) | ||||
|         digest = get_image_hash(filename)#  or get_video_hash(filename) | ||||
|     except KeyboardInterrupt: | ||||
|         print('Skipping media hashing.') | ||||
|         break | ||||
|         print('Cancelled.') | ||||
|         os._exit(0) | ||||
|     except BaseException as e: | ||||
|         print() | ||||
|         print('Exception', e.__class__.__name__, str(e), 'while hashing:') | ||||
|         print(filename) | ||||
|         continue | ||||
|         print(traceback.format_exc()) | ||||
|         return None | ||||
|  | ||||
|     if not digest: continue | ||||
|     if not digest: return None | ||||
|  | ||||
|     hash_lookup[digest] = filename | ||||
|     return (filename, digest) | ||||
|  | ||||
|     if type(digest) == int: | ||||
|         for h in audio_hashes: | ||||
|             if bin(digest ^ h).count('1') <= 5:   # TODO adjust? | ||||
|                 delete.add(filename) | ||||
|                 print() | ||||
|                 print(digest, filename, 'close to', h, hash_lookup[h]) | ||||
|                 break | ||||
|         else:  # for | ||||
|             audio_hashes.append(digest) | ||||
| with Pool() as pool: | ||||
|     results = pool.map(hasher, filenames) | ||||
|  | ||||
| print('Finished hashing.') | ||||
| print() | ||||
| print('Checking digests:') | ||||
| print() | ||||
|  | ||||
| for result in results: | ||||
|     if not result: continue | ||||
|     print(result) | ||||
|  | ||||
|     filename, digest = result | ||||
|  | ||||
|     if digest in hashes: | ||||
|         print('Found digest', digest, 'collision for', filename) | ||||
|         delete.add(filename) | ||||
|     else: | ||||
|         if digest in hashes: | ||||
|             delete.add(filename) | ||||
|         else: | ||||
|             hashes.add(digest) | ||||
|         hashes.add(digest) | ||||
|  | ||||
| print() | ||||
| print() | ||||
| for dupe in delete: | ||||
|     print(dupe) | ||||
| print('Found', len(delete), 'total duplicate files.') | ||||
|  | ||||
| print() | ||||
| print('Found', len(delete), 'total duplicate files. Delete them?') | ||||
| print('ENTER to continue, ctrl+c to cancel.') | ||||
| try: | ||||
|     input() | ||||
| except KeyboardInterrupt: | ||||
|     print('\nCancelled.') | ||||
|     os._exit(0) | ||||
|  | ||||
| print('Deleting...') | ||||
| for dupe in delete: | ||||
| for dupe in sorted(list(delete)): | ||||
|     print('Deleting:', dupe) | ||||
|     os.remove(dupe) | ||||
|   | ||||
| @@ -1,19 +1,13 @@ | ||||
| audioread==3.0.0 | ||||
| Brotli==1.0.9 | ||||
| certifi==2022.12.7 | ||||
| charset-normalizer==3.1.0 | ||||
| idna==3.4 | ||||
| Brotli==1.1.0 | ||||
| certifi==2023.7.22 | ||||
| imagedominantcolor==1.0.1 | ||||
| ImageHash==4.3.1 | ||||
| mutagen==1.46.0 | ||||
| numpy==1.24.2 | ||||
| Pillow==9.5.0 | ||||
| pyacoustid==1.2.2 | ||||
| pycryptodomex==3.17 | ||||
| mutagen==1.47.0 | ||||
| numpy==1.26.1 | ||||
| Pillow==10.1.0 | ||||
| pycryptodomex==3.19.0 | ||||
| PyWavelets==1.4.1 | ||||
| requests==2.28.2 | ||||
| scipy==1.10.1 | ||||
| urllib3==1.26.15 | ||||
| scipy==1.11.3 | ||||
| videohash==3.0.1 | ||||
| websockets==11.0 | ||||
| yt-dlp==2023.3.4 | ||||
| websockets==11.0.3 | ||||
| yt-dlp==2023.10.13 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user