Compare commits
	
		
			7 Commits
		
	
	
		
			no_audio
			...
			f15a631983
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| f15a631983 | |||
| ef79eb5cad | |||
| d4ee0f1f98 | |||
| c226c3807a | |||
| 230460c153 | |||
| dddfbb0724 | |||
| d678c8fd0f | 
							
								
								
									
										55
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										55
									
								
								main.py
									
									
									
									
									
								
							| @@ -2,6 +2,9 @@ import glob | ||||
| import os | ||||
| import hashlib | ||||
| import time | ||||
| import traceback | ||||
| from multiprocessing import Pool | ||||
| import queue | ||||
| from PIL import Image, UnidentifiedImageError | ||||
|  | ||||
| from imagehash import average_hash | ||||
| @@ -18,7 +21,7 @@ print('ENTER to continue, ctrl+c to cancel.') | ||||
| try: | ||||
|     input() | ||||
| except KeyboardInterrupt: | ||||
|     print('\nCancelled.') | ||||
|     print('Cancelled.') | ||||
|     os._exit(0) | ||||
|  | ||||
| print('Sorting file list by smallest size...') | ||||
| @@ -36,51 +39,75 @@ for filename in filenames: | ||||
|         hasher.update(buf) | ||||
|         digest = hasher.hexdigest() | ||||
|         if digest in hashes: | ||||
|             print('Found digest', digest, 'collision for', filename) | ||||
|             delete.add(filename) | ||||
|         else: | ||||
|             hashes.add(digest) | ||||
|  | ||||
| print('Found', len(delete), 'duplicates by md5 hash.') | ||||
|  | ||||
| time.sleep(4) | ||||
| print('Deduplicating by media fingerprint...') | ||||
|  | ||||
| def get_image_hash(filename): | ||||
|     basename = os.path.basename(os.path.dirname(filename)) | ||||
|     try: | ||||
|         image = Image.open(filename) | ||||
|         return str(average_hash(image)) + '_image' | ||||
|         return basename + str(average_hash(image)) + '_image' | ||||
|     except UnidentifiedImageError: | ||||
|         return None | ||||
|  | ||||
| def get_video_hash(filename): | ||||
|     basename = os.path.basename(os.path.dirname(filename)) | ||||
|     try: | ||||
|         return str(VideoHash(path=filename).hash_hex) + '_video' | ||||
|         v = VideoHash(path=filename) | ||||
|         digest = str(v.hash_hex) | ||||
|         v.delete_storage_path() | ||||
|  | ||||
|         return basename + digest + '_video' | ||||
|     except FFmpegFailedToExtractFrames: | ||||
|         return None | ||||
|  | ||||
| count = 0 | ||||
| total = len(filenames) | ||||
|  | ||||
| for filename in filenames: | ||||
|     count += 1 | ||||
|     print('Hashing file', count, '/', total, end='\r') | ||||
| def hasher(filename): | ||||
|     if filename in delete: return None | ||||
|  | ||||
|     if filename in delete: continue | ||||
|     print('Hashing file:', filename) | ||||
|  | ||||
|     try: | ||||
|         digest = get_image_hash(filename) or get_video_hash(filename) | ||||
|         digest = get_image_hash(filename)#  or get_video_hash(filename) | ||||
|     except KeyboardInterrupt: | ||||
|         print('\nCancelled.') | ||||
|         print('Cancelled.') | ||||
|         os._exit(0) | ||||
|     except BaseException as e: | ||||
|         print() | ||||
|         print('Exception', e.__class__.__name__, str(e), 'while hashing:') | ||||
|         print(filename) | ||||
|         continue | ||||
|         print(traceback.format_exc()) | ||||
|         return None | ||||
|  | ||||
|     time.sleep(5) | ||||
|     if not digest: return None | ||||
|  | ||||
|     if not digest: continue | ||||
|     return (filename, digest) | ||||
|  | ||||
| with Pool() as pool: | ||||
|     results = pool.map(hasher, filenames) | ||||
|  | ||||
| print('Finished hashing.') | ||||
| print() | ||||
| print('Checking digests:') | ||||
| print() | ||||
|  | ||||
| for result in results: | ||||
|     if not result: continue | ||||
|     print(result) | ||||
|  | ||||
|     filename, digest = result | ||||
|  | ||||
|     if digest in hashes: | ||||
|         print('Found digest', digest, 'collision for', filename) | ||||
|         delete.add(filename) | ||||
|     else: | ||||
|         hashes.add(digest) | ||||
| @@ -88,6 +115,6 @@ for filename in filenames: | ||||
| print() | ||||
| print('Found', len(delete), 'total duplicate files.') | ||||
|  | ||||
| print('Deleting...') | ||||
| for dupe in delete: | ||||
| for dupe in sorted(list(delete)): | ||||
|     print('Deleting:', dupe) | ||||
|     os.remove(dupe) | ||||
|   | ||||
							
								
								
									
										13
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| Brotli==1.1.0 | ||||
| certifi==2023.7.22 | ||||
| imagedominantcolor==1.0.1 | ||||
| ImageHash==4.3.1 | ||||
| mutagen==1.47.0 | ||||
| numpy==1.26.1 | ||||
| Pillow==10.1.0 | ||||
| pycryptodomex==3.19.0 | ||||
| PyWavelets==1.4.1 | ||||
| scipy==1.11.3 | ||||
| videohash==3.0.1 | ||||
| websockets==11.0.3 | ||||
| yt-dlp==2023.10.13 | ||||
		Reference in New Issue
	
	Block a user