Begin script to fix bad gzip text
This commit is contained in:
		| @@ -101,6 +101,20 @@ def del_ref(ref): | ||||
|     finally: | ||||
|         session.close() | ||||
|  | ||||
| def count_stories(): | ||||
|     try: | ||||
|         session = Session() | ||||
|         return session.query(Story).count() | ||||
|     finally: | ||||
|         session.close() | ||||
|  | ||||
| def get_story_list(): | ||||
|     try: | ||||
|         session = Session() | ||||
|         return session.query(Story.sid).all() | ||||
|     finally: | ||||
|         session.close() | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     init() | ||||
|  | ||||
|   | ||||
							
								
								
									
										40
									
								
								apiserver/fix-stories.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								apiserver/fix-stories.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
| import logging | ||||
| import database | ||||
|  | ||||
| import json | ||||
|  | ||||
| database.init() | ||||
|  | ||||
| def fix_gzip_bug(story_list): | ||||
|     FIX_THRESHOLD = 150 | ||||
|  | ||||
|     count = 1 | ||||
|     for sid in story_list: | ||||
|         try: | ||||
|             sid = sid[0] | ||||
|             story = database.get_story(sid) | ||||
|             full_json = json.loads(story.full_json) | ||||
|             text = full_json.get('text', '') | ||||
|  | ||||
|             count = text.count('<EFBFBD>') | ||||
|             if not count: continue | ||||
|  | ||||
|             ratio = count / len(text) * 1000 | ||||
|             print('Bad story:', sid, 'Num ?:', count, 'Ratio:', ratio) | ||||
|         except KeyboardInterrupt: | ||||
|             raise | ||||
|         except BaseException as e: | ||||
|             logging.exception(e) | ||||
|             breakpoint() | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     num_stories = database.count_stories() | ||||
|  | ||||
|     print('Fix {} stories?'.format(num_stories)) | ||||
|     print('Press ENTER to continue, ctrl-c to cancel') | ||||
|     input() | ||||
|  | ||||
|     story_list = database.get_story_list() | ||||
|  | ||||
|     fix_gzip_bug(story_list) | ||||
|  | ||||
| @@ -23,28 +23,14 @@ def put_stories(stories): | ||||
| def get_update(update_id): | ||||
|     return search.meili_api(requests.get, 'tasks/{}'.format(update_id)) | ||||
|  | ||||
| def count_stories(): | ||||
|     try: | ||||
|         session = database.Session() | ||||
|         return session.query(database.Story).count() | ||||
|     finally: | ||||
|         session.close() | ||||
|  | ||||
| def get_story_list(): | ||||
|     try: | ||||
|         session = database.Session() | ||||
|         return session.query(database.Story.sid).all() | ||||
|     finally: | ||||
|         session.close() | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     num_stories = count_stories() | ||||
|     num_stories = database.count_stories() | ||||
|  | ||||
|     print('Reindex {} stories?'.format(num_stories)) | ||||
|     print('Press ENTER to continue, ctrl-c to cancel') | ||||
|     input() | ||||
|  | ||||
|     story_list = get_story_list() | ||||
|     story_list = database.get_story_list() | ||||
|  | ||||
|     count = 1 | ||||
|     while len(story_list): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user