41 lines
		
	
	
		
			936 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			41 lines
		
	
	
		
			936 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| import logging
 | ||
| import database
 | ||
| 
 | ||
| import json
 | ||
| 
 | ||
| database.init()
 | ||
| 
 | ||
| def fix_gzip_bug(story_list):
 | ||
|     FIX_THRESHOLD = 150
 | ||
| 
 | ||
|     count = 1
 | ||
|     for sid in story_list:
 | ||
|         try:
 | ||
|             sid = sid[0]
 | ||
|             story = database.get_story(sid)
 | ||
|             full_json = json.loads(story.full_json)
 | ||
|             text = full_json.get('text', '')
 | ||
| 
 | ||
|             count = text.count('<EFBFBD>')
 | ||
|             if not count: continue
 | ||
| 
 | ||
|             ratio = count / len(text) * 1000
 | ||
|             print('Bad story:', sid, 'Num ?:', count, 'Ratio:', ratio)
 | ||
|         except KeyboardInterrupt:
 | ||
|             raise
 | ||
|         except BaseException as e:
 | ||
|             logging.exception(e)
 | ||
|             breakpoint()
 | ||
| 
 | ||
| if __name__ == '__main__':
 | ||
|     num_stories = database.count_stories()
 | ||
| 
 | ||
|     print('Fix {} stories?'.format(num_stories))
 | ||
|     print('Press ENTER to continue, ctrl-c to cancel')
 | ||
|     input()
 | ||
| 
 | ||
|     story_list = database.get_story_list()
 | ||
| 
 | ||
|     fix_gzip_bug(story_list)
 | ||
| 
 |