2022-08-10 04:06:39 +00:00
|
|
|
|
import time
|
2022-07-05 00:57:56 +00:00
|
|
|
|
import json
|
2022-07-04 20:32:01 +00:00
|
|
|
|
import logging
|
|
|
|
|
|
2022-07-05 00:57:56 +00:00
|
|
|
|
import feed
|
|
|
|
|
import database
|
|
|
|
|
import search
|
2022-07-04 20:32:01 +00:00
|
|
|
|
|
|
|
|
|
database.init()
|
|
|
|
|
|
|
|
|
|
def fix_gzip_bug(story_list):
|
|
|
|
|
FIX_THRESHOLD = 150
|
|
|
|
|
|
|
|
|
|
count = 1
|
|
|
|
|
for sid in story_list:
|
|
|
|
|
try:
|
|
|
|
|
sid = sid[0]
|
|
|
|
|
story = database.get_story(sid)
|
|
|
|
|
full_json = json.loads(story.full_json)
|
2022-07-05 00:57:56 +00:00
|
|
|
|
meta_json = json.loads(story.meta_json)
|
2022-07-04 20:32:01 +00:00
|
|
|
|
text = full_json.get('text', '')
|
|
|
|
|
|
|
|
|
|
count = text.count('<EFBFBD>')
|
|
|
|
|
if not count: continue
|
|
|
|
|
|
|
|
|
|
ratio = count / len(text) * 1000
|
|
|
|
|
print('Bad story:', sid, 'Num ?:', count, 'Ratio:', ratio)
|
2022-07-05 00:57:56 +00:00
|
|
|
|
if ratio < FIX_THRESHOLD: continue
|
|
|
|
|
|
|
|
|
|
print('Attempting to fix...')
|
|
|
|
|
|
|
|
|
|
valid = feed.update_story(meta_json, is_manual=True)
|
|
|
|
|
if valid:
|
|
|
|
|
database.put_story(meta_json)
|
|
|
|
|
search.put_story(meta_json)
|
|
|
|
|
print('Success')
|
|
|
|
|
else:
|
|
|
|
|
print('Story was not valid')
|
|
|
|
|
|
|
|
|
|
time.sleep(3)
|
|
|
|
|
|
2022-07-04 20:32:01 +00:00
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
raise
|
|
|
|
|
except BaseException as e:
|
|
|
|
|
logging.exception(e)
|
|
|
|
|
breakpoint()
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
num_stories = database.count_stories()
|
|
|
|
|
|
|
|
|
|
print('Fix {} stories?'.format(num_stories))
|
|
|
|
|
print('Press ENTER to continue, ctrl-c to cancel')
|
|
|
|
|
input()
|
|
|
|
|
|
|
|
|
|
story_list = database.get_story_list()
|
|
|
|
|
|
|
|
|
|
fix_gzip_bug(story_list)
|
|
|
|
|
|