Begin script to fix bad gzip text

This commit is contained in:
Tanner Collin 2022-07-04 20:32:01 +00:00
parent 2e2c9ae837
commit dc83a70887
3 changed files with 56 additions and 16 deletions

View File

@ -101,6 +101,20 @@ def del_ref(ref):
finally: finally:
session.close() session.close()
def count_stories():
try:
session = Session()
return session.query(Story).count()
finally:
session.close()
def get_story_list():
try:
session = Session()
return session.query(Story.sid).all()
finally:
session.close()
if __name__ == '__main__': if __name__ == '__main__':
init() init()

40
apiserver/fix-stories.py Normal file
View File

@ -0,0 +1,40 @@
import logging
import database
import json
database.init()
def fix_gzip_bug(story_list):
FIX_THRESHOLD = 150
count = 1
for sid in story_list:
try:
sid = sid[0]
story = database.get_story(sid)
full_json = json.loads(story.full_json)
text = full_json.get('text', '')
count = text.count('<EFBFBD>')
if not count: continue
ratio = count / len(text) * 1000
print('Bad story:', sid, 'Num ?:', count, 'Ratio:', ratio)
except KeyboardInterrupt:
raise
except BaseException as e:
logging.exception(e)
breakpoint()
if __name__ == '__main__':
num_stories = database.count_stories()
print('Fix {} stories?'.format(num_stories))
print('Press ENTER to continue, ctrl-c to cancel')
input()
story_list = database.get_story_list()
fix_gzip_bug(story_list)

View File

@ -23,28 +23,14 @@ def put_stories(stories):
def get_update(update_id): def get_update(update_id):
return search.meili_api(requests.get, 'tasks/{}'.format(update_id)) return search.meili_api(requests.get, 'tasks/{}'.format(update_id))
def count_stories():
try:
session = database.Session()
return session.query(database.Story).count()
finally:
session.close()
def get_story_list():
try:
session = database.Session()
return session.query(database.Story.sid).all()
finally:
session.close()
if __name__ == '__main__': if __name__ == '__main__':
num_stories = count_stories() num_stories = database.count_stories()
print('Reindex {} stories?'.format(num_stories)) print('Reindex {} stories?'.format(num_stories))
print('Press ENTER to continue, ctrl-c to cancel') print('Press ENTER to continue, ctrl-c to cancel')
input() input()
story_list = get_story_list() story_list = database.get_story_list()
count = 1 count = 1
while len(story_list): while len(story_list):