From dc83a70887c55e961bbd21b4e6fd26860a246686 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Mon, 4 Jul 2022 20:32:01 +0000 Subject: [PATCH] Begin script to fix bad gzip text --- apiserver/database.py | 14 ++++++++++++++ apiserver/fix-stories.py | 40 ++++++++++++++++++++++++++++++++++++++++ apiserver/reindex.py | 18 ++---------------- 3 files changed, 56 insertions(+), 16 deletions(-) create mode 100644 apiserver/fix-stories.py diff --git a/apiserver/database.py b/apiserver/database.py index c98c7af..a01756c 100644 --- a/apiserver/database.py +++ b/apiserver/database.py @@ -101,6 +101,20 @@ def del_ref(ref): finally: session.close() +def count_stories(): + try: + session = Session() + return session.query(Story).count() + finally: + session.close() + +def get_story_list(): + try: + session = Session() + return session.query(Story.sid).all() + finally: + session.close() + if __name__ == '__main__': init() diff --git a/apiserver/fix-stories.py b/apiserver/fix-stories.py new file mode 100644 index 0000000..cb01b13 --- /dev/null +++ b/apiserver/fix-stories.py @@ -0,0 +1,40 @@ +import logging +import database + +import json + +database.init() + +def fix_gzip_bug(story_list): + FIX_THRESHOLD = 150 + + count = 1 + for sid in story_list: + try: + sid = sid[0] + story = database.get_story(sid) + full_json = json.loads(story.full_json) + text = full_json.get('text', '') + + count = text.count('�') + if not count: continue + + ratio = count / len(text) * 1000 + print('Bad story:', sid, 'Num ?:', count, 'Ratio:', ratio) + except KeyboardInterrupt: + raise + except BaseException as e: + logging.exception(e) + breakpoint() + +if __name__ == '__main__': + num_stories = database.count_stories() + + print('Fix {} stories?'.format(num_stories)) + print('Press ENTER to continue, ctrl-c to cancel') + input() + + story_list = database.get_story_list() + + fix_gzip_bug(story_list) + diff --git a/apiserver/reindex.py b/apiserver/reindex.py index 7a29bf9..ee1334c 100644 --- a/apiserver/reindex.py +++ b/apiserver/reindex.py @@ -23,28 +23,14 @@ def put_stories(stories): def get_update(update_id): return search.meili_api(requests.get, 'tasks/{}'.format(update_id)) -def count_stories(): - try: - session = database.Session() - return session.query(database.Story).count() - finally: - session.close() - -def get_story_list(): - try: - session = database.Session() - return session.query(database.Story.sid).all() - finally: - session.close() - if __name__ == '__main__': - num_stories = count_stories() + num_stories = database.count_stories() print('Reindex {} stories?'.format(num_stories)) print('Press ENTER to continue, ctrl-c to cancel') input() - story_list = get_story_list() + story_list = database.get_story_list() count = 1 while len(story_list):