Begin script to fix bad gzip text
This commit is contained in:
parent
2e2c9ae837
commit
dc83a70887
|
@ -101,6 +101,20 @@ def del_ref(ref):
|
||||||
finally:
|
finally:
|
||||||
session.close()
|
session.close()
|
||||||
|
|
||||||
|
def count_stories():
|
||||||
|
try:
|
||||||
|
session = Session()
|
||||||
|
return session.query(Story).count()
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
def get_story_list():
|
||||||
|
try:
|
||||||
|
session = Session()
|
||||||
|
return session.query(Story.sid).all()
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
init()
|
init()
|
||||||
|
|
||||||
|
|
40
apiserver/fix-stories.py
Normal file
40
apiserver/fix-stories.py
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
import logging
|
||||||
|
import database
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
database.init()
|
||||||
|
|
||||||
|
def fix_gzip_bug(story_list):
|
||||||
|
FIX_THRESHOLD = 150
|
||||||
|
|
||||||
|
count = 1
|
||||||
|
for sid in story_list:
|
||||||
|
try:
|
||||||
|
sid = sid[0]
|
||||||
|
story = database.get_story(sid)
|
||||||
|
full_json = json.loads(story.full_json)
|
||||||
|
text = full_json.get('text', '')
|
||||||
|
|
||||||
|
count = text.count('<EFBFBD>')
|
||||||
|
if not count: continue
|
||||||
|
|
||||||
|
ratio = count / len(text) * 1000
|
||||||
|
print('Bad story:', sid, 'Num ?:', count, 'Ratio:', ratio)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.exception(e)
|
||||||
|
breakpoint()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
num_stories = database.count_stories()
|
||||||
|
|
||||||
|
print('Fix {} stories?'.format(num_stories))
|
||||||
|
print('Press ENTER to continue, ctrl-c to cancel')
|
||||||
|
input()
|
||||||
|
|
||||||
|
story_list = database.get_story_list()
|
||||||
|
|
||||||
|
fix_gzip_bug(story_list)
|
||||||
|
|
|
@ -23,28 +23,14 @@ def put_stories(stories):
|
||||||
def get_update(update_id):
|
def get_update(update_id):
|
||||||
return search.meili_api(requests.get, 'tasks/{}'.format(update_id))
|
return search.meili_api(requests.get, 'tasks/{}'.format(update_id))
|
||||||
|
|
||||||
def count_stories():
|
|
||||||
try:
|
|
||||||
session = database.Session()
|
|
||||||
return session.query(database.Story).count()
|
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
def get_story_list():
|
|
||||||
try:
|
|
||||||
session = database.Session()
|
|
||||||
return session.query(database.Story.sid).all()
|
|
||||||
finally:
|
|
||||||
session.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
num_stories = count_stories()
|
num_stories = database.count_stories()
|
||||||
|
|
||||||
print('Reindex {} stories?'.format(num_stories))
|
print('Reindex {} stories?'.format(num_stories))
|
||||||
print('Press ENTER to continue, ctrl-c to cancel')
|
print('Press ENTER to continue, ctrl-c to cancel')
|
||||||
input()
|
input()
|
||||||
|
|
||||||
story_list = get_story_list()
|
story_list = database.get_story_list()
|
||||||
|
|
||||||
count = 1
|
count = 1
|
||||||
while len(story_list):
|
while len(story_list):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user