From e59acefda9df00a3ddc34dcf6ddb255002dca620 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Mon, 2 Nov 2020 00:22:40 +0000 Subject: [PATCH] Remove Whoosh --- apiserver/archive.py | 52 ------------------- apiserver/data/archive/.gitkeep | 0 apiserver/migrate-shelve-to-whoosh.py | 26 ---------- apiserver/migrate-whoosh-to-sqlite.py | 74 --------------------------- apiserver/requirements.txt | 1 - 5 files changed, 153 deletions(-) delete mode 100644 apiserver/archive.py delete mode 100644 apiserver/data/archive/.gitkeep delete mode 100644 apiserver/migrate-shelve-to-whoosh.py delete mode 100644 apiserver/migrate-whoosh-to-sqlite.py diff --git a/apiserver/archive.py b/apiserver/archive.py deleted file mode 100644 index a3aa848..0000000 --- a/apiserver/archive.py +++ /dev/null @@ -1,52 +0,0 @@ -from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter -from whoosh.index import create_in, open_dir, exists_in -from whoosh.fields import * -from whoosh.qparser import QueryParser -from whoosh.support.charset import accent_map - -analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3) - -title_field = TEXT(analyzer=analyzer, stored=True) -id_field = ID(unique=True, stored=True) - -schema = Schema( - id=id_field, - title=title_field, - story=STORED, - ) - -ARCHIVE_LOCATION = 'data/archive' - -ix = None - -def init(): - global ix - - if exists_in(ARCHIVE_LOCATION): - ix = open_dir(ARCHIVE_LOCATION) - else: - ix = create_in(ARCHIVE_LOCATION, schema) - -def update(story): - writer = ix.writer() - writer.update_document( - id=story['id'], - title=story['title'], - story=story, - ) - writer.commit() - -def get_story(sid): - with ix.searcher() as searcher: - result = searcher.document(id=sid) - return result['story'] if result else None - -def search(search): - with ix.searcher() as searcher: - query = QueryParser('title', ix.schema).parse(search) - results = searcher.search(query) - stories = [r['story'] for r in results] - for s in stories: - s.pop('text', '') - s.pop('comments', '') - return stories diff --git a/apiserver/data/archive/.gitkeep b/apiserver/data/archive/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/apiserver/migrate-shelve-to-whoosh.py b/apiserver/migrate-shelve-to-whoosh.py deleted file mode 100644 index 4bfae32..0000000 --- a/apiserver/migrate-shelve-to-whoosh.py +++ /dev/null @@ -1,26 +0,0 @@ -import shelve - -import archive - -archive.init() - -#with shelve.open('data/data') as db: -# to_delete = [] -# -# for s in db.values(): -# if 'title' in s: -# archive.update(s) -# if 'id' in s: -# to_delete.append(s['id']) -# -# for id in to_delete: -# del db[id] -# -# for s in db['news_cache'].values(): -# if 'title' in s: -# archive.update(s) - -#with shelve.open('data/whoosh') as db: -# for s in db['news_cache'].values(): -# if 'title' in s and not archive.get_story(s['id']): -# archive.update(s) diff --git a/apiserver/migrate-whoosh-to-sqlite.py b/apiserver/migrate-whoosh-to-sqlite.py deleted file mode 100644 index c96ab13..0000000 --- a/apiserver/migrate-whoosh-to-sqlite.py +++ /dev/null @@ -1,74 +0,0 @@ -import archive -import database -import search - -import json -import requests - -database.init() -archive.init() -search.init() - -count = 0 - -def database_del_story_by_ref(ref): - try: - session = database.Session() - session.query(database.Story).filter(database.Story.ref==ref).delete() - session.commit() - except: - session.rollback() - raise - finally: - session.close() - -def search_del_story(sid): - try: - r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2) - if r.status_code != 202: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem deleting MeiliSearch story: {}'.format(str(e))) - return False - -with archive.ix.searcher() as searcher: - print('count all', searcher.doc_count_all()) - print('count', searcher.doc_count()) - - for doc in searcher.documents(): - try: - print('num', count, 'id', doc['id']) - count += 1 - - story = doc['story'] - story.pop('img', None) - - if 'reddit.com/r/technology' in story['link']: - print('skipping r/technology') - continue - - try: - database.put_story(story) - except database.IntegrityError: - print('collision!') - old_story = database.get_story_by_ref(story['ref']) - old_story = json.loads(old_story.full_json) - if story['num_comments'] > old_story['num_comments']: - print('more comments, replacing') - database_del_story_by_ref(story['ref']) - database.put_story(story) - search_del_story(old_story['id']) - else: - print('fewer comments, skipping') - continue - - search.put_story(story) - print() - except KeyboardInterrupt: - break - except BaseException as e: - print('skipping', doc['id']) - print('reason:', e) diff --git a/apiserver/requirements.txt b/apiserver/requirements.txt index 427e1c4..c34a469 100644 --- a/apiserver/requirements.txt +++ b/apiserver/requirements.txt @@ -25,6 +25,5 @@ urllib3==1.25.9 webencodings==0.5.1 websocket-client==0.57.0 Werkzeug==1.0.1 -Whoosh==2.7.4 zope.event==4.4 zope.interface==5.1.0