Remove Whoosh
This commit is contained in:
		| @@ -1,52 +0,0 @@ | ||||
| from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter | ||||
| from whoosh.index import create_in, open_dir, exists_in | ||||
| from whoosh.fields import * | ||||
| from whoosh.qparser import QueryParser | ||||
| from whoosh.support.charset import accent_map | ||||
|  | ||||
| analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3) | ||||
|  | ||||
| title_field = TEXT(analyzer=analyzer, stored=True) | ||||
| id_field = ID(unique=True, stored=True) | ||||
|  | ||||
| schema = Schema( | ||||
|         id=id_field, | ||||
|         title=title_field, | ||||
|         story=STORED, | ||||
|         ) | ||||
|  | ||||
| ARCHIVE_LOCATION = 'data/archive' | ||||
|  | ||||
| ix = None | ||||
|  | ||||
| def init(): | ||||
|     global ix | ||||
|  | ||||
|     if exists_in(ARCHIVE_LOCATION): | ||||
|         ix = open_dir(ARCHIVE_LOCATION) | ||||
|     else: | ||||
|         ix = create_in(ARCHIVE_LOCATION, schema) | ||||
|  | ||||
| def update(story): | ||||
|     writer = ix.writer() | ||||
|     writer.update_document( | ||||
|             id=story['id'], | ||||
|             title=story['title'], | ||||
|             story=story, | ||||
|             ) | ||||
|     writer.commit() | ||||
|  | ||||
| def get_story(sid): | ||||
|     with ix.searcher() as searcher: | ||||
|         result = searcher.document(id=sid) | ||||
|         return result['story'] if result else None | ||||
|  | ||||
| def search(search): | ||||
|     with ix.searcher() as searcher: | ||||
|         query = QueryParser('title', ix.schema).parse(search) | ||||
|         results = searcher.search(query) | ||||
|         stories = [r['story'] for r in results] | ||||
|         for s in stories: | ||||
|             s.pop('text', '') | ||||
|             s.pop('comments', '') | ||||
|         return stories | ||||
| @@ -1,26 +0,0 @@ | ||||
| import shelve | ||||
|  | ||||
| import archive | ||||
|  | ||||
| archive.init() | ||||
|  | ||||
| #with shelve.open('data/data') as db: | ||||
| #    to_delete = [] | ||||
| # | ||||
| #    for s in db.values(): | ||||
| #        if 'title' in s: | ||||
| #            archive.update(s) | ||||
| #        if 'id' in s: | ||||
| #            to_delete.append(s['id']) | ||||
| # | ||||
| #    for id in to_delete: | ||||
| #        del db[id] | ||||
| # | ||||
| #    for s in db['news_cache'].values(): | ||||
| #        if 'title' in s: | ||||
| #            archive.update(s) | ||||
|  | ||||
| #with shelve.open('data/whoosh') as db: | ||||
| #    for s in db['news_cache'].values(): | ||||
| #        if 'title' in s and not archive.get_story(s['id']): | ||||
| #            archive.update(s) | ||||
| @@ -1,74 +0,0 @@ | ||||
| import archive | ||||
| import database | ||||
| import search | ||||
|  | ||||
| import json | ||||
| import requests | ||||
|  | ||||
| database.init() | ||||
| archive.init() | ||||
| search.init() | ||||
|  | ||||
| count = 0 | ||||
|  | ||||
| def database_del_story_by_ref(ref): | ||||
|     try: | ||||
|         session = database.Session() | ||||
|         session.query(database.Story).filter(database.Story.ref==ref).delete() | ||||
|         session.commit() | ||||
|     except: | ||||
|         session.rollback() | ||||
|         raise | ||||
|     finally: | ||||
|         session.close() | ||||
|  | ||||
| def search_del_story(sid): | ||||
|     try: | ||||
|         r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2) | ||||
|         if r.status_code != 202: | ||||
|             raise Exception('Bad response code ' + str(r.status_code)) | ||||
|         return r.json() | ||||
|     except KeyboardInterrupt: | ||||
|         raise | ||||
|     except BaseException as e: | ||||
|         logging.error('Problem deleting MeiliSearch story: {}'.format(str(e))) | ||||
|         return False | ||||
|  | ||||
| with archive.ix.searcher() as searcher: | ||||
|     print('count all', searcher.doc_count_all()) | ||||
|     print('count', searcher.doc_count()) | ||||
|  | ||||
|     for doc in searcher.documents(): | ||||
|         try: | ||||
|             print('num', count, 'id', doc['id']) | ||||
|             count += 1 | ||||
|  | ||||
|             story = doc['story'] | ||||
|             story.pop('img', None) | ||||
|  | ||||
|             if 'reddit.com/r/technology' in story['link']: | ||||
|                 print('skipping r/technology') | ||||
|                 continue | ||||
|  | ||||
|             try: | ||||
|                 database.put_story(story) | ||||
|             except database.IntegrityError: | ||||
|                 print('collision!') | ||||
|                 old_story = database.get_story_by_ref(story['ref']) | ||||
|                 old_story = json.loads(old_story.full_json) | ||||
|                 if story['num_comments'] > old_story['num_comments']: | ||||
|                     print('more comments, replacing') | ||||
|                     database_del_story_by_ref(story['ref']) | ||||
|                     database.put_story(story) | ||||
|                     search_del_story(old_story['id']) | ||||
|                 else: | ||||
|                     print('fewer comments, skipping') | ||||
|                     continue | ||||
|  | ||||
|             search.put_story(story) | ||||
|             print() | ||||
|         except KeyboardInterrupt: | ||||
|             break | ||||
|         except BaseException as e: | ||||
|             print('skipping', doc['id']) | ||||
|             print('reason:', e) | ||||
| @@ -25,6 +25,5 @@ urllib3==1.25.9 | ||||
| webencodings==0.5.1 | ||||
| websocket-client==0.57.0 | ||||
| Werkzeug==1.0.1 | ||||
| Whoosh==2.7.4 | ||||
| zope.event==4.4 | ||||
| zope.interface==5.1.0 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user