Remove Whoosh

Try Hackernews API twice
Improve logging
2020-11-02 00:22:40 +00:00 · 2020-11-02 00:17:22 +00:00 · 2020-11-02 00:13:43 +00:00
8 changed files with 13 additions and 157 deletions
--- a/apiserver/archive.py
+++ b/apiserver/archive.py
@@ -1,52 +0,0 @@
 from whoosh.analysis import StemmingAnalyzer, CharsetFilter, NgramFilter
 from whoosh.index import create_in, open_dir, exists_in
 from whoosh.fields import *
 from whoosh.qparser import QueryParser
 from whoosh.support.charset import accent_map
 analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=3)
 title_field = TEXT(analyzer=analyzer, stored=True)
 id_field = ID(unique=True, stored=True)
 schema = Schema(
        id=id_field,
        title=title_field,
        story=STORED,
        )
 ARCHIVE_LOCATION = 'data/archive'
 ix = None
 def init():
    global ix
    if exists_in(ARCHIVE_LOCATION):
        ix = open_dir(ARCHIVE_LOCATION)
    else:
        ix = create_in(ARCHIVE_LOCATION, schema)
 def update(story):
    writer = ix.writer()
    writer.update_document(
            id=story['id'],
            title=story['title'],
            story=story,
            )
    writer.commit()
 def get_story(sid):
    with ix.searcher() as searcher:
        result = searcher.document(id=sid)
        return result['story'] if result else None
 def search(search):
    with ix.searcher() as searcher:
        query = QueryParser('title', ix.schema).parse(search)
        results = searcher.search(query)
        stories = [r['story'] for r in results]
        for s in stories:
            s.pop('text', '')
            s.pop('comments', '')
        return stories
--- a/apiserver/data/archive/.gitkeep
+++ b/apiserver/data/archive/.gitkeep
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -10,7 +10,6 @@ from bs4 import BeautifulSoup
 from feeds import hackernews, reddit, tildes, manual
 OUTLINE_API = 'https://api.outline.com/v3/parse_article'
 ARCHIVE_API = 'https://archive.fo/submit/'
 READ_API = 'http://127.0.0.1:33843'
 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@@ -72,8 +71,6 @@ def get_content_type(url):
 def update_story(story, is_manual=False):
    res = {}
    logging.info('Updating story ' + str(story['ref']))
    if story['source'] == 'hackernews':
        res = hackernews.story(story['ref'])
    elif story['source'] == 'reddit':
--- a/apiserver/feeds/hackernews.py
+++ b/apiserver/feeds/hackernews.py
@@ -25,6 +25,16 @@ def api(route, ref=None):
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting hackernews API: {}, trying again'.format(str(e)))
    try:
        r = requests.get(route(ref), timeout=15)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting hackernews API: {}'.format(str(e)))
        return False
--- a/apiserver/migrate-shelve-to-whoosh.py
+++ b/apiserver/migrate-shelve-to-whoosh.py
@@ -1,26 +0,0 @@
 import shelve
 import archive
 archive.init()
 #with shelve.open('data/data') as db:
 #    to_delete = []
 #
 #    for s in db.values():
 #        if 'title' in s:
 #            archive.update(s)
 #        if 'id' in s:
 #            to_delete.append(s['id'])
 #
 #    for id in to_delete:
 #        del db[id]
 #
 #    for s in db['news_cache'].values():
 #        if 'title' in s:
 #            archive.update(s)
 #with shelve.open('data/whoosh') as db:
 #    for s in db['news_cache'].values():
 #        if 'title' in s and not archive.get_story(s['id']):
 #            archive.update(s)
--- a/apiserver/migrate-whoosh-to-sqlite.py
+++ b/apiserver/migrate-whoosh-to-sqlite.py
@@ -1,74 +0,0 @@
 import archive
 import database
 import search
 import json
 import requests
 database.init()
 archive.init()
 search.init()
 count = 0
 def database_del_story_by_ref(ref):
    try:
        session = database.Session()
        session.query(database.Story).filter(database.Story.ref==ref).delete()
        session.commit()
    except:
        session.rollback()
        raise
    finally:
        session.close()
 def search_del_story(sid):
    try:
        r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2)
        if r.status_code != 202:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem deleting MeiliSearch story: {}'.format(str(e)))
        return False
 with archive.ix.searcher() as searcher:
    print('count all', searcher.doc_count_all())
    print('count', searcher.doc_count())
    for doc in searcher.documents():
        try:
            print('num', count, 'id', doc['id'])
            count += 1
            story = doc['story']
            story.pop('img', None)
            if 'reddit.com/r/technology' in story['link']:
                print('skipping r/technology')
                continue
            try:
                database.put_story(story)
            except database.IntegrityError:
                print('collision!')
                old_story = database.get_story_by_ref(story['ref'])
                old_story = json.loads(old_story.full_json)
                if story['num_comments'] > old_story['num_comments']:
                    print('more comments, replacing')
                    database_del_story_by_ref(story['ref'])
                    database.put_story(story)
                    search_del_story(old_story['id'])
                else:
                    print('fewer comments, skipping')
                    continue
            search.put_story(story)
            print()
        except KeyboardInterrupt:
            break
        except BaseException as e:
            print('skipping', doc['id'])
            print('reason:', e)
--- a/apiserver/requirements.txt
+++ b/apiserver/requirements.txt
@@ -25,6 +25,5 @@ urllib3==1.25.9
 webencodings==0.5.1
 websocket-client==0.57.0
 Werkzeug==1.0.1
 Whoosh==2.7.4
 zope.event==4.4
 zope.interface==5.1.0
--- a/apiserver/server.py
+++ b/apiserver/server.py
@@ -175,6 +175,8 @@ def feed_thread():
                except AttributeError:
                    story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
                logging.info('Updating story: ' + str(story['ref']) + ', index: ' + str(news_index))
                valid = feed.update_story(story)
                if valid:
                    database.put_story(story)
@@ -183,7 +185,7 @@ def feed_thread():
                    database.del_ref(item['ref'])
                    logging.info('Removed ref {}'.format(item['ref']))
            else:
-                logging.info('Skipping index')
+                logging.info('Skipping index: ' + str(news_index))
            gevent.sleep(6)
Author	SHA1	Message	Date
Tanner Collin	e59acefda9	Remove Whoosh	2020-11-02 00:22:40 +00:00
Tanner Collin	cbc802b7e9	Try Hackernews API twice	2020-11-02 00:17:22 +00:00
Tanner Collin	4579dfce00	Improve logging	2020-11-02 00:13:43 +00:00