Integrate with external MeiliSearch server

2020-06-27 22:53:39 +00:00
parent f46cafdc90
commit d614ad0743
4 changed files with 156 additions and 41 deletions
--- a/apiserver/database.py
+++ b/apiserver/database.py
@@ -14,10 +14,10 @@ class Story(Base):
    __tablename__ = 'stories'

    sid = Column(String(16), primary_key=True)
+    ref = Column(String(16), unique=True)
    meta_json = Column(String)
    full_json = Column(String)
    title = Column(String)
-    date = Column(Integer)

 class Reflist(Base):
    __tablename__ = 'reflist'
@@ -25,6 +25,7 @@ class Reflist(Base):
    rid = Column(Integer, primary_key=True)
    ref = Column(String(16), unique=True)
    sid = Column(String, ForeignKey('stories.sid'), unique=True)
+    source = Column(String(16))

 def init():
    Base.metadata.create_all(engine)
@@ -34,6 +35,7 @@ def get_story(sid):
    return session.query(Story).get(sid)

 def put_story(story):
+    story = story.copy()
    full_json = json.dumps(story)

    story.pop('text', None)
@@ -44,10 +46,10 @@ def put_story(story):
        session = Session()
        s = Story(
            sid=story['id'],
+            ref=story['ref'],
            full_json=full_json,
            meta_json=meta_json,
            title=story.get('title', None),
-            date=story.get('date', None),
        )
        session.merge(s)
        session.commit()
@@ -57,14 +59,14 @@ def put_story(story):
    finally:
        session.close() 

-def search(q):
+def get_story_by_ref(ref):
    session = Session()
-    return session.query(Story).filter(Story.title.contains(q))
+    return session.query(Story).filter(Story.ref==ref).first()

 def get_reflist(amount):
    session = Session()
    q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount)
-    return [dict(ref=x.ref, sid=x.sid) for x in q.all()]
+    return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]

 def get_stories(amount):
    session = Session()
@@ -75,10 +77,10 @@ def get_stories(amount):
            limit(amount)
    return [x[1] for x in q]

-def put_ref(ref, sid):
+def put_ref(ref, sid, source):
    try:
        session = Session()
-        r = Reflist(ref=ref, sid=sid)
+        r = Reflist(ref=ref, sid=sid, source=source)
        session.add(r)
        session.commit()
    except:
@@ -101,4 +103,4 @@ def del_ref(ref):
 if __name__ == '__main__':
    init()

-    print(get_stories(5))
+    print(get_story_by_ref('hgi3sy'))
--- a/apiserver/migrate-whoosh-to-sqlite.py
+++ b/apiserver/migrate-whoosh-to-sqlite.py
@@ -1,21 +1,67 @@
 import archive
 import database
+import search
+
 import json
+import requests

 database.init()
 archive.init()
+search.init()
+
+count = 0
+
+def database_del_story_by_ref(ref):
+    try:
+        session = database.Session()
+        session.query(database.Story).filter(database.Story.ref==ref).delete()
+        session.commit()
+    except:
+        session.rollback()
+        raise
+    finally:
+        session.close()
+
+def search_del_story(sid):
+    try:
+        r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2)
+        if r.status_code != 202:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem deleting MeiliSearch story: {}'.format(str(e)))
+        return False

 with archive.ix.searcher() as searcher:
-    for docnum in searcher.document_numbers():
-        try:
-            #if docnum > 500:
-            #    break
+    print('count all', searcher.doc_count_all())
+    print('count', searcher.doc_count())

-            print('docnum', docnum)
-            res = searcher.stored_fields(docnum)
-            print('id', res['id'])
-            database.put_story(res['story'])
+    for doc in searcher.documents():
+        try:
+            print('num', count, 'id', doc['id'])
+            count += 1
+
+            try:
+                database.put_story(doc['story'])
+            except database.IntegrityError:
+                print('collision!')
+                old_story = database.get_story_by_ref(doc['story']['ref'])
+                story = json.loads(old_story.full_json)
+                if doc['story']['num_comments'] > story['num_comments']:
+                    print('more comments, replacing')
+                    database_del_story_by_ref(doc['story']['ref'])
+                    database.put_story(doc['story'])
+                    search_del_story(story['id'])
+                else:
+                    print('fewer comments, skipping')
+                    continue
+
+            search.put_story(doc['story'])
            print()
+        except KeyboardInterrupt:
+            break
        except BaseException as e:
-            print('skipping', docnum)
+            print('skipping', doc['id'])
            print('reason:', e)
--- a/apiserver/search.py
+++ b/apiserver/search.py
@@ -0,0 +1,57 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+import requests
+
+MEILI_URL = 'http://127.0.0.1:7700/'
+
+def create_index():
+    try:
+        json = dict(name='qotnews', uid='qotnews')
+        r = requests.post(MEILI_URL + 'indexes', json=json, timeout=2)
+        if r.status_code != 201:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem creating MeiliSearch index: {}'.format(str(e)))
+        return False
+
+def init():
+    create_index()
+
+def put_story(story):
+    story = story.copy()
+    story.pop('text', None)
+    story.pop('comments', None)
+    try:
+        r = requests.post(MEILI_URL + 'indexes/qotnews/documents', json=[story], timeout=2)
+        if r.status_code != 202:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem putting MeiliSearch story: {}'.format(str(e)))
+        return False
+
+def search(q):
+    try:
+        params = dict(q=q, limit=250)
+        r = requests.get(MEILI_URL + 'indexes/qotnews/search', params=params, timeout=2)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()['hits']
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem searching MeiliSearch: {}'.format(str(e)))
+        return False
+    
+if __name__ == '__main__':
+    create_index()
+
+    print(search('the'))
--- a/apiserver/server.py
+++ b/apiserver/server.py
@@ -11,6 +11,7 @@ import time
 from urllib.parse import urlparse, parse_qs

 import database
+import search
 import feed
 from utils import gen_rand_id

@@ -25,6 +26,7 @@ from gevent.pywsgi import WSGIServer
 monkey.patch_all()

 database.init()
+search.init()

 FEED_LENGTH = 75
 news_index = 0
@@ -48,16 +50,13 @@ def api():
    return res

@flask_app.route('/api/search', strict_slashes=False)
-def search():
+def apisearch():
    q = request.args.get('q', '')
    if len(q) >= 3:
-        results = [x.meta_json for x in database.search(q)]
+        results = search.search(q)
    else:
        results = []
-    # hacky nested json
-    res = Response('{"results":[' + ','.join(results) + ']}')
-    res.headers['content-type'] = 'application/json'
-    return res
+    return dict(results=results)

@flask_app.route('/api/submit', methods=['POST'], strict_slashes=False)
 def submit():
@@ -75,19 +74,24 @@ def submit():
        elif 'reddit.com' in parse.hostname and 'comments' in url:
            source = 'reddit'
            ref = parse.path.split('/')[4]
+        elif 'news.t0.vc' in parse.hostname:
+            raise Exception('Invalid article')
        else:
            source = 'manual'
            ref = url

-        # TODO: return existing refs
-
-        story = dict(id=nid, ref=ref, source=source)
-        valid = feed.update_story(story, is_manual=True)
-        if valid:
-            database.put_story(story)
-            return {'nid': nid}
+        existing = database.get_story_by_ref(ref)
+        if existing:
+            return {'nid': existing.sid}
        else:
-            raise Exception('Invalid article')
+            story = dict(id=nid, ref=ref, source=source)
+            valid = feed.update_story(story, is_manual=True)
+            if valid:
+                database.put_story(story)
+                search.put_story(story)
+                return {'nid': nid}
+            else:
+                raise Exception('Invalid article')

    except BaseException as e:
        logging.error('Problem with article submission: {} - {}'.format(e.__class__.__name__, str(e)))
@@ -148,31 +152,37 @@ def feed_thread():

    try:
        while True:
-            ref_list = database.get_reflist(FEED_LENGTH)
-
            # onboard new stories
            if news_index == 0:
                for ref, source in feed.list():
+                    if database.get_story_by_ref(ref):
+                        continue
                    try:
                        nid = new_id()
-                        database.put_ref(ref, nid)
-                        database.put_story(dict(id=nid, ref=ref, source=source))
+                        database.put_ref(ref, nid, source)
                        logging.info('Added ref ' + ref)
                    except database.IntegrityError:
                        continue

+            ref_list = database.get_reflist(FEED_LENGTH)
+
            # update current stories
            if news_index < len(ref_list):
-                update_ref = ref_list[news_index]['ref']
-                update_sid = ref_list[news_index]['sid']
-                story_json = database.get_story(update_sid).full_json
-                story = json.loads(story_json)
+                item = ref_list[news_index]
+
+                try:
+                    story_json = database.get_story(item['sid']).full_json
+                    story = json.loads(story_json)
+                except AttributeError:
+                    story = dict(id=item['sid'], ref=item['ref'], source=item['source'])
+
                valid = feed.update_story(story)
                if valid:
                    database.put_story(story)
+                    search.put_story(story)
                else:
-                    database.del_ref(update_ref)
-                    logging.info('Removed ref {}'.format(update_ref))
+                    database.del_ref(item['ref'])
+                    logging.info('Removed ref {}'.format(item['ref']))

            gevent.sleep(6)