diff --git a/apiserver/database.py b/apiserver/database.py index 826baa4..24582c6 100644 --- a/apiserver/database.py +++ b/apiserver/database.py @@ -14,10 +14,10 @@ class Story(Base): __tablename__ = 'stories' sid = Column(String(16), primary_key=True) + ref = Column(String(16), unique=True) meta_json = Column(String) full_json = Column(String) title = Column(String) - date = Column(Integer) class Reflist(Base): __tablename__ = 'reflist' @@ -25,6 +25,7 @@ class Reflist(Base): rid = Column(Integer, primary_key=True) ref = Column(String(16), unique=True) sid = Column(String, ForeignKey('stories.sid'), unique=True) + source = Column(String(16)) def init(): Base.metadata.create_all(engine) @@ -34,6 +35,7 @@ def get_story(sid): return session.query(Story).get(sid) def put_story(story): + story = story.copy() full_json = json.dumps(story) story.pop('text', None) @@ -44,10 +46,10 @@ def put_story(story): session = Session() s = Story( sid=story['id'], + ref=story['ref'], full_json=full_json, meta_json=meta_json, title=story.get('title', None), - date=story.get('date', None), ) session.merge(s) session.commit() @@ -57,14 +59,14 @@ def put_story(story): finally: session.close() -def search(q): +def get_story_by_ref(ref): session = Session() - return session.query(Story).filter(Story.title.contains(q)) + return session.query(Story).filter(Story.ref==ref).first() def get_reflist(amount): session = Session() q = session.query(Reflist).order_by(Reflist.rid.desc()).limit(amount) - return [dict(ref=x.ref, sid=x.sid) for x in q.all()] + return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()] def get_stories(amount): session = Session() @@ -75,10 +77,10 @@ def get_stories(amount): limit(amount) return [x[1] for x in q] -def put_ref(ref, sid): +def put_ref(ref, sid, source): try: session = Session() - r = Reflist(ref=ref, sid=sid) + r = Reflist(ref=ref, sid=sid, source=source) session.add(r) session.commit() except: @@ -101,4 +103,4 @@ def del_ref(ref): if __name__ == '__main__': init() - print(get_stories(5)) + print(get_story_by_ref('hgi3sy')) diff --git a/apiserver/migrate-whoosh-to-sqlite.py b/apiserver/migrate-whoosh-to-sqlite.py index 430d62b..410e043 100644 --- a/apiserver/migrate-whoosh-to-sqlite.py +++ b/apiserver/migrate-whoosh-to-sqlite.py @@ -1,21 +1,67 @@ import archive import database +import search + import json +import requests database.init() archive.init() +search.init() + +count = 0 + +def database_del_story_by_ref(ref): + try: + session = database.Session() + session.query(database.Story).filter(database.Story.ref==ref).delete() + session.commit() + except: + session.rollback() + raise + finally: + session.close() + +def search_del_story(sid): + try: + r = requests.delete(search.MEILI_URL + 'indexes/qotnews/documents/'+sid, timeout=2) + if r.status_code != 202: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem deleting MeiliSearch story: {}'.format(str(e))) + return False with archive.ix.searcher() as searcher: - for docnum in searcher.document_numbers(): - try: - #if docnum > 500: - # break + print('count all', searcher.doc_count_all()) + print('count', searcher.doc_count()) - print('docnum', docnum) - res = searcher.stored_fields(docnum) - print('id', res['id']) - database.put_story(res['story']) + for doc in searcher.documents(): + try: + print('num', count, 'id', doc['id']) + count += 1 + + try: + database.put_story(doc['story']) + except database.IntegrityError: + print('collision!') + old_story = database.get_story_by_ref(doc['story']['ref']) + story = json.loads(old_story.full_json) + if doc['story']['num_comments'] > story['num_comments']: + print('more comments, replacing') + database_del_story_by_ref(doc['story']['ref']) + database.put_story(doc['story']) + search_del_story(story['id']) + else: + print('fewer comments, skipping') + continue + + search.put_story(doc['story']) print() + except KeyboardInterrupt: + break except BaseException as e: - print('skipping', docnum) + print('skipping', doc['id']) print('reason:', e) diff --git a/apiserver/search.py b/apiserver/search.py new file mode 100644 index 0000000..95341aa --- /dev/null +++ b/apiserver/search.py @@ -0,0 +1,57 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +import requests + +MEILI_URL = 'http://127.0.0.1:7700/' + +def create_index(): + try: + json = dict(name='qotnews', uid='qotnews') + r = requests.post(MEILI_URL + 'indexes', json=json, timeout=2) + if r.status_code != 201: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem creating MeiliSearch index: {}'.format(str(e))) + return False + +def init(): + create_index() + +def put_story(story): + story = story.copy() + story.pop('text', None) + story.pop('comments', None) + try: + r = requests.post(MEILI_URL + 'indexes/qotnews/documents', json=[story], timeout=2) + if r.status_code != 202: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem putting MeiliSearch story: {}'.format(str(e))) + return False + +def search(q): + try: + params = dict(q=q, limit=250) + r = requests.get(MEILI_URL + 'indexes/qotnews/search', params=params, timeout=2) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json()['hits'] + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem searching MeiliSearch: {}'.format(str(e))) + return False + +if __name__ == '__main__': + create_index() + + print(search('the')) diff --git a/apiserver/server.py b/apiserver/server.py index 64efa2a..659ced9 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -11,6 +11,7 @@ import time from urllib.parse import urlparse, parse_qs import database +import search import feed from utils import gen_rand_id @@ -25,6 +26,7 @@ from gevent.pywsgi import WSGIServer monkey.patch_all() database.init() +search.init() FEED_LENGTH = 75 news_index = 0 @@ -48,16 +50,13 @@ def api(): return res @flask_app.route('/api/search', strict_slashes=False) -def search(): +def apisearch(): q = request.args.get('q', '') if len(q) >= 3: - results = [x.meta_json for x in database.search(q)] + results = search.search(q) else: results = [] - # hacky nested json - res = Response('{"results":[' + ','.join(results) + ']}') - res.headers['content-type'] = 'application/json' - return res + return dict(results=results) @flask_app.route('/api/submit', methods=['POST'], strict_slashes=False) def submit(): @@ -75,19 +74,24 @@ def submit(): elif 'reddit.com' in parse.hostname and 'comments' in url: source = 'reddit' ref = parse.path.split('/')[4] + elif 'news.t0.vc' in parse.hostname: + raise Exception('Invalid article') else: source = 'manual' ref = url - # TODO: return existing refs - - story = dict(id=nid, ref=ref, source=source) - valid = feed.update_story(story, is_manual=True) - if valid: - database.put_story(story) - return {'nid': nid} + existing = database.get_story_by_ref(ref) + if existing: + return {'nid': existing.sid} else: - raise Exception('Invalid article') + story = dict(id=nid, ref=ref, source=source) + valid = feed.update_story(story, is_manual=True) + if valid: + database.put_story(story) + search.put_story(story) + return {'nid': nid} + else: + raise Exception('Invalid article') except BaseException as e: logging.error('Problem with article submission: {} - {}'.format(e.__class__.__name__, str(e))) @@ -148,31 +152,37 @@ def feed_thread(): try: while True: - ref_list = database.get_reflist(FEED_LENGTH) - # onboard new stories if news_index == 0: for ref, source in feed.list(): + if database.get_story_by_ref(ref): + continue try: nid = new_id() - database.put_ref(ref, nid) - database.put_story(dict(id=nid, ref=ref, source=source)) + database.put_ref(ref, nid, source) logging.info('Added ref ' + ref) except database.IntegrityError: continue + ref_list = database.get_reflist(FEED_LENGTH) + # update current stories if news_index < len(ref_list): - update_ref = ref_list[news_index]['ref'] - update_sid = ref_list[news_index]['sid'] - story_json = database.get_story(update_sid).full_json - story = json.loads(story_json) + item = ref_list[news_index] + + try: + story_json = database.get_story(item['sid']).full_json + story = json.loads(story_json) + except AttributeError: + story = dict(id=item['sid'], ref=item['ref'], source=item['source']) + valid = feed.update_story(story) if valid: database.put_story(story) + search.put_story(story) else: - database.del_ref(update_ref) - logging.info('Removed ref {}'.format(update_ref)) + database.del_ref(item['ref']) + logging.info('Removed ref {}'.format(item['ref'])) gevent.sleep(6)