diff --git a/apiserver/data/data.db b/apiserver/data/data.db new file mode 100644 index 0000000..2199644 Binary files /dev/null and b/apiserver/data/data.db differ diff --git a/apiserver/feeds/hackernews.py b/apiserver/feeds/hackernews.py index b8fc8a7..691533a 100644 --- a/apiserver/feeds/hackernews.py +++ b/apiserver/feeds/hackernews.py @@ -1,3 +1,8 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.INFO) + import requests API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json' @@ -7,11 +12,17 @@ SITE_LINK = lambda x : 'https://news.ycombinator.com/item?id={}'.format(x) SITE_AUTHOR_LINK = lambda x : 'https://news.ycombinator.com/user?id={}'.format(x) def api(route, ref=None): - r = requests.get(route(ref), timeout=5) - return r.json() + try: + r = requests.get(route(ref), timeout=5) + if r.status_code != 200: + raise + return r.json() + except BaseException as e: + logging.error('Problem hitting hackernews API: {}'.format(str(e))) + return False def feed(): - return api(API_TOPSTORIES)[:30] + return api(API_TOPSTORIES)[:30] or [] def comment(i): c = {} @@ -29,6 +40,7 @@ def comment_count(i): def story(ref): r = api(API_ITEM, ref) + if not r: return False if 'deleted' in r: return False diff --git a/apiserver/server.py b/apiserver/server.py index 1f43635..cd25afe 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -8,6 +8,7 @@ import threading import time import random import requests +import shelve import string from feeds import hackernews @@ -15,12 +16,16 @@ from flask import abort, Flask, request from flask_cors import CORS CACHE_LENGTH = 300 +DATA_FILE = 'data/data' READ_API = 'http://127.0.0.1:33843' news_index = 0 -news_list = [] -news_ref_to_id = {} -news_cache = {} + +with shelve.open(DATA_FILE) as db: + logging.info('Reading caches from disk...') + news_list = db.get('news_list', []) + news_ref_to_id = db.get('news_ref_to_id', {}) + news_cache = db.get('news_cache', {}) flask_app = Flask(__name__) cors = CORS(flask_app) @@ -38,58 +43,82 @@ def index(): def comments(id): if id in news_cache: return {'story': news_cache[id]} - else: - abort(404) + + with shelve.open(DATA_FILE) as db: + if id in db: + return {'story': db[id]} + + abort(404) print('Starting Flask...') web_thread = threading.Thread(target=flask_app.run, kwargs={'port': 33842}) web_thread.setDaemon(True) web_thread.start() -def new_id(): +def gen_rand_id(): return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) +def new_id(): + nid = gen_rand_id() + with shelve.open(DATA_FILE) as db: + while nid in news_cache or nid in db: + nid = gen_rand_id() + return nid + def get_article(url): try: r = requests.post(READ_API, data=dict(url=url), timeout=10) - if r.status_code != 200: raise - return r.text - except: - return '
Problem parsing article :(
' + except BaseException as e: + logging.error('Problem getting article: {}'.format(str(e))) + return '' -while True: - if news_index == 0: - feed = hackernews.feed() - new_refs = [ref for ref in feed if ref not in news_list] - for ref in new_refs: - news_list.insert(0, ref) - nid = new_id() - news_ref_to_id[ref] = nid - news_cache[nid] = dict(id=nid, ref=ref) +try: + while True: + if news_index == 0: + feed = hackernews.feed() + new_refs = [ref for ref in feed if ref not in news_list] + for ref in new_refs: + news_list.insert(0, ref) + nid = new_id() + news_ref_to_id[ref] = nid + news_cache[nid] = dict(id=nid, ref=ref) - if len(new_refs): - logging.info('Added {} new refs.'.format(len(new_refs))) + if len(new_refs): + logging.info('Added {} new refs.'.format(len(new_refs))) - while len(news_list) > CACHE_LENGTH: - old_ref = news_list.pop() - del news_cache[news_ref_to_id[old_ref]] - del news_ref_to_id[old_ref] - logging.info('Removed ref {}.'.format(old_ref)) + while len(news_list) > CACHE_LENGTH: + old_ref = news_list.pop() + old_story = news_cache.pop(news_ref_to_id[old_ref]) + old_id = news_ref_to_id.pop(old_ref) + logging.info('Removed ref {} id {}.'.format(old_ref, old_id)) + if old_story and old_id: + with shelve.open(DATA_FILE) as db: + db[old_id] = old_story - if news_index < len(news_list): - update_ref = news_list[news_index] - update_id = news_ref_to_id[update_ref] - news_story = news_cache[update_id] - story = hackernews.story(update_ref) - if story: - news_story.update(story) - if news_story.get('url', '') and not news_story.get('text', ''): - news_story['text'] = get_article(news_story['url']) + if news_index < len(news_list): + update_ref = news_list[news_index] + update_id = news_ref_to_id[update_ref] + news_story = news_cache[update_id] + story = hackernews.story(update_ref) + if story: + news_story.update(story) + if news_story.get('url', '') and not news_story.get('text', ''): + if not news_story['url'].endswith('.pdf'): + news_story['text'] = get_article(news_story['url']) + else: + news_story['text'] = 'Unsupported article type.
' - time.sleep(1) + time.sleep(1) - news_index += 1 - if news_index == CACHE_LENGTH: news_index = 0 + news_index += 1 + if news_index == CACHE_LENGTH: news_index = 0 + +finally: + with shelve.open(DATA_FILE) as db: + logging.info('Writing caches to disk...') + db['news_list'] = news_list + db['news_ref_to_id'] = news_ref_to_id + db['news_cache'] = news_cache