Write news stories to disk

2019-08-24 05:07:16 +00:00
parent dde6ac4566
commit c1a81a4d8c
3 changed files with 82 additions and 41 deletions
@@ -1,3 +1,8 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.INFO)
+
 import requests

 API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
@@ -7,11 +12,17 @@ SITE_LINK = lambda x : 'https://news.ycombinator.com/item?id={}'.format(x)
 SITE_AUTHOR_LINK = lambda x : 'https://news.ycombinator.com/user?id={}'.format(x)

 def api(route, ref=None):
+    try:
        r = requests.get(route(ref), timeout=5)
+        if r.status_code != 200:
+            raise
        return r.json()
+    except BaseException as e:
+        logging.error('Problem hitting hackernews API: {}'.format(str(e)))
+        return False

 def feed():
-    return api(API_TOPSTORIES)[:30]
+    return api(API_TOPSTORIES)[:30] or []

 def comment(i):
    c = {}
@@ -29,6 +40,7 @@ def comment_count(i):

 def story(ref):
    r = api(API_ITEM, ref)
+    if not r: return False

    if 'deleted' in r:
        return False
@@ -8,6 +8,7 @@ import threading
 import time
 import random
 import requests
+import shelve
 import string

 from feeds import hackernews
@@ -15,12 +16,16 @@ from flask import abort, Flask, request
 from flask_cors import CORS

 CACHE_LENGTH = 300
+DATA_FILE = 'data/data'
 READ_API = 'http://127.0.0.1:33843'

 news_index = 0
-news_list = []
-news_ref_to_id = {}
-news_cache = {}
+
+with shelve.open(DATA_FILE) as db:
+    logging.info('Reading caches from disk...')
+    news_list = db.get('news_list', [])
+    news_ref_to_id = db.get('news_ref_to_id', {})
+    news_cache = db.get('news_cache', {})

 flask_app = Flask(__name__)
 cors = CORS(flask_app)
@@ -38,7 +43,11 @@ def index():
 def comments(id):
    if id in news_cache:
        return {'story': news_cache[id]}
-    else:
+
+    with shelve.open(DATA_FILE) as db:
+        if id in db:
+            return {'story': db[id]}
+
    abort(404)

 print('Starting Flask...')
@@ -46,20 +55,27 @@ web_thread = threading.Thread(target=flask_app.run, kwargs={'port': 33842})
 web_thread.setDaemon(True)
 web_thread.start()

-def new_id():
+def gen_rand_id():
    return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))

+def new_id():
+    nid = gen_rand_id()
+    with shelve.open(DATA_FILE) as db:
+        while nid in news_cache or nid in db:
+            nid = gen_rand_id()
+    return nid
+
 def get_article(url):
    try:
        r = requests.post(READ_API, data=dict(url=url), timeout=10)
-
        if r.status_code != 200:
            raise
-
        return r.text
-    except:
-        return '<p>Problem parsing article :(</p>'
+    except BaseException as e:
+        logging.error('Problem getting article: {}'.format(str(e)))
+        return ''

+try:
    while True:
        if news_index == 0:
            feed = hackernews.feed()
@@ -75,9 +91,12 @@ while True:

            while len(news_list) > CACHE_LENGTH:
                old_ref = news_list.pop()
-            del news_cache[news_ref_to_id[old_ref]]
-            del news_ref_to_id[old_ref]
-            logging.info('Removed ref {}.'.format(old_ref))
+                old_story = news_cache.pop(news_ref_to_id[old_ref])
+                old_id = news_ref_to_id.pop(old_ref)
+                logging.info('Removed ref {} id {}.'.format(old_ref, old_id))
+                if old_story and old_id:
+                    with shelve.open(DATA_FILE) as db:
+                        db[old_id] = old_story

        if news_index < len(news_list):
            update_ref = news_list[news_index]
@@ -87,9 +106,19 @@ while True:
            if story:
                news_story.update(story)
            if news_story.get('url', '') and not news_story.get('text', ''):
+                if not news_story['url'].endswith('.pdf'):
                    news_story['text'] = get_article(news_story['url'])
+                else:
+                    news_story['text'] = '<p>Unsupported article type.</p>'

        time.sleep(1)

        news_index += 1
        if news_index == CACHE_LENGTH: news_index = 0
+
+finally:
+    with shelve.open(DATA_FILE) as db:
+        logging.info('Writing caches to disk...')
+        db['news_list'] = news_list
+        db['news_ref_to_id'] = news_ref_to_id
+        db['news_cache'] = news_cache