From 00531472266fc4835052df5069c510a3fbfbd2eb Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Tue, 24 Sep 2019 08:22:06 +0000 Subject: [PATCH] Ignore certain files and domains, remove refs --- apiserver/feed.py | 35 +++++++++++++++++++++++++---------- apiserver/server.py | 25 ++++++++++++++++--------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/apiserver/feed.py b/apiserver/feed.py index 7a42ac9..ef0f60f 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -11,11 +11,14 @@ from feeds import hackernews, reddit, tildes OUTLINE_API = 'https://outlineapi.com/article' READ_API = 'http://127.0.0.1:33843' +INVALID_FILES = ['.pdf', '.png', '.jpg', '.gif'] +INVALID_DOMAINS = ['youtube.com'] + def list(): feed = [] feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] - feed += [(x, 'reddit') for x in reddit.feed()[:5]] - feed += [(x, 'tildes') for x in tildes.feed()[:5]] + feed += [(x, 'reddit') for x in reddit.feed()[:10]] + feed += [(x, 'tildes') for x in tildes.feed()[:10]] return feed def get_article(url): @@ -58,17 +61,29 @@ def update_story(story): res = reddit.story(story['ref']) elif story['source'] == 'tildes': res = tildes.story(story['ref']) - else: - return if res: - story.update(res) + story.update(res) # join dicts + else: + logging.info('Article not ready yet') + return False + if story.get('url', '') and not story.get('text', ''): - if not story['url'].endswith('.pdf'): - logging.info('Getting article ' + story['url']) - story['text'] = get_article(story['url']) - else: - story['text'] = '

Unsupported article type.

' + for ext in INVALID_FILES: + if story['url'].endswith(ext): + logging.info('URL invalid file type ({})'.format(ext)) + return False + + for domain in INVALID_DOMAINS: + if domain in story['url']: + logging.info('URL invalid domain ({})'.format(domain)) + return False + + logging.info('Getting article ' + story['url']) + story['text'] = get_article(story['url']) + if not story['text']: return False + + return True if __name__ == '__main__': test_news_cache = {} diff --git a/apiserver/server.py b/apiserver/server.py index 7acb63b..138a1a8 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -25,14 +25,13 @@ with shelve.open(DATA_FILE) as db: news_ref_to_id = db.get('news_ref_to_id', {}) news_cache = db.get('news_cache', {}) - flask_app = Flask(__name__) cors = CORS(flask_app) @flask_app.route('/') def index(): front_page = [news_cache[news_ref_to_id[ref]] for ref in news_list] - front_page = [copy.copy(x) for x in front_page if 'title' in x] + front_page = [copy.copy(x) for x in front_page if 'text' in x and x['text']][:100] for story in front_page: if 'comments' in story: story.pop('comments') if 'text' in story: story.pop('text') @@ -61,6 +60,16 @@ def new_id(): nid = gen_rand_id() return nid +def remove_ref(old_ref, archive=False): + while old_ref in news_list: + news_list.remove(old_ref) + old_story = news_cache.pop(news_ref_to_id[old_ref]) + old_id = news_ref_to_id.pop(old_ref) + logging.info('Removed ref {} id {}.'.format(old_ref, old_id)) + if archive: + with shelve.open(DATA_FILE) as db: + db[old_id] = old_story + try: while True: if news_index == 0: @@ -76,18 +85,16 @@ try: logging.info('Added {} new refs.'.format(len(new_items))) while len(news_list) > CACHE_LENGTH: - old_ref = news_list.pop() - old_story = news_cache.pop(news_ref_to_id[old_ref]) - old_id = news_ref_to_id.pop(old_ref) - logging.info('Removed ref {} id {}.'.format(old_ref, old_id)) - with shelve.open(DATA_FILE) as db: - db[old_id] = old_story + old_ref = news_list[-1] + remove_ref(old_ref, archive=True) if news_index < len(news_list): update_ref = news_list[news_index] update_id = news_ref_to_id[update_ref] news_story = news_cache[update_id] - feed.update_story(news_story) + valid = feed.update_story(news_story) + if not valid: + remove_ref(update_ref) time.sleep(3)