From 2822974b6e3b34c538d5f519c1c48e293225232e Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Sun, 15 Dec 2019 22:47:33 +0000 Subject: [PATCH] Stop using archive.is on articles (hits CAPTCHAs) --- apiserver/feed.py | 25 +++---------------------- apiserver/server.py | 2 +- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/apiserver/feed.py b/apiserver/feed.py index 55a1f2d..7b64535 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -13,9 +13,8 @@ OUTLINE_API = 'https://outlineapi.com/article' ARCHIVE_API = 'https://archive.fo/submit/' READ_API = 'http://127.0.0.1:33843' -ARCHIVE_FIRST = ['bloomberg.com', 'wsj.com'] INVALID_FILES = ['.pdf', '.png', '.jpg', '.gif'] -INVALID_DOMAINS = ['youtube.com'] +INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] TWO_DAYS = 60*60*24*2 def list(): @@ -26,24 +25,6 @@ def list(): return feed def get_article(url): - if any([domain in url for domain in ARCHIVE_FIRST]): - try: - logging.info('Article from {}, archiving first...'.format(url)) - data = {'submitid': '9tjtS1EYe5wy8AJiYgVfH9P97uHU1IHG4lO67hsQpHOC3KKJrhqVIoQG2U7Rg%2Fpr', 'url': url} - r = requests.post(ARCHIVE_API, data=data, timeout=20, allow_redirects=False) - if r.status_code == 200: - logging.info('Submitted for archiving. Skipping to wait...') - return '' - elif 'location' in r.headers: - url = r.headers['location'] - else: - raise Exception('Bad response code ' + str(r.status_code)) - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem archiving article: {}'.format(str(e))) - return '' - try: params = {'source_url': url} headers = {'Referer': 'https://outline.com/'} @@ -89,7 +70,7 @@ def get_first_image(text): except: return '' -def update_story(story, manual=False): +def update_story(story, is_manual=False): res = {} logging.info('Updating story ' + str(story['ref'])) @@ -109,7 +90,7 @@ def update_story(story, manual=False): logging.info('Article not ready yet') return False - if story['date'] and not manual and story['date'] + TWO_DAYS < time.time(): + if story['date'] and not is_manual and story['date'] + TWO_DAYS < time.time(): logging.info('Article too old, removing') return False diff --git a/apiserver/server.py b/apiserver/server.py index f471391..5c7dfa8 100644 --- a/apiserver/server.py +++ b/apiserver/server.py @@ -113,7 +113,7 @@ def submit(): news_story = dict(id=nid, ref=ref, source=source) news_cache[nid] = news_story - valid = feed.update_story(news_story, manual=True) + valid = feed.update_story(news_story, is_manual=True) if valid: archive.update(news_story) return {'nid': nid}