From 6430fe5e9f56538ab1e985dac9f047f82522ac2a Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Thu, 25 Jun 2020 23:36:47 +0000 Subject: [PATCH] Check content-type --- apiserver/feed.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/apiserver/feed.py b/apiserver/feed.py index 7b64535..33d15b0 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -9,11 +9,10 @@ from bs4 import BeautifulSoup from feeds import hackernews, reddit, tildes, manual -OUTLINE_API = 'https://outlineapi.com/article' +OUTLINE_API = 'https://api.outline.com/v3/parse_article' ARCHIVE_API = 'https://archive.fo/submit/' READ_API = 'http://127.0.0.1:33843' -INVALID_FILES = ['.pdf', '.png', '.jpg', '.gif'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] TWO_DAYS = 60*60*24*2 @@ -64,12 +63,19 @@ def get_first_image(text): first_img = soup.find('img') url = first_img['src'] headers = {'User-Agent': 'Twitterbot/1.0'} - length = requests.get(url, headers=headers).headers['Content-length'] + length = requests.get(url, headers=headers).headers['content-length'] if int(length) > 1000000: raise return url except: return '' +def get_content_type(url): + try: + headers = {'User-Agent': 'Twitterbot/1.0'} + return requests.get(url, headers=headers).headers['content-type'] + except: + return '' + def update_story(story, is_manual=False): res = {} @@ -95,12 +101,14 @@ def update_story(story, is_manual=False): return False if story.get('url', '') and not story.get('text', ''): - if any([story['url'].endswith(ext) for ext in INVALID_FILES]): - logging.info('URL invalid file type') + if not get_content_type(story['url']).startswith('text/'): + logging.info('URL invalid file type / content type:') + logging.info(story['url']) return False if any([domain in story['url'] for domain in INVALID_DOMAINS]): - logging.info('URL invalid domain') + logging.info('URL invalid domain:') + logging.info(story['url']) return False logging.info('Getting article ' + story['url'])