From 6430fe5e9f56538ab1e985dac9f047f82522ac2a Mon Sep 17 00:00:00 2001
From: Tanner Collin <git@tannercollin.com>
Date: Thu, 25 Jun 2020 23:36:47 +0000
Subject: [PATCH] Check content-type

---
 apiserver/feed.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/apiserver/feed.py b/apiserver/feed.py
index 7b64535..33d15b0 100644
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -9,11 +9,10 @@ from bs4 import BeautifulSoup
 
 from feeds import hackernews, reddit, tildes, manual
 
-OUTLINE_API = 'https://outlineapi.com/article'
+OUTLINE_API = 'https://api.outline.com/v3/parse_article'
 ARCHIVE_API = 'https://archive.fo/submit/'
 READ_API = 'http://127.0.0.1:33843'
 
-INVALID_FILES = ['.pdf', '.png', '.jpg', '.gif']
 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
 TWO_DAYS = 60*60*24*2
 
@@ -64,12 +63,19 @@ def get_first_image(text):
         first_img = soup.find('img')
         url = first_img['src']
         headers = {'User-Agent': 'Twitterbot/1.0'}
-        length = requests.get(url, headers=headers).headers['Content-length']
+        length = requests.get(url, headers=headers).headers['content-length']
         if int(length) > 1000000: raise
         return url
     except:
         return ''
 
+def get_content_type(url):
+    try:
+        headers = {'User-Agent': 'Twitterbot/1.0'}
+        return requests.get(url, headers=headers).headers['content-type']
+    except:
+        return ''
+
 def update_story(story, is_manual=False):
     res = {}
 
@@ -95,12 +101,14 @@ def update_story(story, is_manual=False):
         return False
 
     if story.get('url', '') and not story.get('text', ''):
-        if any([story['url'].endswith(ext) for ext in INVALID_FILES]):
-            logging.info('URL invalid file type')
+        if not get_content_type(story['url']).startswith('text/'):
+            logging.info('URL invalid file type / content type:')
+            logging.info(story['url'])
             return False
 
         if any([domain in story['url'] for domain in INVALID_DOMAINS]):
-            logging.info('URL invalid domain')
+            logging.info('URL invalid domain:')
+            logging.info(story['url'])
             return False
 
         logging.info('Getting article ' + story['url'])