Check content-type
This commit is contained in:
		| @@ -9,11 +9,10 @@ from bs4 import BeautifulSoup | ||||
|  | ||||
| from feeds import hackernews, reddit, tildes, manual | ||||
|  | ||||
| OUTLINE_API = 'https://outlineapi.com/article' | ||||
| OUTLINE_API = 'https://api.outline.com/v3/parse_article' | ||||
| ARCHIVE_API = 'https://archive.fo/submit/' | ||||
| READ_API = 'http://127.0.0.1:33843' | ||||
|  | ||||
| INVALID_FILES = ['.pdf', '.png', '.jpg', '.gif'] | ||||
| INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] | ||||
| TWO_DAYS = 60*60*24*2 | ||||
|  | ||||
| @@ -64,12 +63,19 @@ def get_first_image(text): | ||||
|         first_img = soup.find('img') | ||||
|         url = first_img['src'] | ||||
|         headers = {'User-Agent': 'Twitterbot/1.0'} | ||||
|         length = requests.get(url, headers=headers).headers['Content-length'] | ||||
|         length = requests.get(url, headers=headers).headers['content-length'] | ||||
|         if int(length) > 1000000: raise | ||||
|         return url | ||||
|     except: | ||||
|         return '' | ||||
|  | ||||
| def get_content_type(url): | ||||
|     try: | ||||
|         headers = {'User-Agent': 'Twitterbot/1.0'} | ||||
|         return requests.get(url, headers=headers).headers['content-type'] | ||||
|     except: | ||||
|         return '' | ||||
|  | ||||
| def update_story(story, is_manual=False): | ||||
|     res = {} | ||||
|  | ||||
| @@ -95,12 +101,14 @@ def update_story(story, is_manual=False): | ||||
|         return False | ||||
|  | ||||
|     if story.get('url', '') and not story.get('text', ''): | ||||
|         if any([story['url'].endswith(ext) for ext in INVALID_FILES]): | ||||
|             logging.info('URL invalid file type') | ||||
|         if not get_content_type(story['url']).startswith('text/'): | ||||
|             logging.info('URL invalid file type / content type:') | ||||
|             logging.info(story['url']) | ||||
|             return False | ||||
|  | ||||
|         if any([domain in story['url'] for domain in INVALID_DOMAINS]): | ||||
|             logging.info('URL invalid domain') | ||||
|             logging.info('URL invalid domain:') | ||||
|             logging.info(story['url']) | ||||
|             return False | ||||
|  | ||||
|         logging.info('Getting article ' + story['url']) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user