Check content-type
This commit is contained in:
parent
a4cf719cb8
commit
6430fe5e9f
|
@ -9,11 +9,10 @@ from bs4 import BeautifulSoup
|
|||
|
||||
from feeds import hackernews, reddit, tildes, manual
|
||||
|
||||
OUTLINE_API = 'https://outlineapi.com/article'
|
||||
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
|
||||
ARCHIVE_API = 'https://archive.fo/submit/'
|
||||
READ_API = 'http://127.0.0.1:33843'
|
||||
|
||||
INVALID_FILES = ['.pdf', '.png', '.jpg', '.gif']
|
||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||
TWO_DAYS = 60*60*24*2
|
||||
|
||||
|
@ -64,12 +63,19 @@ def get_first_image(text):
|
|||
first_img = soup.find('img')
|
||||
url = first_img['src']
|
||||
headers = {'User-Agent': 'Twitterbot/1.0'}
|
||||
length = requests.get(url, headers=headers).headers['Content-length']
|
||||
length = requests.get(url, headers=headers).headers['content-length']
|
||||
if int(length) > 1000000: raise
|
||||
return url
|
||||
except:
|
||||
return ''
|
||||
|
||||
def get_content_type(url):
|
||||
try:
|
||||
headers = {'User-Agent': 'Twitterbot/1.0'}
|
||||
return requests.get(url, headers=headers).headers['content-type']
|
||||
except:
|
||||
return ''
|
||||
|
||||
def update_story(story, is_manual=False):
|
||||
res = {}
|
||||
|
||||
|
@ -95,12 +101,14 @@ def update_story(story, is_manual=False):
|
|||
return False
|
||||
|
||||
if story.get('url', '') and not story.get('text', ''):
|
||||
if any([story['url'].endswith(ext) for ext in INVALID_FILES]):
|
||||
logging.info('URL invalid file type')
|
||||
if not get_content_type(story['url']).startswith('text/'):
|
||||
logging.info('URL invalid file type / content type:')
|
||||
logging.info(story['url'])
|
||||
return False
|
||||
|
||||
if any([domain in story['url'] for domain in INVALID_DOMAINS]):
|
||||
logging.info('URL invalid domain')
|
||||
logging.info('URL invalid domain:')
|
||||
logging.info(story['url'])
|
||||
return False
|
||||
|
||||
logging.info('Getting article ' + story['url'])
|
||||
|
|
Loading…
Reference in New Issue
Block a user