increase declutter timeout.

This commit is contained in:
Jason Schwarzenberger 2020-11-04 15:14:51 +13:00
parent e6976db25d
commit 98a0c2257c
3 changed files with 16 additions and 9 deletions

View File

@ -5,19 +5,22 @@ logging.basicConfig(
import requests import requests
DECLUTTER_API = 'https://declutter.1j.nz/details' DECLUTTER_API = 'https://declutter.1j.nz/details'
TIMEOUT = 30
def get_html(url): def get_html(url):
try: try:
logging.info(f"Declutter Scraper: {url}") logging.info(f"Declutter Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details:
return ''
return details['content'] return details['content']
except: except:
raise raise
def get_details(url): def get_details(url):
try: try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=20) r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -25,4 +28,4 @@ def get_details(url):
raise raise
except BaseException as e: except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e))) logging.error('Problem decluttering article: {}'.format(str(e)))
return {} return None

View File

@ -5,19 +5,21 @@ logging.basicConfig(
import requests import requests
READ_API = 'http://127.0.0.1:33843/details' READ_API = 'http://127.0.0.1:33843/details'
TIMEOUT = 20
def get_html(url): def get_html(url):
try: try:
logging.info(f"Local Scraper: {url}") logging.info(f"Local Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details:
return ''
return details['content'] return details['content']
except: except:
raise raise
def get_details(url): def get_details(url):
try: try:
r = requests.post(READ_API, data=dict(url=url), timeout=20) r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
return r.json() return r.json()
@ -25,4 +27,4 @@ def get_details(url):
raise raise
except BaseException as e: except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e))) logging.error('Problem getting article: {}'.format(str(e)))
return {} return None

View File

@ -6,11 +6,13 @@ import requests
OUTLINE_REFERER = 'https://outline.com/' OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
TIMEOUT = 20
def get_html(url): def get_html(url):
try: try:
details = get_details(url) details = get_details(url)
if not details:
return ''
return details['html'] return details['html']
except: except:
raise raise
@ -20,11 +22,11 @@ def get_details(url):
logging.info(f"Outline Scraper: {url}") logging.info(f"Outline Scraper: {url}")
params = {'source_url': url} params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER} headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429: if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...') logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30) time.sleep(30)
return '' return None
if r.status_code != 200: if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code)) raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data'] data = r.json()['data']
@ -35,4 +37,4 @@ def get_details(url):
raise raise
except BaseException as e: except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e))) logging.error('Problem outlining article: {}'.format(str(e)))
return {} return None