diff --git a/apiserver/scrapers/declutter.py b/apiserver/scrapers/declutter.py index 45a40e9..d516423 100644 --- a/apiserver/scrapers/declutter.py +++ b/apiserver/scrapers/declutter.py @@ -5,19 +5,22 @@ logging.basicConfig( import requests DECLUTTER_API = 'https://declutter.1j.nz/details' +TIMEOUT = 30 def get_html(url): try: logging.info(f"Declutter Scraper: {url}") details = get_details(url) + if not details: + return '' return details['content'] except: raise def get_details(url): try: - r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=20) + r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT) if r.status_code != 200: raise Exception('Bad response code ' + str(r.status_code)) return r.json() @@ -25,4 +28,4 @@ def get_details(url): raise except BaseException as e: logging.error('Problem decluttering article: {}'.format(str(e))) - return {} \ No newline at end of file + return None \ No newline at end of file diff --git a/apiserver/scrapers/local.py b/apiserver/scrapers/local.py index 0cb4ef2..489eae8 100644 --- a/apiserver/scrapers/local.py +++ b/apiserver/scrapers/local.py @@ -5,19 +5,21 @@ logging.basicConfig( import requests READ_API = 'http://127.0.0.1:33843/details' - +TIMEOUT = 20 def get_html(url): try: logging.info(f"Local Scraper: {url}") details = get_details(url) + if not details: + return '' return details['content'] except: raise def get_details(url): try: - r = requests.post(READ_API, data=dict(url=url), timeout=20) + r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT) if r.status_code != 200: raise Exception('Bad response code ' + str(r.status_code)) return r.json() @@ -25,4 +27,4 @@ def get_details(url): raise except BaseException as e: logging.error('Problem getting article: {}'.format(str(e))) - return {} \ No newline at end of file + return None \ No newline at end of file diff --git a/apiserver/scrapers/outline.py b/apiserver/scrapers/outline.py index ba65047..b3c0574 100644 --- a/apiserver/scrapers/outline.py +++ b/apiserver/scrapers/outline.py @@ -6,11 +6,13 @@ import requests OUTLINE_REFERER = 'https://outline.com/' OUTLINE_API = 'https://api.outline.com/v3/parse_article' - +TIMEOUT = 20 def get_html(url): try: details = get_details(url) + if not details: + return '' return details['html'] except: raise @@ -20,11 +22,11 @@ def get_details(url): logging.info(f"Outline Scraper: {url}") params = {'source_url': url} headers = {'Referer': OUTLINE_REFERER} - r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) + r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT) if r.status_code == 429: logging.info('Rate limited by outline, sleeping 30s and skipping...') time.sleep(30) - return '' + return None if r.status_code != 200: raise Exception('Bad response code ' + str(r.status_code)) data = r.json()['data'] @@ -35,4 +37,4 @@ def get_details(url): raise except BaseException as e: logging.error('Problem outlining article: {}'.format(str(e))) - return {} \ No newline at end of file + return None \ No newline at end of file