From 00954c6cac8d938c5a1f24c4ebc51006e33d5807 Mon Sep 17 00:00:00 2001 From: Jason Date: Wed, 11 Nov 2020 09:26:54 +0000 Subject: [PATCH] local browser scraper --- apiserver/feed.py | 5 +++-- apiserver/scrapers/browser.py | 41 +++++++++++++++++++++++++++++++++++ apiserver/settings.py.example | 2 +- 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 apiserver/scrapers/browser.py diff --git a/apiserver/feed.py b/apiserver/feed.py index dab64a5..5e5605f 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -10,7 +10,7 @@ import itertools import settings from feeds import hackernews, reddit, tildes, substack, manual, news -from scrapers import outline, declutter, local +from scrapers import outline, declutter, browser, local INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] @@ -26,7 +26,7 @@ for key, value in settings.SITEMAP.items(): def get_list(): feeds = {} - + if settings.NUM_HACKERNEWS: feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]] @@ -63,6 +63,7 @@ def get_article(url): scrapers = { 'declutter': declutter, 'outline': outline, + 'browser': browser, 'local': local, } available = settings.SCRAPERS or ['local'] diff --git a/apiserver/scrapers/browser.py b/apiserver/scrapers/browser.py new file mode 100644 index 0000000..3de7dd0 --- /dev/null +++ b/apiserver/scrapers/browser.py @@ -0,0 +1,41 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) +import requests + +READ_API = 'http://127.0.0.1:33843/browser/details' +READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd' +TIMEOUT = 60 + + +def get_html(url): + logging.info(f"Reader Scraper: {url}") + details = get_details(url) + if not details: + return '' + return details['content'] + +def get_details(url): + try: + r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem Scraping article: {}'.format(str(e))) + return None + +def get_comments(url): + try: + r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem getting comments for article: {}'.format(str(e))) + return None diff --git a/apiserver/settings.py.example b/apiserver/settings.py.example index 089a559..62cec23 100644 --- a/apiserver/settings.py.example +++ b/apiserver/settings.py.example @@ -23,7 +23,7 @@ SUBSTACK = {} CATEGORY = {} # CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, -SCRAPERS = ['declutter', 'outline', 'local'] +SCRAPERS = ['browser', 'declutter', 'outline', 'local'] # Reddit account info # leave blank if not using Reddit