local browser scraper

2020-11-11 09:26:54 +00:00 · 2020-11-11 09:26:54 +00:00 · 00954c6cac
commit 00954c6cac
parent 637bc38476
3 changed files with 45 additions and 3 deletions
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@ -10,7 +10,7 @@ import itertools
 import settings
 from feeds import hackernews, reddit, tildes, substack, manual, news
-from scrapers import outline, declutter, local
+from scrapers import outline, declutter, browser, local
 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -26,7 +26,7 @@ for key, value in settings.SITEMAP.items():
 def get_list():
    feeds = {}
-    
+
    if settings.NUM_HACKERNEWS:
        feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
@ -63,6 +63,7 @@ def get_article(url):
    scrapers = {
        'declutter': declutter,
        'outline': outline,
        'browser': browser,
        'local': local,
    }
    available = settings.SCRAPERS or ['local']
--- a/apiserver/scrapers/browser.py
+++ b/apiserver/scrapers/browser.py
@ -0,0 +1,41 @@
 import logging
 logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)
 import requests
 READ_API = 'http://127.0.0.1:33843/browser/details'
 READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
 TIMEOUT = 60
 def get_html(url):
    logging.info(f"Reader Scraper: {url}")
    details = get_details(url)
    if not details:
        return ''
    return details['content']
 def get_details(url):
    try:
        r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem Scraping article: {}'.format(str(e)))
        return None
 def get_comments(url):
    try:
        r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem getting comments for article: {}'.format(str(e)))
        return None
--- a/apiserver/settings.py.example
+++ b/apiserver/settings.py.example
@ -23,7 +23,7 @@ SUBSTACK = {}
 CATEGORY = {}
 # CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
-SCRAPERS = ['declutter', 'outline', 'local']
+SCRAPERS = ['browser', 'declutter', 'outline', 'local']
 # Reddit account info
 # leave blank if not using Reddit