local browser scraper

2020-11-11 09:26:54 +00:00
parent 637bc38476
commit 00954c6cac
3 changed files with 45 additions and 3 deletions
@@ -10,7 +10,7 @@ import itertools

 import settings
 from feeds import hackernews, reddit, tildes, substack, manual, news
-from scrapers import outline, declutter, local
+from scrapers import outline, declutter, browser, local

 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']

@@ -26,7 +26,7 @@ for key, value in settings.SITEMAP.items():

 def get_list():
    feeds = {}
-    
+
    if settings.NUM_HACKERNEWS:
        feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]

@@ -63,6 +63,7 @@ def get_article(url):
    scrapers = {
        'declutter': declutter,
        'outline': outline,
+        'browser': browser,
        'local': local,
    }
    available = settings.SCRAPERS or ['local']
@@ -0,0 +1,41 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+import requests
+
+READ_API = 'http://127.0.0.1:33843/browser/details'
+READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
+TIMEOUT = 60
+
+
+def get_html(url):
+    logging.info(f"Reader Scraper: {url}")
+    details = get_details(url)
+    if not details:
+        return ''
+    return details['content']
+
+def get_details(url):
+    try:
+        r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem Scraping article: {}'.format(str(e)))
+        return None
+
+def get_comments(url):
+    try:
+        r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem getting comments for article: {}'.format(str(e)))
+        return None
@@ -23,7 +23,7 @@ SUBSTACK = {}
 CATEGORY = {}
 # CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},

-SCRAPERS = ['declutter', 'outline', 'local']
+SCRAPERS = ['browser', 'declutter', 'outline', 'local']

 # Reddit account info
 # leave blank if not using Reddit