local browser scraper

This commit is contained in:
Jason 2020-11-11 09:26:54 +00:00
parent 637bc38476
commit 00954c6cac
3 changed files with 45 additions and 3 deletions

View File

@ -10,7 +10,7 @@ import itertools
import settings
from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local
from scrapers import outline, declutter, browser, local
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,6 +63,7 @@ def get_article(url):
scrapers = {
'declutter': declutter,
'outline': outline,
'browser': browser,
'local': local,
}
available = settings.SCRAPERS or ['local']

View File

@ -0,0 +1,41 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/browser/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
TIMEOUT = 60
def get_html(url):
logging.info(f"Reader Scraper: {url}")
details = get_details(url)
if not details:
return ''
return details['content']
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e)))
return None
def get_comments(url):
try:
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None

View File

@ -23,7 +23,7 @@ SUBSTACK = {}
CATEGORY = {}
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
SCRAPERS = ['declutter', 'outline', 'local']
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
# Reddit account info
# leave blank if not using Reddit