local browser scraper
This commit is contained in:
parent
637bc38476
commit
00954c6cac
|
@ -10,7 +10,7 @@ import itertools
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
from feeds import hackernews, reddit, tildes, substack, manual, news
|
from feeds import hackernews, reddit, tildes, substack, manual, news
|
||||||
from scrapers import outline, declutter, local
|
from scrapers import outline, declutter, browser, local
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ for key, value in settings.SITEMAP.items():
|
||||||
|
|
||||||
def get_list():
|
def get_list():
|
||||||
feeds = {}
|
feeds = {}
|
||||||
|
|
||||||
if settings.NUM_HACKERNEWS:
|
if settings.NUM_HACKERNEWS:
|
||||||
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
|
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
|
||||||
|
|
||||||
|
@ -63,6 +63,7 @@ def get_article(url):
|
||||||
scrapers = {
|
scrapers = {
|
||||||
'declutter': declutter,
|
'declutter': declutter,
|
||||||
'outline': outline,
|
'outline': outline,
|
||||||
|
'browser': browser,
|
||||||
'local': local,
|
'local': local,
|
||||||
}
|
}
|
||||||
available = settings.SCRAPERS or ['local']
|
available = settings.SCRAPERS or ['local']
|
||||||
|
|
41
apiserver/scrapers/browser.py
Normal file
41
apiserver/scrapers/browser.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
level=logging.DEBUG)
|
||||||
|
import requests
|
||||||
|
|
||||||
|
READ_API = 'http://127.0.0.1:33843/browser/details'
|
||||||
|
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
|
||||||
|
TIMEOUT = 60
|
||||||
|
|
||||||
|
|
||||||
|
def get_html(url):
|
||||||
|
logging.info(f"Reader Scraper: {url}")
|
||||||
|
details = get_details(url)
|
||||||
|
if not details:
|
||||||
|
return ''
|
||||||
|
return details['content']
|
||||||
|
|
||||||
|
def get_details(url):
|
||||||
|
try:
|
||||||
|
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem Scraping article: {}'.format(str(e)))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_comments(url):
|
||||||
|
try:
|
||||||
|
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
|
||||||
|
if r.status_code != 200:
|
||||||
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
|
return r.json()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise
|
||||||
|
except BaseException as e:
|
||||||
|
logging.error('Problem getting comments for article: {}'.format(str(e)))
|
||||||
|
return None
|
|
@ -23,7 +23,7 @@ SUBSTACK = {}
|
||||||
CATEGORY = {}
|
CATEGORY = {}
|
||||||
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
|
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
|
||||||
|
|
||||||
SCRAPERS = ['declutter', 'outline', 'local']
|
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
|
||||||
|
|
||||||
# Reddit account info
|
# Reddit account info
|
||||||
# leave blank if not using Reddit
|
# leave blank if not using Reddit
|
||||||
|
|
Loading…
Reference in New Issue
Block a user