forked from tanner/qotnews
local browser scraper
This commit is contained in:
parent
637bc38476
commit
00954c6cac
|
@ -10,7 +10,7 @@ import itertools
|
|||
|
||||
import settings
|
||||
from feeds import hackernews, reddit, tildes, substack, manual, news
|
||||
from scrapers import outline, declutter, local
|
||||
from scrapers import outline, declutter, browser, local
|
||||
|
||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||
|
||||
|
@ -26,7 +26,7 @@ for key, value in settings.SITEMAP.items():
|
|||
|
||||
def get_list():
|
||||
feeds = {}
|
||||
|
||||
|
||||
if settings.NUM_HACKERNEWS:
|
||||
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
|
||||
|
||||
|
@ -63,6 +63,7 @@ def get_article(url):
|
|||
scrapers = {
|
||||
'declutter': declutter,
|
||||
'outline': outline,
|
||||
'browser': browser,
|
||||
'local': local,
|
||||
}
|
||||
available = settings.SCRAPERS or ['local']
|
||||
|
|
41
apiserver/scrapers/browser.py
Normal file
41
apiserver/scrapers/browser.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import logging
|
||||
logging.basicConfig(
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
level=logging.DEBUG)
|
||||
import requests
|
||||
|
||||
READ_API = 'http://127.0.0.1:33843/browser/details'
|
||||
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
|
||||
TIMEOUT = 60
|
||||
|
||||
|
||||
def get_html(url):
|
||||
logging.info(f"Reader Scraper: {url}")
|
||||
details = get_details(url)
|
||||
if not details:
|
||||
return ''
|
||||
return details['content']
|
||||
|
||||
def get_details(url):
|
||||
try:
|
||||
r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
|
||||
if r.status_code != 200:
|
||||
raise Exception('Bad response code ' + str(r.status_code))
|
||||
return r.json()
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except BaseException as e:
|
||||
logging.error('Problem Scraping article: {}'.format(str(e)))
|
||||
return None
|
||||
|
||||
def get_comments(url):
|
||||
try:
|
||||
r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
|
||||
if r.status_code != 200:
|
||||
raise Exception('Bad response code ' + str(r.status_code))
|
||||
return r.json()
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except BaseException as e:
|
||||
logging.error('Problem getting comments for article: {}'.format(str(e)))
|
||||
return None
|
|
@ -23,7 +23,7 @@ SUBSTACK = {}
|
|||
CATEGORY = {}
|
||||
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
|
||||
|
||||
SCRAPERS = ['declutter', 'outline', 'local']
|
||||
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
|
||||
|
||||
# Reddit account info
|
||||
# leave blank if not using Reddit
|
||||
|
|
Loading…
Reference in New Issue
Block a user