forked from tanner/qotnews
renaming things.
This commit is contained in:
@@ -12,7 +12,7 @@ import settings
|
||||
from feeds import hackernews, reddit, tildes, substack, manual
|
||||
from feeds.sitemap import Sitemap
|
||||
from feeds.category import Category
|
||||
from scrapers import outline, declutter, browser, local
|
||||
from scrapers import outline, declutter, headless, simple
|
||||
|
||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||
|
||||
@@ -63,14 +63,14 @@ def get_list():
|
||||
|
||||
def get_article(url):
|
||||
scrapers = {
|
||||
'declutter': declutter,
|
||||
'headless': headless,
|
||||
'simple': simple,
|
||||
'outline': outline,
|
||||
'browser': browser,
|
||||
'local': local,
|
||||
'declutter': declutter,
|
||||
}
|
||||
available = settings.SCRAPERS or ['local']
|
||||
if 'local' not in available:
|
||||
available += ['local']
|
||||
available = settings.SCRAPERS or ['headless', 'simple']
|
||||
if 'simple' not in available:
|
||||
available += ['simple']
|
||||
|
||||
for scraper in available:
|
||||
if scraper not in scrapers.keys():
|
||||
|
@@ -4,13 +4,13 @@ logging.basicConfig(
|
||||
level=logging.DEBUG)
|
||||
import requests
|
||||
|
||||
READ_API = 'http://127.0.0.1:33843/browser/details'
|
||||
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
|
||||
READ_API = 'http://127.0.0.1:33843/headless/details'
|
||||
READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments'
|
||||
TIMEOUT = 60
|
||||
|
||||
|
||||
def get_html(url):
|
||||
logging.info(f"Reader Scraper: {url}")
|
||||
logging.info(f"Headless Browser Scraper: {url}")
|
||||
details = get_details(url)
|
||||
if not details:
|
||||
return ''
|
||||
@@ -25,7 +25,7 @@ def get_details(url):
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except BaseException as e:
|
||||
logging.error('Problem Scraping article: {}'.format(str(e)))
|
||||
logging.error('Problem scraping article: {}'.format(str(e)))
|
||||
return None
|
||||
|
||||
def get_comments(url):
|
@@ -4,11 +4,11 @@ logging.basicConfig(
|
||||
level=logging.DEBUG)
|
||||
import requests
|
||||
|
||||
READ_API = 'http://127.0.0.1:33843/details'
|
||||
READ_API = 'http://127.0.0.1:33843/simple/details'
|
||||
TIMEOUT = 20
|
||||
|
||||
def get_html(url):
|
||||
logging.info(f"Local Scraper: {url}")
|
||||
logging.info(f"Simple Scraper: {url}")
|
||||
details = get_details(url)
|
||||
if not details:
|
||||
return ''
|
@@ -51,7 +51,7 @@ CATEGORY = {}
|
||||
# ],
|
||||
# }
|
||||
|
||||
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
|
||||
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
|
||||
|
||||
# Reddit account info
|
||||
# leave blank if not using Reddit
|
||||
|
Reference in New Issue
Block a user