diff --git a/.gitmodules b/.gitmodules index b25f9f9..be8a66a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"] - path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome - url = https://github.com/iamadamdev/bypass-paywalls-chrome.git +[submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"] + path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome + url = https://github.com/iamadamdev/bypass-paywalls-chrome/ diff --git a/apiserver/feed.py b/apiserver/feed.py index 2ce9f1b..1e8c6eb 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -12,7 +12,7 @@ import settings from feeds import hackernews, reddit, tildes, substack, manual from feeds.sitemap import Sitemap from feeds.category import Category -from scrapers import outline, declutter, browser, local +from scrapers import outline, declutter, headless, simple INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] @@ -63,14 +63,14 @@ def get_list(): def get_article(url): scrapers = { - 'declutter': declutter, + 'headless': headless, + 'simple': simple, 'outline': outline, - 'browser': browser, - 'local': local, + 'declutter': declutter, } - available = settings.SCRAPERS or ['local'] - if 'local' not in available: - available += ['local'] + available = settings.SCRAPERS or ['headless', 'simple'] + if 'simple' not in available: + available += ['simple'] for scraper in available: if scraper not in scrapers.keys(): diff --git a/apiserver/scrapers/browser.py b/apiserver/scrapers/headless.py similarity index 81% rename from apiserver/scrapers/browser.py rename to apiserver/scrapers/headless.py index 3de7dd0..30639c3 100644 --- a/apiserver/scrapers/browser.py +++ b/apiserver/scrapers/headless.py @@ -4,13 +4,13 @@ logging.basicConfig( level=logging.DEBUG) import requests -READ_API = 'http://127.0.0.1:33843/browser/details' -READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd' +READ_API = 'http://127.0.0.1:33843/headless/details' +READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments' TIMEOUT = 60 def get_html(url): - logging.info(f"Reader Scraper: {url}") + logging.info(f"Headless Browser Scraper: {url}") details = get_details(url) if not details: return '' @@ -25,7 +25,7 @@ def get_details(url): except KeyboardInterrupt: raise except BaseException as e: - logging.error('Problem Scraping article: {}'.format(str(e))) + logging.error('Problem scraping article: {}'.format(str(e))) return None def get_comments(url): diff --git a/apiserver/scrapers/local.py b/apiserver/scrapers/simple.py similarity index 85% rename from apiserver/scrapers/local.py rename to apiserver/scrapers/simple.py index dd81f93..6613bf0 100644 --- a/apiserver/scrapers/local.py +++ b/apiserver/scrapers/simple.py @@ -4,11 +4,11 @@ logging.basicConfig( level=logging.DEBUG) import requests -READ_API = 'http://127.0.0.1:33843/details' +READ_API = 'http://127.0.0.1:33843/simple/details' TIMEOUT = 20 def get_html(url): - logging.info(f"Local Scraper: {url}") + logging.info(f"Simple Scraper: {url}") details = get_details(url) if not details: return '' diff --git a/apiserver/settings.py.example b/apiserver/settings.py.example index 87d608d..797d6ba 100644 --- a/apiserver/settings.py.example +++ b/apiserver/settings.py.example @@ -51,7 +51,7 @@ CATEGORY = {} # ], # } -SCRAPERS = ['browser', 'declutter', 'outline', 'local'] +SCRAPERS = ['headless', 'outline', 'declutter', 'simple'] # Reddit account info # leave blank if not using Reddit diff --git a/readerserver/main.js b/readerserver/main.js index f0fe218..b318f53 100644 --- a/readerserver/main.js +++ b/readerserver/main.js @@ -2,12 +2,18 @@ const port = 33843; const express = require('express'); const app = express(); const simple = require('./scraper/simple'); -const browser = require('./scraper/browser'); +const headless = require('./scraper/headless'); app.use(express.urlencoded({ extended: true })); app.get('/', (req, res) => { - const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments']; + const routes = [ + '/simple', + '/simple/details', + '/headless', + '/headless/details', + '/headless/comments' + ]; const html = routes.map(route => `
@@ -19,11 +25,11 @@ app.get('/', (req, res) => {
`).join('
'); res.send(html); }); -app.post('/', simple.scrape); -app.post('/details', simple.details); -app.post('/browser', browser.scrape); -app.post('/browser/details', browser.details); -app.post('/browser/comments', browser.comments); +app.post('/simple/', simple.scrape); +app.post('/simple/details', simple.details); +app.post('/headless', headless.scrape); +app.post('/headless/details', headless.details); +app.post('/headless/comments', headless.comments); app.listen(port, () => { console.log(`Example app listening on port ${port}!`); diff --git a/readerserver/scraper/browser/scripts/bypass-paywalls-chrome b/readerserver/scraper/browser/scripts/bypass-paywalls-chrome deleted file mode 160000 index 0f129c5..0000000 --- a/readerserver/scraper/browser/scripts/bypass-paywalls-chrome +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0f129c5d6c206fde389878fed4d26fec90923a21 diff --git a/readerserver/scraper/browser/_browser.js b/readerserver/scraper/headless/_browser.js similarity index 81% rename from readerserver/scraper/browser/_browser.js rename to readerserver/scraper/headless/_browser.js index e1cbc19..308134c 100644 --- a/readerserver/scraper/browser/_browser.js +++ b/readerserver/scraper/headless/_browser.js @@ -25,9 +25,9 @@ module.exports.getDetails = async (url) => { } return route.continue(); }); - await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); - await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" }); - await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" }); + await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); + await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" }); + await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" }); await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" }); await tab.waitForTimeout(2000); diff --git a/readerserver/scraper/browser/_comments.js b/readerserver/scraper/headless/_comments.js similarity index 100% rename from readerserver/scraper/browser/_comments.js rename to readerserver/scraper/headless/_comments.js diff --git a/readerserver/scraper/browser/index.js b/readerserver/scraper/headless/index.js similarity index 100% rename from readerserver/scraper/browser/index.js rename to readerserver/scraper/headless/index.js diff --git a/readerserver/scraper/headless/scripts/bypass-paywalls-chrome b/readerserver/scraper/headless/scripts/bypass-paywalls-chrome new file mode 160000 index 0000000..fff7f48 --- /dev/null +++ b/readerserver/scraper/headless/scripts/bypass-paywalls-chrome @@ -0,0 +1 @@ +Subproject commit fff7f483db947e690977bfc80955a53329d3d349 diff --git a/readerserver/scraper/browser/scripts/cosmetic-filters.js b/readerserver/scraper/headless/scripts/cosmetic-filters.js similarity index 100% rename from readerserver/scraper/browser/scripts/cosmetic-filters.js rename to readerserver/scraper/headless/scripts/cosmetic-filters.js diff --git a/readerserver/scraper/browser/scripts/fix-relative-links.js b/readerserver/scraper/headless/scripts/fix-relative-links.js similarity index 100% rename from readerserver/scraper/browser/scripts/fix-relative-links.js rename to readerserver/scraper/headless/scripts/fix-relative-links.js