renaming things.

This commit is contained in:
Jason Schwarzenberger 2020-11-17 15:50:31 +13:00
parent 55d50a86d8
commit 3b885e4327
13 changed files with 34 additions and 28 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"] [submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git url = https://github.com/iamadamdev/bypass-paywalls-chrome/

View File

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap from feeds.sitemap import Sitemap
from feeds.category import Category from feeds.category import Category
from scrapers import outline, declutter, browser, local from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url): def get_article(url):
scrapers = { scrapers = {
'declutter': declutter, 'headless': headless,
'simple': simple,
'outline': outline, 'outline': outline,
'browser': browser, 'declutter': declutter,
'local': local,
} }
available = settings.SCRAPERS or ['local'] available = settings.SCRAPERS or ['headless', 'simple']
if 'local' not in available: if 'simple' not in available:
available += ['local'] available += ['simple']
for scraper in available: for scraper in available:
if scraper not in scrapers.keys(): if scraper not in scrapers.keys():

View File

@ -4,13 +4,13 @@ logging.basicConfig(
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
READ_API = 'http://127.0.0.1:33843/browser/details' READ_API = 'http://127.0.0.1:33843/headless/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd' READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments'
TIMEOUT = 60 TIMEOUT = 60
def get_html(url): def get_html(url):
logging.info(f"Reader Scraper: {url}") logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details: if not details:
return '' return ''
@ -25,7 +25,7 @@ def get_details(url):
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e))) logging.error('Problem scraping article: {}'.format(str(e)))
return None return None
def get_comments(url): def get_comments(url):

View File

@ -4,11 +4,11 @@ logging.basicConfig(
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
READ_API = 'http://127.0.0.1:33843/details' READ_API = 'http://127.0.0.1:33843/simple/details'
TIMEOUT = 20 TIMEOUT = 20
def get_html(url): def get_html(url):
logging.info(f"Local Scraper: {url}") logging.info(f"Simple Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details: if not details:
return '' return ''

View File

@ -51,7 +51,7 @@ CATEGORY = {}
# ], # ],
# } # }
SCRAPERS = ['browser', 'declutter', 'outline', 'local'] SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit

View File

@ -2,12 +2,18 @@ const port = 33843;
const express = require('express'); const express = require('express');
const app = express(); const app = express();
const simple = require('./scraper/simple'); const simple = require('./scraper/simple');
const browser = require('./scraper/browser'); const headless = require('./scraper/headless');
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => { app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments']; const routes = [
'/simple',
'/simple/details',
'/headless',
'/headless/details',
'/headless/comments'
];
const html = routes.map(route => ` const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8"> <form method="POST" action="${route}" accept-charset="UTF-8">
@ -19,11 +25,11 @@ app.get('/', (req, res) => {
</form>`).join('<hr />'); </form>`).join('<hr />');
res.send(html); res.send(html);
}); });
app.post('/', simple.scrape); app.post('/simple/', simple.scrape);
app.post('/details', simple.details); app.post('/simple/details', simple.details);
app.post('/browser', browser.scrape); app.post('/headless', headless.scrape);
app.post('/browser/details', browser.details); app.post('/headless/details', headless.details);
app.post('/browser/comments', browser.comments); app.post('/headless/comments', headless.comments);
app.listen(port, () => { app.listen(port, () => {
console.log(`Example app listening on port ${port}!`); console.log(`Example app listening on port ${port}!`);

@ -1 +0,0 @@
Subproject commit 0f129c5d6c206fde389878fed4d26fec90923a21

View File

@ -25,9 +25,9 @@ module.exports.getDetails = async (url) => {
} }
return route.continue(); return route.continue();
}); });
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" }); await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" }); await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" }); await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000); await tab.waitForTimeout(2000);

@ -0,0 +1 @@
Subproject commit fff7f483db947e690977bfc80955a53329d3d349