renaming things.

This commit is contained in:
Jason Schwarzenberger 2020-11-17 15:50:31 +13:00
parent 55d50a86d8
commit 3b885e4327
13 changed files with 34 additions and 28 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git
[submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome/

View File

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, browser, local
from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url):
scrapers = {
'declutter': declutter,
'headless': headless,
'simple': simple,
'outline': outline,
'browser': browser,
'local': local,
'declutter': declutter,
}
available = settings.SCRAPERS or ['local']
if 'local' not in available:
available += ['local']
available = settings.SCRAPERS or ['headless', 'simple']
if 'simple' not in available:
available += ['simple']
for scraper in available:
if scraper not in scrapers.keys():

View File

@ -4,13 +4,13 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/browser/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
READ_API = 'http://127.0.0.1:33843/headless/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments'
TIMEOUT = 60
def get_html(url):
logging.info(f"Reader Scraper: {url}")
logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url)
if not details:
return ''
@ -25,7 +25,7 @@ def get_details(url):
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e)))
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):

View File

@ -4,11 +4,11 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/details'
READ_API = 'http://127.0.0.1:33843/simple/details'
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''

View File

@ -51,7 +51,7 @@ CATEGORY = {}
# ],
# }
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
# Reddit account info
# leave blank if not using Reddit

View File

@ -2,12 +2,18 @@ const port = 33843;
const express = require('express');
const app = express();
const simple = require('./scraper/simple');
const browser = require('./scraper/browser');
const headless = require('./scraper/headless');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
const routes = [
'/simple',
'/simple/details',
'/headless',
'/headless/details',
'/headless/comments'
];
const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8">
@ -19,11 +25,11 @@ app.get('/', (req, res) => {
</form>`).join('<hr />');
res.send(html);
});
app.post('/', simple.scrape);
app.post('/details', simple.details);
app.post('/browser', browser.scrape);
app.post('/browser/details', browser.details);
app.post('/browser/comments', browser.comments);
app.post('/simple/', simple.scrape);
app.post('/simple/details', simple.details);
app.post('/headless', headless.scrape);
app.post('/headless/details', headless.details);
app.post('/headless/comments', headless.comments);
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);

@ -1 +0,0 @@
Subproject commit 0f129c5d6c206fde389878fed4d26fec90923a21

View File

@ -25,9 +25,9 @@ module.exports.getDetails = async (url) => {
}
return route.continue();
});
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);

@ -0,0 +1 @@
Subproject commit fff7f483db947e690977bfc80955a53329d3d349