Compare commits

..

6 Commits

Author SHA1 Message Date
Jason Schwarzenberger
e0960d59f3 update readme. 2020-11-18 13:26:34 +13:00
Jason Schwarzenberger
f5b38f5c6b remove readerserver, add declutter. 2020-11-18 12:59:35 +13:00
Jason Schwarzenberger
c9da2a078b increase setTimeouts. 2020-11-18 10:06:45 +13:00
Jason Schwarzenberger
78654e0c63 reduce setTimeout. 2020-11-17 16:07:33 +13:00
Jason Schwarzenberger
3b885e4327 renaming things. 2020-11-17 15:54:14 +13:00
Jason Schwarzenberger
55d50a86d8 hmmm 2020-11-17 15:13:38 +13:00
23 changed files with 34 additions and 1872 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git
[submodule "readerserver"]
path = readerserver
url = https://github.com/master5o1/declutter.git

View File

@ -20,7 +20,7 @@ $ sudo apt install yarn
Clone this repo:
```text
$ git clone https://gogs.tannercollin.com/tanner/qotnews.git
$ git clone --recurse-submodules https://git.1j.nz/jason/qotnews.git
$ cd qotnews
```
@ -37,14 +37,14 @@ $ source env/bin/activate
Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps
* Click "Create app"
* Name: whatever
* App type: script
* Description: blank
* About URL: blank
* Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `settings.py` below
- Go to https://www.reddit.com/prefs/apps
- Click "Create app"
- Name: whatever
- App type: script
- Description: blank
- About URL: blank
- Redirect URL: your GitHub profile
- Submit, copy the client ID and client secret into `settings.py` below
```text
(env) $ vim settings.py.example
@ -109,7 +109,7 @@ stdout_logfile_maxbytes=1MB
[program:qotnewsreader]
user=qotnews
directory=/home/qotnews/qotnews/readerserver
command=node main.js
command=node index.js
autostart=true
autorestart=true
stderr_logfile=/var/log/qotnewsreader.log

View File

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, browser, local
from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url):
scrapers = {
'declutter': declutter,
'headless': headless,
'simple': simple,
'outline': outline,
'browser': browser,
'local': local,
'declutter': declutter,
}
available = settings.SCRAPERS or ['local']
if 'local' not in available:
available += ['local']
available = settings.SCRAPERS or ['headless', 'simple']
if 'simple' not in available:
available += ['simple']
for scraper in available:
if scraper not in scrapers.keys():

View File

@ -3,14 +3,15 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:33843/browser/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000)
TIMEOUT = 60
def get_html(url):
logging.info(f"Reader Scraper: {url}")
logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url)
if not details:
return ''
@ -25,7 +26,7 @@ def get_details(url):
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e)))
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):

View File

@ -3,12 +3,13 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:33843/details'
READ_API = 'http://127.0.0.1:{}/simple/details'.format(READER_PORT or 3000)
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''

View File

@ -142,7 +142,7 @@ def static_story(sid):
url=url,
description=description)
http_server = WSGIServer(('', 33842), flask_app)
http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app)
def _add_new_refs():
for ref, source, urlref in feed.get_list():

View File

@ -4,6 +4,10 @@
HOSTNAME = 'news.t0.vc'
MAX_STORY_AGE = 3*24*60*60
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
API_PORT = 33842
READER_PORT = 3000
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site
@ -51,8 +55,6 @@ CATEGORY = {}
# ],
# }
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
# Reddit account info
# leave blank if not using Reddit
REDDIT_CLIENT_ID = ''

1
readerserver Submodule

@ -0,0 +1 @@
Subproject commit 9c0336b0af4be942991a7a3771c09ec08938bde8

View File

@ -1,92 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Editor
*.swp
*.swo

View File

@ -1,30 +0,0 @@
const port = 33843;
const express = require('express');
const app = express();
const simple = require('./scraper/simple');
const browser = require('./scraper/browser');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8">
<fieldset>
<legend>route: POST ${route}</legend>
<input name="url">
<button type="submit">SUBMIT</button>
</fieldset>
</form>`).join('<hr />');
res.send(html);
});
app.post('/', simple.scrape);
app.post('/details', simple.details);
app.post('/browser', browser.scrape);
app.post('/browser/details', browser.details);
app.post('/browser/comments', browser.comments);
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);
});

View File

@ -1,15 +0,0 @@
{
"name": "readerserver",
"version": "1.0.0",
"main": "main.js",
"license": "MIT",
"dependencies": {
"@mozilla/readability": "^0.3.0",
"dompurify": "^1.0.11",
"express": "^4.17.1",
"jsdom": "^15.1.1",
"node-fetch": "^2.6.1",
"playwright": "^1.5.2",
"request": "^2.88.0"
}
}

View File

@ -1,45 +0,0 @@
const { firefox } = require("playwright");
const { JSDOM } = require("jsdom");
const { Readability } = require("@mozilla/readability");
const { getUserAgent } = require('../../utils/user-agent');
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites");
module.exports.getDetails = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.route(/.*/, (route) => {
const routeUrl = route.request().url();
const blockedDomains = Object.keys(blockedRegexes);
const domain = matchUrlDomain(blockedDomains, routeUrl);
if (domain && routeUrl.match(blockedRegexes[domain])) {
return route.abort();
}
return route.continue();
});
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);
const body = await tab.content();
const doc = new JSDOM(body, { url });
const reader = new Readability(doc.window.document);
const article = reader.parse();
return article;
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

View File

@ -1,34 +0,0 @@
const { JSDOM } = require("jsdom");
const { firefox } = require("playwright");
const { getUserAgent } = require('../../utils/user-agent');
const { disqusThread } = require('../../utils/disqus-thread');
const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
module.exports.getComments = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED));
const text = await response.text();
const dom = new JSDOM(text, response.url());
const script = dom.window.document.querySelector('#disqus-threadData')
const data = JSON.parse(script.innerHTML);
return disqusThread(data);
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

View File

@ -1,40 +0,0 @@
const { getDetails } = require('./_browser');
const { getComments } = require('./_comments');
module.exports.scrape = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article || !article.content) {
throw new Error('failed to get details.');
}
return res.send(article.content);
} catch (e) {
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article) {
throw new Error('failed to get details.');
}
return res.send(article);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};
module.exports.comments = async (req, res) => {
try {
const comments = await getComments(req.body.url);
if (!comments) {
throw new Error('failed to get comments.');
}
return res.send(comments);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};

@ -1 +0,0 @@
Subproject commit 44f3d1b114400d73aba2bf2551a34f9f142eda76

View File

@ -1,104 +0,0 @@
(function () {
removeHiddenElements();
if (matchDomain("stuff.co.nz")) {
removeSelectors([
".support-brief-container",
'[class*="donation-in-"]',
".sics-component__sharebar",
".breaking-news-pointer",
".bigbyline-container",
[
".sics-component__html-injector.sics-component__story__paragraph",
"READ MORE:",
],
]);
}
if (matchDomain("nzherald.co.nz")) {
removeSelectors([
"[href$='#commenting-widget']",
".related-articles",
".article__print-button",
".share-bar",
".c-suggest-links.read-more-links",
".website-of-year",
".meta-data",
".article__kicker",
".author__image",
]);
}
if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) {
removeSelectors([".c-advert-app", ".c-sub-nav"]);
}
if (matchDomain(["newsroom.co.nz"])) {
removeSelectors([".article_content__section", ".bio"]);
}
if (matchDomain(["newshub.co.nz"])) {
removeSelectors([
".c-ArticleHeading-authorPicture",
".relatedarticles",
".ArticleAttribution",
'.GlobalFooter'
]);
}
if (matchDomain(["tvnz.co.nz"])) {
removeSelectors([".signup-container container"]);
}
if (matchDomain(["thespinoff.co.nz"])) {
removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
}
function matchDomain(domains) {
const hostname = window.location.hostname;
if (typeof domains === "string") {
domains = [domains];
}
return domains.some(
(domain) => hostname === domain || hostname.endsWith("." + domain)
);
}
function removeDOMElement(...elements) {
for (const element of elements) {
if (element) {
element.remove();
}
}
}
function pageContains(selector, text) {
const elements = document.querySelectorAll(selector);
return Array.prototype.filter.call(elements, function (element) {
return RegExp(text).test(element.textContent);
});
}
function removeHiddenElements() {
window.setTimeout(function () {
const selector = "*:not(script):not(head):not(meta):not(link):not(style)";
Array.from(document.querySelectorAll(selector))
.filter((element) => {
const computed = getComputedStyle(element);
const displayNone = computed["display"] === "none";
const visibilityHidden = computed["visibility"] === "hidden";
return displayNone || visibilityHidden;
})
.forEach((element) => element && element.remove());
}, 1000);
}
function removeSelectors(selectors) {
window.setTimeout(function () {
const elements = selectors.flatMap((s) => {
if (typeof s === "string") {
return Array.from(document.querySelectorAll(s));
}
if (s && s.constructor.name === "Array") {
return pageContains(...s);
}
return undefined;
});
removeDOMElement(...elements);
}, 1000);
}
})();

View File

@ -1,14 +0,0 @@
(function () {
const { host, protocol } = window.location;
const url = `${protocol}//${host}`;
[
['[src^="/"]', 'src'],
['[href^="/"]', 'href']
].forEach(([selector, attribute]) => {
Array.from(document.querySelectorAll(selector))
.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
.forEach((e) => {
e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
});
});
})();

View File

@ -1,59 +0,0 @@
const fetch = require('node-fetch');
const { JSDOM } = require('jsdom');
const { Readability } = require('@mozilla/readability');
const { getUserAgent } = require('../utils/user-agent');
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.scrape = async (req, res) => {
try {
const { userAgent, headers } = getUserAgent(req.body.url);
const response = await fetch(req.body.url, {
headers: {
...headers,
'User-Agent': userAgent
}
});
if (!response.ok) {
return res.sendStatus(response.statusCode);
}
const html = await response.text();
const article = await extract(req.body.url, html);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
} catch (e) {
console.error(e);
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const { userAgent, headers } = getUserAgent(req.body.url);
const response = await fetch(req.body.url, {
headers: {
...headers,
'User-Agent': userAgent
}
});
if (!response.ok) {
return res.sendStatus(response.statusCode);
}
const html = await response.text();
const article = await extract(req.body.url, html);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
} catch (e) {
console.error(e);
return res.sendStatus(500);
}
};

View File

@ -1,11 +0,0 @@
const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
const googleBotIp = '66.249.66.1';
module.exports.googleBot = {
userAgent: googleBotUserAgent,
ip: googleBotIp,
headers: {
'User-Agent': googleBotUserAgent,
'X-Forwarded-For': googleBotIp,
}
}

View File

@ -1,21 +0,0 @@
module.exports.disqusThread = data => {
const comments = data.response.posts.reduce((c, post) => ({
...c,
[post.id.toString()]: {
author: post.author.name,
authorLink: post.author.profileUrl,
date: post.createdAt,
text: post.raw_message,
score: post.points,
children: [],
id: post.id.toString(),
parent: (post.parent || '').toString(),
}
}), {});
Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => {
const comment = comments[id];
comments[comment.parent].children.push(comment);
});
const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]);
return parents;
};

View File

@ -1,98 +0,0 @@
module.exports.blockedRegexes = {
"adweek.com": /.+\.lightboxcdn\.com\/.+/,
"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
"foreignpolicy.com": /.+\.tinypass\.com\/.+/,
"fortune.com": /.+\.tinypass\.com\/.+/,
"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
"inquirer.com": /.+\.tinypass\.com\/.+/,
"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
"lrb.co.uk": /.+\.tinypass\.com\/.+/,
"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
"spectator.co.uk": /.+\.tinypass\.com\/.+/,
"spectator.com.au": /.+\.tinypass\.com\/.+/,
"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
"thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
"latercera.com": /.+\.cxense\.com\/+/,
"lesechos.fr": /.+\.tinypass\.com\/.+/,
"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
"technologyreview.com": /.+\.blueconic\.net\/.+/,
};
module.exports.useGoogleBotSites = [
"adelaidenow.com.au",
"barrons.com",
"couriermail.com.au",
"dailytelegraph.com.au",
"fd.nl",
"genomeweb.com",
"haaretz.co.il",
"haaretz.com",
"heraldsun.com.au",
"mexiconewsdaily.com",
"ntnews.com.au",
"quora.com",
"seekingalpha.com",
"telegraph.co.uk",
"theaustralian.com.au",
"themarker.com",
"themercury.com.au",
"thenational.scot",
"thetimes.co.uk",
"wsj.com",
"kansascity.com",
"republic.ru",
"nzz.ch",
"handelsblatt.com",
"washingtonpost.com",
"df.cl",
];
function matchDomain(domains, hostname) {
let matchedDomain = false;
if (typeof domains === "string") {
domains = [domains];
}
domains.some(
(domain) =>
(hostname === domain || hostname.endsWith("." + domain)) &&
(matchedDomain = domain)
);
return matchedDomain;
}
function matchUrlDomain(domains, url) {
return matchDomain(domains, urlHost(url));
}
function urlHost(url) {
if (url && url.startsWith("http")) {
try {
return new URL(url).hostname;
} catch (e) {
console.log(`url not valid: ${url} error: ${e}`);
}
}
return url;
}
module.exports.matchDomain = matchDomain;
module.exports.matchUrlDomain = matchUrlDomain;
module.exports.urlHost = urlHost;

View File

@ -1,18 +0,0 @@
const { googleBot } = require('./constants');
const { matchUrlDomain, useGoogleBotSites } = require("./sites");
module.exports.getUserAgent = (url) => {
const useGoogleBot = useGoogleBotSites.some(function (item) {
return typeof item === "string" && matchUrlDomain(item, url);
});
if (!useGoogleBot) {
return {};
}
return {
userAgent: googleBot.userAgent,
headers: {
"X-Forwarded-For": googleBot.ip
}
}
};

File diff suppressed because it is too large Load Diff