Compare commits

..

No commits in common. "e0960d59f33363e1ae60250161d35ceefe1700fa" and "55e7f6bb1400d1d87169d670e427986f30b20484" have entirely different histories.

23 changed files with 1872 additions and 34 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "readerserver"] [submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
path = readerserver path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
url = https://github.com/master5o1/declutter.git url = https://github.com/iamadamdev/bypass-paywalls-chrome.git

View File

@ -20,7 +20,7 @@ $ sudo apt install yarn
Clone this repo: Clone this repo:
```text ```text
$ git clone --recurse-submodules https://git.1j.nz/jason/qotnews.git $ git clone https://gogs.tannercollin.com/tanner/qotnews.git
$ cd qotnews $ cd qotnews
``` ```
@ -37,14 +37,14 @@ $ source env/bin/activate
Configure Praw for your Reddit account (optional): Configure Praw for your Reddit account (optional):
- Go to https://www.reddit.com/prefs/apps * Go to https://www.reddit.com/prefs/apps
- Click "Create app" * Click "Create app"
- Name: whatever * Name: whatever
- App type: script * App type: script
- Description: blank * Description: blank
- About URL: blank * About URL: blank
- Redirect URL: your GitHub profile * Redirect URL: your GitHub profile
- Submit, copy the client ID and client secret into `settings.py` below * Submit, copy the client ID and client secret into `settings.py` below
```text ```text
(env) $ vim settings.py.example (env) $ vim settings.py.example
@ -109,7 +109,7 @@ stdout_logfile_maxbytes=1MB
[program:qotnewsreader] [program:qotnewsreader]
user=qotnews user=qotnews
directory=/home/qotnews/qotnews/readerserver directory=/home/qotnews/qotnews/readerserver
command=node index.js command=node main.js
autostart=true autostart=true
autorestart=true autorestart=true
stderr_logfile=/var/log/qotnewsreader.log stderr_logfile=/var/log/qotnewsreader.log

View File

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap from feeds.sitemap import Sitemap
from feeds.category import Category from feeds.category import Category
from scrapers import outline, declutter, headless, simple from scrapers import outline, declutter, browser, local
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url): def get_article(url):
scrapers = { scrapers = {
'headless': headless,
'simple': simple,
'outline': outline,
'declutter': declutter, 'declutter': declutter,
'outline': outline,
'browser': browser,
'local': local,
} }
available = settings.SCRAPERS or ['headless', 'simple'] available = settings.SCRAPERS or ['local']
if 'simple' not in available: if 'local' not in available:
available += ['simple'] available += ['local']
for scraper in available: for scraper in available:
if scraper not in scrapers.keys(): if scraper not in scrapers.keys():

View File

@ -3,15 +3,14 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000) READ_API = 'http://127.0.0.1:33843/browser/details'
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000) READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
TIMEOUT = 60 TIMEOUT = 60
def get_html(url): def get_html(url):
logging.info(f"Headless Browser Scraper: {url}") logging.info(f"Reader Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details: if not details:
return '' return ''
@ -26,7 +25,7 @@ def get_details(url):
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except BaseException as e:
logging.error('Problem scraping article: {}'.format(str(e))) logging.error('Problem Scraping article: {}'.format(str(e)))
return None return None
def get_comments(url): def get_comments(url):

View File

@ -3,13 +3,12 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
import requests import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:{}/simple/details'.format(READER_PORT or 3000) READ_API = 'http://127.0.0.1:33843/details'
TIMEOUT = 20 TIMEOUT = 20
def get_html(url): def get_html(url):
logging.info(f"Simple Scraper: {url}") logging.info(f"Local Scraper: {url}")
details = get_details(url) details = get_details(url)
if not details: if not details:
return '' return ''

View File

@ -142,7 +142,7 @@ def static_story(sid):
url=url, url=url,
description=description) description=description)
http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app) http_server = WSGIServer(('', 33842), flask_app)
def _add_new_refs(): def _add_new_refs():
for ref, source, urlref in feed.get_list(): for ref, source, urlref in feed.get_list():

View File

@ -4,10 +4,6 @@
HOSTNAME = 'news.t0.vc' HOSTNAME = 'news.t0.vc'
MAX_STORY_AGE = 3*24*60*60 MAX_STORY_AGE = 3*24*60*60
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
API_PORT = 33842
READER_PORT = 3000
# Feed Lengths # Feed Lengths
# Number of top items from each site to pull # Number of top items from each site to pull
# set to 0 to disable that site # set to 0 to disable that site
@ -55,6 +51,8 @@ CATEGORY = {}
# ], # ],
# } # }
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit
REDDIT_CLIENT_ID = '' REDDIT_CLIENT_ID = ''

@ -1 +0,0 @@
Subproject commit 9c0336b0af4be942991a7a3771c09ec08938bde8

92
readerserver/.gitignore vendored Normal file
View File

@ -0,0 +1,92 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Editor
*.swp
*.swo

30
readerserver/main.js Normal file
View File

@ -0,0 +1,30 @@
const port = 33843;
const express = require('express');
const app = express();
const simple = require('./scraper/simple');
const browser = require('./scraper/browser');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8">
<fieldset>
<legend>route: POST ${route}</legend>
<input name="url">
<button type="submit">SUBMIT</button>
</fieldset>
</form>`).join('<hr />');
res.send(html);
});
app.post('/', simple.scrape);
app.post('/details', simple.details);
app.post('/browser', browser.scrape);
app.post('/browser/details', browser.details);
app.post('/browser/comments', browser.comments);
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);
});

15
readerserver/package.json Normal file
View File

@ -0,0 +1,15 @@
{
"name": "readerserver",
"version": "1.0.0",
"main": "main.js",
"license": "MIT",
"dependencies": {
"@mozilla/readability": "^0.3.0",
"dompurify": "^1.0.11",
"express": "^4.17.1",
"jsdom": "^15.1.1",
"node-fetch": "^2.6.1",
"playwright": "^1.5.2",
"request": "^2.88.0"
}
}

View File

@ -0,0 +1,45 @@
const { firefox } = require("playwright");
const { JSDOM } = require("jsdom");
const { Readability } = require("@mozilla/readability");
const { getUserAgent } = require('../../utils/user-agent');
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites");
module.exports.getDetails = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.route(/.*/, (route) => {
const routeUrl = route.request().url();
const blockedDomains = Object.keys(blockedRegexes);
const domain = matchUrlDomain(blockedDomains, routeUrl);
if (domain && routeUrl.match(blockedRegexes[domain])) {
return route.abort();
}
return route.continue();
});
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);
const body = await tab.content();
const doc = new JSDOM(body, { url });
const reader = new Readability(doc.window.document);
const article = reader.parse();
return article;
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

View File

@ -0,0 +1,34 @@
const { JSDOM } = require("jsdom");
const { firefox } = require("playwright");
const { getUserAgent } = require('../../utils/user-agent');
const { disqusThread } = require('../../utils/disqus-thread');
const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
module.exports.getComments = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED));
const text = await response.text();
const dom = new JSDOM(text, response.url());
const script = dom.window.document.querySelector('#disqus-threadData')
const data = JSON.parse(script.innerHTML);
return disqusThread(data);
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

View File

@ -0,0 +1,40 @@
const { getDetails } = require('./_browser');
const { getComments } = require('./_comments');
module.exports.scrape = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article || !article.content) {
throw new Error('failed to get details.');
}
return res.send(article.content);
} catch (e) {
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article) {
throw new Error('failed to get details.');
}
return res.send(article);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};
module.exports.comments = async (req, res) => {
try {
const comments = await getComments(req.body.url);
if (!comments) {
throw new Error('failed to get comments.');
}
return res.send(comments);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};

@ -0,0 +1 @@
Subproject commit 44f3d1b114400d73aba2bf2551a34f9f142eda76

View File

@ -0,0 +1,104 @@
(function () {
removeHiddenElements();
if (matchDomain("stuff.co.nz")) {
removeSelectors([
".support-brief-container",
'[class*="donation-in-"]',
".sics-component__sharebar",
".breaking-news-pointer",
".bigbyline-container",
[
".sics-component__html-injector.sics-component__story__paragraph",
"READ MORE:",
],
]);
}
if (matchDomain("nzherald.co.nz")) {
removeSelectors([
"[href$='#commenting-widget']",
".related-articles",
".article__print-button",
".share-bar",
".c-suggest-links.read-more-links",
".website-of-year",
".meta-data",
".article__kicker",
".author__image",
]);
}
if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) {
removeSelectors([".c-advert-app", ".c-sub-nav"]);
}
if (matchDomain(["newsroom.co.nz"])) {
removeSelectors([".article_content__section", ".bio"]);
}
if (matchDomain(["newshub.co.nz"])) {
removeSelectors([
".c-ArticleHeading-authorPicture",
".relatedarticles",
".ArticleAttribution",
'.GlobalFooter'
]);
}
if (matchDomain(["tvnz.co.nz"])) {
removeSelectors([".signup-container container"]);
}
if (matchDomain(["thespinoff.co.nz"])) {
removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
}
function matchDomain(domains) {
const hostname = window.location.hostname;
if (typeof domains === "string") {
domains = [domains];
}
return domains.some(
(domain) => hostname === domain || hostname.endsWith("." + domain)
);
}
function removeDOMElement(...elements) {
for (const element of elements) {
if (element) {
element.remove();
}
}
}
function pageContains(selector, text) {
const elements = document.querySelectorAll(selector);
return Array.prototype.filter.call(elements, function (element) {
return RegExp(text).test(element.textContent);
});
}
function removeHiddenElements() {
window.setTimeout(function () {
const selector = "*:not(script):not(head):not(meta):not(link):not(style)";
Array.from(document.querySelectorAll(selector))
.filter((element) => {
const computed = getComputedStyle(element);
const displayNone = computed["display"] === "none";
const visibilityHidden = computed["visibility"] === "hidden";
return displayNone || visibilityHidden;
})
.forEach((element) => element && element.remove());
}, 1000);
}
function removeSelectors(selectors) {
window.setTimeout(function () {
const elements = selectors.flatMap((s) => {
if (typeof s === "string") {
return Array.from(document.querySelectorAll(s));
}
if (s && s.constructor.name === "Array") {
return pageContains(...s);
}
return undefined;
});
removeDOMElement(...elements);
}, 1000);
}
})();

View File

@ -0,0 +1,14 @@
(function () {
const { host, protocol } = window.location;
const url = `${protocol}//${host}`;
[
['[src^="/"]', 'src'],
['[href^="/"]', 'href']
].forEach(([selector, attribute]) => {
Array.from(document.querySelectorAll(selector))
.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
.forEach((e) => {
e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
});
});
})();

View File

@ -0,0 +1,59 @@
const fetch = require('node-fetch');
const { JSDOM } = require('jsdom');
const { Readability } = require('@mozilla/readability');
const { getUserAgent } = require('../utils/user-agent');
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.scrape = async (req, res) => {
try {
const { userAgent, headers } = getUserAgent(req.body.url);
const response = await fetch(req.body.url, {
headers: {
...headers,
'User-Agent': userAgent
}
});
if (!response.ok) {
return res.sendStatus(response.statusCode);
}
const html = await response.text();
const article = await extract(req.body.url, html);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
} catch (e) {
console.error(e);
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const { userAgent, headers } = getUserAgent(req.body.url);
const response = await fetch(req.body.url, {
headers: {
...headers,
'User-Agent': userAgent
}
});
if (!response.ok) {
return res.sendStatus(response.statusCode);
}
const html = await response.text();
const article = await extract(req.body.url, html);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
} catch (e) {
console.error(e);
return res.sendStatus(500);
}
};

View File

@ -0,0 +1,11 @@
const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
const googleBotIp = '66.249.66.1';
module.exports.googleBot = {
userAgent: googleBotUserAgent,
ip: googleBotIp,
headers: {
'User-Agent': googleBotUserAgent,
'X-Forwarded-For': googleBotIp,
}
}

View File

@ -0,0 +1,21 @@
module.exports.disqusThread = data => {
const comments = data.response.posts.reduce((c, post) => ({
...c,
[post.id.toString()]: {
author: post.author.name,
authorLink: post.author.profileUrl,
date: post.createdAt,
text: post.raw_message,
score: post.points,
children: [],
id: post.id.toString(),
parent: (post.parent || '').toString(),
}
}), {});
Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => {
const comment = comments[id];
comments[comment.parent].children.push(comment);
});
const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]);
return parents;
};

View File

@ -0,0 +1,98 @@
module.exports.blockedRegexes = {
"adweek.com": /.+\.lightboxcdn\.com\/.+/,
"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
"foreignpolicy.com": /.+\.tinypass\.com\/.+/,
"fortune.com": /.+\.tinypass\.com\/.+/,
"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
"inquirer.com": /.+\.tinypass\.com\/.+/,
"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
"lrb.co.uk": /.+\.tinypass\.com\/.+/,
"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
"spectator.co.uk": /.+\.tinypass\.com\/.+/,
"spectator.com.au": /.+\.tinypass\.com\/.+/,
"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
"thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
"latercera.com": /.+\.cxense\.com\/+/,
"lesechos.fr": /.+\.tinypass\.com\/.+/,
"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
"technologyreview.com": /.+\.blueconic\.net\/.+/,
};
module.exports.useGoogleBotSites = [
"adelaidenow.com.au",
"barrons.com",
"couriermail.com.au",
"dailytelegraph.com.au",
"fd.nl",
"genomeweb.com",
"haaretz.co.il",
"haaretz.com",
"heraldsun.com.au",
"mexiconewsdaily.com",
"ntnews.com.au",
"quora.com",
"seekingalpha.com",
"telegraph.co.uk",
"theaustralian.com.au",
"themarker.com",
"themercury.com.au",
"thenational.scot",
"thetimes.co.uk",
"wsj.com",
"kansascity.com",
"republic.ru",
"nzz.ch",
"handelsblatt.com",
"washingtonpost.com",
"df.cl",
];
function matchDomain(domains, hostname) {
let matchedDomain = false;
if (typeof domains === "string") {
domains = [domains];
}
domains.some(
(domain) =>
(hostname === domain || hostname.endsWith("." + domain)) &&
(matchedDomain = domain)
);
return matchedDomain;
}
function matchUrlDomain(domains, url) {
return matchDomain(domains, urlHost(url));
}
function urlHost(url) {
if (url && url.startsWith("http")) {
try {
return new URL(url).hostname;
} catch (e) {
console.log(`url not valid: ${url} error: ${e}`);
}
}
return url;
}
module.exports.matchDomain = matchDomain;
module.exports.matchUrlDomain = matchUrlDomain;
module.exports.urlHost = urlHost;

View File

@ -0,0 +1,18 @@
const { googleBot } = require('./constants');
const { matchUrlDomain, useGoogleBotSites } = require("./sites");
module.exports.getUserAgent = (url) => {
const useGoogleBot = useGoogleBotSites.some(function (item) {
return typeof item === "string" && matchUrlDomain(item, url);
});
if (!useGoogleBot) {
return {};
}
return {
userAgent: googleBot.userAgent,
headers: {
"X-Forwarded-For": googleBot.ip
}
}
};

1261
readerserver/yarn.lock Normal file

File diff suppressed because it is too large Load Diff