Compare commits

...

6 Commits

Author SHA1 Message Date
Jason Schwarzenberger e0960d59f3 update readme. 4 years ago
Jason Schwarzenberger f5b38f5c6b remove readerserver, add declutter. 4 years ago
Jason Schwarzenberger c9da2a078b increase setTimeouts. 4 years ago
Jason Schwarzenberger 78654e0c63 reduce setTimeout. 4 years ago
Jason Schwarzenberger 3b885e4327 renaming things. 4 years ago
Jason Schwarzenberger 55d50a86d8 hmmm 4 years ago
  1. 6
      .gitmodules
  2. 20
      README.md
  3. 14
      apiserver/feed.py
  4. 9
      apiserver/scrapers/headless.py
  5. 5
      apiserver/scrapers/simple.py
  6. 2
      apiserver/server.py
  7. 6
      apiserver/settings.py.example
  8. 1
      readerserver
  9. 92
      readerserver/.gitignore
  10. 30
      readerserver/main.js
  11. 15
      readerserver/package.json
  12. 45
      readerserver/scraper/browser/_browser.js
  13. 34
      readerserver/scraper/browser/_comments.js
  14. 40
      readerserver/scraper/browser/index.js
  15. 1
      readerserver/scraper/browser/scripts/bypass-paywalls-chrome
  16. 104
      readerserver/scraper/browser/scripts/cosmetic-filters.js
  17. 14
      readerserver/scraper/browser/scripts/fix-relative-links.js
  18. 59
      readerserver/scraper/simple.js
  19. 11
      readerserver/utils/constants.js
  20. 21
      readerserver/utils/disqus-thread.js
  21. 98
      readerserver/utils/sites.js
  22. 18
      readerserver/utils/user-agent.js
  23. 1261
      readerserver/yarn.lock

6
.gitmodules vendored

@ -1,3 +1,3 @@
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git
[submodule "readerserver"]
path = readerserver
url = https://github.com/master5o1/declutter.git

@ -20,7 +20,7 @@ $ sudo apt install yarn
Clone this repo:
```text
$ git clone https://gogs.tannercollin.com/tanner/qotnews.git
$ git clone --recurse-submodules https://git.1j.nz/jason/qotnews.git
$ cd qotnews
```
@ -37,14 +37,14 @@ $ source env/bin/activate
Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps
* Click "Create app"
* Name: whatever
* App type: script
* Description: blank
* About URL: blank
* Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `settings.py` below
- Go to https://www.reddit.com/prefs/apps
- Click "Create app"
- Name: whatever
- App type: script
- Description: blank
- About URL: blank
- Redirect URL: your GitHub profile
- Submit, copy the client ID and client secret into `settings.py` below
```text
(env) $ vim settings.py.example
@ -109,7 +109,7 @@ stdout_logfile_maxbytes=1MB
[program:qotnewsreader]
user=qotnews
directory=/home/qotnews/qotnews/readerserver
command=node main.js
command=node index.js
autostart=true
autorestart=true
stderr_logfile=/var/log/qotnewsreader.log

@ -12,7 +12,7 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, browser, local
from scrapers import outline, declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -63,14 +63,14 @@ def get_list():
def get_article(url):
scrapers = {
'declutter': declutter,
'headless': headless,
'simple': simple,
'outline': outline,
'browser': browser,
'local': local,
'declutter': declutter,
}
available = settings.SCRAPERS or ['local']
if 'local' not in available:
available += ['local']
available = settings.SCRAPERS or ['headless', 'simple']
if 'simple' not in available:
available += ['simple']
for scraper in available:
if scraper not in scrapers.keys():

@ -3,14 +3,15 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:33843/browser/details'
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000)
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000)
TIMEOUT = 60
def get_html(url):
logging.info(f"Reader Scraper: {url}")
logging.info(f"Headless Browser Scraper: {url}")
details = get_details(url)
if not details:
return ''
@ -25,7 +26,7 @@ def get_details(url):
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem Scraping article: {}'.format(str(e)))
logging.error('Problem scraping article: {}'.format(str(e)))
return None
def get_comments(url):

@ -3,12 +3,13 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
from settings import READER_PORT
READ_API = 'http://127.0.0.1:33843/details'
READ_API = 'http://127.0.0.1:{}/simple/details'.format(READER_PORT or 3000)
TIMEOUT = 20
def get_html(url):
logging.info(f"Local Scraper: {url}")
logging.info(f"Simple Scraper: {url}")
details = get_details(url)
if not details:
return ''

@ -142,7 +142,7 @@ def static_story(sid):
url=url,
description=description)
http_server = WSGIServer(('', 33842), flask_app)
http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app)
def _add_new_refs():
for ref, source, urlref in feed.get_list():

@ -4,6 +4,10 @@
HOSTNAME = 'news.t0.vc'
MAX_STORY_AGE = 3*24*60*60
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
API_PORT = 33842
READER_PORT = 3000
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site
@ -51,8 +55,6 @@ CATEGORY = {}
# ],
# }
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
# Reddit account info
# leave blank if not using Reddit
REDDIT_CLIENT_ID = ''

@ -0,0 +1 @@
Subproject commit 9c0336b0af4be942991a7a3771c09ec08938bde8

@ -1,92 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# TypeScript v1 declaration files
typings/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.env.test
# parcel-bundler cache (https://parceljs.org/)
.cache
# next.js build output
.next
# nuxt.js build output
.nuxt
# vuepress build output
.vuepress/dist
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# Editor
*.swp
*.swo

@ -1,30 +0,0 @@
const port = 33843;
const express = require('express');
const app = express();
const simple = require('./scraper/simple');
const browser = require('./scraper/browser');
app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => {
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
const html = routes.map(route => `
<form method="POST" action="${route}" accept-charset="UTF-8">
<fieldset>
<legend>route: POST ${route}</legend>
<input name="url">
<button type="submit">SUBMIT</button>
</fieldset>
</form>`).join('<hr />');
res.send(html);
});
app.post('/', simple.scrape);
app.post('/details', simple.details);
app.post('/browser', browser.scrape);
app.post('/browser/details', browser.details);
app.post('/browser/comments', browser.comments);
app.listen(port, () => {
console.log(`Example app listening on port ${port}!`);
});

@ -1,15 +0,0 @@
{
"name": "readerserver",
"version": "1.0.0",
"main": "main.js",
"license": "MIT",
"dependencies": {
"@mozilla/readability": "^0.3.0",
"dompurify": "^1.0.11",
"express": "^4.17.1",
"jsdom": "^15.1.1",
"node-fetch": "^2.6.1",
"playwright": "^1.5.2",
"request": "^2.88.0"
}
}

@ -1,45 +0,0 @@
const { firefox } = require("playwright");
const { JSDOM } = require("jsdom");
const { Readability } = require("@mozilla/readability");
const { getUserAgent } = require('../../utils/user-agent');
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites");
module.exports.getDetails = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.route(/.*/, (route) => {
const routeUrl = route.request().url();
const blockedDomains = Object.keys(blockedRegexes);
const domain = matchUrlDomain(blockedDomains, routeUrl);
if (domain && routeUrl.match(blockedRegexes[domain])) {
return route.abort();
}
return route.continue();
});
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);
const body = await tab.content();
const doc = new JSDOM(body, { url });
const reader = new Readability(doc.window.document);
const article = reader.parse();
return article;
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

@ -1,34 +0,0 @@
const { JSDOM } = require("jsdom");
const { firefox } = require("playwright");
const { getUserAgent } = require('../../utils/user-agent');
const { disqusThread } = require('../../utils/disqus-thread');
const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
module.exports.getComments = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED));
const text = await response.text();
const dom = new JSDOM(text, response.url());
const script = dom.window.document.querySelector('#disqus-threadData')
const data = JSON.parse(script.innerHTML);
return disqusThread(data);
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

@ -1,40 +0,0 @@
const { getDetails } = require('./_browser');
const { getComments } = require('./_comments');
module.exports.scrape = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article || !article.content) {
throw new Error('failed to get details.');
}
return res.send(article.content);
} catch (e) {
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article) {
throw new Error('failed to get details.');
}
return res.send(article);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};
module.exports.comments = async (req, res) => {
try {
const comments = await getComments(req.body.url);
if (!comments) {
throw new Error('failed to get comments.');
}
return res.send(comments);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};

@ -1 +0,0 @@
Subproject commit 44f3d1b114400d73aba2bf2551a34f9f142eda76

@ -1,104 +0,0 @@
(function () {
removeHiddenElements();
if (matchDomain("stuff.co.nz")) {
removeSelectors([
".support-brief-container",
'[class*="donation-in-"]',
".sics-component__sharebar",
".breaking-news-pointer",
".bigbyline-container",
[
".sics-component__html-injector.sics-component__story__paragraph",
"READ MORE:",
],
]);
}
if (matchDomain("nzherald.co.nz")) {
removeSelectors([
"[href$='#commenting-widget']",
".related-articles",
".article__print-button",
".share-bar",
".c-suggest-links.read-more-links",
".website-of-year",
".meta-data",
".article__kicker",
".author__image",
]);
}
if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) {
removeSelectors([".c-advert-app", ".c-sub-nav"]);
}
if (matchDomain(["newsroom.co.nz"])) {
removeSelectors([".article_content__section", ".bio"]);
}
if (matchDomain(["newshub.co.nz"])) {
removeSelectors([
".c-ArticleHeading-authorPicture",
".relatedarticles",
".ArticleAttribution",
'.GlobalFooter'
]);
}
if (matchDomain(["tvnz.co.nz"])) {
removeSelectors([".signup-container container"]);
}
if (matchDomain(["thespinoff.co.nz"])) {
removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
}
function matchDomain(domains) {
const hostname = window.location.hostname;
if (typeof domains === "string") {
domains = [domains];
}
return domains.some(
(domain) => hostname === domain || hostname.endsWith("." + domain)
);
}
function removeDOMElement(...elements) {
for (const element of elements) {
if (element) {
element.remove();
}
}
}
function pageContains(selector, text) {
const elements = document.querySelectorAll(selector);
return Array.prototype.filter.call(elements, function (element) {
return RegExp(text).test(element.textContent);
});
}
function removeHiddenElements() {
window.setTimeout(function () {
const selector = "*:not(script):not(head):not(meta):not(link):not(style)";
Array.from(document.querySelectorAll(selector))
.filter((element) => {
const computed = getComputedStyle(element);
const displayNone = computed["display"] === "none";
const visibilityHidden = computed["visibility"] === "hidden";
return displayNone || visibilityHidden;
})
.forEach((element) => element && element.remove());
}, 1000);
}
function removeSelectors(selectors) {
window.setTimeout(function () {
const elements = selectors.flatMap((s) => {
if (typeof s === "string") {
return Array.from(document.querySelectorAll(s));
}
if (s && s.constructor.name === "Array") {
return pageContains(...s);
}
return undefined;
});
removeDOMElement(...elements);
}, 1000);
}
})();

@ -1,14 +0,0 @@
(function () {
const { host, protocol } = window.location;
const url = `${protocol}//${host}`;
[
['[src^="/"]', 'src'],
['[href^="/"]', 'href']
].forEach(([selector, attribute]) => {
Array.from(document.querySelectorAll(selector))
.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
.forEach((e) => {
e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
});
});
})();

@ -1,59 +0,0 @@
const fetch = require('node-fetch');
const { JSDOM } = require('jsdom');
const { Readability } = require('@mozilla/readability');
const { getUserAgent } = require('../utils/user-agent');
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.scrape = async (req, res) => {
try {
const { userAgent, headers } = getUserAgent(req.body.url);
const response = await fetch(req.body.url, {
headers: {
...headers,
'User-Agent': userAgent
}
});
if (!response.ok) {
return res.sendStatus(response.statusCode);
}
const html = await response.text();
const article = await extract(req.body.url, html);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
} catch (e) {
console.error(e);
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const { userAgent, headers } = getUserAgent(req.body.url);
const response = await fetch(req.body.url, {
headers: {
...headers,
'User-Agent': userAgent
}
});
if (!response.ok) {
return res.sendStatus(response.statusCode);
}
const html = await response.text();
const article = await extract(req.body.url, html);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
} catch (e) {
console.error(e);
return res.sendStatus(500);
}
};

@ -1,11 +0,0 @@
const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
const googleBotIp = '66.249.66.1';
module.exports.googleBot = {
userAgent: googleBotUserAgent,
ip: googleBotIp,
headers: {
'User-Agent': googleBotUserAgent,
'X-Forwarded-For': googleBotIp,
}
}

@ -1,21 +0,0 @@
module.exports.disqusThread = data => {
const comments = data.response.posts.reduce((c, post) => ({
...c,
[post.id.toString()]: {
author: post.author.name,
authorLink: post.author.profileUrl,
date: post.createdAt,
text: post.raw_message,
score: post.points,
children: [],
id: post.id.toString(),
parent: (post.parent || '').toString(),
}
}), {});
Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => {
const comment = comments[id];
comments[comment.parent].children.push(comment);
});
const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]);
return parents;
};

@ -1,98 +0,0 @@
module.exports.blockedRegexes = {
"adweek.com": /.+\.lightboxcdn\.com\/.+/,
"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
"foreignpolicy.com": /.+\.tinypass\.com\/.+/,
"fortune.com": /.+\.tinypass\.com\/.+/,
"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
"inquirer.com": /.+\.tinypass\.com\/.+/,
"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
"lrb.co.uk": /.+\.tinypass\.com\/.+/,
"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
"spectator.co.uk": /.+\.tinypass\.com\/.+/,
"spectator.com.au": /.+\.tinypass\.com\/.+/,
"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
"thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
"latercera.com": /.+\.cxense\.com\/+/,
"lesechos.fr": /.+\.tinypass\.com\/.+/,
"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
"technologyreview.com": /.+\.blueconic\.net\/.+/,
};
module.exports.useGoogleBotSites = [
"adelaidenow.com.au",
"barrons.com",
"couriermail.com.au",
"dailytelegraph.com.au",
"fd.nl",
"genomeweb.com",
"haaretz.co.il",
"haaretz.com",
"heraldsun.com.au",
"mexiconewsdaily.com",
"ntnews.com.au",
"quora.com",
"seekingalpha.com",
"telegraph.co.uk",
"theaustralian.com.au",
"themarker.com",
"themercury.com.au",
"thenational.scot",
"thetimes.co.uk",
"wsj.com",
"kansascity.com",
"republic.ru",
"nzz.ch",
"handelsblatt.com",
"washingtonpost.com",
"df.cl",
];
function matchDomain(domains, hostname) {
let matchedDomain = false;
if (typeof domains === "string") {
domains = [domains];
}
domains.some(
(domain) =>
(hostname === domain || hostname.endsWith("." + domain)) &&
(matchedDomain = domain)
);
return matchedDomain;
}
function matchUrlDomain(domains, url) {
return matchDomain(domains, urlHost(url));
}
function urlHost(url) {
if (url && url.startsWith("http")) {
try {
return new URL(url).hostname;
} catch (e) {
console.log(`url not valid: ${url} error: ${e}`);
}
}
return url;
}
module.exports.matchDomain = matchDomain;
module.exports.matchUrlDomain = matchUrlDomain;
module.exports.urlHost = urlHost;

@ -1,18 +0,0 @@
const { googleBot } = require('./constants');
const { matchUrlDomain, useGoogleBotSites } = require("./sites");
module.exports.getUserAgent = (url) => {
const useGoogleBot = useGoogleBotSites.some(function (item) {
return typeof item === "string" && matchUrlDomain(item, url);
});
if (!useGoogleBot) {
return {};
}
return {
userAgent: googleBot.userAgent,
headers: {
"X-Forwarded-For": googleBot.ip
}
}
};

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save