forked from tanner/qotnews
Compare commits
No commits in common. "e0960d59f33363e1ae60250161d35ceefe1700fa" and "55e7f6bb1400d1d87169d670e427986f30b20484" have entirely different histories.
e0960d59f3
...
55e7f6bb14
6
.gitmodules
vendored
6
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "readerserver"]
|
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
|
||||||
path = readerserver
|
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
|
||||||
url = https://github.com/master5o1/declutter.git
|
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git
|
||||||
|
|
20
README.md
20
README.md
|
@ -20,7 +20,7 @@ $ sudo apt install yarn
|
||||||
Clone this repo:
|
Clone this repo:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
$ git clone --recurse-submodules https://git.1j.nz/jason/qotnews.git
|
$ git clone https://gogs.tannercollin.com/tanner/qotnews.git
|
||||||
$ cd qotnews
|
$ cd qotnews
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -37,14 +37,14 @@ $ source env/bin/activate
|
||||||
|
|
||||||
Configure Praw for your Reddit account (optional):
|
Configure Praw for your Reddit account (optional):
|
||||||
|
|
||||||
- Go to https://www.reddit.com/prefs/apps
|
* Go to https://www.reddit.com/prefs/apps
|
||||||
- Click "Create app"
|
* Click "Create app"
|
||||||
- Name: whatever
|
* Name: whatever
|
||||||
- App type: script
|
* App type: script
|
||||||
- Description: blank
|
* Description: blank
|
||||||
- About URL: blank
|
* About URL: blank
|
||||||
- Redirect URL: your GitHub profile
|
* Redirect URL: your GitHub profile
|
||||||
- Submit, copy the client ID and client secret into `settings.py` below
|
* Submit, copy the client ID and client secret into `settings.py` below
|
||||||
|
|
||||||
```text
|
```text
|
||||||
(env) $ vim settings.py.example
|
(env) $ vim settings.py.example
|
||||||
|
@ -109,7 +109,7 @@ stdout_logfile_maxbytes=1MB
|
||||||
[program:qotnewsreader]
|
[program:qotnewsreader]
|
||||||
user=qotnews
|
user=qotnews
|
||||||
directory=/home/qotnews/qotnews/readerserver
|
directory=/home/qotnews/qotnews/readerserver
|
||||||
command=node index.js
|
command=node main.js
|
||||||
autostart=true
|
autostart=true
|
||||||
autorestart=true
|
autorestart=true
|
||||||
stderr_logfile=/var/log/qotnewsreader.log
|
stderr_logfile=/var/log/qotnewsreader.log
|
||||||
|
|
|
@ -12,7 +12,7 @@ import settings
|
||||||
from feeds import hackernews, reddit, tildes, substack, manual
|
from feeds import hackernews, reddit, tildes, substack, manual
|
||||||
from feeds.sitemap import Sitemap
|
from feeds.sitemap import Sitemap
|
||||||
from feeds.category import Category
|
from feeds.category import Category
|
||||||
from scrapers import outline, declutter, headless, simple
|
from scrapers import outline, declutter, browser, local
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||||
|
|
||||||
|
@ -63,14 +63,14 @@ def get_list():
|
||||||
|
|
||||||
def get_article(url):
|
def get_article(url):
|
||||||
scrapers = {
|
scrapers = {
|
||||||
'headless': headless,
|
|
||||||
'simple': simple,
|
|
||||||
'outline': outline,
|
|
||||||
'declutter': declutter,
|
'declutter': declutter,
|
||||||
|
'outline': outline,
|
||||||
|
'browser': browser,
|
||||||
|
'local': local,
|
||||||
}
|
}
|
||||||
available = settings.SCRAPERS or ['headless', 'simple']
|
available = settings.SCRAPERS or ['local']
|
||||||
if 'simple' not in available:
|
if 'local' not in available:
|
||||||
available += ['simple']
|
available += ['local']
|
||||||
|
|
||||||
for scraper in available:
|
for scraper in available:
|
||||||
if scraper not in scrapers.keys():
|
if scraper not in scrapers.keys():
|
||||||
|
|
|
@ -3,15 +3,14 @@ logging.basicConfig(
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
import requests
|
import requests
|
||||||
from settings import READER_PORT
|
|
||||||
|
|
||||||
READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000)
|
READ_API = 'http://127.0.0.1:33843/browser/details'
|
||||||
READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000)
|
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
|
||||||
TIMEOUT = 60
|
TIMEOUT = 60
|
||||||
|
|
||||||
|
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
logging.info(f"Headless Browser Scraper: {url}")
|
logging.info(f"Reader Scraper: {url}")
|
||||||
details = get_details(url)
|
details = get_details(url)
|
||||||
if not details:
|
if not details:
|
||||||
return ''
|
return ''
|
||||||
|
@ -26,7 +25,7 @@ def get_details(url):
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
logging.error('Problem scraping article: {}'.format(str(e)))
|
logging.error('Problem Scraping article: {}'.format(str(e)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_comments(url):
|
def get_comments(url):
|
|
@ -3,13 +3,12 @@ logging.basicConfig(
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
import requests
|
import requests
|
||||||
from settings import READER_PORT
|
|
||||||
|
|
||||||
READ_API = 'http://127.0.0.1:{}/simple/details'.format(READER_PORT or 3000)
|
READ_API = 'http://127.0.0.1:33843/details'
|
||||||
TIMEOUT = 20
|
TIMEOUT = 20
|
||||||
|
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
logging.info(f"Simple Scraper: {url}")
|
logging.info(f"Local Scraper: {url}")
|
||||||
details = get_details(url)
|
details = get_details(url)
|
||||||
if not details:
|
if not details:
|
||||||
return ''
|
return ''
|
|
@ -142,7 +142,7 @@ def static_story(sid):
|
||||||
url=url,
|
url=url,
|
||||||
description=description)
|
description=description)
|
||||||
|
|
||||||
http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app)
|
http_server = WSGIServer(('', 33842), flask_app)
|
||||||
|
|
||||||
def _add_new_refs():
|
def _add_new_refs():
|
||||||
for ref, source, urlref in feed.get_list():
|
for ref, source, urlref in feed.get_list():
|
||||||
|
|
|
@ -4,10 +4,6 @@
|
||||||
HOSTNAME = 'news.t0.vc'
|
HOSTNAME = 'news.t0.vc'
|
||||||
MAX_STORY_AGE = 3*24*60*60
|
MAX_STORY_AGE = 3*24*60*60
|
||||||
|
|
||||||
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
|
|
||||||
API_PORT = 33842
|
|
||||||
READER_PORT = 3000
|
|
||||||
|
|
||||||
# Feed Lengths
|
# Feed Lengths
|
||||||
# Number of top items from each site to pull
|
# Number of top items from each site to pull
|
||||||
# set to 0 to disable that site
|
# set to 0 to disable that site
|
||||||
|
@ -55,6 +51,8 @@ CATEGORY = {}
|
||||||
# ],
|
# ],
|
||||||
# }
|
# }
|
||||||
|
|
||||||
|
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
|
||||||
|
|
||||||
# Reddit account info
|
# Reddit account info
|
||||||
# leave blank if not using Reddit
|
# leave blank if not using Reddit
|
||||||
REDDIT_CLIENT_ID = ''
|
REDDIT_CLIENT_ID = ''
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
Subproject commit 9c0336b0af4be942991a7a3771c09ec08938bde8
|
|
92
readerserver/.gitignore
vendored
Normal file
92
readerserver/.gitignore
vendored
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
# Logs
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
lerna-debug.log*
|
||||||
|
|
||||||
|
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||||
|
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||||
|
|
||||||
|
# Runtime data
|
||||||
|
pids
|
||||||
|
*.pid
|
||||||
|
*.seed
|
||||||
|
*.pid.lock
|
||||||
|
|
||||||
|
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||||
|
lib-cov
|
||||||
|
|
||||||
|
# Coverage directory used by tools like istanbul
|
||||||
|
coverage
|
||||||
|
*.lcov
|
||||||
|
|
||||||
|
# nyc test coverage
|
||||||
|
.nyc_output
|
||||||
|
|
||||||
|
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||||
|
.grunt
|
||||||
|
|
||||||
|
# Bower dependency directory (https://bower.io/)
|
||||||
|
bower_components
|
||||||
|
|
||||||
|
# node-waf configuration
|
||||||
|
.lock-wscript
|
||||||
|
|
||||||
|
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||||
|
build/Release
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
node_modules/
|
||||||
|
jspm_packages/
|
||||||
|
|
||||||
|
# TypeScript v1 declaration files
|
||||||
|
typings/
|
||||||
|
|
||||||
|
# TypeScript cache
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
# Optional npm cache directory
|
||||||
|
.npm
|
||||||
|
|
||||||
|
# Optional eslint cache
|
||||||
|
.eslintcache
|
||||||
|
|
||||||
|
# Optional REPL history
|
||||||
|
.node_repl_history
|
||||||
|
|
||||||
|
# Output of 'npm pack'
|
||||||
|
*.tgz
|
||||||
|
|
||||||
|
# Yarn Integrity file
|
||||||
|
.yarn-integrity
|
||||||
|
|
||||||
|
# dotenv environment variables file
|
||||||
|
.env
|
||||||
|
.env.test
|
||||||
|
|
||||||
|
# parcel-bundler cache (https://parceljs.org/)
|
||||||
|
.cache
|
||||||
|
|
||||||
|
# next.js build output
|
||||||
|
.next
|
||||||
|
|
||||||
|
# nuxt.js build output
|
||||||
|
.nuxt
|
||||||
|
|
||||||
|
# vuepress build output
|
||||||
|
.vuepress/dist
|
||||||
|
|
||||||
|
# Serverless directories
|
||||||
|
.serverless/
|
||||||
|
|
||||||
|
# FuseBox cache
|
||||||
|
.fusebox/
|
||||||
|
|
||||||
|
# DynamoDB Local files
|
||||||
|
.dynamodb/
|
||||||
|
|
||||||
|
# Editor
|
||||||
|
*.swp
|
||||||
|
*.swo
|
30
readerserver/main.js
Normal file
30
readerserver/main.js
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
const port = 33843;
|
||||||
|
const express = require('express');
|
||||||
|
const app = express();
|
||||||
|
const simple = require('./scraper/simple');
|
||||||
|
const browser = require('./scraper/browser');
|
||||||
|
|
||||||
|
app.use(express.urlencoded({ extended: true }));
|
||||||
|
|
||||||
|
app.get('/', (req, res) => {
|
||||||
|
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
|
||||||
|
|
||||||
|
const html = routes.map(route => `
|
||||||
|
<form method="POST" action="${route}" accept-charset="UTF-8">
|
||||||
|
<fieldset>
|
||||||
|
<legend>route: POST ${route}</legend>
|
||||||
|
<input name="url">
|
||||||
|
<button type="submit">SUBMIT</button>
|
||||||
|
</fieldset>
|
||||||
|
</form>`).join('<hr />');
|
||||||
|
res.send(html);
|
||||||
|
});
|
||||||
|
app.post('/', simple.scrape);
|
||||||
|
app.post('/details', simple.details);
|
||||||
|
app.post('/browser', browser.scrape);
|
||||||
|
app.post('/browser/details', browser.details);
|
||||||
|
app.post('/browser/comments', browser.comments);
|
||||||
|
|
||||||
|
app.listen(port, () => {
|
||||||
|
console.log(`Example app listening on port ${port}!`);
|
||||||
|
});
|
15
readerserver/package.json
Normal file
15
readerserver/package.json
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
{
|
||||||
|
"name": "readerserver",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"main": "main.js",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"@mozilla/readability": "^0.3.0",
|
||||||
|
"dompurify": "^1.0.11",
|
||||||
|
"express": "^4.17.1",
|
||||||
|
"jsdom": "^15.1.1",
|
||||||
|
"node-fetch": "^2.6.1",
|
||||||
|
"playwright": "^1.5.2",
|
||||||
|
"request": "^2.88.0"
|
||||||
|
}
|
||||||
|
}
|
45
readerserver/scraper/browser/_browser.js
Normal file
45
readerserver/scraper/browser/_browser.js
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
const { firefox } = require("playwright");
|
||||||
|
const { JSDOM } = require("jsdom");
|
||||||
|
const { Readability } = require("@mozilla/readability");
|
||||||
|
|
||||||
|
const { getUserAgent } = require('../../utils/user-agent');
|
||||||
|
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites");
|
||||||
|
|
||||||
|
module.exports.getDetails = async (url) => {
|
||||||
|
const { userAgent, headers } = getUserAgent(url);
|
||||||
|
|
||||||
|
const browser = await firefox.launch({ args: [], headless: true });
|
||||||
|
const tab = await browser.newPage({
|
||||||
|
extraHTTPHeaders: headers,
|
||||||
|
userAgent,
|
||||||
|
viewport: { width: 2000, height: 10000 },
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await tab.route(/.*/, (route) => {
|
||||||
|
const routeUrl = route.request().url();
|
||||||
|
const blockedDomains = Object.keys(blockedRegexes);
|
||||||
|
const domain = matchUrlDomain(blockedDomains, routeUrl);
|
||||||
|
if (domain && routeUrl.match(blockedRegexes[domain])) {
|
||||||
|
return route.abort();
|
||||||
|
}
|
||||||
|
return route.continue();
|
||||||
|
});
|
||||||
|
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
|
||||||
|
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
|
||||||
|
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
|
||||||
|
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
|
||||||
|
await tab.waitForTimeout(2000);
|
||||||
|
|
||||||
|
const body = await tab.content();
|
||||||
|
const doc = new JSDOM(body, { url });
|
||||||
|
const reader = new Readability(doc.window.document);
|
||||||
|
const article = reader.parse();
|
||||||
|
return article;
|
||||||
|
} catch (e) {
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
await tab.close();
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
};
|
34
readerserver/scraper/browser/_comments.js
Normal file
34
readerserver/scraper/browser/_comments.js
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
const { JSDOM } = require("jsdom");
|
||||||
|
const { firefox } = require("playwright");
|
||||||
|
const { getUserAgent } = require('../../utils/user-agent');
|
||||||
|
const { disqusThread } = require('../../utils/disqus-thread');
|
||||||
|
|
||||||
|
const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
|
||||||
|
|
||||||
|
module.exports.getComments = async (url) => {
|
||||||
|
const { userAgent, headers } = getUserAgent(url);
|
||||||
|
|
||||||
|
const browser = await firefox.launch({ args: [], headless: true });
|
||||||
|
const tab = await browser.newPage({
|
||||||
|
extraHTTPHeaders: headers,
|
||||||
|
userAgent,
|
||||||
|
viewport: { width: 2000, height: 10000 },
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
|
||||||
|
|
||||||
|
const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED));
|
||||||
|
const text = await response.text();
|
||||||
|
const dom = new JSDOM(text, response.url());
|
||||||
|
const script = dom.window.document.querySelector('#disqus-threadData')
|
||||||
|
const data = JSON.parse(script.innerHTML);
|
||||||
|
|
||||||
|
return disqusThread(data);
|
||||||
|
} catch (e) {
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
await tab.close();
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
};
|
40
readerserver/scraper/browser/index.js
Normal file
40
readerserver/scraper/browser/index.js
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
const { getDetails } = require('./_browser');
|
||||||
|
const { getComments } = require('./_comments');
|
||||||
|
|
||||||
|
module.exports.scrape = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const article = await getDetails(req.body.url);
|
||||||
|
if (!article || !article.content) {
|
||||||
|
throw new Error('failed to get details.');
|
||||||
|
}
|
||||||
|
return res.send(article.content);
|
||||||
|
} catch (e) {
|
||||||
|
return res.sendStatus(500);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports.details = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const article = await getDetails(req.body.url);
|
||||||
|
if (!article) {
|
||||||
|
throw new Error('failed to get details.');
|
||||||
|
}
|
||||||
|
return res.send(article);
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
return res.sendStatus(500);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports.comments = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const comments = await getComments(req.body.url);
|
||||||
|
if (!comments) {
|
||||||
|
throw new Error('failed to get comments.');
|
||||||
|
}
|
||||||
|
return res.send(comments);
|
||||||
|
} catch (e) {
|
||||||
|
console.log(e);
|
||||||
|
return res.sendStatus(500);
|
||||||
|
}
|
||||||
|
};
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 44f3d1b114400d73aba2bf2551a34f9f142eda76
|
104
readerserver/scraper/browser/scripts/cosmetic-filters.js
Normal file
104
readerserver/scraper/browser/scripts/cosmetic-filters.js
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
(function () {
|
||||||
|
removeHiddenElements();
|
||||||
|
|
||||||
|
if (matchDomain("stuff.co.nz")) {
|
||||||
|
removeSelectors([
|
||||||
|
".support-brief-container",
|
||||||
|
'[class*="donation-in-"]',
|
||||||
|
".sics-component__sharebar",
|
||||||
|
".breaking-news-pointer",
|
||||||
|
".bigbyline-container",
|
||||||
|
[
|
||||||
|
".sics-component__html-injector.sics-component__story__paragraph",
|
||||||
|
"READ MORE:",
|
||||||
|
],
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
if (matchDomain("nzherald.co.nz")) {
|
||||||
|
removeSelectors([
|
||||||
|
"[href$='#commenting-widget']",
|
||||||
|
".related-articles",
|
||||||
|
".article__print-button",
|
||||||
|
".share-bar",
|
||||||
|
".c-suggest-links.read-more-links",
|
||||||
|
".website-of-year",
|
||||||
|
".meta-data",
|
||||||
|
".article__kicker",
|
||||||
|
".author__image",
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) {
|
||||||
|
removeSelectors([".c-advert-app", ".c-sub-nav"]);
|
||||||
|
}
|
||||||
|
if (matchDomain(["newsroom.co.nz"])) {
|
||||||
|
removeSelectors([".article_content__section", ".bio"]);
|
||||||
|
}
|
||||||
|
if (matchDomain(["newshub.co.nz"])) {
|
||||||
|
removeSelectors([
|
||||||
|
".c-ArticleHeading-authorPicture",
|
||||||
|
".relatedarticles",
|
||||||
|
".ArticleAttribution",
|
||||||
|
'.GlobalFooter'
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
if (matchDomain(["tvnz.co.nz"])) {
|
||||||
|
removeSelectors([".signup-container container"]);
|
||||||
|
}
|
||||||
|
if (matchDomain(["thespinoff.co.nz"])) {
|
||||||
|
removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchDomain(domains) {
|
||||||
|
const hostname = window.location.hostname;
|
||||||
|
if (typeof domains === "string") {
|
||||||
|
domains = [domains];
|
||||||
|
}
|
||||||
|
return domains.some(
|
||||||
|
(domain) => hostname === domain || hostname.endsWith("." + domain)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeDOMElement(...elements) {
|
||||||
|
for (const element of elements) {
|
||||||
|
if (element) {
|
||||||
|
element.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function pageContains(selector, text) {
|
||||||
|
const elements = document.querySelectorAll(selector);
|
||||||
|
return Array.prototype.filter.call(elements, function (element) {
|
||||||
|
return RegExp(text).test(element.textContent);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeHiddenElements() {
|
||||||
|
window.setTimeout(function () {
|
||||||
|
const selector = "*:not(script):not(head):not(meta):not(link):not(style)";
|
||||||
|
Array.from(document.querySelectorAll(selector))
|
||||||
|
.filter((element) => {
|
||||||
|
const computed = getComputedStyle(element);
|
||||||
|
const displayNone = computed["display"] === "none";
|
||||||
|
const visibilityHidden = computed["visibility"] === "hidden";
|
||||||
|
return displayNone || visibilityHidden;
|
||||||
|
})
|
||||||
|
.forEach((element) => element && element.remove());
|
||||||
|
}, 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeSelectors(selectors) {
|
||||||
|
window.setTimeout(function () {
|
||||||
|
const elements = selectors.flatMap((s) => {
|
||||||
|
if (typeof s === "string") {
|
||||||
|
return Array.from(document.querySelectorAll(s));
|
||||||
|
}
|
||||||
|
if (s && s.constructor.name === "Array") {
|
||||||
|
return pageContains(...s);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
});
|
||||||
|
removeDOMElement(...elements);
|
||||||
|
}, 1000);
|
||||||
|
}
|
||||||
|
})();
|
14
readerserver/scraper/browser/scripts/fix-relative-links.js
Normal file
14
readerserver/scraper/browser/scripts/fix-relative-links.js
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
(function () {
|
||||||
|
const { host, protocol } = window.location;
|
||||||
|
const url = `${protocol}//${host}`;
|
||||||
|
[
|
||||||
|
['[src^="/"]', 'src'],
|
||||||
|
['[href^="/"]', 'href']
|
||||||
|
].forEach(([selector, attribute]) => {
|
||||||
|
Array.from(document.querySelectorAll(selector))
|
||||||
|
.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
|
||||||
|
.forEach((e) => {
|
||||||
|
e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
})();
|
59
readerserver/scraper/simple.js
Normal file
59
readerserver/scraper/simple.js
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
const fetch = require('node-fetch');
|
||||||
|
const { JSDOM } = require('jsdom');
|
||||||
|
const { Readability } = require('@mozilla/readability');
|
||||||
|
|
||||||
|
const { getUserAgent } = require('../utils/user-agent');
|
||||||
|
|
||||||
|
const extract = (url, body) => {
|
||||||
|
const doc = new JSDOM(body, { url: url });
|
||||||
|
const reader = new Readability(doc.window.document);
|
||||||
|
return reader.parse();
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports.scrape = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { userAgent, headers } = getUserAgent(req.body.url);
|
||||||
|
const response = await fetch(req.body.url, {
|
||||||
|
headers: {
|
||||||
|
...headers,
|
||||||
|
'User-Agent': userAgent
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
return res.sendStatus(response.statusCode);
|
||||||
|
}
|
||||||
|
const html = await response.text();
|
||||||
|
const article = await extract(req.body.url, html);
|
||||||
|
if (article && article.content) {
|
||||||
|
return res.send(article.content);
|
||||||
|
}
|
||||||
|
return res.sendStatus(404);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
return res.sendStatus(500);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports.details = async (req, res) => {
|
||||||
|
try {
|
||||||
|
const { userAgent, headers } = getUserAgent(req.body.url);
|
||||||
|
const response = await fetch(req.body.url, {
|
||||||
|
headers: {
|
||||||
|
...headers,
|
||||||
|
'User-Agent': userAgent
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
return res.sendStatus(response.statusCode);
|
||||||
|
}
|
||||||
|
const html = await response.text();
|
||||||
|
const article = await extract(req.body.url, html);
|
||||||
|
if (article) {
|
||||||
|
return res.send(article);
|
||||||
|
}
|
||||||
|
return res.sendStatus(404);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(e);
|
||||||
|
return res.sendStatus(500);
|
||||||
|
}
|
||||||
|
};
|
11
readerserver/utils/constants.js
Normal file
11
readerserver/utils/constants.js
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)';
|
||||||
|
const googleBotIp = '66.249.66.1';
|
||||||
|
|
||||||
|
module.exports.googleBot = {
|
||||||
|
userAgent: googleBotUserAgent,
|
||||||
|
ip: googleBotIp,
|
||||||
|
headers: {
|
||||||
|
'User-Agent': googleBotUserAgent,
|
||||||
|
'X-Forwarded-For': googleBotIp,
|
||||||
|
}
|
||||||
|
}
|
21
readerserver/utils/disqus-thread.js
Normal file
21
readerserver/utils/disqus-thread.js
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
module.exports.disqusThread = data => {
|
||||||
|
const comments = data.response.posts.reduce((c, post) => ({
|
||||||
|
...c,
|
||||||
|
[post.id.toString()]: {
|
||||||
|
author: post.author.name,
|
||||||
|
authorLink: post.author.profileUrl,
|
||||||
|
date: post.createdAt,
|
||||||
|
text: post.raw_message,
|
||||||
|
score: post.points,
|
||||||
|
children: [],
|
||||||
|
id: post.id.toString(),
|
||||||
|
parent: (post.parent || '').toString(),
|
||||||
|
}
|
||||||
|
}), {});
|
||||||
|
Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => {
|
||||||
|
const comment = comments[id];
|
||||||
|
comments[comment.parent].children.push(comment);
|
||||||
|
});
|
||||||
|
const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]);
|
||||||
|
return parents;
|
||||||
|
};
|
98
readerserver/utils/sites.js
Normal file
98
readerserver/utils/sites.js
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
module.exports.blockedRegexes = {
|
||||||
|
"adweek.com": /.+\.lightboxcdn\.com\/.+/,
|
||||||
|
"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/,
|
||||||
|
"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/,
|
||||||
|
"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//,
|
||||||
|
"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/,
|
||||||
|
"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/,
|
||||||
|
"foreignpolicy.com": /.+\.tinypass\.com\/.+/,
|
||||||
|
"fortune.com": /.+\.tinypass\.com\/.+/,
|
||||||
|
"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/,
|
||||||
|
"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/,
|
||||||
|
"inquirer.com": /.+\.tinypass\.com\/.+/,
|
||||||
|
"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/,
|
||||||
|
"lrb.co.uk": /.+\.tinypass\.com\/.+/,
|
||||||
|
"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/,
|
||||||
|
"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/,
|
||||||
|
"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/,
|
||||||
|
"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/,
|
||||||
|
"spectator.co.uk": /.+\.tinypass\.com\/.+/,
|
||||||
|
"spectator.com.au": /.+\.tinypass\.com\/.+/,
|
||||||
|
"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/,
|
||||||
|
"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/,
|
||||||
|
"thenation.com": /thenation\.com\/.+\/paywall-script\.php/,
|
||||||
|
"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/,
|
||||||
|
"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/,
|
||||||
|
"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
|
||||||
|
"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/,
|
||||||
|
"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
|
||||||
|
"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/,
|
||||||
|
"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/,
|
||||||
|
"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/,
|
||||||
|
"latercera.com": /.+\.cxense\.com\/+/,
|
||||||
|
"lesechos.fr": /.+\.tinypass\.com\/.+/,
|
||||||
|
"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/,
|
||||||
|
"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/,
|
||||||
|
"technologyreview.com": /.+\.blueconic\.net\/.+/,
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports.useGoogleBotSites = [
|
||||||
|
"adelaidenow.com.au",
|
||||||
|
"barrons.com",
|
||||||
|
"couriermail.com.au",
|
||||||
|
"dailytelegraph.com.au",
|
||||||
|
"fd.nl",
|
||||||
|
"genomeweb.com",
|
||||||
|
"haaretz.co.il",
|
||||||
|
"haaretz.com",
|
||||||
|
"heraldsun.com.au",
|
||||||
|
"mexiconewsdaily.com",
|
||||||
|
"ntnews.com.au",
|
||||||
|
"quora.com",
|
||||||
|
"seekingalpha.com",
|
||||||
|
"telegraph.co.uk",
|
||||||
|
"theaustralian.com.au",
|
||||||
|
"themarker.com",
|
||||||
|
"themercury.com.au",
|
||||||
|
"thenational.scot",
|
||||||
|
"thetimes.co.uk",
|
||||||
|
"wsj.com",
|
||||||
|
"kansascity.com",
|
||||||
|
"republic.ru",
|
||||||
|
"nzz.ch",
|
||||||
|
"handelsblatt.com",
|
||||||
|
"washingtonpost.com",
|
||||||
|
"df.cl",
|
||||||
|
];
|
||||||
|
|
||||||
|
function matchDomain(domains, hostname) {
|
||||||
|
let matchedDomain = false;
|
||||||
|
if (typeof domains === "string") {
|
||||||
|
domains = [domains];
|
||||||
|
}
|
||||||
|
domains.some(
|
||||||
|
(domain) =>
|
||||||
|
(hostname === domain || hostname.endsWith("." + domain)) &&
|
||||||
|
(matchedDomain = domain)
|
||||||
|
);
|
||||||
|
return matchedDomain;
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchUrlDomain(domains, url) {
|
||||||
|
return matchDomain(domains, urlHost(url));
|
||||||
|
}
|
||||||
|
|
||||||
|
function urlHost(url) {
|
||||||
|
if (url && url.startsWith("http")) {
|
||||||
|
try {
|
||||||
|
return new URL(url).hostname;
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`url not valid: ${url} error: ${e}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports.matchDomain = matchDomain;
|
||||||
|
module.exports.matchUrlDomain = matchUrlDomain;
|
||||||
|
module.exports.urlHost = urlHost;
|
18
readerserver/utils/user-agent.js
Normal file
18
readerserver/utils/user-agent.js
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
const { googleBot } = require('./constants');
|
||||||
|
const { matchUrlDomain, useGoogleBotSites } = require("./sites");
|
||||||
|
|
||||||
|
module.exports.getUserAgent = (url) => {
|
||||||
|
const useGoogleBot = useGoogleBotSites.some(function (item) {
|
||||||
|
return typeof item === "string" && matchUrlDomain(item, url);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!useGoogleBot) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
userAgent: googleBot.userAgent,
|
||||||
|
headers: {
|
||||||
|
"X-Forwarded-For": googleBot.ip
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
1261
readerserver/yarn.lock
Normal file
1261
readerserver/yarn.lock
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user