renaming things.

This commit is contained in:
Jason Schwarzenberger
2020-11-17 15:50:31 +13:00
parent 55d50a86d8
commit 3b885e4327
13 changed files with 34 additions and 28 deletions

View File

@@ -0,0 +1,45 @@
const { firefox } = require("playwright");
const { JSDOM } = require("jsdom");
const { Readability } = require("@mozilla/readability");
const { getUserAgent } = require('../../utils/user-agent');
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites");
module.exports.getDetails = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.route(/.*/, (route) => {
const routeUrl = route.request().url();
const blockedDomains = Object.keys(blockedRegexes);
const domain = matchUrlDomain(blockedDomains, routeUrl);
if (domain && routeUrl.match(blockedRegexes[domain])) {
return route.abort();
}
return route.continue();
});
await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);
const body = await tab.content();
const doc = new JSDOM(body, { url });
const reader = new Readability(doc.window.document);
const article = reader.parse();
return article;
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

View File

@@ -0,0 +1,34 @@
const { JSDOM } = require("jsdom");
const { firefox } = require("playwright");
const { getUserAgent } = require('../../utils/user-agent');
const { disqusThread } = require('../../utils/disqus-thread');
const DISQUS_EMBED = 'https://disqus.com/embed/comments/';
module.exports.getComments = async (url) => {
const { userAgent, headers } = getUserAgent(url);
const browser = await firefox.launch({ args: [], headless: true });
const tab = await browser.newPage({
extraHTTPHeaders: headers,
userAgent,
viewport: { width: 2000, height: 10000 },
});
try {
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED));
const text = await response.text();
const dom = new JSDOM(text, response.url());
const script = dom.window.document.querySelector('#disqus-threadData')
const data = JSON.parse(script.innerHTML);
return disqusThread(data);
} catch (e) {
throw e;
} finally {
await tab.close();
await browser.close();
}
};

View File

@@ -0,0 +1,40 @@
const { getDetails } = require('./_browser');
const { getComments } = require('./_comments');
module.exports.scrape = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article || !article.content) {
throw new Error('failed to get details.');
}
return res.send(article.content);
} catch (e) {
return res.sendStatus(500);
}
};
module.exports.details = async (req, res) => {
try {
const article = await getDetails(req.body.url);
if (!article) {
throw new Error('failed to get details.');
}
return res.send(article);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};
module.exports.comments = async (req, res) => {
try {
const comments = await getComments(req.body.url);
if (!comments) {
throw new Error('failed to get comments.');
}
return res.send(comments);
} catch (e) {
console.log(e);
return res.sendStatus(500);
}
};

View File

@@ -0,0 +1,104 @@
(function () {
removeHiddenElements();
if (matchDomain("stuff.co.nz")) {
removeSelectors([
".support-brief-container",
'[class*="donation-in-"]',
".sics-component__sharebar",
".breaking-news-pointer",
".bigbyline-container",
[
".sics-component__html-injector.sics-component__story__paragraph",
"READ MORE:",
],
]);
}
if (matchDomain("nzherald.co.nz")) {
removeSelectors([
"[href$='#commenting-widget']",
".related-articles",
".article__print-button",
".share-bar",
".c-suggest-links.read-more-links",
".website-of-year",
".meta-data",
".article__kicker",
".author__image",
]);
}
if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) {
removeSelectors([".c-advert-app", ".c-sub-nav"]);
}
if (matchDomain(["newsroom.co.nz"])) {
removeSelectors([".article_content__section", ".bio"]);
}
if (matchDomain(["newshub.co.nz"])) {
removeSelectors([
".c-ArticleHeading-authorPicture",
".relatedarticles",
".ArticleAttribution",
'.GlobalFooter'
]);
}
if (matchDomain(["tvnz.co.nz"])) {
removeSelectors([".signup-container container"]);
}
if (matchDomain(["thespinoff.co.nz"])) {
removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
}
function matchDomain(domains) {
const hostname = window.location.hostname;
if (typeof domains === "string") {
domains = [domains];
}
return domains.some(
(domain) => hostname === domain || hostname.endsWith("." + domain)
);
}
function removeDOMElement(...elements) {
for (const element of elements) {
if (element) {
element.remove();
}
}
}
function pageContains(selector, text) {
const elements = document.querySelectorAll(selector);
return Array.prototype.filter.call(elements, function (element) {
return RegExp(text).test(element.textContent);
});
}
function removeHiddenElements() {
window.setTimeout(function () {
const selector = "*:not(script):not(head):not(meta):not(link):not(style)";
Array.from(document.querySelectorAll(selector))
.filter((element) => {
const computed = getComputedStyle(element);
const displayNone = computed["display"] === "none";
const visibilityHidden = computed["visibility"] === "hidden";
return displayNone || visibilityHidden;
})
.forEach((element) => element && element.remove());
}, 1000);
}
function removeSelectors(selectors) {
window.setTimeout(function () {
const elements = selectors.flatMap((s) => {
if (typeof s === "string") {
return Array.from(document.querySelectorAll(s));
}
if (s && s.constructor.name === "Array") {
return pageContains(...s);
}
return undefined;
});
removeDOMElement(...elements);
}, 1000);
}
})();

View File

@@ -0,0 +1,14 @@
(function () {
const { host, protocol } = window.location;
const url = `${protocol}//${host}`;
[
['[src^="/"]', 'src'],
['[href^="/"]', 'href']
].forEach(([selector, attribute]) => {
Array.from(document.querySelectorAll(selector))
.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value))
.forEach((e) => {
e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`;
});
});
})();