forked from tanner/qotnews
parent
c9da2a078b
commit
f5b38f5c6b
21 changed files with 14 additions and 1863 deletions
@ -1,4 +1,3 @@ |
|||||||
[submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"] |
[submodule "readerserver"] |
||||||
path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome |
path = readerserver |
||||||
url = https://github.com/iamadamdev/bypass-paywalls-chrome/ |
url = https://github.com/master5o1/declutter.git |
||||||
branch = master |
|
||||||
|
@ -0,0 +1 @@ |
|||||||
|
Subproject commit 9c0336b0af4be942991a7a3771c09ec08938bde8 |
@ -1,92 +0,0 @@ |
|||||||
# Logs |
|
||||||
logs |
|
||||||
*.log |
|
||||||
npm-debug.log* |
|
||||||
yarn-debug.log* |
|
||||||
yarn-error.log* |
|
||||||
lerna-debug.log* |
|
||||||
|
|
||||||
# Diagnostic reports (https://nodejs.org/api/report.html) |
|
||||||
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json |
|
||||||
|
|
||||||
# Runtime data |
|
||||||
pids |
|
||||||
*.pid |
|
||||||
*.seed |
|
||||||
*.pid.lock |
|
||||||
|
|
||||||
# Directory for instrumented libs generated by jscoverage/JSCover |
|
||||||
lib-cov |
|
||||||
|
|
||||||
# Coverage directory used by tools like istanbul |
|
||||||
coverage |
|
||||||
*.lcov |
|
||||||
|
|
||||||
# nyc test coverage |
|
||||||
.nyc_output |
|
||||||
|
|
||||||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) |
|
||||||
.grunt |
|
||||||
|
|
||||||
# Bower dependency directory (https://bower.io/) |
|
||||||
bower_components |
|
||||||
|
|
||||||
# node-waf configuration |
|
||||||
.lock-wscript |
|
||||||
|
|
||||||
# Compiled binary addons (https://nodejs.org/api/addons.html) |
|
||||||
build/Release |
|
||||||
|
|
||||||
# Dependency directories |
|
||||||
node_modules/ |
|
||||||
jspm_packages/ |
|
||||||
|
|
||||||
# TypeScript v1 declaration files |
|
||||||
typings/ |
|
||||||
|
|
||||||
# TypeScript cache |
|
||||||
*.tsbuildinfo |
|
||||||
|
|
||||||
# Optional npm cache directory |
|
||||||
.npm |
|
||||||
|
|
||||||
# Optional eslint cache |
|
||||||
.eslintcache |
|
||||||
|
|
||||||
# Optional REPL history |
|
||||||
.node_repl_history |
|
||||||
|
|
||||||
# Output of 'npm pack' |
|
||||||
*.tgz |
|
||||||
|
|
||||||
# Yarn Integrity file |
|
||||||
.yarn-integrity |
|
||||||
|
|
||||||
# dotenv environment variables file |
|
||||||
.env |
|
||||||
.env.test |
|
||||||
|
|
||||||
# parcel-bundler cache (https://parceljs.org/) |
|
||||||
.cache |
|
||||||
|
|
||||||
# next.js build output |
|
||||||
.next |
|
||||||
|
|
||||||
# nuxt.js build output |
|
||||||
.nuxt |
|
||||||
|
|
||||||
# vuepress build output |
|
||||||
.vuepress/dist |
|
||||||
|
|
||||||
# Serverless directories |
|
||||||
.serverless/ |
|
||||||
|
|
||||||
# FuseBox cache |
|
||||||
.fusebox/ |
|
||||||
|
|
||||||
# DynamoDB Local files |
|
||||||
.dynamodb/ |
|
||||||
|
|
||||||
# Editor |
|
||||||
*.swp |
|
||||||
*.swo |
|
@ -1,36 +0,0 @@ |
|||||||
const port = 33843; |
|
||||||
const express = require('express'); |
|
||||||
const app = express(); |
|
||||||
const simple = require('./scraper/simple'); |
|
||||||
const headless = require('./scraper/headless'); |
|
||||||
|
|
||||||
app.use(express.urlencoded({ extended: true })); |
|
||||||
|
|
||||||
app.get('/', (req, res) => { |
|
||||||
const routes = [ |
|
||||||
'/simple', |
|
||||||
'/simple/details', |
|
||||||
'/headless', |
|
||||||
'/headless/details', |
|
||||||
'/headless/comments' |
|
||||||
]; |
|
||||||
|
|
||||||
const html = routes.map(route => ` |
|
||||||
<form method="POST" action="${route}" accept-charset="UTF-8"> |
|
||||||
<fieldset> |
|
||||||
<legend>route: POST ${route}</legend> |
|
||||||
<input name="url"> |
|
||||||
<button type="submit">SUBMIT</button> |
|
||||||
</fieldset> |
|
||||||
</form>`).join('<hr />'); |
|
||||||
res.send(html); |
|
||||||
}); |
|
||||||
app.post('/simple/', simple.scrape); |
|
||||||
app.post('/simple/details', simple.details); |
|
||||||
app.post('/headless', headless.scrape); |
|
||||||
app.post('/headless/details', headless.details); |
|
||||||
app.post('/headless/comments', headless.comments); |
|
||||||
|
|
||||||
app.listen(port, () => { |
|
||||||
console.log(`Example app listening on port ${port}!`); |
|
||||||
}); |
|
@ -1,15 +0,0 @@ |
|||||||
{ |
|
||||||
"name": "readerserver", |
|
||||||
"version": "1.0.0", |
|
||||||
"main": "main.js", |
|
||||||
"license": "MIT", |
|
||||||
"dependencies": { |
|
||||||
"@mozilla/readability": "^0.3.0", |
|
||||||
"dompurify": "^1.0.11", |
|
||||||
"express": "^4.17.1", |
|
||||||
"jsdom": "^15.1.1", |
|
||||||
"node-fetch": "^2.6.1", |
|
||||||
"playwright": "^1.5.2", |
|
||||||
"request": "^2.88.0" |
|
||||||
} |
|
||||||
} |
|
@ -1,45 +0,0 @@ |
|||||||
const { firefox } = require("playwright"); |
|
||||||
const { JSDOM } = require("jsdom"); |
|
||||||
const { Readability } = require("@mozilla/readability"); |
|
||||||
|
|
||||||
const { getUserAgent } = require('../../utils/user-agent'); |
|
||||||
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites"); |
|
||||||
|
|
||||||
module.exports.getDetails = async (url) => { |
|
||||||
const { userAgent, headers } = getUserAgent(url); |
|
||||||
|
|
||||||
const browser = await firefox.launch({ args: [], headless: true }); |
|
||||||
const tab = await browser.newPage({ |
|
||||||
extraHTTPHeaders: headers, |
|
||||||
userAgent, |
|
||||||
viewport: { width: 2000, height: 10000 }, |
|
||||||
}); |
|
||||||
|
|
||||||
try { |
|
||||||
await tab.route(/.*/, (route) => { |
|
||||||
const routeUrl = route.request().url(); |
|
||||||
const blockedDomains = Object.keys(blockedRegexes); |
|
||||||
const domain = matchUrlDomain(blockedDomains, routeUrl); |
|
||||||
if (domain && routeUrl.match(blockedRegexes[domain])) { |
|
||||||
return route.abort(); |
|
||||||
} |
|
||||||
return route.continue(); |
|
||||||
}); |
|
||||||
await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); |
|
||||||
await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" }); |
|
||||||
await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" }); |
|
||||||
await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" }); |
|
||||||
await tab.waitForTimeout(2000); |
|
||||||
|
|
||||||
const body = await tab.content(); |
|
||||||
const doc = new JSDOM(body, { url }); |
|
||||||
const reader = new Readability(doc.window.document); |
|
||||||
const article = reader.parse(); |
|
||||||
return article; |
|
||||||
} catch (e) { |
|
||||||
throw e; |
|
||||||
} finally { |
|
||||||
await tab.close(); |
|
||||||
await browser.close(); |
|
||||||
} |
|
||||||
}; |
|
@ -1,34 +0,0 @@ |
|||||||
const { JSDOM } = require("jsdom"); |
|
||||||
const { firefox } = require("playwright"); |
|
||||||
const { getUserAgent } = require('../../utils/user-agent'); |
|
||||||
const { disqusThread } = require('../../utils/disqus-thread'); |
|
||||||
|
|
||||||
const DISQUS_EMBED = 'https://disqus.com/embed/comments/'; |
|
||||||
|
|
||||||
module.exports.getComments = async (url) => { |
|
||||||
const { userAgent, headers } = getUserAgent(url); |
|
||||||
|
|
||||||
const browser = await firefox.launch({ args: [], headless: true }); |
|
||||||
const tab = await browser.newPage({ |
|
||||||
extraHTTPHeaders: headers, |
|
||||||
userAgent, |
|
||||||
viewport: { width: 2000, height: 10000 }, |
|
||||||
}); |
|
||||||
|
|
||||||
try { |
|
||||||
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" }); |
|
||||||
|
|
||||||
const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED)); |
|
||||||
const text = await response.text(); |
|
||||||
const dom = new JSDOM(text, response.url()); |
|
||||||
const script = dom.window.document.querySelector('#disqus-threadData') |
|
||||||
const data = JSON.parse(script.innerHTML); |
|
||||||
|
|
||||||
return disqusThread(data); |
|
||||||
} catch (e) { |
|
||||||
throw e; |
|
||||||
} finally { |
|
||||||
await tab.close(); |
|
||||||
await browser.close(); |
|
||||||
} |
|
||||||
}; |
|
@ -1,40 +0,0 @@ |
|||||||
const { getDetails } = require('./_browser'); |
|
||||||
const { getComments } = require('./_comments'); |
|
||||||
|
|
||||||
module.exports.scrape = async (req, res) => { |
|
||||||
try { |
|
||||||
const article = await getDetails(req.body.url); |
|
||||||
if (!article || !article.content) { |
|
||||||
throw new Error('failed to get details.'); |
|
||||||
} |
|
||||||
return res.send(article.content); |
|
||||||
} catch (e) { |
|
||||||
return res.sendStatus(500); |
|
||||||
} |
|
||||||
}; |
|
||||||
|
|
||||||
module.exports.details = async (req, res) => { |
|
||||||
try { |
|
||||||
const article = await getDetails(req.body.url); |
|
||||||
if (!article) { |
|
||||||
throw new Error('failed to get details.'); |
|
||||||
} |
|
||||||
return res.send(article); |
|
||||||
} catch (e) { |
|
||||||
console.log(e); |
|
||||||
return res.sendStatus(500); |
|
||||||
} |
|
||||||
}; |
|
||||||
|
|
||||||
module.exports.comments = async (req, res) => { |
|
||||||
try { |
|
||||||
const comments = await getComments(req.body.url); |
|
||||||
if (!comments) { |
|
||||||
throw new Error('failed to get comments.'); |
|
||||||
} |
|
||||||
return res.send(comments); |
|
||||||
} catch (e) { |
|
||||||
console.log(e); |
|
||||||
return res.sendStatus(500); |
|
||||||
} |
|
||||||
}; |
|
@ -1 +0,0 @@ |
|||||||
Subproject commit fb1b09fccbb64f1d782753cc5c425eb0723e596f |
|
@ -1,108 +0,0 @@ |
|||||||
(function () { |
|
||||||
removeHiddenElements(); |
|
||||||
|
|
||||||
if (matchDomain("stuff.co.nz")) { |
|
||||||
removeSelectors([ |
|
||||||
".support-brief-container", |
|
||||||
'[class*="donation-in-"]', |
|
||||||
".sics-component__sharebar", |
|
||||||
".breaking-news-pointer", |
|
||||||
".bigbyline-container", |
|
||||||
[ |
|
||||||
".sics-component__html-injector.sics-component__story__paragraph", |
|
||||||
"READ MORE:", |
|
||||||
], |
|
||||||
]); |
|
||||||
} |
|
||||||
if (matchDomain("nzherald.co.nz")) { |
|
||||||
removeSelectors([ |
|
||||||
"[href$='#commenting-widget']", |
|
||||||
".related-articles", |
|
||||||
".article__print-button", |
|
||||||
".share-bar", |
|
||||||
".c-suggest-links.read-more-links", |
|
||||||
".website-of-year", |
|
||||||
".meta-data", |
|
||||||
".article__kicker", |
|
||||||
".author__image", |
|
||||||
]); |
|
||||||
} |
|
||||||
if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) { |
|
||||||
removeSelectors([".c-advert-app", ".c-sub-nav"]); |
|
||||||
} |
|
||||||
if (matchDomain(["newsroom.co.nz"])) { |
|
||||||
removeSelectors([".article_content__section", ".bio"]); |
|
||||||
} |
|
||||||
if (matchDomain(["newshub.co.nz"])) { |
|
||||||
removeSelectors([ |
|
||||||
".c-ArticleHeading-authorPicture", |
|
||||||
".relatedarticles", |
|
||||||
".ArticleAttribution", |
|
||||||
'.GlobalFooter' |
|
||||||
]); |
|
||||||
} |
|
||||||
if (matchDomain(["tvnz.co.nz"])) { |
|
||||||
removeSelectors([".signup-container container"]); |
|
||||||
} |
|
||||||
if (matchDomain(["thespinoff.co.nz"])) { |
|
||||||
removeSelectors([ |
|
||||||
".the-spinoff-club-interruptive", |
|
||||||
".bulletin-signup", |
|
||||||
".sponsor_post_footer" |
|
||||||
]); |
|
||||||
} |
|
||||||
|
|
||||||
function matchDomain(domains) { |
|
||||||
const hostname = window.location.hostname; |
|
||||||
if (typeof domains === "string") { |
|
||||||
domains = [domains]; |
|
||||||
} |
|
||||||
return domains.some( |
|
||||||
(domain) => hostname === domain || hostname.endsWith("." + domain) |
|
||||||
); |
|
||||||
} |
|
||||||
|
|
||||||
function removeDOMElement(...elements) { |
|
||||||
for (const element of elements) { |
|
||||||
if (element) { |
|
||||||
element.remove(); |
|
||||||
} |
|
||||||
} |
|
||||||
} |
|
||||||
|
|
||||||
function pageContains(selector, text) { |
|
||||||
const elements = document.querySelectorAll(selector); |
|
||||||
return Array.prototype.filter.call(elements, function (element) { |
|
||||||
return RegExp(text).test(element.textContent); |
|
||||||
}); |
|
||||||
} |
|
||||||
|
|
||||||
function removeHiddenElements() { |
|
||||||
window.setTimeout(function () { |
|
||||||
const selector = "*:not(script):not(head):not(meta):not(link):not(style)"; |
|
||||||
Array.from(document.querySelectorAll(selector)) |
|
||||||
.filter((element) => { |
|
||||||
const computed = getComputedStyle(element); |
|
||||||
const displayNone = computed["display"] === "none"; |
|
||||||
const visibilityHidden = computed["visibility"] === "hidden"; |
|
||||||
return displayNone || visibilityHidden; |
|
||||||
}) |
|
||||||
.forEach((element) => element && element.remove()); |
|
||||||
}, 1500); |
|
||||||
} |
|
||||||
|
|
||||||
function removeSelectors(selectors) { |
|
||||||
window.setTimeout(function () { |
|
||||||
const elements = selectors.flatMap((s) => { |
|
||||||
if (typeof s === "string") { |
|
||||||
return Array.from(document.querySelectorAll(s)); |
|
||||||
} |
|
||||||
if (s && s.constructor.name === "Array") { |
|
||||||
return pageContains(...s); |
|
||||||
} |
|
||||||
return undefined; |
|
||||||
}); |
|
||||||
removeDOMElement(...elements); |
|
||||||
}, 500); |
|
||||||
} |
|
||||||
})(); |
|
@ -1,14 +0,0 @@ |
|||||||
(function () { |
|
||||||
const { host, protocol } = window.location; |
|
||||||
const url = `${protocol}//${host}`; |
|
||||||
[ |
|
||||||
['[src^="/"]', 'src'], |
|
||||||
['[href^="/"]', 'href'] |
|
||||||
].forEach(([selector, attribute]) => { |
|
||||||
Array.from(document.querySelectorAll(selector)) |
|
||||||
.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value)) |
|
||||||
.forEach((e) => { |
|
||||||
e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`; |
|
||||||
}); |
|
||||||
}); |
|
||||||
})(); |
|
@ -1,59 +0,0 @@ |
|||||||
const fetch = require('node-fetch'); |
|
||||||
const { JSDOM } = require('jsdom'); |
|
||||||
const { Readability } = require('@mozilla/readability'); |
|
||||||
|
|
||||||
const { getUserAgent } = require('../utils/user-agent'); |
|
||||||
|
|
||||||
const extract = (url, body) => { |
|
||||||
const doc = new JSDOM(body, { url: url }); |
|
||||||
const reader = new Readability(doc.window.document); |
|
||||||
return reader.parse(); |
|
||||||
}; |
|
||||||
|
|
||||||
module.exports.scrape = async (req, res) => { |
|
||||||
try { |
|
||||||
const { userAgent, headers } = getUserAgent(req.body.url); |
|
||||||
const response = await fetch(req.body.url, { |
|
||||||
headers: { |
|
||||||
...headers, |
|
||||||
'User-Agent': userAgent |
|
||||||
} |
|
||||||
}); |
|
||||||
if (!response.ok) { |
|
||||||
return res.sendStatus(response.statusCode); |
|
||||||
} |
|
||||||
const html = await response.text(); |
|
||||||
const article = await extract(req.body.url, html); |
|
||||||
if (article && article.content) { |
|
||||||
return res.send(article.content); |
|
||||||
} |
|
||||||
return res.sendStatus(404); |
|
||||||
} catch (e) { |
|
||||||
console.error(e); |
|
||||||
return res.sendStatus(500); |
|
||||||
} |
|
||||||
}; |
|
||||||
|
|
||||||
module.exports.details = async (req, res) => { |
|
||||||
try { |
|
||||||
const { userAgent, headers } = getUserAgent(req.body.url); |
|
||||||
const response = await fetch(req.body.url, { |
|
||||||
headers: { |
|
||||||
...headers, |
|
||||||
'User-Agent': userAgent |
|
||||||
} |
|
||||||
}); |
|
||||||
if (!response.ok) { |
|
||||||
return res.sendStatus(response.statusCode); |
|
||||||
} |
|
||||||
const html = await response.text(); |
|
||||||
const article = await extract(req.body.url, html); |
|
||||||
if (article) { |
|
||||||
return res.send(article); |
|
||||||
} |
|
||||||
return res.sendStatus(404); |
|
||||||
} catch (e) { |
|
||||||
console.error(e); |
|
||||||
return res.sendStatus(500); |
|
||||||
} |
|
||||||
}; |
|
@ -1,11 +0,0 @@ |
|||||||
const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)'; |
|
||||||
const googleBotIp = '66.249.66.1'; |
|
||||||
|
|
||||||
module.exports.googleBot = { |
|
||||||
userAgent: googleBotUserAgent, |
|
||||||
ip: googleBotIp, |
|
||||||
headers: { |
|
||||||
'User-Agent': googleBotUserAgent, |
|
||||||
'X-Forwarded-For': googleBotIp, |
|
||||||
} |
|
||||||
} |
|
@ -1,21 +0,0 @@ |
|||||||
module.exports.disqusThread = data => { |
|
||||||
const comments = data.response.posts.reduce((c, post) => ({ |
|
||||||
...c, |
|
||||||
[post.id.toString()]: { |
|
||||||
author: post.author.name, |
|
||||||
authorLink: post.author.profileUrl, |
|
||||||
date: post.createdAt, |
|
||||||
text: post.raw_message, |
|
||||||
score: post.points, |
|
||||||
children: [], |
|
||||||
id: post.id.toString(), |
|
||||||
parent: (post.parent || '').toString(), |
|
||||||
} |
|
||||||
}), {}); |
|
||||||
Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => { |
|
||||||
const comment = comments[id]; |
|
||||||
comments[comment.parent].children.push(comment); |
|
||||||
}); |
|
||||||
const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]); |
|
||||||
return parents; |
|
||||||
}; |
|
@ -1,98 +0,0 @@ |
|||||||
module.exports.blockedRegexes = { |
|
||||||
"adweek.com": /.+\.lightboxcdn\.com\/.+/, |
|
||||||
"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/, |
|
||||||
"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/, |
|
||||||
"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//, |
|
||||||
"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/, |
|
||||||
"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/, |
|
||||||
"foreignpolicy.com": /.+\.tinypass\.com\/.+/, |
|
||||||
"fortune.com": /.+\.tinypass\.com\/.+/, |
|
||||||
"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/, |
|
||||||
"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/, |
|
||||||
"inquirer.com": /.+\.tinypass\.com\/.+/, |
|
||||||
"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/, |
|
||||||
"lrb.co.uk": /.+\.tinypass\.com\/.+/, |
|
||||||
"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/, |
|
||||||
"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/, |
|
||||||
"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/, |
|
||||||
"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/, |
|
||||||
"spectator.co.uk": /.+\.tinypass\.com\/.+/, |
|
||||||
"spectator.com.au": /.+\.tinypass\.com\/.+/, |
|
||||||
"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/, |
|
||||||
"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/, |
|
||||||
"thenation.com": /thenation\.com\/.+\/paywall-script\.php/, |
|
||||||
"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/, |
|
||||||
"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/, |
|
||||||
"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/, |
|
||||||
"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/, |
|
||||||
"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/, |
|
||||||
"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/, |
|
||||||
"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/, |
|
||||||
"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/, |
|
||||||
"latercera.com": /.+\.cxense\.com\/+/, |
|
||||||
"lesechos.fr": /.+\.tinypass\.com\/.+/, |
|
||||||
"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/, |
|
||||||
"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/, |
|
||||||
"technologyreview.com": /.+\.blueconic\.net\/.+/, |
|
||||||
}; |
|
||||||
|
|
||||||
module.exports.useGoogleBotSites = [ |
|
||||||
"adelaidenow.com.au", |
|
||||||
"barrons.com", |
|
||||||
"couriermail.com.au", |
|
||||||
"dailytelegraph.com.au", |
|
||||||
"fd.nl", |
|
||||||
"genomeweb.com", |
|
||||||
"haaretz.co.il", |
|
||||||
"haaretz.com", |
|
||||||
"heraldsun.com.au", |
|
||||||
"mexiconewsdaily.com", |
|
||||||
"ntnews.com.au", |
|
||||||
"quora.com", |
|
||||||
"seekingalpha.com", |
|
||||||
"telegraph.co.uk", |
|
||||||
"theaustralian.com.au", |
|
||||||
"themarker.com", |
|
||||||
"themercury.com.au", |
|
||||||
"thenational.scot", |
|
||||||
"thetimes.co.uk", |
|
||||||
"wsj.com", |
|
||||||
"kansascity.com", |
|
||||||
"republic.ru", |
|
||||||
"nzz.ch", |
|
||||||
"handelsblatt.com", |
|
||||||
"washingtonpost.com", |
|
||||||
"df.cl", |
|
||||||
]; |
|
||||||
|
|
||||||
function matchDomain(domains, hostname) { |
|
||||||
let matchedDomain = false; |
|
||||||
if (typeof domains === "string") { |
|
||||||
domains = [domains]; |
|
||||||
} |
|
||||||
domains.some( |
|
||||||
(domain) => |
|
||||||
(hostname === domain || hostname.endsWith("." + domain)) && |
|
||||||
(matchedDomain = domain) |
|
||||||
); |
|
||||||
return matchedDomain; |
|
||||||
} |
|
||||||
|
|
||||||
function matchUrlDomain(domains, url) { |
|
||||||
return matchDomain(domains, urlHost(url)); |
|
||||||
} |
|
||||||
|
|
||||||
function urlHost(url) { |
|
||||||
if (url && url.startsWith("http")) { |
|
||||||
try { |
|
||||||
return new URL(url).hostname; |
|
||||||
} catch (e) { |
|
||||||
console.log(`url not valid: ${url} error: ${e}`); |
|
||||||
} |
|
||||||
} |
|
||||||
return url; |
|
||||||
} |
|
||||||
|
|
||||||
module.exports.matchDomain = matchDomain; |
|
||||||
module.exports.matchUrlDomain = matchUrlDomain; |
|
||||||
module.exports.urlHost = urlHost; |
|
@ -1,18 +0,0 @@ |
|||||||
const { googleBot } = require('./constants'); |
|
||||||
const { matchUrlDomain, useGoogleBotSites } = require("./sites"); |
|
||||||
|
|
||||||
module.exports.getUserAgent = (url) => { |
|
||||||
const useGoogleBot = useGoogleBotSites.some(function (item) { |
|
||||||
return typeof item === "string" && matchUrlDomain(item, url); |
|
||||||
}); |
|
||||||
|
|
||||||
if (!useGoogleBot) { |
|
||||||
return {}; |
|
||||||
} |
|
||||||
return { |
|
||||||
userAgent: googleBot.userAgent, |
|
||||||
headers: { |
|
||||||
"X-Forwarded-For": googleBot.ip |
|
||||||
} |
|
||||||
} |
|
||||||
}; |
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue