forked from tanner/qotnews
		
	remove readerserver, add declutter.
This commit is contained in:
		
							
								
								
									
										7
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @@ -1,4 +1,3 @@ | ||||
| [submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"] | ||||
| 	path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome | ||||
| 	url = https://github.com/iamadamdev/bypass-paywalls-chrome/ | ||||
| 	branch = master | ||||
| [submodule "readerserver"] | ||||
| 	path = readerserver | ||||
| 	url = https://github.com/master5o1/declutter.git | ||||
|   | ||||
| @@ -3,9 +3,10 @@ logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
| import requests | ||||
| from settings import READER_PORT | ||||
|  | ||||
| READ_API = 'http://127.0.0.1:33843/headless/details' | ||||
| READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments' | ||||
| READ_API = 'http://127.0.0.1:{}/headless/details'.format(READER_PORT or 3000) | ||||
| READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(READER_PORT or 3000) | ||||
| TIMEOUT = 60 | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -3,8 +3,9 @@ logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
| import requests | ||||
| from settings import READER_PORT | ||||
|  | ||||
| READ_API = 'http://127.0.0.1:33843/simple/details' | ||||
| READ_API = 'http://127.0.0.1:{}/simple/details'.format(READER_PORT or 3000) | ||||
| TIMEOUT = 20 | ||||
|  | ||||
| def get_html(url): | ||||
|   | ||||
| @@ -142,7 +142,7 @@ def static_story(sid): | ||||
|             url=url, | ||||
|             description=description) | ||||
|  | ||||
| http_server = WSGIServer(('', 33842), flask_app) | ||||
| http_server = WSGIServer(('', settings.API_PORT or 33842), flask_app) | ||||
|  | ||||
| def _add_new_refs(): | ||||
|     for ref, source, urlref in feed.get_list(): | ||||
|   | ||||
| @@ -4,6 +4,10 @@ | ||||
| HOSTNAME = 'news.t0.vc' | ||||
| MAX_STORY_AGE = 3*24*60*60 | ||||
|  | ||||
| SCRAPERS = ['headless', 'outline', 'declutter', 'simple'] | ||||
| API_PORT = 33842 | ||||
| READER_PORT = 3000 | ||||
|  | ||||
| # Feed Lengths | ||||
| # Number of top items from each site to pull | ||||
| # set to 0 to disable that site | ||||
| @@ -51,8 +55,6 @@ CATEGORY = {} | ||||
| #     ], | ||||
| # } | ||||
|  | ||||
| SCRAPERS = ['headless', 'outline', 'declutter', 'simple'] | ||||
|  | ||||
| # Reddit account info | ||||
| # leave blank if not using Reddit | ||||
| REDDIT_CLIENT_ID = '' | ||||
|   | ||||
							
								
								
									
										1
									
								
								readerserver
									
									
									
									
									
										Submodule
									
								
							
							
								
								
								
								
								
							
						
						
									
										1
									
								
								readerserver
									
									
									
									
									
										Submodule
									
								
							 Submodule readerserver added at 9c0336b0af
									
								
							
							
								
								
									
										92
									
								
								readerserver/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										92
									
								
								readerserver/.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,92 +0,0 @@ | ||||
| # Logs | ||||
| logs | ||||
| *.log | ||||
| npm-debug.log* | ||||
| yarn-debug.log* | ||||
| yarn-error.log* | ||||
| lerna-debug.log* | ||||
|  | ||||
| # Diagnostic reports (https://nodejs.org/api/report.html) | ||||
| report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json | ||||
|  | ||||
| # Runtime data | ||||
| pids | ||||
| *.pid | ||||
| *.seed | ||||
| *.pid.lock | ||||
|  | ||||
| # Directory for instrumented libs generated by jscoverage/JSCover | ||||
| lib-cov | ||||
|  | ||||
| # Coverage directory used by tools like istanbul | ||||
| coverage | ||||
| *.lcov | ||||
|  | ||||
| # nyc test coverage | ||||
| .nyc_output | ||||
|  | ||||
| # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) | ||||
| .grunt | ||||
|  | ||||
| # Bower dependency directory (https://bower.io/) | ||||
| bower_components | ||||
|  | ||||
| # node-waf configuration | ||||
| .lock-wscript | ||||
|  | ||||
| # Compiled binary addons (https://nodejs.org/api/addons.html) | ||||
| build/Release | ||||
|  | ||||
| # Dependency directories | ||||
| node_modules/ | ||||
| jspm_packages/ | ||||
|  | ||||
| # TypeScript v1 declaration files | ||||
| typings/ | ||||
|  | ||||
| # TypeScript cache | ||||
| *.tsbuildinfo | ||||
|  | ||||
| # Optional npm cache directory | ||||
| .npm | ||||
|  | ||||
| # Optional eslint cache | ||||
| .eslintcache | ||||
|  | ||||
| # Optional REPL history | ||||
| .node_repl_history | ||||
|  | ||||
| # Output of 'npm pack' | ||||
| *.tgz | ||||
|  | ||||
| # Yarn Integrity file | ||||
| .yarn-integrity | ||||
|  | ||||
| # dotenv environment variables file | ||||
| .env | ||||
| .env.test | ||||
|  | ||||
| # parcel-bundler cache (https://parceljs.org/) | ||||
| .cache | ||||
|  | ||||
| # next.js build output | ||||
| .next | ||||
|  | ||||
| # nuxt.js build output | ||||
| .nuxt | ||||
|  | ||||
| # vuepress build output | ||||
| .vuepress/dist | ||||
|  | ||||
| # Serverless directories | ||||
| .serverless/ | ||||
|  | ||||
| # FuseBox cache | ||||
| .fusebox/ | ||||
|  | ||||
| # DynamoDB Local files | ||||
| .dynamodb/ | ||||
|  | ||||
| # Editor | ||||
| *.swp | ||||
| *.swo | ||||
| @@ -1,36 +0,0 @@ | ||||
| const port = 33843; | ||||
| const express = require('express'); | ||||
| const app = express(); | ||||
| const simple = require('./scraper/simple'); | ||||
| const headless = require('./scraper/headless'); | ||||
|  | ||||
| app.use(express.urlencoded({ extended: true })); | ||||
|  | ||||
| app.get('/', (req, res) => { | ||||
| 	const routes = [ | ||||
| 		'/simple', | ||||
| 		'/simple/details', | ||||
| 		'/headless', | ||||
| 		'/headless/details', | ||||
| 		'/headless/comments' | ||||
| 	]; | ||||
|  | ||||
| 	const html = routes.map(route => ` | ||||
| 	<form method="POST" action="${route}" accept-charset="UTF-8"> | ||||
| 		<fieldset> | ||||
| 			<legend>route: POST ${route}</legend> | ||||
| 			<input name="url"> | ||||
| 			<button type="submit">SUBMIT</button> | ||||
| 		</fieldset> | ||||
| 	</form>`).join('<hr />'); | ||||
| 	res.send(html); | ||||
| }); | ||||
| app.post('/simple/', simple.scrape); | ||||
| app.post('/simple/details', simple.details); | ||||
| app.post('/headless', headless.scrape); | ||||
| app.post('/headless/details', headless.details); | ||||
| app.post('/headless/comments', headless.comments); | ||||
|  | ||||
| app.listen(port, () => { | ||||
| 	console.log(`Example app listening on port ${port}!`); | ||||
| }); | ||||
| @@ -1,15 +0,0 @@ | ||||
| { | ||||
|   "name": "readerserver", | ||||
|   "version": "1.0.0", | ||||
|   "main": "main.js", | ||||
|   "license": "MIT", | ||||
|   "dependencies": { | ||||
|     "@mozilla/readability": "^0.3.0", | ||||
|     "dompurify": "^1.0.11", | ||||
|     "express": "^4.17.1", | ||||
|     "jsdom": "^15.1.1", | ||||
|     "node-fetch": "^2.6.1", | ||||
|     "playwright": "^1.5.2", | ||||
|     "request": "^2.88.0" | ||||
|   } | ||||
| } | ||||
| @@ -1,45 +0,0 @@ | ||||
| const { firefox } = require("playwright"); | ||||
| const { JSDOM } = require("jsdom"); | ||||
| const { Readability } = require("@mozilla/readability"); | ||||
|  | ||||
| const { getUserAgent } = require('../../utils/user-agent'); | ||||
| const { blockedRegexes, matchUrlDomain } = require("../../utils/sites"); | ||||
|  | ||||
| module.exports.getDetails = async (url) => { | ||||
| 	const { userAgent, headers } = getUserAgent(url); | ||||
|  | ||||
| 	const browser = await firefox.launch({ args: [], headless: true }); | ||||
| 	const tab = await browser.newPage({ | ||||
| 		extraHTTPHeaders: headers, | ||||
| 		userAgent, | ||||
| 		viewport: { width: 2000, height: 10000 }, | ||||
| 	}); | ||||
|  | ||||
| 	try { | ||||
| 		await tab.route(/.*/, (route) => { | ||||
| 			const routeUrl = route.request().url(); | ||||
| 			const blockedDomains = Object.keys(blockedRegexes); | ||||
| 			const domain = matchUrlDomain(blockedDomains, routeUrl); | ||||
| 			if (domain && routeUrl.match(blockedRegexes[domain])) { | ||||
| 				return route.abort(); | ||||
| 			} | ||||
| 			return route.continue(); | ||||
| 		}); | ||||
| 		await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); | ||||
| 		await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" }); | ||||
| 		await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" }); | ||||
| 		await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" }); | ||||
| 		await tab.waitForTimeout(2000); | ||||
|  | ||||
| 		const body = await tab.content(); | ||||
| 		const doc = new JSDOM(body, { url }); | ||||
| 		const reader = new Readability(doc.window.document); | ||||
| 		const article = reader.parse(); | ||||
| 		return article; | ||||
| 	} catch (e) { | ||||
| 		throw e; | ||||
| 	} finally { | ||||
| 		await tab.close(); | ||||
| 		await browser.close(); | ||||
| 	} | ||||
| }; | ||||
| @@ -1,34 +0,0 @@ | ||||
| const { JSDOM } = require("jsdom"); | ||||
| const { firefox } = require("playwright"); | ||||
| const { getUserAgent } = require('../../utils/user-agent'); | ||||
| const { disqusThread } = require('../../utils/disqus-thread'); | ||||
|  | ||||
| const DISQUS_EMBED = 'https://disqus.com/embed/comments/'; | ||||
|  | ||||
| module.exports.getComments = async (url) => { | ||||
| 	const { userAgent, headers } = getUserAgent(url); | ||||
|  | ||||
| 	const browser = await firefox.launch({ args: [], headless: true }); | ||||
| 	const tab = await browser.newPage({ | ||||
| 		extraHTTPHeaders: headers, | ||||
| 		userAgent, | ||||
| 		viewport: { width: 2000, height: 10000 }, | ||||
| 	}); | ||||
|  | ||||
| 	try { | ||||
| 		await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" }); | ||||
|  | ||||
| 		const response = await tab.waitForResponse(response => response.url().includes(DISQUS_EMBED)); | ||||
| 		const text = await response.text(); | ||||
| 		const dom = new JSDOM(text, response.url()); | ||||
| 		const script = dom.window.document.querySelector('#disqus-threadData') | ||||
| 		const data = JSON.parse(script.innerHTML); | ||||
|  | ||||
| 		return disqusThread(data); | ||||
| 	} catch (e) { | ||||
| 		throw e; | ||||
| 	} finally { | ||||
| 		await tab.close(); | ||||
| 		await browser.close(); | ||||
| 	} | ||||
| }; | ||||
| @@ -1,40 +0,0 @@ | ||||
| const { getDetails } = require('./_browser'); | ||||
| const { getComments } = require('./_comments'); | ||||
|  | ||||
| module.exports.scrape = async (req, res) => { | ||||
| 	try { | ||||
| 		const article = await getDetails(req.body.url); | ||||
| 		if (!article || !article.content) { | ||||
| 			throw new Error('failed to get details.'); | ||||
| 		} | ||||
| 		return res.send(article.content); | ||||
| 	} catch (e) { | ||||
| 		return res.sendStatus(500); | ||||
| 	} | ||||
| }; | ||||
|  | ||||
| module.exports.details = async (req, res) => { | ||||
| 	try { | ||||
| 		const article = await getDetails(req.body.url); | ||||
| 		if (!article) { | ||||
| 			throw new Error('failed to get details.'); | ||||
| 		} | ||||
| 		return res.send(article); | ||||
| 	} catch (e) { | ||||
| 		console.log(e); | ||||
| 		return res.sendStatus(500); | ||||
| 	} | ||||
| }; | ||||
|  | ||||
| module.exports.comments = async (req, res) => { | ||||
| 	try { | ||||
| 		const comments = await getComments(req.body.url); | ||||
| 		if (!comments) { | ||||
| 			throw new Error('failed to get comments.'); | ||||
| 		} | ||||
| 		return res.send(comments); | ||||
| 	} catch (e) { | ||||
| 		console.log(e); | ||||
| 		return res.sendStatus(500); | ||||
| 	} | ||||
| }; | ||||
 Submodule readerserver/scraper/headless/scripts/bypass-paywalls-chrome deleted from fb1b09fccb
									
								
							| @@ -1,108 +0,0 @@ | ||||
| (function () { | ||||
| 	removeHiddenElements(); | ||||
|  | ||||
| 	if (matchDomain("stuff.co.nz")) { | ||||
| 		removeSelectors([ | ||||
| 			".support-brief-container", | ||||
| 			'[class*="donation-in-"]', | ||||
| 			".sics-component__sharebar", | ||||
| 			".breaking-news-pointer", | ||||
| 			".bigbyline-container", | ||||
| 			[ | ||||
| 				".sics-component__html-injector.sics-component__story__paragraph", | ||||
| 				"READ MORE:", | ||||
| 			], | ||||
| 		]); | ||||
| 	} | ||||
| 	if (matchDomain("nzherald.co.nz")) { | ||||
| 		removeSelectors([ | ||||
| 			"[href$='#commenting-widget']", | ||||
| 			".related-articles", | ||||
| 			".article__print-button", | ||||
| 			".share-bar", | ||||
| 			".c-suggest-links.read-more-links", | ||||
| 			".website-of-year", | ||||
| 			".meta-data", | ||||
| 			".article__kicker", | ||||
| 			".author__image", | ||||
| 		]); | ||||
| 	} | ||||
| 	if (matchDomain(["rnz.co.nz", "radionz.co.nz"])) { | ||||
| 		removeSelectors([".c-advert-app", ".c-sub-nav"]); | ||||
| 	} | ||||
| 	if (matchDomain(["newsroom.co.nz"])) { | ||||
| 		removeSelectors([".article_content__section", ".bio"]); | ||||
| 	} | ||||
| 	if (matchDomain(["newshub.co.nz"])) { | ||||
| 		removeSelectors([ | ||||
| 			".c-ArticleHeading-authorPicture", | ||||
| 			".relatedarticles", | ||||
| 			".ArticleAttribution", | ||||
| 			'.GlobalFooter' | ||||
| 		]); | ||||
| 	} | ||||
| 	if (matchDomain(["tvnz.co.nz"])) { | ||||
| 		removeSelectors([".signup-container container"]); | ||||
| 	} | ||||
| 	if (matchDomain(["thespinoff.co.nz"])) { | ||||
| 		removeSelectors([ | ||||
| 			".the-spinoff-club-interruptive", | ||||
| 			".bulletin-signup", | ||||
| 			".sponsor_post_footer" | ||||
| 		]); | ||||
| 	} | ||||
|  | ||||
| 	function matchDomain(domains) { | ||||
| 		const hostname = window.location.hostname; | ||||
| 		if (typeof domains === "string") { | ||||
| 			domains = [domains]; | ||||
| 		} | ||||
| 		return domains.some( | ||||
| 			(domain) => hostname === domain || hostname.endsWith("." + domain) | ||||
| 		); | ||||
| 	} | ||||
|  | ||||
| 	function removeDOMElement(...elements) { | ||||
| 		for (const element of elements) { | ||||
| 			if (element) { | ||||
| 				element.remove(); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	function pageContains(selector, text) { | ||||
| 		const elements = document.querySelectorAll(selector); | ||||
| 		return Array.prototype.filter.call(elements, function (element) { | ||||
| 			return RegExp(text).test(element.textContent); | ||||
| 		}); | ||||
| 	} | ||||
|  | ||||
| 	function removeHiddenElements() { | ||||
| 		window.setTimeout(function () { | ||||
| 			const selector = "*:not(script):not(head):not(meta):not(link):not(style)"; | ||||
| 			Array.from(document.querySelectorAll(selector)) | ||||
| 				.filter((element) => { | ||||
| 					const computed = getComputedStyle(element); | ||||
| 					const displayNone = computed["display"] === "none"; | ||||
| 					const visibilityHidden = computed["visibility"] === "hidden"; | ||||
| 					return displayNone || visibilityHidden; | ||||
| 				}) | ||||
| 				.forEach((element) => element && element.remove()); | ||||
| 		}, 1500); | ||||
| 	} | ||||
|  | ||||
| 	function removeSelectors(selectors) { | ||||
| 		window.setTimeout(function () { | ||||
| 			const elements = selectors.flatMap((s) => { | ||||
| 				if (typeof s === "string") { | ||||
| 					return Array.from(document.querySelectorAll(s)); | ||||
| 				} | ||||
| 				if (s && s.constructor.name === "Array") { | ||||
| 					return pageContains(...s); | ||||
| 				} | ||||
| 				return undefined; | ||||
| 			}); | ||||
| 			removeDOMElement(...elements); | ||||
| 		}, 500); | ||||
| 	} | ||||
| })(); | ||||
| @@ -1,14 +0,0 @@ | ||||
| (function () { | ||||
| 	const { host, protocol } = window.location; | ||||
| 	const url = `${protocol}//${host}`; | ||||
| 	[ | ||||
| 		['[src^="/"]', 'src'], | ||||
| 		['[href^="/"]', 'href'] | ||||
| 	].forEach(([selector, attribute]) => { | ||||
| 		Array.from(document.querySelectorAll(selector)) | ||||
| 			.filter(e => e.attributes[attribute] && /^\/[^\/]/.test(e.attributes[attribute].value)) | ||||
| 			.forEach((e) => { | ||||
| 				e.attributes[attribute].value = `${url}${e.attributes[attribute].value}`; | ||||
| 			}); | ||||
| 	}); | ||||
| })(); | ||||
| @@ -1,59 +0,0 @@ | ||||
| const fetch = require('node-fetch'); | ||||
| const { JSDOM } = require('jsdom'); | ||||
| const { Readability } = require('@mozilla/readability'); | ||||
|  | ||||
| const { getUserAgent } = require('../utils/user-agent'); | ||||
|  | ||||
| const extract = (url, body) => { | ||||
| 	const doc = new JSDOM(body, { url: url }); | ||||
| 	const reader = new Readability(doc.window.document); | ||||
| 	return reader.parse(); | ||||
| }; | ||||
|  | ||||
| module.exports.scrape = async (req, res) => { | ||||
| 	try { | ||||
| 		const { userAgent, headers } = getUserAgent(req.body.url); | ||||
| 		const response = await fetch(req.body.url, { | ||||
| 			headers: { | ||||
| 				...headers, | ||||
| 				'User-Agent': userAgent | ||||
| 			} | ||||
| 		}); | ||||
| 		if (!response.ok) { | ||||
| 			return res.sendStatus(response.statusCode); | ||||
| 		} | ||||
| 		const html = await response.text(); | ||||
| 		const article = await extract(req.body.url, html); | ||||
| 		if (article && article.content) { | ||||
| 			return res.send(article.content); | ||||
| 		} | ||||
| 		return res.sendStatus(404); | ||||
| 	} catch (e) { | ||||
| 		console.error(e); | ||||
| 		return res.sendStatus(500); | ||||
| 	} | ||||
| }; | ||||
|  | ||||
| module.exports.details = async (req, res) => { | ||||
| 	try { | ||||
| 		const { userAgent, headers } = getUserAgent(req.body.url); | ||||
| 		const response = await fetch(req.body.url, { | ||||
| 			headers: { | ||||
| 				...headers, | ||||
| 				'User-Agent': userAgent | ||||
| 			} | ||||
| 		}); | ||||
| 		if (!response.ok) { | ||||
| 			return res.sendStatus(response.statusCode); | ||||
| 		} | ||||
| 		const html = await response.text(); | ||||
| 		const article = await extract(req.body.url, html); | ||||
| 		if (article) { | ||||
| 			return res.send(article); | ||||
| 		} | ||||
| 		return res.sendStatus(404); | ||||
| 	} catch (e) { | ||||
| 		console.error(e); | ||||
| 		return res.sendStatus(500); | ||||
| 	} | ||||
| }; | ||||
| @@ -1,11 +0,0 @@ | ||||
| const googleBotUserAgent = 'Googlebot/2.1 (+http://www.google.com/bot.html)'; | ||||
| const googleBotIp = '66.249.66.1'; | ||||
|  | ||||
| module.exports.googleBot = { | ||||
| 	userAgent: googleBotUserAgent, | ||||
| 	ip: googleBotIp, | ||||
| 	headers: { | ||||
| 		'User-Agent': googleBotUserAgent, | ||||
| 		'X-Forwarded-For': googleBotIp, | ||||
| 	} | ||||
| } | ||||
| @@ -1,21 +0,0 @@ | ||||
| module.exports.disqusThread = data => { | ||||
| 	const comments = data.response.posts.reduce((c, post) => ({ | ||||
| 		...c, | ||||
| 		[post.id.toString()]: { | ||||
| 			author: post.author.name, | ||||
| 			authorLink: post.author.profileUrl, | ||||
| 			date: post.createdAt, | ||||
| 			text: post.raw_message, | ||||
| 			score: post.points, | ||||
| 			children: [], | ||||
| 			id: post.id.toString(), | ||||
| 			parent: (post.parent || '').toString(), | ||||
| 		} | ||||
| 	}), {}); | ||||
| 	Object.keys(comments).filter(id => !!comments[id].parent).forEach(id => { | ||||
| 		const comment = comments[id]; | ||||
| 		comments[comment.parent].children.push(comment); | ||||
| 	}); | ||||
| 	const parents = Object.keys(comments).filter(id => comments[id].parent).map(id => comments[id]); | ||||
| 	return parents; | ||||
| }; | ||||
| @@ -1,98 +0,0 @@ | ||||
| module.exports.blockedRegexes = { | ||||
| 	"adweek.com": /.+\.lightboxcdn\.com\/.+/, | ||||
| 	"afr.com": /afr\.com\/assets\/vendorsReactRedux_client.+\.js/, | ||||
| 	"businessinsider.com": /(.+\.tinypass\.com\/.+|cdn\.onesignal\.com\/sdks\/.+\.js)/, | ||||
| 	"chicagotribune.com": /.+:\/\/.+\.tribdss\.com\//, | ||||
| 	"economist.com": /(.+\.tinypass\.com\/.+|economist\.com\/engassets\/_next\/static\/chunks\/framework.+\.js)/, | ||||
| 	"editorialedomani.it": /(js\.pelcro\.com\/.+|editorialedomani.it\/pelcro\.js)/, | ||||
| 	"foreignpolicy.com": /.+\.tinypass\.com\/.+/, | ||||
| 	"fortune.com": /.+\.tinypass\.com\/.+/, | ||||
| 	"haaretz.co.il": /haaretz\.co\.il\/htz\/js\/inter\.js/, | ||||
| 	"haaretz.com": /haaretz\.com\/hdc\/web\/js\/minified\/header-scripts-int.js.+/, | ||||
| 	"inquirer.com": /.+\.tinypass\.com\/.+/, | ||||
| 	"lastampa.it": /.+\.repstatic\.it\/minify\/sites\/lastampa\/.+\/config\.cache\.php\?name=social_js/, | ||||
| 	"lrb.co.uk": /.+\.tinypass\.com\/.+/, | ||||
| 	"nzherald.co.nz": /(.+nzherald\.co\.nz\/.+\/subs\/p\.js|.+nzherald\.co\.nz\/.+\/react\.js|.+nzherald\.co\.nz\/.+\/appear\.js|.+nzherald\.co\.nz\/.+\/tracking\/.+|.+nzherald\.co\.nz\/.+\/default\.js|.+\/newsbarscript\.js)/, | ||||
| 	"medscape.com": /.+\.medscapestatic\.com\/.*medscape-library\.js/, | ||||
| 	"interest.co.nz": /(.+\.presspatron\.com.+|.+interest\.co\.nz.+pp-ablock-banner\.js)/, | ||||
| 	"repubblica.it": /scripts\.repubblica\.it\/pw\/pw\.js.+/, | ||||
| 	"spectator.co.uk": /.+\.tinypass\.com\/.+/, | ||||
| 	"spectator.com.au": /.+\.tinypass\.com\/.+/, | ||||
| 	"telegraph.co.uk": /.+telegraph\.co\.uk.+martech.+/, | ||||
| 	"thecourier.com.au": /.+cdn-au\.piano\.io\/api\/tinypass.+\.js/, | ||||
| 	"thenation.com": /thenation\.com\/.+\/paywall-script\.php/, | ||||
| 	"thenational.scot": /(.+\.tinypass\.com\/.+|.+thenational\.scot.+omniture\.js|.+thenational\.scot.+responsive-sync.+)/, | ||||
| 	"thewrap.com": /thewrap\.com\/.+\/wallkit\.js/, | ||||
| 	"wsj.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/, | ||||
| 	"historyextra.com": /.+\.evolok\.net\/.+\/authorize\/.+/, | ||||
| 	"barrons.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/, | ||||
| 	"irishtimes.com": /cdn\.ampproject\.org\/v\d\/amp-access-.+\.js/, | ||||
| 	"elmercurio.com": /(merreader\.emol\.cl\/assets\/js\/merPramV2.js|staticmer\.emol\.cl\/js\/inversiones\/PramModal.+\.js)/, | ||||
| 	"sloanreview.mit.edu": /(.+\.tinypass\.com\/.+|.+\.netdna-ssl\.com\/wp-content\/themes\/smr\/assets\/js\/libs\/welcome-ad\.js)/, | ||||
| 	"latercera.com": /.+\.cxense\.com\/+/, | ||||
| 	"lesechos.fr": /.+\.tinypass\.com\/.+/, | ||||
| 	"washingtonpost.com": /.+\.washingtonpost\.com\/.+\/pwapi-proxy\.min\.js/, | ||||
| 	"thehindu.com": /ajax\.cloudflare\.com\/cdn-cgi\/scripts\/.+\/cloudflare-static\/rocket-loader\.min\.js/, | ||||
| 	"technologyreview.com": /.+\.blueconic\.net\/.+/, | ||||
| }; | ||||
|  | ||||
| module.exports.useGoogleBotSites = [ | ||||
| 	"adelaidenow.com.au", | ||||
| 	"barrons.com", | ||||
| 	"couriermail.com.au", | ||||
| 	"dailytelegraph.com.au", | ||||
| 	"fd.nl", | ||||
| 	"genomeweb.com", | ||||
| 	"haaretz.co.il", | ||||
| 	"haaretz.com", | ||||
| 	"heraldsun.com.au", | ||||
| 	"mexiconewsdaily.com", | ||||
| 	"ntnews.com.au", | ||||
| 	"quora.com", | ||||
| 	"seekingalpha.com", | ||||
| 	"telegraph.co.uk", | ||||
| 	"theaustralian.com.au", | ||||
| 	"themarker.com", | ||||
| 	"themercury.com.au", | ||||
| 	"thenational.scot", | ||||
| 	"thetimes.co.uk", | ||||
| 	"wsj.com", | ||||
| 	"kansascity.com", | ||||
| 	"republic.ru", | ||||
| 	"nzz.ch", | ||||
| 	"handelsblatt.com", | ||||
| 	"washingtonpost.com", | ||||
| 	"df.cl", | ||||
| ]; | ||||
|  | ||||
| function matchDomain(domains, hostname) { | ||||
| 	let matchedDomain = false; | ||||
| 	if (typeof domains === "string") { | ||||
| 		domains = [domains]; | ||||
| 	} | ||||
| 	domains.some( | ||||
| 		(domain) => | ||||
| 			(hostname === domain || hostname.endsWith("." + domain)) && | ||||
| 			(matchedDomain = domain) | ||||
| 	); | ||||
| 	return matchedDomain; | ||||
| } | ||||
|  | ||||
| function matchUrlDomain(domains, url) { | ||||
| 	return matchDomain(domains, urlHost(url)); | ||||
| } | ||||
|  | ||||
| function urlHost(url) { | ||||
| 	if (url && url.startsWith("http")) { | ||||
| 		try { | ||||
| 			return new URL(url).hostname; | ||||
| 		} catch (e) { | ||||
| 			console.log(`url not valid: ${url} error: ${e}`); | ||||
| 		} | ||||
| 	} | ||||
| 	return url; | ||||
| } | ||||
|  | ||||
| module.exports.matchDomain = matchDomain; | ||||
| module.exports.matchUrlDomain = matchUrlDomain; | ||||
| module.exports.urlHost = urlHost; | ||||
| @@ -1,18 +0,0 @@ | ||||
| const { googleBot } = require('./constants'); | ||||
| const { matchUrlDomain, useGoogleBotSites } = require("./sites"); | ||||
|  | ||||
| module.exports.getUserAgent = (url) => { | ||||
| 	const useGoogleBot = useGoogleBotSites.some(function (item) { | ||||
| 		return typeof item === "string" && matchUrlDomain(item, url); | ||||
| 	}); | ||||
|  | ||||
| 	if (!useGoogleBot) { | ||||
| 		return {}; | ||||
| 	} | ||||
| 	return { | ||||
| 		userAgent: googleBot.userAgent, | ||||
| 		headers: { | ||||
| 			"X-Forwarded-For": googleBot.ip | ||||
| 		} | ||||
| 	} | ||||
| }; | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user