forked from tanner/qotnews
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
45 lines
1.5 KiB
45 lines
1.5 KiB
const { firefox } = require("playwright"); |
|
const { JSDOM } = require("jsdom"); |
|
const { Readability } = require("@mozilla/readability"); |
|
|
|
const { getUserAgent } = require('../../utils/user-agent'); |
|
const { blockedRegexes, matchUrlDomain } = require("../../utils/sites"); |
|
|
|
module.exports.getDetails = async (url) => { |
|
const { userAgent, headers } = getUserAgent(url); |
|
|
|
const browser = await firefox.launch({ args: [], headless: true }); |
|
const tab = await browser.newPage({ |
|
extraHTTPHeaders: headers, |
|
userAgent, |
|
viewport: { width: 2000, height: 10000 }, |
|
}); |
|
|
|
try { |
|
await tab.route(/.*/, (route) => { |
|
const routeUrl = route.request().url(); |
|
const blockedDomains = Object.keys(blockedRegexes); |
|
const domain = matchUrlDomain(blockedDomains, routeUrl); |
|
if (domain && routeUrl.match(blockedRegexes[domain])) { |
|
return route.abort(); |
|
} |
|
return route.continue(); |
|
}); |
|
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" }); |
|
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" }); |
|
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" }); |
|
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" }); |
|
await tab.waitForTimeout(2000); |
|
|
|
const body = await tab.content(); |
|
const doc = new JSDOM(body, { url }); |
|
const reader = new Readability(doc.window.document); |
|
const article = reader.parse(); |
|
return article; |
|
} catch (e) { |
|
throw e; |
|
} finally { |
|
await tab.close(); |
|
await browser.close(); |
|
} |
|
};
|
|
|