renaming things.
This commit is contained in:
parent
55d50a86d8
commit
3b885e4327
6
.gitmodules
vendored
6
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "readerserver/scraper/browser/scripts/bypass-paywalls-chrome"]
|
[submodule "readerserver/scraper/headless/scripts/bypass-paywalls-chrome"]
|
||||||
path = readerserver/scraper/browser/scripts/bypass-paywalls-chrome
|
path = readerserver/scraper/headless/scripts/bypass-paywalls-chrome
|
||||||
url = https://github.com/iamadamdev/bypass-paywalls-chrome.git
|
url = https://github.com/iamadamdev/bypass-paywalls-chrome/
|
||||||
|
|
|
@ -12,7 +12,7 @@ import settings
|
||||||
from feeds import hackernews, reddit, tildes, substack, manual
|
from feeds import hackernews, reddit, tildes, substack, manual
|
||||||
from feeds.sitemap import Sitemap
|
from feeds.sitemap import Sitemap
|
||||||
from feeds.category import Category
|
from feeds.category import Category
|
||||||
from scrapers import outline, declutter, browser, local
|
from scrapers import outline, declutter, headless, simple
|
||||||
|
|
||||||
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
|
||||||
|
|
||||||
|
@ -63,14 +63,14 @@ def get_list():
|
||||||
|
|
||||||
def get_article(url):
|
def get_article(url):
|
||||||
scrapers = {
|
scrapers = {
|
||||||
'declutter': declutter,
|
'headless': headless,
|
||||||
|
'simple': simple,
|
||||||
'outline': outline,
|
'outline': outline,
|
||||||
'browser': browser,
|
'declutter': declutter,
|
||||||
'local': local,
|
|
||||||
}
|
}
|
||||||
available = settings.SCRAPERS or ['local']
|
available = settings.SCRAPERS or ['headless', 'simple']
|
||||||
if 'local' not in available:
|
if 'simple' not in available:
|
||||||
available += ['local']
|
available += ['simple']
|
||||||
|
|
||||||
for scraper in available:
|
for scraper in available:
|
||||||
if scraper not in scrapers.keys():
|
if scraper not in scrapers.keys():
|
||||||
|
|
|
@ -4,13 +4,13 @@ logging.basicConfig(
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
READ_API = 'http://127.0.0.1:33843/browser/details'
|
READ_API = 'http://127.0.0.1:33843/headless/details'
|
||||||
READ_COMMENT__API = 'http://127.0.0.1:33843/browser/commentd'
|
READ_COMMENT__API = 'http://127.0.0.1:33843/headless/comments'
|
||||||
TIMEOUT = 60
|
TIMEOUT = 60
|
||||||
|
|
||||||
|
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
logging.info(f"Reader Scraper: {url}")
|
logging.info(f"Headless Browser Scraper: {url}")
|
||||||
details = get_details(url)
|
details = get_details(url)
|
||||||
if not details:
|
if not details:
|
||||||
return ''
|
return ''
|
||||||
|
@ -25,7 +25,7 @@ def get_details(url):
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
logging.error('Problem Scraping article: {}'.format(str(e)))
|
logging.error('Problem scraping article: {}'.format(str(e)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_comments(url):
|
def get_comments(url):
|
|
@ -4,11 +4,11 @@ logging.basicConfig(
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
READ_API = 'http://127.0.0.1:33843/details'
|
READ_API = 'http://127.0.0.1:33843/simple/details'
|
||||||
TIMEOUT = 20
|
TIMEOUT = 20
|
||||||
|
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
logging.info(f"Local Scraper: {url}")
|
logging.info(f"Simple Scraper: {url}")
|
||||||
details = get_details(url)
|
details = get_details(url)
|
||||||
if not details:
|
if not details:
|
||||||
return ''
|
return ''
|
|
@ -51,7 +51,7 @@ CATEGORY = {}
|
||||||
# ],
|
# ],
|
||||||
# }
|
# }
|
||||||
|
|
||||||
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
|
SCRAPERS = ['headless', 'outline', 'declutter', 'simple']
|
||||||
|
|
||||||
# Reddit account info
|
# Reddit account info
|
||||||
# leave blank if not using Reddit
|
# leave blank if not using Reddit
|
||||||
|
|
|
@ -2,12 +2,18 @@ const port = 33843;
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
const app = express();
|
const app = express();
|
||||||
const simple = require('./scraper/simple');
|
const simple = require('./scraper/simple');
|
||||||
const browser = require('./scraper/browser');
|
const headless = require('./scraper/headless');
|
||||||
|
|
||||||
app.use(express.urlencoded({ extended: true }));
|
app.use(express.urlencoded({ extended: true }));
|
||||||
|
|
||||||
app.get('/', (req, res) => {
|
app.get('/', (req, res) => {
|
||||||
const routes = ['/', '/details', '/browser', '/browser/details', '/browser/comments'];
|
const routes = [
|
||||||
|
'/simple',
|
||||||
|
'/simple/details',
|
||||||
|
'/headless',
|
||||||
|
'/headless/details',
|
||||||
|
'/headless/comments'
|
||||||
|
];
|
||||||
|
|
||||||
const html = routes.map(route => `
|
const html = routes.map(route => `
|
||||||
<form method="POST" action="${route}" accept-charset="UTF-8">
|
<form method="POST" action="${route}" accept-charset="UTF-8">
|
||||||
|
@ -19,11 +25,11 @@ app.get('/', (req, res) => {
|
||||||
</form>`).join('<hr />');
|
</form>`).join('<hr />');
|
||||||
res.send(html);
|
res.send(html);
|
||||||
});
|
});
|
||||||
app.post('/', simple.scrape);
|
app.post('/simple/', simple.scrape);
|
||||||
app.post('/details', simple.details);
|
app.post('/simple/details', simple.details);
|
||||||
app.post('/browser', browser.scrape);
|
app.post('/headless', headless.scrape);
|
||||||
app.post('/browser/details', browser.details);
|
app.post('/headless/details', headless.details);
|
||||||
app.post('/browser/comments', browser.comments);
|
app.post('/headless/comments', headless.comments);
|
||||||
|
|
||||||
app.listen(port, () => {
|
app.listen(port, () => {
|
||||||
console.log(`Example app listening on port ${port}!`);
|
console.log(`Example app listening on port ${port}!`);
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
Subproject commit 0f129c5d6c206fde389878fed4d26fec90923a21
|
|
|
@ -25,9 +25,9 @@ module.exports.getDetails = async (url) => {
|
||||||
}
|
}
|
||||||
return route.continue();
|
return route.continue();
|
||||||
});
|
});
|
||||||
await tab.addInitScript({ path: "scraper/browser/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
|
await tab.addInitScript({ path: "scraper/headless/scripts/bypass-paywalls-chrome/src/js/contentScript.js" });
|
||||||
await tab.addInitScript({ path: "scraper/browser/scripts/cosmetic-filters.js" });
|
await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filters.js" });
|
||||||
await tab.addInitScript({ path: "scraper/browser/scripts/fix-relative-links.js" });
|
await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
|
||||||
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
|
await tab.goto(url, { timeout: 60000, waitUntil: "domcontentloaded" });
|
||||||
await tab.waitForTimeout(2000);
|
await tab.waitForTimeout(2000);
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit fff7f483db947e690977bfc80955a53329d3d349
|
Loading…
Reference in New Issue
Block a user