diff --git a/apiserver/feed.py b/apiserver/feed.py index 81959f7..0f2a770 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -9,9 +9,7 @@ from bs4 import BeautifulSoup import settings from feeds import hackernews, reddit, tildes, substack, manual, news - -OUTLINE_API = 'https://api.outline.com/v3/parse_article' -READ_API = 'http://127.0.0.1:33843' +from scrapers import outline, declutter, local INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] TWO_DAYS = 60*60*24*2 @@ -57,36 +55,27 @@ def list(): def get_article(url): try: - params = {'source_url': url} - headers = {'Referer': 'https://outline.com/'} - r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) - if r.status_code == 429: - logging.info('Rate limited by outline, sleeping 30s and skipping...') - time.sleep(30) - return '' - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - html = r.json()['data']['html'] - if 'URL is not supported by Outline' in html: - raise Exception('URL not supported by Outline') - return html + return declutter.get_html(url) except KeyboardInterrupt: raise - except BaseException as e: - logging.error('Problem outlining article: {}'.format(str(e))) - - logging.info('Trying our server instead...') + except: + pass try: - r = requests.post(READ_API, data=dict(url=url), timeout=20) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.text + return outline.get_html(url) except KeyboardInterrupt: raise - except BaseException as e: - logging.error('Problem getting article: {}'.format(str(e))) - return '' + except: + pass + + try: + return local.get_html(url) + except KeyboardInterrupt: + raise + except: + pass + + return '' def get_content_type(url): try: diff --git a/apiserver/scrapers/declutter.py b/apiserver/scrapers/declutter.py new file mode 100644 index 0000000..711f34a --- /dev/null +++ b/apiserver/scrapers/declutter.py @@ -0,0 +1,28 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) +import requests + +DECLUTTER_API = 'https://declutter.1j.nz/details' + + +def get_html(url): + try: + logging.info(f'Declutter Scraper: {url}') + details = get_details(url) + return details['content'] + except: + raise + +def get_details(url): + try: + r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=20) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem decluttering article: {}'.format(str(e))) + return {} \ No newline at end of file diff --git a/apiserver/scrapers/local.py b/apiserver/scrapers/local.py new file mode 100644 index 0000000..1451f20 --- /dev/null +++ b/apiserver/scrapers/local.py @@ -0,0 +1,28 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) +import requests + +READ_API = 'http://127.0.0.1:33843/details' + + +def get_html(url): + try: + logging.info(f'Local Scraper: {url}') + details = get_details(url) + return details['content'] + except: + raise + +def get_details(url): + try: + r = requests.post(READ_API, data=dict(url=url), timeout=20) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem getting article: {}'.format(str(e))) + return {} \ No newline at end of file diff --git a/apiserver/scrapers/outline.py b/apiserver/scrapers/outline.py new file mode 100644 index 0000000..e5d6c2a --- /dev/null +++ b/apiserver/scrapers/outline.py @@ -0,0 +1,38 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) +import requests + +OUTLINE_REFERER = 'https://outline.com/' +OUTLINE_API = 'https://api.outline.com/v3/parse_article' + + +def get_html(url): + try: + details = get_details(url) + return details['html'] + except: + raise + +def get_details(url): + try: + logging.info(f'Outline Scraper: {url}') + params = {'source_url': url} + headers = {'Referer': OUTLINE_REFERER} + r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) + if r.status_code == 429: + logging.info('Rate limited by outline, sleeping 30s and skipping...') + time.sleep(30) + return '' + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + data = r.json()['data'] + if 'URL is not supported by Outline' in data['html']: + raise Exception('URL not supported by Outline') + return data + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem outlining article: {}'.format(str(e))) + return {} \ No newline at end of file diff --git a/readerserver/main.js b/readerserver/main.js index 242db7a..219faeb 100644 --- a/readerserver/main.js +++ b/readerserver/main.js @@ -1,52 +1,14 @@ +const port = 33843; const express = require('express'); const app = express(); -const port = 33843; - -const request = require('request'); -const JSDOM = require('jsdom').JSDOM; -const { Readability } = require('readability'); +const simple = require('./simple'); app.use(express.urlencoded({ extended: true })); - -app.get('/', (req, res) => { - res.send('
'); -}); - -const requestCallback = (url, res) => (error, response, body) => { - if (!error && response.statusCode == 200) { - console.log('Response OK.'); - - const doc = new JSDOM(body, {url: url}); - const reader = new Readability(doc.window.document); - const article = reader.parse(); - - if (article && article.content) { - res.send(article.content); - } else { - res.sendStatus(404); - } - } else { - console.log('Response error:', error ? error.toString() : response.statusCode); - res.sendStatus(response ? response.statusCode : 404); - } -}; - -app.post('/', (req, res) => { - const url = req.body.url; - const requestOptions = { - url: url, - //headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'}, - //headers: {'User-Agent': 'Twitterbot/1.0'}, - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', - 'X-Forwarded-For': '66.249.66.1', - }, - }; - - console.log('Parse request for:', url); - - request(requestOptions, requestCallback(url, res)); -}); +app.get('/', (req, res) => res.send(simple.FORM)); +app.post('/', (req, res) => simple.scrape(req, res)); +app.post('/details', (req, res) => simple.details(req, res)); +// app.post('/browser', (req, res) => browser.scrape(req, res)); +// app.post('/browser/details', (req, res) => browser.details(req, res)); app.listen(port, () => { console.log(`Example app listening on port ${port}!`); diff --git a/readerserver/simple.js b/readerserver/simple.js new file mode 100644 index 0000000..69fcd0d --- /dev/null +++ b/readerserver/simple.js @@ -0,0 +1,43 @@ +const request = require('request'); +const JSDOM = require('jsdom').JSDOM; +const { Readability } = require('readability'); + +const options = url => ({ + url: url, + headers: { + 'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)', + 'X-Forwarded-For': '66.249.66.1', + }, +}); + +const extract = (url, body) => { + const doc = new JSDOM(body, { url: url }); + const reader = new Readability(doc.window.document); + return reader.parse(); +}; + + +module.exports.FORM = '
'; +module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => { + if (error || response.statusCode != 200) { + console.log('Response error:', error ? error.toString() : response.statusCode); + return res.sendStatus(response ? response.statusCode : 404); + } + const article = extract(url, body); + if (article && article.content) { + return res.send(article.content); + } + return res.sendStatus(404); +}); + +module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => { + if (error || response.statusCode != 200) { + console.log('Response error:', error ? error.toString() : response.statusCode); + return res.sendStatus(response ? response.statusCode : 404); + } + const article = extract(url, body); + if (article) { + return res.send(article); + } + return res.sendStatus(404); +}); \ No newline at end of file