move scraping for article content to files.

This commit is contained in:
Jason Schwarzenberger 2020-11-04 15:00:58 +13:00
parent 33e21e7f30
commit 9edc8b7cca
6 changed files with 160 additions and 72 deletions

View File

@ -9,9 +9,7 @@ from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, news
from scrapers import outline, declutter, local
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 TWO_DAYS = 60*60*24*2
@ -57,36 +55,27 @@ def list():
def get_article(url): def get_article(url):
try: try:
params = {'source_url': url} return declutter.get_html(url)
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
html = r.json()['data']['html']
if 'URL is not supported by Outline' in html:
raise Exception('URL not supported by Outline')
return html
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except:
logging.error('Problem outlining article: {}'.format(str(e))) pass
logging.info('Trying our server instead...')
try: try:
r = requests.post(READ_API, data=dict(url=url), timeout=20) return outline.get_html(url)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except BaseException as e: except:
logging.error('Problem getting article: {}'.format(str(e))) pass
return ''
try:
return local.get_html(url)
except KeyboardInterrupt:
raise
except:
pass
return ''
def get_content_type(url): def get_content_type(url):
try: try:

View File

@ -0,0 +1,28 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
DECLUTTER_API = 'https://declutter.1j.nz/details'
def get_html(url):
try:
logging.info(f'Declutter Scraper: {url}')
details = get_details(url)
return details['content']
except:
raise
def get_details(url):
try:
r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e)))
return {}

View File

@ -0,0 +1,28 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
READ_API = 'http://127.0.0.1:33843/details'
def get_html(url):
try:
logging.info(f'Local Scraper: {url}')
details = get_details(url)
return details['content']
except:
raise
def get_details(url):
try:
r = requests.post(READ_API, data=dict(url=url), timeout=20)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting article: {}'.format(str(e)))
return {}

View File

@ -0,0 +1,38 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
OUTLINE_REFERER = 'https://outline.com/'
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
def get_html(url):
try:
details = get_details(url)
return details['html']
except:
raise
def get_details(url):
try:
logging.info(f'Outline Scraper: {url}')
params = {'source_url': url}
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return data
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return {}

View File

@ -1,52 +1,14 @@
const port = 33843;
const express = require('express'); const express = require('express');
const app = express(); const app = express();
const port = 33843; const simple = require('./simple');
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
app.get('/', (req, res) => res.send(simple.FORM));
app.get('/', (req, res) => { app.post('/', (req, res) => simple.scrape(req, res));
res.send('<form method="POST" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>'); app.post('/details', (req, res) => simple.details(req, res));
}); // app.post('/browser', (req, res) => browser.scrape(req, res));
// app.post('/browser/details', (req, res) => browser.details(req, res));
const requestCallback = (url, res) => (error, response, body) => {
if (!error && response.statusCode == 200) {
console.log('Response OK.');
const doc = new JSDOM(body, {url: url});
const reader = new Readability(doc.window.document);
const article = reader.parse();
if (article && article.content) {
res.send(article.content);
} else {
res.sendStatus(404);
}
} else {
console.log('Response error:', error ? error.toString() : response.statusCode);
res.sendStatus(response ? response.statusCode : 404);
}
};
app.post('/', (req, res) => {
const url = req.body.url;
const requestOptions = {
url: url,
//headers: {'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'},
//headers: {'User-Agent': 'Twitterbot/1.0'},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Forwarded-For': '66.249.66.1',
},
};
console.log('Parse request for:', url);
request(requestOptions, requestCallback(url, res));
});
app.listen(port, () => { app.listen(port, () => {
console.log(`Example app listening on port ${port}!`); console.log(`Example app listening on port ${port}!`);

43
readerserver/simple.js Normal file
View File

@ -0,0 +1,43 @@
const request = require('request');
const JSDOM = require('jsdom').JSDOM;
const { Readability } = require('readability');
const options = url => ({
url: url,
headers: {
'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'X-Forwarded-For': '66.249.66.1',
},
});
const extract = (url, body) => {
const doc = new JSDOM(body, { url: url });
const reader = new Readability(doc.window.document);
return reader.parse();
};
module.exports.FORM = '<form method="POST" action="/" accept-charset="UTF-8"><input name="url"><button type="submit">SUBMIT</button></form>';
module.exports.scrape = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article && article.content) {
return res.send(article.content);
}
return res.sendStatus(404);
});
module.exports.details = (req, res) => request(options(req.body.url), (error, response, body) => {
if (error || response.statusCode != 200) {
console.log('Response error:', error ? error.toString() : response.statusCode);
return res.sendStatus(response ? response.statusCode : 404);
}
const article = extract(url, body);
if (article) {
return res.send(article);
}
return res.sendStatus(404);
});