Compare commits

...

8 Commits

Author SHA1 Message Date
Jason Schwarzenberger
736cdc8576 fix mistake. 2020-11-03 17:04:46 +13:00
Jason Schwarzenberger
244d416f6e settings config of sitemap/substack publications. 2020-11-03 17:01:29 +13:00
Jason Schwarzenberger
5f98a2e76a Merge remote-tracking branch 'tanner/master' into master
And adding relevant setings.py.example/etc.
2020-11-03 16:44:02 +13:00
Jason Schwarzenberger
0567cdfd9b move sort to render. 2020-11-03 16:30:22 +13:00
Jason Schwarzenberger
4f90671cec order feed by reverse chronological 2020-11-03 16:21:23 +13:00
Jason Schwarzenberger
e63a1456a5 add logos. 2020-11-03 16:07:07 +13:00
Jason Schwarzenberger
76f1d57702 sitemap based feed. 2020-11-03 16:00:03 +13:00
ca78a6d7a9 Move feed and Praw config to settings.py 2020-11-02 02:26:54 +00:00
10 changed files with 230 additions and 41 deletions

View File

@ -35,7 +35,7 @@ $ source env/bin/activate
(env) $ pip install -r requirements.txt (env) $ pip install -r requirements.txt
``` ```
Configure Praw for your Reddit account: Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps * Go to https://www.reddit.com/prefs/apps
* Click "Create app" * Click "Create app"
@ -44,16 +44,14 @@ Configure Praw for your Reddit account:
* Description: blank * Description: blank
* About URL: blank * About URL: blank
* Redirect URL: your GitHub profile * Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `praw.ini`: * Submit, copy the client ID and client secret into `settings.py` below
```text ```text
(env) $ vim praw.ini (env) $ vim settings.py.example
[bot]
client_id=paste here
client_secret=paste here
user_agent=script by github/your-username-here
``` ```
Edit it and save it as `settings.py`.
Now you can run the server: Now you can run the server:
```text ```text

View File

@ -105,7 +105,7 @@ ENV/
# DB # DB
db.sqlite3 db.sqlite3
praw.ini settings.py
data.db data.db
data.db.bak data.db.bak
data/archive/* data/archive/*

View File

@ -7,7 +7,8 @@ import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from feeds import hackernews, reddit, tildes, substack, manual import settings
from feeds import hackernews, reddit, tildes, substack, manual, sitemap
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@ -15,17 +16,36 @@ READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 TWO_DAYS = 60*60*24*2
webworm = substack.Publication("https://www.webworm.co") substacks = {}
bulletin = substack.Publication("https://thespinoff.substack.com") for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = sitemap.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] if settings.NUM_HACKERNEWS:
feed += [(x, 'tildes') for x in tildes.feed()[:10]] feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
feed += [(x, 'substack') for x in substack.top.feed()[:15]]
feed += [(x, 'reddit') for x in reddit.feed()[:15]] if settings.NUM_REDDIT:
feed += [(x, 'webworm') for x in webworm.feed()[:15]] feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
feed += [(x, 'the bulletin') for x in bulletin.feed()[:15]]
if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key].count
feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].count
feed += [(x, key) for x in sites.feed()[:count]]
return feed return feed
def get_article(url): def get_article(url):
@ -83,12 +103,12 @@ def update_story(story, is_manual=False):
res = reddit.story(story['ref']) res = reddit.story(story['ref'])
elif story['source'] == 'tildes': elif story['source'] == 'tildes':
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'webworm':
res = webworm.story(story['ref'])
elif story['source'] == 'the bulletin':
res = bulletin.story(story['ref'])
elif story['source'] == 'substack': elif story['source'] == 'substack':
res = substack.top.story(story['ref']) res = substack.top.story(story['ref'])
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'])
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual': elif story['source'] == 'manual':
res = manual.story(story['ref']) res = manual.story(story['ref'])

View File

@ -12,18 +12,24 @@ from praw.exceptions import PRAWException
from praw.models import MoreComments from praw.models import MoreComments
from prawcore.exceptions import PrawcoreException from prawcore.exceptions import PrawcoreException
import settings
from utils import render_md, clean from utils import render_md, clean
SUBREDDITS = 'newzealand'
SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x) SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x)
SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x) SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x)
reddit = praw.Reddit('bot') if settings.NUM_REDDIT:
reddit = praw.Reddit(
client_id=settings.REDDIT_CLIENT_ID,
client_secret=settings.REDDIT_CLIENT_SECRET,
user_agent=settings.REDDIT_USER_AGENT,
)
subs = '+'.join(settings.SUBREDDITS)
def feed(): def feed():
try: try:
return [x.id for x in reddit.subreddit(SUBREDDITS).hot()] return [x.id for x in reddit.subreddit(subs).hot()]
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except PRAWException as e: except PRAWException as e:

110
apiserver/feeds/sitemap.py Normal file
View File

@ -0,0 +1,110 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp())
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def get_article_details(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return (data, "outline")
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return (None, None)
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles]))
if len(articles) == 0:
return False
r = articles[0]
if not r:
return False
(data, method) = get_article_details(ref)
if not data:
return False
if 'outline' not in method:
return False
s = {}
s['author'] = data['author']
s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text)
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = []
s['num_comments'] = 0
s['text'] = data['html']
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

View File

@ -1,4 +0,0 @@
[bot]
client_id=
client_secret=
user_agent=

View File

@ -11,6 +11,7 @@ greenlet==0.4.16
idna==2.10 idna==2.10
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.11.2 Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
packaging==20.4 packaging==20.4
praw==6.4.0 praw==6.4.0

View File

@ -0,0 +1,51 @@
# QotNews settings
# edit this file and save it as settings.py
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site
NUM_HACKERNEWS = 15
NUM_REDDIT = 10
NUM_TILDES = 5
NUM_SUBSTACK = 10
# SITEMAP = {
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
# SUBSTACK = {
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10},
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
# Reddit account info
# leave blank if not using Reddit
REDDIT_CLIENT_ID = ''
REDDIT_CLIENT_SECRET = ''
REDDIT_USER_AGENT = ''
SUBREDDITS = [
'Economics',
'AcademicPhilosophy',
'DepthHub',
'Foodforthought',
'HistoryofIdeas',
'LaymanJournals',
'PhilosophyofScience',
'PoliticsPDFs',
'Scholar',
'StateOfTheUnion',
'TheAgora',
'TrueFilm',
'TrueReddit',
'UniversityofReddit',
'culturalstudies',
'hardscience',
'indepthsports',
'indepthstories',
'ludology',
'neurophilosophy',
'resilientcommunities',
'worldevents',
]

View File

@ -22,18 +22,19 @@ class Feed extends React.Component {
const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id; const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id;
console.log('updated:', updated); console.log('updated:', updated);
this.setState({ stories: result.stories }); const { stories } = result;
localStorage.setItem('stories', JSON.stringify(result.stories)); this.setState({ stories });
localStorage.setItem('stories', JSON.stringify(stories));
if (updated) { if (updated) {
localForage.clear(); localForage.clear();
result.stories.forEach((x, i) => { stories.forEach((x, i) => {
fetch('/api/' + x.id) fetch('/api/' + x.id)
.then(res => res.json()) .then(res => res.json())
.then(result => { .then(({ story }) => {
localForage.setItem(x.id, result.story) localForage.setItem(x.id, story)
.then(console.log('preloaded', x.id, x.title)); .then(console.log('preloaded', x.id, x.title));
this.props.updateCache(x.id, result.story); this.props.updateCache(x.id, story);
}, error => { } }, error => { }
); );
}); });
@ -49,6 +50,10 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>

File diff suppressed because one or more lines are too long