Compare commits

...

8 Commits

Author SHA1 Message Date
Jason Schwarzenberger 736cdc8576 fix mistake. 4 years ago
Jason Schwarzenberger 244d416f6e settings config of sitemap/substack publications. 4 years ago
Jason Schwarzenberger 5f98a2e76a Merge remote-tracking branch 'tanner/master' into master 4 years ago
Jason Schwarzenberger 0567cdfd9b move sort to render. 4 years ago
Jason Schwarzenberger 4f90671cec order feed by reverse chronological 4 years ago
Jason Schwarzenberger e63a1456a5 add logos. 4 years ago
Jason Schwarzenberger 76f1d57702 sitemap based feed. 4 years ago
Tanner Collin ca78a6d7a9 Move feed and Praw config to settings.py 4 years ago
  1. 12
      README.md
  2. 2
      apiserver/.gitignore
  3. 46
      apiserver/feed.py
  4. 14
      apiserver/feeds/reddit.py
  5. 110
      apiserver/feeds/sitemap.py
  6. 4
      apiserver/praw.ini.example
  7. 1
      apiserver/requirements.txt
  8. 51
      apiserver/settings.py.example
  9. 23
      webclient/src/Feed.js
  10. 8
      webclient/src/utils.js

@ -35,7 +35,7 @@ $ source env/bin/activate
(env) $ pip install -r requirements.txt (env) $ pip install -r requirements.txt
``` ```
Configure Praw for your Reddit account: Configure Praw for your Reddit account (optional):
* Go to https://www.reddit.com/prefs/apps * Go to https://www.reddit.com/prefs/apps
* Click "Create app" * Click "Create app"
@ -44,16 +44,14 @@ Configure Praw for your Reddit account:
* Description: blank * Description: blank
* About URL: blank * About URL: blank
* Redirect URL: your GitHub profile * Redirect URL: your GitHub profile
* Submit, copy the client ID and client secret into `praw.ini`: * Submit, copy the client ID and client secret into `settings.py` below
```text ```text
(env) $ vim praw.ini (env) $ vim settings.py.example
[bot]
client_id=paste here
client_secret=paste here
user_agent=script by github/your-username-here
``` ```
Edit it and save it as `settings.py`.
Now you can run the server: Now you can run the server:
```text ```text

@ -105,7 +105,7 @@ ENV/
# DB # DB
db.sqlite3 db.sqlite3
praw.ini settings.py
data.db data.db
data.db.bak data.db.bak
data/archive/* data/archive/*

@ -7,7 +7,8 @@ import requests
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from feeds import hackernews, reddit, tildes, substack, manual import settings
from feeds import hackernews, reddit, tildes, substack, manual, sitemap
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@ -15,17 +16,36 @@ READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 TWO_DAYS = 60*60*24*2
webworm = substack.Publication("https://www.webworm.co") substacks = {}
bulletin = substack.Publication("https://thespinoff.substack.com") for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = sitemap.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] if settings.NUM_HACKERNEWS:
feed += [(x, 'tildes') for x in tildes.feed()[:10]] feed += [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
feed += [(x, 'substack') for x in substack.top.feed()[:15]]
feed += [(x, 'reddit') for x in reddit.feed()[:15]] if settings.NUM_REDDIT:
feed += [(x, 'webworm') for x in webworm.feed()[:15]] feed += [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
feed += [(x, 'the bulletin') for x in bulletin.feed()[:15]]
if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key].count
feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].count
feed += [(x, key) for x in sites.feed()[:count]]
return feed return feed
def get_article(url): def get_article(url):
@ -83,12 +103,12 @@ def update_story(story, is_manual=False):
res = reddit.story(story['ref']) res = reddit.story(story['ref'])
elif story['source'] == 'tildes': elif story['source'] == 'tildes':
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'webworm':
res = webworm.story(story['ref'])
elif story['source'] == 'the bulletin':
res = bulletin.story(story['ref'])
elif story['source'] == 'substack': elif story['source'] == 'substack':
res = substack.top.story(story['ref']) res = substack.top.story(story['ref'])
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'])
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual': elif story['source'] == 'manual':
res = manual.story(story['ref']) res = manual.story(story['ref'])

@ -12,18 +12,24 @@ from praw.exceptions import PRAWException
from praw.models import MoreComments from praw.models import MoreComments
from prawcore.exceptions import PrawcoreException from prawcore.exceptions import PrawcoreException
import settings
from utils import render_md, clean from utils import render_md, clean
SUBREDDITS = 'newzealand'
SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x) SITE_LINK = lambda x : 'https://old.reddit.com{}'.format(x)
SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x) SITE_AUTHOR_LINK = lambda x : 'https://old.reddit.com/u/{}'.format(x)
reddit = praw.Reddit('bot') if settings.NUM_REDDIT:
reddit = praw.Reddit(
client_id=settings.REDDIT_CLIENT_ID,
client_secret=settings.REDDIT_CLIENT_SECRET,
user_agent=settings.REDDIT_USER_AGENT,
)
subs = '+'.join(settings.SUBREDDITS)
def feed(): def feed():
try: try:
return [x.id for x in reddit.subreddit(SUBREDDITS).hot()] return [x.id for x in reddit.subreddit(subs).hot()]
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except PRAWException as e: except PRAWException as e:

@ -0,0 +1,110 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp())
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def get_article_details(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return (data, "outline")
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return (None, None)
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles]))
if len(articles) == 0:
return False
r = articles[0]
if not r:
return False
(data, method) = get_article_details(ref)
if not data:
return False
if 'outline' not in method:
return False
s = {}
s['author'] = data['author']
s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text)
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = []
s['num_comments'] = 0
s['text'] = data['html']
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

@ -1,4 +0,0 @@
[bot]
client_id=
client_secret=
user_agent=

@ -11,6 +11,7 @@ greenlet==0.4.16
idna==2.10 idna==2.10
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.11.2 Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
packaging==20.4 packaging==20.4
praw==6.4.0 praw==6.4.0

@ -0,0 +1,51 @@
# QotNews settings
# edit this file and save it as settings.py
# Feed Lengths
# Number of top items from each site to pull
# set to 0 to disable that site
NUM_HACKERNEWS = 15
NUM_REDDIT = 10
NUM_TILDES = 5
NUM_SUBSTACK = 10
# SITEMAP = {
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
# SUBSTACK = {
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10},
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
# Reddit account info
# leave blank if not using Reddit
REDDIT_CLIENT_ID = ''
REDDIT_CLIENT_SECRET = ''
REDDIT_USER_AGENT = ''
SUBREDDITS = [
'Economics',
'AcademicPhilosophy',
'DepthHub',
'Foodforthought',
'HistoryofIdeas',
'LaymanJournals',
'PhilosophyofScience',
'PoliticsPDFs',
'Scholar',
'StateOfTheUnion',
'TheAgora',
'TrueFilm',
'TrueReddit',
'UniversityofReddit',
'culturalstudies',
'hardscience',
'indepthsports',
'indepthstories',
'ludology',
'neurophilosophy',
'resilientcommunities',
'worldevents',
]

@ -22,20 +22,21 @@ class Feed extends React.Component {
const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id; const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id;
console.log('updated:', updated); console.log('updated:', updated);
this.setState({ stories: result.stories }); const { stories } = result;
localStorage.setItem('stories', JSON.stringify(result.stories)); this.setState({ stories });
localStorage.setItem('stories', JSON.stringify(stories));
if (updated) { if (updated) {
localForage.clear(); localForage.clear();
result.stories.forEach((x, i) => { stories.forEach((x, i) => {
fetch('/api/' + x.id) fetch('/api/' + x.id)
.then(res => res.json()) .then(res => res.json())
.then(result => { .then(({ story }) => {
localForage.setItem(x.id, result.story) localForage.setItem(x.id, story)
.then(console.log('preloaded', x.id, x.title)); .then(console.log('preloaded', x.id, x.title));
this.props.updateCache(x.id, result.story); this.props.updateCache(x.id, story);
}, error => {} }, error => { }
); );
}); });
} }
}, },
@ -49,6 +50,10 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>
@ -73,7 +78,7 @@ class Feed extends React.Component {
</div> </div>
)} )}
</div> </div>
: :
<p>loading...</p> <p>loading...</p>
} }
</div> </div>

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save