Compare commits

...

18 Commits

Author SHA1 Message Date
Jason Schwarzenberger
9f4ff4acf0 remove unnecessary sitemap.xml request. 2020-11-04 11:22:15 +13:00
Jason Schwarzenberger
db6aad84ec fix mistake. 2020-11-04 11:12:01 +13:00
Jason Schwarzenberger
29f8a8b8cc add news site categories feed. 2020-11-04 11:08:50 +13:00
Jason
abf8589e02 fix sitemap 2020-11-03 10:53:40 +00:00
Jason
b759f46582 use extruct for opengraph/json-ld/microdata of articles 2020-11-03 10:31:36 +00:00
Jason Schwarzenberger
736cdc8576 fix mistake. 2020-11-03 17:04:46 +13:00
Jason Schwarzenberger
244d416f6e settings config of sitemap/substack publications. 2020-11-03 17:01:29 +13:00
Jason Schwarzenberger
5f98a2e76a Merge remote-tracking branch 'tanner/master' into master
And adding relevant setings.py.example/etc.
2020-11-03 16:44:02 +13:00
Jason Schwarzenberger
0567cdfd9b move sort to render. 2020-11-03 16:30:22 +13:00
Jason Schwarzenberger
4f90671cec order feed by reverse chronological 2020-11-03 16:21:23 +13:00
Jason Schwarzenberger
e63a1456a5 add logos. 2020-11-03 16:07:07 +13:00
Jason Schwarzenberger
76f1d57702 sitemap based feed. 2020-11-03 16:00:03 +13:00
Jason Schwarzenberger
de80389ed0 add logos. 2020-11-03 12:48:19 +13:00
Jason Schwarzenberger
4e64cf682a add the bulletin. 2020-11-03 12:41:16 +13:00
Jason Schwarzenberger
c5fe5d25a0 add substack.py top sites, replacing webworm.py 2020-11-03 12:28:39 +13:00
Jason
283a2b1545 fix webworm comments 2020-11-02 22:06:43 +00:00
Jason Schwarzenberger
0d6a86ace2 fix webworm dates. 2020-11-03 10:31:14 +13:00
Jason Schwarzenberger
f23bf628e0 add webworm/substack as a feed. 2020-11-02 17:09:59 +13:00
7 changed files with 442 additions and 37 deletions

View File

@@ -8,7 +8,7 @@ import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, manual from feeds import hackernews, reddit, tildes, substack, manual, news
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@@ -16,6 +16,16 @@ READ_API = 'http://127.0.0.1:33843'
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
TWO_DAYS = 60*60*24*2 TWO_DAYS = 60*60*24*2
substacks = {}
for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url'])
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
if settings.NUM_HACKERNEWS: if settings.NUM_HACKERNEWS:
@@ -27,6 +37,22 @@ def list():
if settings.NUM_TILDES: if settings.NUM_TILDES:
feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]] feed += [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count']
feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key]['count']
feed += [(x, key) for x in sites.feed()[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key]['count']
feed += [(x, key) for x in sites.feed()[:count]]
return feed return feed
def get_article(url): def get_article(url):
@@ -84,6 +110,14 @@ def update_story(story, is_manual=False):
res = reddit.story(story['ref']) res = reddit.story(story['ref'])
elif story['source'] == 'tildes': elif story['source'] == 'tildes':
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'substack':
res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'])
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'])
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual': elif story['source'] == 'manual':
res = manual.story(story['ref']) res = manual.story(story['ref'])

180
apiserver/feeds/news.py Normal file
View File

@@ -0,0 +1,180 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import extruct
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
date_tzfix = date_str
if ":" == date_tzfix[-3]:
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
for f in formats:
try:
return int(datetime.strptime(date_str, f).timestamp())
except:
pass
try:
return int(datetime.strptime(date_tzfix, f).timestamp())
except:
pass
return 0
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def parse_extruct(s, data):
for rdfa in data['rdfa']:
for key, props in rdfa.items():
if 'http://ogp.me/ns#title' in props:
for values in props['http://ogp.me/ns#title']:
s['title'] = values['@value']
if 'http://ogp.me/ns/article#modified_time' in props:
for values in props['http://ogp.me/ns/article#modified_time']:
print(f"modified_time: {values['@value']}")
s['date'] = unix(values['@value'])
if 'http://ogp.me/ns/article#published_time' in props:
for values in props['http://ogp.me/ns/article#published_time']:
print(f"published_time: {values['@value']}")
s['date'] = unix(values['@value'])
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = unix(modified[0])
if len(published):
s['date'] = unix(published[0])
s['date'] = unix(published[0] or modified[0] or '')
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] == 'https://schema.org/NewsArticle':
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = unix(props['dateModified'])
if props['datePublished']:
s['date'] = unix(props['datePublished'])
if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name']
for ld in data['json-ld']:
if ld['@type'] == 'Article':
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = unix(ld['dateModified'])
if ld['datePublished']:
s['date'] = unix(ld['datePublished'])
if 'author' in ld and ld['author']:
s['author'] = ld['author']['name']
return s
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
class Category:
def __init__(self, url):
self.category_url = url
self.base_url = '/'.join(url.split('/')[:3])
def feed(self):
markup = xml(lambda x: self.category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: Stuff")
site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Sitemap: NZ Herald")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

160
apiserver/feeds/substack.py Normal file
View File

@@ -0,0 +1,160 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from utils import clean
SUBSTACK_API_TOP_POSTS = lambda x: "https://substack.com/api/v1/reader/top-posts"
def author_link(author_id, base_url):
return f"{base_url}/people/{author_id}"
def api_comments(post_id, base_url):
return f"{base_url}/api/v1/post/{post_id}/comments?all_comments=true&sort=best_first"
def api_stories(x, base_url):
return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100"
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
def api(route, ref=None):
try:
r = requests.get(route(ref), timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
try:
r = requests.get(route(ref), timeout=15)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting Substack API: {}'.format(str(e)))
return False
def comment(i):
if 'body' not in i:
return False
c = {}
c['date'] = unix(i.get('date'))
c['author'] = i.get('name', '')
c['score'] = i.get('reactions').get('')
c['text'] = clean(i.get('body', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
class Publication:
def __init__(self, domain):
self.BASE_DOMAIN = domain
def feed(self):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []]
def story(self, ref):
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN))
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
if len(stories) == 0:
return False
r = stories[0]
if not r:
return False
s = {}
s['author'] = ''
s['author_link'] = ''
s['date'] = unix(r.get('post_date'))
s['score'] = r.get('reactions').get('')
s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'))
s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0)
authors = list(filter(None, [self._bylines(byline) for byline in r.get('publishedBylines')]))
if len(authors):
s['author'] = authors[0].get('name')
s['author_link'] = authors[0].get('link')
return s
def _bylines(self, b):
if 'id' not in b:
return None
a = {}
a['name'] = b.get('name')
a['link'] = author_link(b.get('id'), self.BASE_DOMAIN)
return a
class Top:
def feed(self):
stories = api(SUBSTACK_API_TOP_POSTS)
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
return [str(i.get("id")) for i in stories or []]
def story(self, ref):
stories = api(SUBSTACK_API_TOP_POSTS)
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
if len(stories) == 0:
return False
r = stories[0]
if not r:
return False
s = {}
pub = r.get('pub')
base_url = pub.get('base_url')
s['author'] = pub.get('author_name')
s['author_link'] = author_link(pub.get('author_id'), base_url)
s['date'] = unix(r.get('post_date'))
s['score'] = r.get('score')
s['title'] = r.get('title', '')
s['link'] = r.get('canonical_url', '')
s['url'] = r.get('canonical_url', '')
comments = api(lambda x: api_comments(x, base_url), r.get('id'))
s['comments'] = [comment(i) for i in comments.get('comments')]
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = r.get('comment_count', 0)
return s
top = Top()
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
top_posts = top.feed()
print(top.story(top_posts[0]))
webworm = Publication("https://www.webworm.co/")
posts = webworm.feed()
print(posts[:1])
print(webworm.story(posts[0]))

View File

@@ -4,6 +4,7 @@ certifi==2020.6.20
chardet==3.0.4 chardet==3.0.4
click==7.1.2 click==7.1.2
commonmark==0.9.1 commonmark==0.9.1
extruct==0.10.0
Flask==1.1.2 Flask==1.1.2
Flask-Cors==3.0.8 Flask-Cors==3.0.8
gevent==20.6.2 gevent==20.6.2
@@ -11,6 +12,7 @@ greenlet==0.4.16
idna==2.10 idna==2.10
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.11.2 Jinja2==2.11.2
lxml==4.6.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
packaging==20.4 packaging==20.4
praw==6.4.0 praw==6.4.0

View File

@@ -7,6 +7,21 @@
NUM_HACKERNEWS = 15 NUM_HACKERNEWS = 15
NUM_REDDIT = 10 NUM_REDDIT = 10
NUM_TILDES = 5 NUM_TILDES = 5
NUM_SUBSTACK = 10
# SITEMAP = {
# 'nzherald': { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# 'stuff': { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# }
# SUBSTACK = {
# 'webworm': { 'url': "https://www.webworm.co", 'count': 10},
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# }
# CATEGORY = {
# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# }
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit

View File

@@ -22,19 +22,20 @@ class Feed extends React.Component {
const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id; const updated = !this.state.stories || this.state.stories[0].id !== result.stories[0].id;
console.log('updated:', updated); console.log('updated:', updated);
this.setState({ stories: result.stories }); const { stories } = result;
localStorage.setItem('stories', JSON.stringify(result.stories)); this.setState({ stories });
localStorage.setItem('stories', JSON.stringify(stories));
if (updated) { if (updated) {
localForage.clear(); localForage.clear();
result.stories.forEach((x, i) => { stories.forEach((x, i) => {
fetch('/api/' + x.id) fetch('/api/' + x.id)
.then(res => res.json()) .then(res => res.json())
.then(result => { .then(({ story }) => {
localForage.setItem(x.id, result.story) localForage.setItem(x.id, story)
.then(console.log('preloaded', x.id, x.title)); .then(console.log('preloaded', x.id, x.title));
this.props.updateCache(x.id, result.story); this.props.updateCache(x.id, story);
}, error => {} }, error => { }
); );
}); });
} }
@@ -49,6 +50,10 @@ class Feed extends React.Component {
const stories = this.state.stories; const stories = this.state.stories;
const error = this.state.error; const error = this.state.error;
if (stories) {
stories.sort((a, b) => b.date - a.date);
}
return ( return (
<div className='container'> <div className='container'>
<Helmet> <Helmet>

File diff suppressed because one or more lines are too long