Compare commits

...

5 Commits

Author SHA1 Message Date
Jason Schwarzenberger 9f4ff4acf0 remove unnecessary sitemap.xml request. 4 years ago
Jason Schwarzenberger db6aad84ec fix mistake. 4 years ago
Jason Schwarzenberger 29f8a8b8cc add news site categories feed. 4 years ago
Jason abf8589e02 fix sitemap 4 years ago
Jason b759f46582 use extruct for opengraph/json-ld/microdata of articles 4 years ago
  1. 17
      apiserver/feed.py
  2. 180
      apiserver/feeds/news.py
  3. 110
      apiserver/feeds/sitemap.py
  4. 2
      apiserver/feeds/substack.py
  5. 1
      apiserver/requirements.txt
  6. 4
      apiserver/settings.py.example

@ -8,7 +8,7 @@ import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, sitemap from feeds import hackernews, reddit, tildes, substack, manual, news
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@ -19,9 +19,12 @@ TWO_DAYS = 60*60*24*2
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url']) substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url'])
sitemaps = {} sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = sitemap.Sitemap(value['url']) sitemaps[key] = news.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
@ -38,11 +41,15 @@ def list():
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key].count count = settings.SUBSTACK[key]['count']
feed += [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key]['count']
feed += [(x, key) for x in sites.feed()[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].count count = settings.SITEMAP[key]['count']
feed += [(x, key) for x in sites.feed()[:count]] feed += [(x, key) for x in sites.feed()[:count]]
@ -105,6 +112,8 @@ def update_story(story, is_manual=False):
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'substack': elif story['source'] == 'substack':
res = substack.top.story(story['ref']) res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'])
elif story['source'] in sitemaps.keys(): elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref']) res = sitemaps[story['source']].story(story['ref'])
elif story['source'] in substacks.keys(): elif story['source'] in substacks.keys():

@ -0,0 +1,180 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import extruct
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
date_tzfix = date_str
if ":" == date_tzfix[-3]:
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
for f in formats:
try:
return int(datetime.strptime(date_str, f).timestamp())
except:
pass
try:
return int(datetime.strptime(date_tzfix, f).timestamp())
except:
pass
return 0
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def parse_extruct(s, data):
for rdfa in data['rdfa']:
for key, props in rdfa.items():
if 'http://ogp.me/ns#title' in props:
for values in props['http://ogp.me/ns#title']:
s['title'] = values['@value']
if 'http://ogp.me/ns/article#modified_time' in props:
for values in props['http://ogp.me/ns/article#modified_time']:
print(f"modified_time: {values['@value']}")
s['date'] = unix(values['@value'])
if 'http://ogp.me/ns/article#published_time' in props:
for values in props['http://ogp.me/ns/article#published_time']:
print(f"published_time: {values['@value']}")
s['date'] = unix(values['@value'])
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = unix(modified[0])
if len(published):
s['date'] = unix(published[0])
s['date'] = unix(published[0] or modified[0] or '')
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] == 'https://schema.org/NewsArticle':
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = unix(props['dateModified'])
if props['datePublished']:
s['date'] = unix(props['datePublished'])
if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name']
for ld in data['json-ld']:
if ld['@type'] == 'Article':
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = unix(ld['dateModified'])
if ld['datePublished']:
s['date'] = unix(ld['datePublished'])
if 'author' in ld and ld['author']:
s['author'] = ld['author']['name']
return s
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
class Category:
def __init__(self, url):
self.category_url = url
self.base_url = '/'.join(url.split('/')[:3])
def feed(self):
markup = xml(lambda x: self.category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: Stuff")
site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Sitemap: NZ Herald")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

@ -1,110 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp())
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def get_article_details(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return (data, "outline")
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return (None, None)
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles]))
if len(articles) == 0:
return False
r = articles[0]
if not r:
return False
(data, method) = get_article_details(ref)
if not data:
return False
if 'outline' not in method:
return False
s = {}
s['author'] = data['author']
s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text)
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = []
s['num_comments'] = 0
s['text'] = data['html']
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

@ -157,4 +157,4 @@ if __name__ == '__main__':
webworm = Publication("https://www.webworm.co/") webworm = Publication("https://www.webworm.co/")
posts = webworm.feed() posts = webworm.feed()
print(posts[:1]) print(posts[:1])
print(webworm.story(posts[0])) print(webworm.story(posts[0]))

@ -4,6 +4,7 @@ certifi==2020.6.20
chardet==3.0.4 chardet==3.0.4
click==7.1.2 click==7.1.2
commonmark==0.9.1 commonmark==0.9.1
extruct==0.10.0
Flask==1.1.2 Flask==1.1.2
Flask-Cors==3.0.8 Flask-Cors==3.0.8
gevent==20.6.2 gevent==20.6.2

@ -19,6 +19,10 @@ NUM_SUBSTACK = 10
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10}, # 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# } # }
# CATEGORY = {
# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# }
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit
REDDIT_CLIENT_ID = '' REDDIT_CLIENT_ID = ''

Loading…
Cancel
Save