Compare commits

..

No commits in common. "9f4ff4acf08e33194509018726b7ddbeaf651656" and "736cdc8576066b110779e008e16aad28561136ef" have entirely different histories.

6 changed files with 115 additions and 199 deletions

View File

@ -8,7 +8,7 @@ import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import settings import settings
from feeds import hackernews, reddit, tildes, substack, manual, news from feeds import hackernews, reddit, tildes, substack, manual, sitemap
OUTLINE_API = 'https://api.outline.com/v3/parse_article' OUTLINE_API = 'https://api.outline.com/v3/parse_article'
READ_API = 'http://127.0.0.1:33843' READ_API = 'http://127.0.0.1:33843'
@ -19,12 +19,9 @@ TWO_DAYS = 60*60*24*2
substacks = {} substacks = {}
for key, value in settings.SUBSTACK.items(): for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url']) substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url'])
sitemaps = {} sitemaps = {}
for key, value in settings.SITEMAP.items(): for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url']) sitemaps[key] = sitemap.Sitemap(value['url'])
def list(): def list():
feed = [] feed = []
@ -41,15 +38,11 @@ def list():
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count'] count = settings.SUBSTACK[key].count
feed += [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key]['count']
feed += [(x, key) for x in sites.feed()[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key]['count'] count = settings.SITEMAP[key].count
feed += [(x, key) for x in sites.feed()[:count]] feed += [(x, key) for x in sites.feed()[:count]]
@ -112,8 +105,6 @@ def update_story(story, is_manual=False):
res = tildes.story(story['ref']) res = tildes.story(story['ref'])
elif story['source'] == 'substack': elif story['source'] == 'substack':
res = substack.top.story(story['ref']) res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'])
elif story['source'] in sitemaps.keys(): elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref']) res = sitemaps[story['source']].story(story['ref'])
elif story['source'] in substacks.keys(): elif story['source'] in substacks.keys():

View File

@ -1,180 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import extruct
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
date_tzfix = date_str
if ":" == date_tzfix[-3]:
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z']
for f in formats:
try:
return int(datetime.strptime(date_str, f).timestamp())
except:
pass
try:
return int(datetime.strptime(date_tzfix, f).timestamp())
except:
pass
return 0
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def parse_extruct(s, data):
for rdfa in data['rdfa']:
for key, props in rdfa.items():
if 'http://ogp.me/ns#title' in props:
for values in props['http://ogp.me/ns#title']:
s['title'] = values['@value']
if 'http://ogp.me/ns/article#modified_time' in props:
for values in props['http://ogp.me/ns/article#modified_time']:
print(f"modified_time: {values['@value']}")
s['date'] = unix(values['@value'])
if 'http://ogp.me/ns/article#published_time' in props:
for values in props['http://ogp.me/ns/article#published_time']:
print(f"published_time: {values['@value']}")
s['date'] = unix(values['@value'])
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = unix(modified[0])
if len(published):
s['date'] = unix(published[0])
s['date'] = unix(published[0] or modified[0] or '')
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] == 'https://schema.org/NewsArticle':
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = unix(props['dateModified'])
if props['datePublished']:
s['date'] = unix(props['datePublished'])
if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name']
for ld in data['json-ld']:
if ld['@type'] == 'Article':
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = unix(ld['dateModified'])
if ld['datePublished']:
s['date'] = unix(ld['datePublished'])
if 'author' in ld and ld['author']:
s['author'] = ld['author']['name']
return s
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
class Category:
def __init__(self, url):
self.category_url = url
self.base_url = '/'.join(url.split('/')[:3])
def feed(self):
markup = xml(lambda x: self.category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links]
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: Stuff")
site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Sitemap: NZ Herald")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

110
apiserver/feeds/sitemap.py Normal file
View File

@ -0,0 +1,110 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from utils import clean
OUTLINE_API = 'https://api.outline.com/v3/parse_article'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
def unix(date_str):
return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp())
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def get_article_details(url):
try:
params = {'source_url': url}
headers = {'Referer': 'https://outline.com/'}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20)
if r.status_code == 429:
logging.info('Rate limited by outline, sleeping 30s and skipping...')
time.sleep(30)
return ''
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
data = r.json()['data']
if 'URL is not supported by Outline' in data['html']:
raise Exception('URL not supported by Outline')
return (data, "outline")
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem outlining article: {}'.format(str(e)))
return (None, None)
class Sitemap:
def __init__(self, url):
self.sitemap_url = url
def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
return [x.find('loc').text for x in articles] or []
def story(self, ref):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles]))
if len(articles) == 0:
return False
r = articles[0]
if not r:
return False
(data, method) = get_article_details(ref)
if not data:
return False
if 'outline' not in method:
return False
s = {}
s['author'] = data['author']
s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text)
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = []
s['num_comments'] = 0
s['text'] = data['html']
return s
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

View File

@ -157,4 +157,4 @@ if __name__ == '__main__':
webworm = Publication("https://www.webworm.co/") webworm = Publication("https://www.webworm.co/")
posts = webworm.feed() posts = webworm.feed()
print(posts[:1]) print(posts[:1])
print(webworm.story(posts[0])) print(webworm.story(posts[0]))

View File

@ -4,7 +4,6 @@ certifi==2020.6.20
chardet==3.0.4 chardet==3.0.4
click==7.1.2 click==7.1.2
commonmark==0.9.1 commonmark==0.9.1
extruct==0.10.0
Flask==1.1.2 Flask==1.1.2
Flask-Cors==3.0.8 Flask-Cors==3.0.8
gevent==20.6.2 gevent==20.6.2

View File

@ -19,10 +19,6 @@ NUM_SUBSTACK = 10
# 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10}, # 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10},
# } # }
# CATEGORY = {
# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# }
# Reddit account info # Reddit account info
# leave blank if not using Reddit # leave blank if not using Reddit
REDDIT_CLIENT_ID = '' REDDIT_CLIENT_ID = ''