diff --git a/apiserver/feed.py b/apiserver/feed.py index 62b782e..a9c7882 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -38,11 +38,11 @@ def list(): feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] for key, publication in substacks.items(): - count = settings.SUBSTACK[key].count + count = settings.SUBSTACK[key]['count'] feed += [(x, key) for x in publication.feed()[:count]] for key, sites in sitemaps.items(): - count = settings.SITEMAP[key].count + count = settings.SITEMAP[key]['count'] feed += [(x, key) for x in sites.feed()[:count]] diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py index 5863c4c..cfb83d7 100644 --- a/apiserver/feeds/sitemap.py +++ b/apiserver/feeds/sitemap.py @@ -10,6 +10,7 @@ if __name__ == '__main__': import requests from datetime import datetime from bs4 import BeautifulSoup +import extruct from utils import clean @@ -81,23 +82,40 @@ class Sitemap: if not r: return False - (data, method) = get_article_details(ref) - if not data: - return False - if 'outline' not in method: + html = xml(lambda x: ref) + + if not html: return False + + data = extruct.extract(html) + s = {} - s['author'] = data['author'] s['author_link'] = '' - s['date'] = unix(r.find('lastmod').text) - s['score'] = 0 - s['title'] = data['title'] - s['link'] = data['article_url'] - s['url'] = data['article_url'] + s['score'] = '' s['comments'] = [] s['num_comments'] = 0 - s['text'] = data['html'] - + s['link'] = ref + s['url'] = ref + s['date'] = unix(r.find('lastmod').text) + + for og in data['opengraph']: + titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) + if len(titles): + s['title'] = titles[0] + + + for md in data['microdata']: + if md['type'] == 'https://schema.org/NewsArticle': + props = md['properties'] + s['title'] = props['headline'] + if props['author']: + s['author'] = props['author']['properties']['name'] + + for ld in data['json-ld']: + if ld['@type'] == 'Article': + s['title'] = ld['headline'] + if ld['author']: + s['author'] = ld['author']['name'] return s @@ -107,4 +125,4 @@ if __name__ == '__main__': site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") posts = site.feed() print(posts[:1]) - print(site.story(posts[0])) \ No newline at end of file + print(site.story(posts[0])) diff --git a/apiserver/feeds/substack.py b/apiserver/feeds/substack.py index 7402c87..a60d095 100644 --- a/apiserver/feeds/substack.py +++ b/apiserver/feeds/substack.py @@ -157,4 +157,4 @@ if __name__ == '__main__': webworm = Publication("https://www.webworm.co/") posts = webworm.feed() print(posts[:1]) - print(webworm.story(posts[0])) \ No newline at end of file + print(webworm.story(posts[0])) diff --git a/apiserver/requirements.txt b/apiserver/requirements.txt index c198079..e6141ab 100644 --- a/apiserver/requirements.txt +++ b/apiserver/requirements.txt @@ -4,6 +4,7 @@ certifi==2020.6.20 chardet==3.0.4 click==7.1.2 commonmark==0.9.1 +extruct==0.10.0 Flask==1.1.2 Flask-Cors==3.0.8 gevent==20.6.2