From 76f1d5770277c91d3ea7d5c4d6b33f7aeacf547f Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Tue, 3 Nov 2020 16:00:03 +1300 Subject: [PATCH] sitemap based feed. --- apiserver/feed.py | 10 +++- apiserver/feeds/sitemap.py | 110 +++++++++++++++++++++++++++++++++++++ apiserver/requirements.txt | 1 + 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 apiserver/feeds/sitemap.py diff --git a/apiserver/feed.py b/apiserver/feed.py index 8c9ee73..6e3cf3e 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -7,7 +7,7 @@ import requests import time from bs4 import BeautifulSoup -from feeds import hackernews, reddit, tildes, substack, manual +from feeds import hackernews, reddit, tildes, substack, manual, sitemap OUTLINE_API = 'https://api.outline.com/v3/parse_article' READ_API = 'http://127.0.0.1:33843' @@ -17,11 +17,15 @@ TWO_DAYS = 60*60*24*2 webworm = substack.Publication("https://www.webworm.co") bulletin = substack.Publication("https://thespinoff.substack.com") +stuff = sitemap.Sitemap("https://www.stuff.co.nz/sitemap.xml") +nzherald = sitemap.Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") def list(): feed = [] feed += [(x, 'hackernews') for x in hackernews.feed()[:10]] feed += [(x, 'tildes') for x in tildes.feed()[:10]] + feed += [(x, 'stuff') for x in stuff.feed()[:10]] + feed += [(x, 'nzherald') for x in nzherald.feed()[:10]] feed += [(x, 'substack') for x in substack.top.feed()[:15]] feed += [(x, 'reddit') for x in reddit.feed()[:15]] feed += [(x, 'webworm') for x in webworm.feed()[:15]] @@ -89,6 +93,10 @@ def update_story(story, is_manual=False): res = bulletin.story(story['ref']) elif story['source'] == 'substack': res = substack.top.story(story['ref']) + elif story['source'] == 'stuff': + res = stuff.story(story['ref']) + elif story['source'] == 'nzherald': + res = nzherald.story(story['ref']) elif story['source'] == 'manual': res = manual.story(story['ref']) diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py new file mode 100644 index 0000000..5863c4c --- /dev/null +++ b/apiserver/feeds/sitemap.py @@ -0,0 +1,110 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +import requests +from datetime import datetime +from bs4 import BeautifulSoup + +from utils import clean + +OUTLINE_API = 'https://api.outline.com/v3/parse_article' +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' + +def unix(date_str): + return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp()) + +def xml(route, ref=None): + try: + headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} + r = requests.get(route(ref), headers=headers, timeout=5) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.text + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem hitting URL: {}'.format(str(e))) + return False + +def get_article_details(url): + try: + params = {'source_url': url} + headers = {'Referer': 'https://outline.com/'} + r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) + if r.status_code == 429: + logging.info('Rate limited by outline, sleeping 30s and skipping...') + time.sleep(30) + return '' + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + data = r.json()['data'] + if 'URL is not supported by Outline' in data['html']: + raise Exception('URL not supported by Outline') + return (data, "outline") + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem outlining article: {}'.format(str(e))) + return (None, None) + + +class Sitemap: + def __init__(self, url): + self.sitemap_url = url + + def feed(self): + markup = xml(lambda x: self.sitemap_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='lxml') + articles = soup.find('urlset').findAll('url') + articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) + return [x.find('loc').text for x in articles] or [] + + def story(self, ref): + markup = xml(lambda x: self.sitemap_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='lxml') + articles = soup.find('urlset').findAll('url') + articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) + articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles])) + + if len(articles) == 0: + return False + + r = articles[0] + if not r: + return False + + (data, method) = get_article_details(ref) + if not data: + return False + if 'outline' not in method: + return False + s = {} + s['author'] = data['author'] + s['author_link'] = '' + s['date'] = unix(r.find('lastmod').text) + s['score'] = 0 + s['title'] = data['title'] + s['link'] = data['article_url'] + s['url'] = data['article_url'] + s['comments'] = [] + s['num_comments'] = 0 + s['text'] = data['html'] + + return s + + +# scratchpad so I can quickly develop the parser +if __name__ == '__main__': + # site = Sitemap("https://www.stuff.co.nz/sitemap.xml") + site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") + posts = site.feed() + print(posts[:1]) + print(site.story(posts[0])) \ No newline at end of file diff --git a/apiserver/requirements.txt b/apiserver/requirements.txt index c34a469..c198079 100644 --- a/apiserver/requirements.txt +++ b/apiserver/requirements.txt @@ -11,6 +11,7 @@ greenlet==0.4.16 idna==2.10 itsdangerous==1.1.0 Jinja2==2.11.2 +lxml==4.6.1 MarkupSafe==1.1.1 packaging==20.4 praw==6.4.0