From 29f8a8b8cca0c0236191b2a25807907f9de1e651 Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Wed, 4 Nov 2020 11:08:50 +1300 Subject: [PATCH] add news site categories feed. --- apiserver/feed.py | 13 ++- apiserver/feeds/news.py | 197 ++++++++++++++++++++++++++++++++++ apiserver/feeds/sitemap.py | 128 ---------------------- apiserver/settings.py.example | 4 + 4 files changed, 212 insertions(+), 130 deletions(-) create mode 100644 apiserver/feeds/news.py delete mode 100644 apiserver/feeds/sitemap.py diff --git a/apiserver/feed.py b/apiserver/feed.py index a9c7882..d47f036 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -8,7 +8,7 @@ import time from bs4 import BeautifulSoup import settings -from feeds import hackernews, reddit, tildes, substack, manual, sitemap +from feeds import hackernews, reddit, tildes, substack, manual, news OUTLINE_API = 'https://api.outline.com/v3/parse_article' READ_API = 'http://127.0.0.1:33843' @@ -19,9 +19,12 @@ TWO_DAYS = 60*60*24*2 substacks = {} for key, value in settings.SUBSTACK.items(): substacks[key] = substack.Publication(value['url']) +categories = {} +for key, value in settings.CATEGORY.items(): + categories[key] = news.Cateogry(value['url']) sitemaps = {} for key, value in settings.SITEMAP.items(): - sitemaps[key] = sitemap.Sitemap(value['url']) + sitemaps[key] = news.Sitemap(value['url']) def list(): feed = [] @@ -41,6 +44,10 @@ def list(): count = settings.SUBSTACK[key]['count'] feed += [(x, key) for x in publication.feed()[:count]] + for key, sites in categories.items(): + count = settings.CATEGORY[key]['count'] + feed += [(x, key) for x in sites.feed()[:count]] + for key, sites in sitemaps.items(): count = settings.SITEMAP[key]['count'] feed += [(x, key) for x in sites.feed()[:count]] @@ -105,6 +112,8 @@ def update_story(story, is_manual=False): res = tildes.story(story['ref']) elif story['source'] == 'substack': res = substack.top.story(story['ref']) + elif story['source'] in categories.keys(): + res = categories[story['source']].story(story['ref']) elif story['source'] in sitemaps.keys(): res = sitemaps[story['source']].story(story['ref']) elif story['source'] in substacks.keys(): diff --git a/apiserver/feeds/news.py b/apiserver/feeds/news.py new file mode 100644 index 0000000..9c1917f --- /dev/null +++ b/apiserver/feeds/news.py @@ -0,0 +1,197 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +import requests +from datetime import datetime +from bs4 import BeautifulSoup +import extruct + +from utils import clean + +OUTLINE_API = 'https://api.outline.com/v3/parse_article' +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' + +def unix(date_str): + date_tzfix = date_str + if ":" == date_tzfix[-3]: + date_tzfix = date_tzfix[:-3]+date_tzfix[-2:] + formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z'] + for f in formats: + try: + return int(datetime.strptime(date_str, f).timestamp()) + except: + pass + try: + return int(datetime.strptime(date_tzfix, f).timestamp()) + except: + pass + return 0 + +def xml(route, ref=None): + try: + headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} + r = requests.get(route(ref), headers=headers, timeout=5) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.text + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem hitting URL: {}'.format(str(e))) + return False + +def parse_extruct(s, data): + for rdfa in data['rdfa']: + for key, props in rdfa.items(): + if 'http://ogp.me/ns#title' in props: + for values in props['http://ogp.me/ns#title']: + s['title'] = values['@value'] + if 'http://ogp.me/ns/article#modified_time' in props: + for values in props['http://ogp.me/ns/article#modified_time']: + print(f"modified_time: {values['@value']}") + s['date'] = unix(values['@value']) + if 'http://ogp.me/ns/article#published_time' in props: + for values in props['http://ogp.me/ns/article#published_time']: + print(f"published_time: {values['@value']}") + s['date'] = unix(values['@value']) + + for og in data['opengraph']: + titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) + modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']])) + published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']])) + if len(modified): + s['date'] = unix(modified[0]) + if len(published): + s['date'] = unix(published[0]) + s['date'] = unix(published[0] or modified[0] or '') + if len(titles): + s['title'] = titles[0] + + for md in data['microdata']: + if md['type'] == 'https://schema.org/NewsArticle': + props = md['properties'] + s['title'] = props['headline'] + if props['dateModified']: + s['date'] = unix(props['dateModified']) + if props['datePublished']: + s['date'] = unix(props['datePublished']) + if 'author' in props and props['author']: + s['author'] = props['author']['properties']['name'] + + for ld in data['json-ld']: + if ld['@type'] == 'Article': + s['title'] = ld['headline'] + if ld['dateModified']: + s['date'] = unix(ld['dateModified']) + if ld['datePublished']: + s['date'] = unix(ld['datePublished']) + if 'author' in ld and ld['author']: + s['author'] = ld['author']['name'] + + return s + +class Sitemap: + def __init__(self, url): + self.sitemap_url = url + + def feed(self): + markup = xml(lambda x: self.sitemap_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='lxml') + articles = soup.find('urlset').findAll('url') + articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) + return [x.find('loc').text for x in articles] or [] + + def story(self, ref): + markup = xml(lambda x: self.sitemap_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='lxml') + articles = soup.find('urlset').findAll('url') + articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) + articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles])) + + if len(articles) == 0: + return False + + r = articles[0] + if not r: + return False + + html = xml(lambda x: ref) + + if not html: + return False + + data = extruct.extract(html) + + s = {} + s['author_link'] = '' + s['score'] = '' + s['comments'] = [] + s['num_comments'] = 0 + s['link'] = ref + s['url'] = ref + s['date'] = unix(r.find('lastmod').text) + + s = parse_extruct(s, data) + return s + +class Category: + def __init__(self, url): + self.category_url = url + self.base_url = '/'.join(url.split('/')[:3]) + + def feed(self): + markup = xml(lambda x: self.category_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='html.parser') + links = soup.find_all('a', href=True) + links = [link.get('href') for link in links] + links = [f"{self.base_url}{link}" if link.startswith('/') else link for link in links] + links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links])) + return links + + def story(self, ref): + markup = xml(lambda x: ref) + if not markup: + return False + + data = extruct.extract(markup) + + s = {} + s['author_link'] = '' + s['score'] = '' + s['comments'] = [] + s['num_comments'] = 0 + s['link'] = ref + s['url'] = ref + s['date'] = 0 + + s = parse_extruct(s, data) + return s + +# scratchpad so I can quickly develop the parser +if __name__ == '__main__': + print("Sitemap: Stuff") + site = Sitemap("https://www.stuff.co.nz/sitemap.xml") + posts = site.feed() + print(posts[:1]) + print(site.story(posts[0])) + + print("Sitemap: NZ Herald") + site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") + posts = site.feed() + print(posts[:1]) + print(site.story(posts[0])) + + print("Category: RadioNZ Te Ao Māori") + site = Category("https://www.rnz.co.nz/news/te-manu-korihi/") + posts = site.feed() + print(posts[:1]) + print(site.story(posts[0])) \ No newline at end of file diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py deleted file mode 100644 index 2282a5d..0000000 --- a/apiserver/feeds/sitemap.py +++ /dev/null @@ -1,128 +0,0 @@ -import logging -logging.basicConfig( - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.DEBUG) - -if __name__ == '__main__': - import sys - sys.path.insert(0,'.') - -import requests -from datetime import datetime -from bs4 import BeautifulSoup -import extruct - -from utils import clean - -OUTLINE_API = 'https://api.outline.com/v3/parse_article' -USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' - -def unix(date_str): - return int(datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ').timestamp()) - -def xml(route, ref=None): - try: - headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} - r = requests.get(route(ref), headers=headers, timeout=5) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.text - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem hitting URL: {}'.format(str(e))) - return False - -def get_article_details(url): - try: - params = {'source_url': url} - headers = {'Referer': 'https://outline.com/'} - r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=20) - if r.status_code == 429: - logging.info('Rate limited by outline, sleeping 30s and skipping...') - time.sleep(30) - return '' - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - data = r.json()['data'] - if 'URL is not supported by Outline' in data['html']: - raise Exception('URL not supported by Outline') - return (data, "outline") - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem outlining article: {}'.format(str(e))) - return (None, None) - - -class Sitemap: - def __init__(self, url): - self.sitemap_url = url - - def feed(self): - markup = xml(lambda x: self.sitemap_url) - if not markup: return [] - soup = BeautifulSoup(markup, features='lxml') - articles = soup.find('urlset').findAll('url') - articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) - return [x.find('loc').text for x in articles] or [] - - def story(self, ref): - markup = xml(lambda x: self.sitemap_url) - if not markup: return [] - soup = BeautifulSoup(markup, features='lxml') - articles = soup.find('urlset').findAll('url') - articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) - articles = list(filter(None, [a if a.find('loc').text == ref else None for a in articles])) - - if len(articles) == 0: - return False - - r = articles[0] - if not r: - return False - - html = xml(lambda x: ref) - - if not html: - return False - - data = extruct.extract(html) - - s = {} - s['author_link'] = '' - s['score'] = '' - s['comments'] = [] - s['num_comments'] = 0 - s['link'] = ref - s['url'] = ref - s['date'] = unix(r.find('lastmod').text) - - for og in data['opengraph']: - titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) - if len(titles): - s['title'] = titles[0] - - - for md in data['microdata']: - if md['type'] == 'https://schema.org/NewsArticle': - props = md['properties'] - s['title'] = props['headline'] - if 'author' in props and props['author']: - s['author'] = props['author']['properties']['name'] - - for ld in data['json-ld']: - if ld['@type'] == 'Article': - s['title'] = ld['headline'] - if 'author' in ld and ld['author']: - s['author'] = ld['author']['name'] - return s - - -# scratchpad so I can quickly develop the parser -if __name__ == '__main__': - #site = Sitemap("https://www.stuff.co.nz/sitemap.xml") - site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") - posts = site.feed() - print(posts[:1]) - print(site.story(posts[0])) diff --git a/apiserver/settings.py.example b/apiserver/settings.py.example index 3a21597..0852cdc 100644 --- a/apiserver/settings.py.example +++ b/apiserver/settings.py.example @@ -19,6 +19,10 @@ NUM_SUBSTACK = 10 # 'the bulletin': { 'url': "https://thespinoff.substack.com", 'count': 10}, # } +# CATEGORIES = { +# 'rnz national': { 'url': "https://www.rnz.co.nz/news/national", 'count': 10}, +# } + # Reddit account info # leave blank if not using Reddit REDDIT_CLIENT_ID = ''