diff --git a/apiserver/feed.py b/apiserver/feed.py index 5e5605f..9314a9f 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -9,7 +9,9 @@ from bs4 import BeautifulSoup import itertools import settings -from feeds import hackernews, reddit, tildes, substack, manual, news +from feeds import hackernews, reddit, tildes, substack, manual +from feeds.sitemap import Sitemap +from feeds.category import Category from scrapers import outline, declutter, browser, local INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] @@ -19,10 +21,10 @@ for key, value in settings.SUBSTACK.items(): substacks[key] = substack.Publication(value['url']) categories = {} for key, value in settings.CATEGORY.items(): - categories[key] = news.Category(value['url'], value.get('tz')) + categories[key] = Category(value['url'], value.get('tz')) sitemaps = {} for key, value in settings.SITEMAP.items(): - sitemaps[key] = news.Sitemap(value['url'], value.get('tz')) + sitemaps[key] = Sitemap(value['url'], value.get('tz')) def get_list(): feeds = {} diff --git a/apiserver/feeds/_news.py b/apiserver/feeds/_news.py new file mode 100644 index 0000000..0b5041d --- /dev/null +++ b/apiserver/feeds/_news.py @@ -0,0 +1,90 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +import requests +from bs4 import BeautifulSoup +from scrapers import declutter +import extruct + +import settings +from utils import clean +from misc.metadata import parse_extruct +from misc.time import unix +from misc.api import xml + +def comment(i): + if 'author' not in i: + return False + + c = {} + c['author'] = i.get('author', '') + c['score'] = i.get('points', 0) + c['date'] = unix(i.get('date', 0)) + c['text'] = clean(i.get('text', '') or '') + c['comments'] = [comment(j) for j in i['children']] + c['comments'] = list(filter(bool, c['comments'])) + return c + +def comment_count(i): + alive = 1 if i['author'] else 0 + return sum([comment_count(c) for c in i['comments']]) + alive + +class Base: + def __init__(url, tz=None): + self.url = url + self.tz = tz + + def feed(self, excludes=None): + return [] + + def story(self, ref): + markup = xml(lambda x: ref) + if not markup: + return False + + s = {} + s['author_link'] = '' + s['score'] = 0 + s['comments'] = [] + s['num_comments'] = 0 + s['link'] = ref + s['url'] = ref + s['date'] = 0 + + soup = BeautifulSoup(markup, features='html.parser') + icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32") + icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16") + favicon = soup.find_all('link', rel="shortcut icon", href=True) + others = soup.find_all('link', rel="icon", href=True) + icons = icon32 + icon16 + favicon + others + base_url = '/'.join(ref.split('/')[:3]) + icons = list(set([i.get('href') for i in icons])) + icons = [i if i.startswith('http') else base_url + i for i in icons] + + if icons: + s['icon'] = icons[0] + + data = extruct.extract(markup) + s = parse_extruct(s, data) + if s['date']: + s['date'] = unix(s['date'], tz=self.tz) + + if 'disqus' in markup: + try: + s['comments'] = declutter.get_comments(ref) + c['comments'] = list(filter(bool, c['comments'])) + s['num_comments'] = comment_count(s['comments']) + except KeyboardInterrupt: + raise + except: + pass + + if not s['date']: + return False + return s diff --git a/apiserver/feeds/category.py b/apiserver/feeds/category.py new file mode 100644 index 0000000..a58f82e --- /dev/null +++ b/apiserver/feeds/category.py @@ -0,0 +1,70 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +from bs4 import BeautifulSoup + +import settings +from utils import clean +from misc.api import xml +from _news import Base + +def _filter_links(links, category_url, excludes=None): + links = list(filter(None, [link if link.startswith(category_url) else None for link in links])) + links = list(filter(None, [link if link != category_url else None for link in links])) + links = list(set(links)) + if excludes: + links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links])) + return links + +def _get_category(category_url, excludes=None): + base_url = '/'.join(category_url.split('/')[:3]) + markup = xml(lambda x: category_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='html.parser') + links = soup.find_all('a', href=True) + links = [link.get('href') for link in links] + links = [f"{base_url}{link}" if link.startswith('/') else link for link in links] + links = _filter_links(links, category_url, excludes) + return links + +class Category(Base): + def __init__(self, url, tz=None): + self.tz = tz + self.category_url = url + + def feed(self, excludes=None): + links = [] + if isinstance(self.category_url, str): + links += _get_category(self.category_url, excludes) + elif isinstance(self.category_url, list): + for url in self.category_url: + links += _get_category(url, excludes) + return list(set(links)) + + +# scratchpad so I can quickly develop the parser +if __name__ == '__main__': + print("Category: RadioNZ") + site = Category("https://www.rnz.co.nz/news/") + excludes = [ + 'rnz.co.nz/news/sport', + 'rnz.co.nz/weather', + 'rnz.co.nz/news/weather', + ] + posts = site.feed(excludes) + print(posts[:5]) + print(site.story(posts[0])) + + print("Category: Newsroom") + site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland') + posts = site.feed() + print(posts[:5]) + print(site.story(posts[0])) + + diff --git a/apiserver/feeds/news.py b/apiserver/feeds/news.py deleted file mode 100644 index a71099c..0000000 --- a/apiserver/feeds/news.py +++ /dev/null @@ -1,307 +0,0 @@ -import logging -logging.basicConfig( - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.DEBUG) - -if __name__ == '__main__': - import sys - sys.path.insert(0,'.') - -import requests -from datetime import datetime -from bs4 import BeautifulSoup -from scrapers import declutter -import dateutil.parser -import extruct -import pytz - -from utils import clean -import settings - -tzinfos = { - 'NZDT': pytz.timezone('Pacific/Auckland'), - 'NZST': pytz.timezone('Pacific/Auckland') -} - -USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0' -#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" - -def unix(date_str, tz=None): - try: - dt = dateutil.parser.parse(date_str, tzinfos=tzinfos) - if tz: - dt = pytz.timezone(tz).localize(dt) - return int(dt.timestamp()) - except: - pass - return 0 - - -def xml(route, ref=None): - try: - headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'} - r = requests.get(route(ref), headers=headers, timeout=5) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.text - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem hitting URL: {}'.format(str(e))) - return False - - -def parse_extruct(s, data): - rdfa_keys = { - 'title': [ - 'http://ogp.me/ns#title', - 'https://ogp.me/ns#title', - ], - 'date': [ - 'http://ogp.me/ns/article#modified_time', - 'https://ogp.me/ns/article#modified_time', - 'http://ogp.me/ns/article#published_time', - 'https://ogp.me/ns/article#published_time', - ] - } - for rdfa in data['rdfa']: - for key, props in rdfa.items(): - for attribute, properties in rdfa_keys.items(): - for prop in properties: - if prop in props: - for values in props[prop]: - s[attribute] = values['@value'] - - for og in data['opengraph']: - titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) - modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']])) - published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']])) - if len(modified): - s['date'] = modified[0] - if len(published): - s['date'] = published[0] - if len(titles): - s['title'] = titles[0] - - for md in data['microdata']: - if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']: - props = md['properties'] - s['title'] = props['headline'] - if props['dateModified']: - s['date'] = props['dateModified'] - if props['datePublished']: - s['date'] = props['datePublished'] - if 'author' in props and props['author']: - if 'properties' in props['author']: - s['author'] = props['author']['properties']['name'] - elif isinstance(props['author'], list): - s['author'] = props['author'][0]['properties']['name'] - - for ld in data['json-ld']: - if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']: - s['title'] = ld['headline'] - if ld['dateModified']: - s['date'] = ld['dateModified'] - if ld['datePublished']: - s['date'] = ld['datePublished'] - if 'author' in ld and ld['author']: - if 'name' in ld['author']: - s['author'] = ld['author']['name'] - elif isinstance(ld['author'], list): - s['author'] = ld['author'][0]['name'] - if '@graph' in ld: - for gld in ld['@graph']: - if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']: - s['title'] = gld['headline'] - if gld['dateModified']: - s['date'] = gld['dateModified'] - if gld['datePublished']: - s['date'] = gld['datePublished'] - - return s - -def comment(i): - if 'author' not in i: - return False - - c = {} - c['author'] = i.get('author', '') - c['score'] = i.get('points', 0) - c['date'] = unix(i.get('date', 0)) - c['text'] = clean(i.get('text', '') or '') - c['comments'] = [comment(j) for j in i['children']] - c['comments'] = list(filter(bool, c['comments'])) - return c - -def comment_count(i): - alive = 1 if i['author'] else 0 - return sum([comment_count(c) for c in i['comments']]) + alive - -class _Base: - def __init__(url, tz=None): - self.url = url - self.tz = tz - - def feed(self, excludes=None): - return [] - - def story(self, ref): - markup = xml(lambda x: ref) - if not markup: - return False - - s = {} - s['author_link'] = '' - s['score'] = 0 - s['comments'] = [] - s['num_comments'] = 0 - s['link'] = ref - s['url'] = ref - s['date'] = 0 - - soup = BeautifulSoup(markup, features='html.parser') - icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32") - icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16") - favicon = soup.find_all('link', rel="shortcut icon", href=True) - others = soup.find_all('link', rel="icon", href=True) - icons = icon32 + icon16 + favicon + others - base_url = '/'.join(ref.split('/')[:3]) - icons = list(set([i.get('href') for i in icons])) - icons = [i if i.startswith('http') else base_url + i for i in icons] - - if icons: - s['icon'] = icons[0] - - data = extruct.extract(markup) - s = parse_extruct(s, data) - if s['date']: - s['date'] = unix(s['date'], tz=self.tz) - - if 'disqus' in markup: - try: - s['comments'] = declutter.get_comments(ref) - c['comments'] = list(filter(bool, c['comments'])) - s['num_comments'] = comment_count(s['comments']) - except KeyboardInterrupt: - raise - except: - pass - - if not s['date']: - return False - return s - -def get_sitemap_date(a): - if a.find('lastmod'): - return a.find('lastmod').text - if a.find('news:publication_date'): - return a.find('news:publication_date').text - if a.find('ns2:publication_date'): - return a.find('ns2:publication_date').text - return '' - -class Sitemap(_Base): - def __init__(self, url, tz=None): - self.tz = tz - self.sitemap_url = url - - def feed(self, excludes=None): - links = [] - if isinstance(self.sitemap_url, str): - links += self._get_sitemap(self.sitemap_url, excludes) - elif isinstance(self.sitemap_url, list): - for url in self.sitemap_url: - links += self._get_sitemap(url, excludes) - return list(set(links)) - - def _filter_links(self, links, excludes=None): - too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE - links = list(filter(None, [a if get_sitemap_date(a) else None for a in links])) - links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links])) - links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True) - - links = [x.find('loc').text for x in links] or [] - links = list(set(links)) - if excludes: - links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links])) - return links - - def _get_sitemap(self, feed_url, excludes=None): - markup = xml(lambda x: feed_url) - if not markup: return [] - soup = BeautifulSoup(markup, features='lxml') - links = [] - feed_urls = [] - if soup.find('sitemapindex'): - sitemap = soup.find('sitemapindex').findAll('sitemap') - feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap])) - if soup.find('urlset'): - sitemap = soup.find('urlset').findAll('url') - links = list(filter(None, [a if a.find('loc') else None for a in sitemap])) - - feed_urls = self._filter_links(feed_urls, excludes) - links = self._filter_links(links, excludes) - - for url in feed_urls: - links += self._get_sitemap(url, excludes) - return list(set(links)) - -class Category(_Base): - def __init__(self, url, tz=None): - self.tz = tz - self.category_url = url - - def _filter_links(self, links, category_url, excludes=None): - links = list(filter(None, [link if link.startswith(category_url) else None for link in links])) - links = list(filter(None, [link if link != category_url else None for link in links])) - links = list(set(links)) - if excludes: - links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links])) - return links - - def _get_category(self, category_url, excludes=None): - base_url = '/'.join(category_url.split('/')[:3]) - markup = xml(lambda x: category_url) - if not markup: return [] - soup = BeautifulSoup(markup, features='html.parser') - links = soup.find_all('a', href=True) - links = [link.get('href') for link in links] - links = [f"{base_url}{link}" if link.startswith('/') else link for link in links] - links = self._filter_links(links, category_url, excludes) - return links - - def feed(self, excludes=None): - links = [] - if isinstance(self.category_url, str): - links += self._get_category(self.category_url, excludes) - elif isinstance(self.category_url, list): - for url in self.category_url: - links += self._get_category(url, excludes) - return list(set(links)) - - -# scratchpad so I can quickly develop the parser -if __name__ == '__main__': - print("Sitemap: The Spinoff") - site = Sitemap("https://thespinoff.co.nz/sitemap.xml") - excludes = [ - 'thespinoff.co.nz/sitemap-misc.xml', - 'thespinoff.co.nz/sitemap-authors.xml', - 'thespinoff.co.nz/sitemap-tax-category.xml', - ] - posts = site.feed(excludes) - print(posts[:5]) - print(site.story(posts[0])) - - print("Sitemap: Newshub") - site = Sitemap([ - 'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', - 'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', - 'https://www.newshub.co.nz/home/world.gnewssitemap.xml', - 'https://www.newshub.co.nz/home/money.gnewssitemap.xml', - ]) - posts = site.feed() - print(posts[:5]) - print(site.story(posts[0])) - print(site.story(posts[:-1])) - diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py new file mode 100644 index 0000000..ac9f7e7 --- /dev/null +++ b/apiserver/feeds/sitemap.py @@ -0,0 +1,97 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +from datetime import datetime +from bs4 import BeautifulSoup + +import settings +from utils import clean +from misc.time import unix +from misc.api import xml +from _news import Base + +def _get_sitemap_date(a): + if a.find('lastmod'): + return a.find('lastmod').text + if a.find('news:publication_date'): + return a.find('news:publication_date').text + if a.find('ns2:publication_date'): + return a.find('ns2:publication_date').text + return '' + +def _filter_links(links, excludes=None): + too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE + links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links])) + links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links])) + links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True) + + links = [x.find('loc').text for x in links] or [] + links = list(set(links)) + if excludes: + links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links])) + return links + +def _get_sitemap(feed_url, excludes=None): + markup = xml(lambda x: feed_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='lxml') + links = [] + feed_urls = [] + if soup.find('sitemapindex'): + sitemap = soup.find('sitemapindex').findAll('sitemap') + feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap])) + if soup.find('urlset'): + sitemap = soup.find('urlset').findAll('url') + links = list(filter(None, [a if a.find('loc') else None for a in sitemap])) + + feed_urls = _filter_links(feed_urls, excludes) + links = _filter_links(links, excludes) + + for url in feed_urls: + links += _get_sitemap(url, excludes) + return list(set(links)) + +class Sitemap(Base): + def __init__(self, url, tz=None): + self.tz = tz + self.sitemap_url = url + + def feed(self, excludes=None): + links = [] + if isinstance(self.sitemap_url, str): + links += _get_sitemap(self.sitemap_url, excludes) + elif isinstance(self.sitemap_url, list): + for url in self.sitemap_url: + links += _get_sitemap(url, excludes) + return list(set(links)) + +# scratchpad so I can quickly develop the parser +if __name__ == '__main__': + print("Sitemap: The Spinoff") + site = Sitemap("https://thespinoff.co.nz/sitemap.xml") + excludes = [ + 'thespinoff.co.nz/sitemap-misc.xml', + 'thespinoff.co.nz/sitemap-authors.xml', + 'thespinoff.co.nz/sitemap-tax-category.xml', + ] + posts = site.feed(excludes) + print(posts[:5]) + print(site.story(posts[0])) + + print("Sitemap: Newshub") + site = Sitemap([ + 'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', + 'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', + 'https://www.newshub.co.nz/home/world.gnewssitemap.xml', + 'https://www.newshub.co.nz/home/money.gnewssitemap.xml', + ]) + posts = site.feed() + print(posts[:5]) + print(site.story(posts[0])) + print(site.story(posts[:-1])) diff --git a/apiserver/misc/api.py b/apiserver/misc/api.py new file mode 100644 index 0000000..9353375 --- /dev/null +++ b/apiserver/misc/api.py @@ -0,0 +1,35 @@ +import logging +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + level=logging.DEBUG) + +import requests + +USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +FORWARD_IP = '66.249.66.1' + +def xml(route, ref=None): + try: + headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP} + r = requests.get(route(ref), headers=headers, timeout=5) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.text + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem hitting URL: {}'.format(str(e))) + return False + +def json(route, ref=None): + try: + headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP} + r = requests.get(route(ref), headers=headers, timeout=5) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('Problem hitting URL: {}'.format(str(e))) + return False \ No newline at end of file diff --git a/apiserver/misc/metadata.py b/apiserver/misc/metadata.py new file mode 100644 index 0000000..c705ab9 --- /dev/null +++ b/apiserver/misc/metadata.py @@ -0,0 +1,69 @@ + +def parse_extruct(s, data): + rdfa_keys = { + 'title': [ + 'http://ogp.me/ns#title', + 'https://ogp.me/ns#title', + ], + 'date': [ + 'http://ogp.me/ns/article#modified_time', + 'https://ogp.me/ns/article#modified_time', + 'http://ogp.me/ns/article#published_time', + 'https://ogp.me/ns/article#published_time', + ] + } + for rdfa in data['rdfa']: + for key, props in rdfa.items(): + for attribute, properties in rdfa_keys.items(): + for prop in properties: + if prop in props: + for values in props[prop]: + s[attribute] = values['@value'] + + for og in data['opengraph']: + titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) + modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']])) + published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']])) + if len(modified): + s['date'] = modified[0] + if len(published): + s['date'] = published[0] + if len(titles): + s['title'] = titles[0] + + for md in data['microdata']: + if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']: + props = md['properties'] + s['title'] = props['headline'] + if props['dateModified']: + s['date'] = props['dateModified'] + if props['datePublished']: + s['date'] = props['datePublished'] + if 'author' in props and props['author']: + if 'properties' in props['author']: + s['author'] = props['author']['properties']['name'] + elif isinstance(props['author'], list): + s['author'] = props['author'][0]['properties']['name'] + + for ld in data['json-ld']: + if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']: + s['title'] = ld['headline'] + if ld['dateModified']: + s['date'] = ld['dateModified'] + if ld['datePublished']: + s['date'] = ld['datePublished'] + if 'author' in ld and ld['author']: + if 'name' in ld['author']: + s['author'] = ld['author']['name'] + elif isinstance(ld['author'], list): + s['author'] = ld['author'][0]['name'] + if '@graph' in ld: + for gld in ld['@graph']: + if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']: + s['title'] = gld['headline'] + if gld['dateModified']: + s['date'] = gld['dateModified'] + if gld['datePublished']: + s['date'] = gld['datePublished'] + + return s \ No newline at end of file diff --git a/apiserver/misc/time.py b/apiserver/misc/time.py new file mode 100644 index 0000000..e705cb1 --- /dev/null +++ b/apiserver/misc/time.py @@ -0,0 +1,18 @@ +import pytz +import dateutil.parser + + +TZINFOS = { + 'NZDT': pytz.timezone('Pacific/Auckland'), + 'NZST': pytz.timezone('Pacific/Auckland') +} + +def unix(date_str, tz=None, tzinfos=TZINFOS): + try: + dt = dateutil.parser.parse(date_str, tzinfos=tzinfos) + if tz: + dt = pytz.timezone(tz).localize(dt) + return int(dt.timestamp()) + except: + pass + return 0 \ No newline at end of file