diff --git a/apiserver/feeds/category.py b/apiserver/feeds/category.py index 23dd1f2..ce5456a 100644 --- a/apiserver/feeds/category.py +++ b/apiserver/feeds/category.py @@ -53,7 +53,7 @@ class Category(Base): # scratchpad so I can quickly develop the parser if __name__ == '__main__': print("Category: RadioNZ") - site = Category("https://www.rnz.co.nz/news/") + site = Category({ 'url': "https://www.rnz.co.nz/news/" }) excludes = [ 'rnz.co.nz/news/sport', 'rnz.co.nz/weather', @@ -61,12 +61,12 @@ if __name__ == '__main__': ] posts = site.feed(excludes) print(posts[:5]) - print(site.story(posts[0])) + print(site.story(posts[0][0], posts[0][1])) print("Category: Newsroom") - site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland') + site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'}) posts = site.feed() print(posts[:5]) - print(site.story(posts[0])) + print(site.story(posts[0][0], posts[0][1])) diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py index 4efe47d..d7e1990 100644 --- a/apiserver/feeds/sitemap.py +++ b/apiserver/feeds/sitemap.py @@ -76,7 +76,7 @@ class Sitemap(Base): # scratchpad so I can quickly develop the parser if __name__ == '__main__': print("Sitemap: The Spinoff") - site = Sitemap("https://thespinoff.co.nz/sitemap.xml") + site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" }) excludes = [ 'thespinoff.co.nz/sitemap-misc.xml', 'thespinoff.co.nz/sitemap-authors.xml', @@ -84,16 +84,18 @@ if __name__ == '__main__': ] posts = site.feed(excludes) print(posts[:5]) - print(site.story(posts[0])) + print(site.story(posts[0][0], posts[0][1])) print("Sitemap: Newshub") - site = Sitemap([ - 'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', - 'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', - 'https://www.newshub.co.nz/home/world.gnewssitemap.xml', - 'https://www.newshub.co.nz/home/money.gnewssitemap.xml', - ]) + site = Sitemap({ + 'url': [ + 'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', + 'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', + 'https://www.newshub.co.nz/home/world.gnewssitemap.xml', + 'https://www.newshub.co.nz/home/money.gnewssitemap.xml', + ], + }) posts = site.feed() print(posts[:5]) - print(site.story(posts[0])) - print(site.story(posts[:-1])) + print(site.story(posts[0][0], posts[0][1])) + diff --git a/apiserver/misc/news.py b/apiserver/misc/news.py index 8d32143..1594574 100644 --- a/apiserver/misc/news.py +++ b/apiserver/misc/news.py @@ -14,6 +14,7 @@ from utils import clean from misc.metadata import parse_extruct from misc.time import unix from misc.api import xml +import misc.stuff as stuff def comment(i): if 'author' not in i: @@ -89,13 +90,18 @@ class Base: if 'disqus' in markup: try: s['comments'] = declutter.get_comments(urlref) - c['comments'] = list(filter(bool, c['comments'])) + s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = comment_count(s['comments']) except KeyboardInterrupt: raise except: pass + if urlref.startswith('https://www.stuff.co.nz'): + s['comments'] = stuff.get_comments(urlref) + s['comments'] = list(filter(bool, s['comments'])) + s['num_comments'] = len(s['comments']) + if not s['date']: return False return s diff --git a/apiserver/misc/stuff.py b/apiserver/misc/stuff.py new file mode 100644 index 0000000..8c01665 --- /dev/null +++ b/apiserver/misc/stuff.py @@ -0,0 +1,62 @@ +import re +from bs4 import BeautifulSoup + +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + +from misc.time import unix +from misc.api import xml + +def _soup_get_text(soup): + if not soup: return None + if soup.text: return soup.text + + s = soup.find(text=lambda tag: isinstance(tag, bs4.CData)) + if s and s.string: return s.string.strip() + return None + +def _parse_comment(soup): + c = { + 'author': '', + 'authorLink': '', + 'score': 0, + 'date': 0, + 'text': '', + 'comments': [], + } + + if soup.find('link'): + title = _soup_get_text(soup.find('link')) + if title and 'By:' in title: + c['author'] = title.strip('By:').strip() + if soup.find('dc:creator'): + c['author'] = _soup_get_text(soup.find('dc:creator')) + if soup.find('link'): + c['authorLink'] = _soup_get_text(soup.find('link')) + if soup.find('description'): + c['text'] = _soup_get_text(soup.find('description')) + if soup.find('pubDate'): + c['date'] = unix(soup.find('pubDate').text) + + return c + +def get_comments(url): + regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+" + p = re.compile(regex).match(url) + path = p.groups()[0] + comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}' + markup = xml(lambda x: comment_url) + if not markup: return [] + soup = BeautifulSoup(markup, features='html.parser') + comments = soup.find_all('item') + if not comments: return [] + comments = [_parse_comment(c) for c in comments] + return comments + + +# scratchpad so I can quickly develop the parser +if __name__ == '__main__': + comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing') + print(len(comments)) + print(comments[:5]) \ No newline at end of file