From 1fe524207e8ac67ecf573a39057f334b97d7fb21 Mon Sep 17 00:00:00 2001
From: Jason Schwarzenberger <jason@credisense.io>
Date: Thu, 19 Nov 2020 14:23:01 +1300
Subject: [PATCH] stuff comments.

---
 apiserver/feeds/category.py |  8 ++---
 apiserver/feeds/sitemap.py  | 22 +++++++------
 apiserver/misc/news.py      |  8 ++++-
 apiserver/misc/stuff.py     | 62 +++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+), 15 deletions(-)
 create mode 100644 apiserver/misc/stuff.py

diff --git a/apiserver/feeds/category.py b/apiserver/feeds/category.py
index 23dd1f2..ce5456a 100644
--- a/apiserver/feeds/category.py
+++ b/apiserver/feeds/category.py
@@ -53,7 +53,7 @@ class Category(Base):
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
     print("Category: RadioNZ")
-    site = Category("https://www.rnz.co.nz/news/")
+    site = Category({ 'url': "https://www.rnz.co.nz/news/" })
     excludes = [
         'rnz.co.nz/news/sport',
         'rnz.co.nz/weather',
@@ -61,12 +61,12 @@ if __name__ == '__main__':
     ]
     posts = site.feed(excludes)
     print(posts[:5])
-    print(site.story(posts[0]))
+    print(site.story(posts[0][0], posts[0][1]))
 
     print("Category: Newsroom")
-    site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
+    site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'})
     posts = site.feed()
     print(posts[:5])
-    print(site.story(posts[0]))
+    print(site.story(posts[0][0], posts[0][1]))
 
 
diff --git a/apiserver/feeds/sitemap.py b/apiserver/feeds/sitemap.py
index 4efe47d..d7e1990 100644
--- a/apiserver/feeds/sitemap.py
+++ b/apiserver/feeds/sitemap.py
@@ -76,7 +76,7 @@ class Sitemap(Base):
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
     print("Sitemap: The Spinoff")
-    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
+    site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" })
     excludes = [
         'thespinoff.co.nz/sitemap-misc.xml',
         'thespinoff.co.nz/sitemap-authors.xml',
@@ -84,16 +84,18 @@ if __name__ == '__main__':
     ]
     posts = site.feed(excludes)
     print(posts[:5])
-    print(site.story(posts[0]))
+    print(site.story(posts[0][0], posts[0][1]))
 
     print("Sitemap: Newshub")
-    site = Sitemap([
-        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
-    ])
+    site = Sitemap({
+        'url': [
+            'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
+            'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
+            'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
+            'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
+        ],
+    })
     posts = site.feed()
     print(posts[:5])
-    print(site.story(posts[0]))
-    print(site.story(posts[:-1]))
+    print(site.story(posts[0][0], posts[0][1]))
+    
diff --git a/apiserver/misc/news.py b/apiserver/misc/news.py
index 8d32143..1594574 100644
--- a/apiserver/misc/news.py
+++ b/apiserver/misc/news.py
@@ -14,6 +14,7 @@ from utils import clean
 from misc.metadata import parse_extruct
 from misc.time import unix
 from misc.api import xml
+import misc.stuff as stuff
 
 def comment(i):
     if 'author' not in i:
@@ -89,13 +90,18 @@ class Base:
         if 'disqus' in markup:
             try:
                 s['comments'] = declutter.get_comments(urlref)
-                c['comments'] = list(filter(bool, c['comments']))
+                s['comments'] = list(filter(bool, s['comments']))
                 s['num_comments'] = comment_count(s['comments'])
             except KeyboardInterrupt:
                 raise
             except:
                 pass
 
+        if urlref.startswith('https://www.stuff.co.nz'):
+            s['comments'] = stuff.get_comments(urlref)
+            s['comments'] = list(filter(bool, s['comments']))
+            s['num_comments'] = len(s['comments'])
+
         if not s['date']:
             return False
         return s
diff --git a/apiserver/misc/stuff.py b/apiserver/misc/stuff.py
new file mode 100644
index 0000000..8c01665
--- /dev/null
+++ b/apiserver/misc/stuff.py
@@ -0,0 +1,62 @@
+import re
+from bs4 import BeautifulSoup
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+from misc.time import unix
+from misc.api import xml
+
+def _soup_get_text(soup):
+    if not soup: return None
+    if soup.text: return soup.text
+
+    s = soup.find(text=lambda tag: isinstance(tag, bs4.CData))
+    if s and s.string: return s.string.strip()
+    return None
+
+def _parse_comment(soup):
+    c = {
+        'author': '',
+        'authorLink': '',
+        'score': 0,
+        'date': 0,
+        'text': '',
+        'comments': [],
+    }
+    
+    if soup.find('link'):
+        title = _soup_get_text(soup.find('link'))
+        if title and 'By:' in title:
+            c['author'] = title.strip('By:').strip()
+    if soup.find('dc:creator'):
+        c['author'] = _soup_get_text(soup.find('dc:creator'))
+    if soup.find('link'):
+        c['authorLink'] = _soup_get_text(soup.find('link'))
+    if soup.find('description'):
+        c['text'] = _soup_get_text(soup.find('description'))
+    if soup.find('pubDate'):
+        c['date'] = unix(soup.find('pubDate').text)
+
+    return c
+
+def get_comments(url):
+    regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+"
+    p = re.compile(regex).match(url)
+    path = p.groups()[0]
+    comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}'
+    markup = xml(lambda x: comment_url)
+    if not markup: return []
+    soup = BeautifulSoup(markup, features='html.parser')
+    comments = soup.find_all('item')
+    if not comments: return []
+    comments = [_parse_comment(c) for c in comments]
+    return comments
+
+
+# scratchpad so I can quickly develop the parser
+if __name__ == '__main__':
+    comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing')
+    print(len(comments))
+    print(comments[:5])
\ No newline at end of file