From 2ede5ed6ff7471d2f15c474d022ec469c97b04fe Mon Sep 17 00:00:00 2001
From: Tanner Collin <git@tannercollin.com>
Date: Fri, 30 Aug 2019 06:23:14 +0000
Subject: [PATCH] Filter out False comments

---
 apiserver/feeds/hackernews.py |  5 +++++
 apiserver/feeds/reddit.py     | 22 +++++++++++-----------
 apiserver/feeds/tildes.py     | 32 ++++++++++++++++++++------------
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/apiserver/feeds/hackernews.py b/apiserver/feeds/hackernews.py
index 74b9cb5..e9c9a31 100644
--- a/apiserver/feeds/hackernews.py
+++ b/apiserver/feeds/hackernews.py
@@ -25,12 +25,16 @@ def feed():
     return api(API_TOPSTORIES) or []
 
 def comment(i):
+    if 'author' not in i:
+        return False
+
     c = {}
     c['author'] = i.get('author', '')
     c['score'] = i.get('points', 0)
     c['date'] = i.get('created_at_i', 0)
     c['text'] = i.get('text', '')
     c['comments'] = [comment(j) for j in i['children']]
+    c['comments'] = list(filter(bool, c['comments']))
     return c
 
 def comment_count(i):
@@ -55,6 +59,7 @@ def story(ref):
     s['link'] = SITE_LINK(ref)
     s['url'] = r.get('url', '')
     s['comments'] = [comment(i) for i in r['children']]
+    s['comments'] = list(filter(bool, s['comments']))
     s['num_comments'] = comment_count(s) - 1
 
     if 'text' in r and r['text']:
diff --git a/apiserver/feeds/reddit.py b/apiserver/feeds/reddit.py
index 29f8aaf..2160acb 100644
--- a/apiserver/feeds/reddit.py
+++ b/apiserver/feeds/reddit.py
@@ -22,22 +22,21 @@ reddit = praw.Reddit('bot')
 def feed():
     return [x.id for x in reddit.subreddit(SUBREDDITS).hot()]
 
-def good_comment(c):
-    if isinstance(c, MoreComments):
-        return False
-    if c.body == '[removed]':
-        return False
-    if c.author and c.author.name == 'AutoModerator':
-        return False
-    return True
-
 def comment(i):
+    if isinstance(i, MoreComments):
+        return False
+    if '[removed]' in i.body or '[deleted]' in i.body:
+        return False
+    if i.author and i.author.name == 'AutoModerator':
+        return False
+
     c = {}
     c['author'] = i.author.name if i.author else '[Deleted]'
     c['score'] = i.score
     c['date'] = i.created_utc
     c['text'] = render_md(i.body)
-    c['comments'] = [comment(j) for j in i.replies if good_comment(j)]
+    c['comments'] = [comment(j) for j in i.replies]
+    c['comments'] = list(filter(bool, c['comments']))
     return c
 
 def story(ref):
@@ -52,7 +51,8 @@ def story(ref):
     s['title'] = r.title
     s['link'] = SITE_LINK(r.permalink)
     s['url'] = r.url
-    s['comments'] = [comment(i) for i in r.comments if good_comment(i)]
+    s['comments'] = [comment(i) for i in r.comments]
+    s['comments'] = list(filter(bool, s['comments']))
     s['num_comments'] = r.num_comments
 
     if r.selftext:
diff --git a/apiserver/feeds/tildes.py b/apiserver/feeds/tildes.py
index 8154dbe..48286b8 100644
--- a/apiserver/feeds/tildes.py
+++ b/apiserver/feeds/tildes.py
@@ -41,6 +41,10 @@ def unix(date_str):
 
 def comment(i):
     i = i.article
+
+    if i.find('div', class_='is-comment-removed'):
+        return False
+
     c = {}
     c['author'] = str(i.find('a', class_='link-user').string)
     c['score'] = 1
@@ -48,6 +52,7 @@ def comment(i):
     c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
     ct = i.find('ol', class_='comment-tree')
     c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
+    c['comments'] = list(filter(bool, c['comments']))
     return c
 
 def story(ref):
@@ -76,6 +81,7 @@ def story(ref):
     s['url'] = ud.a['href'] if ud else s['link']
     sc = a.find('ol', id='comments')
     s['comments'] = [comment(i) for i in sc.findAll('li', recursive=False)]
+    s['comments'] = list(filter(bool, s['comments']))
     ch = a.find('header', class_='topic-comments-header')
     s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
 
@@ -87,17 +93,19 @@ def story(ref):
 
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
-    print(feed())
-    normal = story('gxt')
-    print(normal)
-    no_comments = story('gxr')
-    print(no_comments)
-    self_post = story('gsb')
-    print(self_post)
-    li_comment = story('gqx')
-    print(li_comment)
+    #print(feed())
+    #normal = story('gxt')
+    #print(normal)
+    #no_comments = story('gxr')
+    #print(no_comments)
+    #self_post = story('gsb')
+    #print(self_post)
+    #li_comment = story('gqx')
+    #print(li_comment)
+    broken = story('h23')
+    print(broken)
 
     # make sure there's no self-reference
-    import copy
-    for x in [normal, no_comments, self_post, li_comment]:
-        _ = copy.deepcopy(x)
+    #import copy
+    #for x in [normal, no_comments, self_post, li_comment]:
+    #    _ = copy.deepcopy(x)