From 2ede5ed6ff7471d2f15c474d022ec469c97b04fe Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Fri, 30 Aug 2019 06:23:14 +0000 Subject: [PATCH] Filter out False comments --- apiserver/feeds/hackernews.py | 5 +++++ apiserver/feeds/reddit.py | 22 +++++++++++----------- apiserver/feeds/tildes.py | 32 ++++++++++++++++++++------------ 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/apiserver/feeds/hackernews.py b/apiserver/feeds/hackernews.py index 74b9cb5..e9c9a31 100644 --- a/apiserver/feeds/hackernews.py +++ b/apiserver/feeds/hackernews.py @@ -25,12 +25,16 @@ def feed(): return api(API_TOPSTORIES) or [] def comment(i): + if 'author' not in i: + return False + c = {} c['author'] = i.get('author', '') c['score'] = i.get('points', 0) c['date'] = i.get('created_at_i', 0) c['text'] = i.get('text', '') c['comments'] = [comment(j) for j in i['children']] + c['comments'] = list(filter(bool, c['comments'])) return c def comment_count(i): @@ -55,6 +59,7 @@ def story(ref): s['link'] = SITE_LINK(ref) s['url'] = r.get('url', '') s['comments'] = [comment(i) for i in r['children']] + s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = comment_count(s) - 1 if 'text' in r and r['text']: diff --git a/apiserver/feeds/reddit.py b/apiserver/feeds/reddit.py index 29f8aaf..2160acb 100644 --- a/apiserver/feeds/reddit.py +++ b/apiserver/feeds/reddit.py @@ -22,22 +22,21 @@ reddit = praw.Reddit('bot') def feed(): return [x.id for x in reddit.subreddit(SUBREDDITS).hot()] -def good_comment(c): - if isinstance(c, MoreComments): - return False - if c.body == '[removed]': - return False - if c.author and c.author.name == 'AutoModerator': - return False - return True - def comment(i): + if isinstance(i, MoreComments): + return False + if '[removed]' in i.body or '[deleted]' in i.body: + return False + if i.author and i.author.name == 'AutoModerator': + return False + c = {} c['author'] = i.author.name if i.author else '[Deleted]' c['score'] = i.score c['date'] = i.created_utc c['text'] = render_md(i.body) - c['comments'] = [comment(j) for j in i.replies if good_comment(j)] + c['comments'] = [comment(j) for j in i.replies] + c['comments'] = list(filter(bool, c['comments'])) return c def story(ref): @@ -52,7 +51,8 @@ def story(ref): s['title'] = r.title s['link'] = SITE_LINK(r.permalink) s['url'] = r.url - s['comments'] = [comment(i) for i in r.comments if good_comment(i)] + s['comments'] = [comment(i) for i in r.comments] + s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = r.num_comments if r.selftext: diff --git a/apiserver/feeds/tildes.py b/apiserver/feeds/tildes.py index 8154dbe..48286b8 100644 --- a/apiserver/feeds/tildes.py +++ b/apiserver/feeds/tildes.py @@ -41,6 +41,10 @@ def unix(date_str): def comment(i): i = i.article + + if i.find('div', class_='is-comment-removed'): + return False + c = {} c['author'] = str(i.find('a', class_='link-user').string) c['score'] = 1 @@ -48,6 +52,7 @@ def comment(i): c['text'] = i.find('div', class_='comment-text').encode_contents().decode() ct = i.find('ol', class_='comment-tree') c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else [] + c['comments'] = list(filter(bool, c['comments'])) return c def story(ref): @@ -76,6 +81,7 @@ def story(ref): s['url'] = ud.a['href'] if ud else s['link'] sc = a.find('ol', id='comments') s['comments'] = [comment(i) for i in sc.findAll('li', recursive=False)] + s['comments'] = list(filter(bool, s['comments'])) ch = a.find('header', class_='topic-comments-header') s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0 @@ -87,17 +93,19 @@ def story(ref): # scratchpad so I can quickly develop the parser if __name__ == '__main__': - print(feed()) - normal = story('gxt') - print(normal) - no_comments = story('gxr') - print(no_comments) - self_post = story('gsb') - print(self_post) - li_comment = story('gqx') - print(li_comment) + #print(feed()) + #normal = story('gxt') + #print(normal) + #no_comments = story('gxr') + #print(no_comments) + #self_post = story('gsb') + #print(self_post) + #li_comment = story('gqx') + #print(li_comment) + broken = story('h23') + print(broken) # make sure there's no self-reference - import copy - for x in [normal, no_comments, self_post, li_comment]: - _ = copy.deepcopy(x) + #import copy + #for x in [normal, no_comments, self_post, li_comment]: + # _ = copy.deepcopy(x)