Fix tildes comments parsing bug

This commit is contained in:
Tanner Collin 2019-08-25 07:46:22 +00:00
parent 2b1a352917
commit cf9e197e6c
2 changed files with 10 additions and 3 deletions

View File

@ -29,6 +29,8 @@ def get_article(url):
def update_story(story):
res = {}
logging.info('Updating story ' + str(story['ref']))
if story['source'] == 'hackernews':
res = hackernews.story(story['ref'])
elif story['source'] == 'reddit':
@ -42,6 +44,7 @@ def update_story(story):
story.update(res)
if story.get('url', '') and not story.get('text', ''):
if not story['url'].endswith('.pdf'):
logging.info('Getting article ' + story['url'])
story['text'] = get_article(story['url'])
else:
story['text'] = '<p>Unsupported article type.</p>'

View File

@ -41,7 +41,8 @@ def comment(i):
c['score'] = 1
c['date'] = unix(i.find('time')['datetime'])
c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
c['comments'] = [comment(j) for j in i.find('ol', class_='comment-tree').findAll('li', recursive=False)] if i.ol else []
ct = i.find('ol', class_='comment-tree')
c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
return c
def story(ref):
@ -61,7 +62,8 @@ def story(ref):
s['link'] = SITE_LINK(ref)
ud = a.find('div', class_='topic-full-link')
s['url'] = ud.a['href'] if ud else s['link']
s['comments'] = [comment(i) for i in a.find('ol', id='comments').findAll('li', recursive=False)]
sc = a.find('ol', id='comments')
s['comments'] = [comment(i) for i in sc.findAll('li', recursive=False)]
ch = a.find('header', class_='topic-comments-header')
s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
@ -79,8 +81,10 @@ if __name__ == '__main__':
print(no_comments)
self_post = story('gsb')
print(self_post)
li_comment = story('gqx')
print(li_comment)
# make sure there's no self-reference
import copy
for x in [normal, no_comments, self_post]:
for x in [normal, no_comments, self_post, li_comment]:
_ = copy.deepcopy(x)