Fix tildes comments parsing bug
This commit is contained in:
parent
2b1a352917
commit
cf9e197e6c
|
@ -29,6 +29,8 @@ def get_article(url):
|
||||||
def update_story(story):
|
def update_story(story):
|
||||||
res = {}
|
res = {}
|
||||||
|
|
||||||
|
logging.info('Updating story ' + str(story['ref']))
|
||||||
|
|
||||||
if story['source'] == 'hackernews':
|
if story['source'] == 'hackernews':
|
||||||
res = hackernews.story(story['ref'])
|
res = hackernews.story(story['ref'])
|
||||||
elif story['source'] == 'reddit':
|
elif story['source'] == 'reddit':
|
||||||
|
@ -42,6 +44,7 @@ def update_story(story):
|
||||||
story.update(res)
|
story.update(res)
|
||||||
if story.get('url', '') and not story.get('text', ''):
|
if story.get('url', '') and not story.get('text', ''):
|
||||||
if not story['url'].endswith('.pdf'):
|
if not story['url'].endswith('.pdf'):
|
||||||
|
logging.info('Getting article ' + story['url'])
|
||||||
story['text'] = get_article(story['url'])
|
story['text'] = get_article(story['url'])
|
||||||
else:
|
else:
|
||||||
story['text'] = '<p>Unsupported article type.</p>'
|
story['text'] = '<p>Unsupported article type.</p>'
|
||||||
|
|
|
@ -41,7 +41,8 @@ def comment(i):
|
||||||
c['score'] = 1
|
c['score'] = 1
|
||||||
c['date'] = unix(i.find('time')['datetime'])
|
c['date'] = unix(i.find('time')['datetime'])
|
||||||
c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
|
c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
|
||||||
c['comments'] = [comment(j) for j in i.find('ol', class_='comment-tree').findAll('li', recursive=False)] if i.ol else []
|
ct = i.find('ol', class_='comment-tree')
|
||||||
|
c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
|
||||||
return c
|
return c
|
||||||
|
|
||||||
def story(ref):
|
def story(ref):
|
||||||
|
@ -61,7 +62,8 @@ def story(ref):
|
||||||
s['link'] = SITE_LINK(ref)
|
s['link'] = SITE_LINK(ref)
|
||||||
ud = a.find('div', class_='topic-full-link')
|
ud = a.find('div', class_='topic-full-link')
|
||||||
s['url'] = ud.a['href'] if ud else s['link']
|
s['url'] = ud.a['href'] if ud else s['link']
|
||||||
s['comments'] = [comment(i) for i in a.find('ol', id='comments').findAll('li', recursive=False)]
|
sc = a.find('ol', id='comments')
|
||||||
|
s['comments'] = [comment(i) for i in sc.findAll('li', recursive=False)]
|
||||||
ch = a.find('header', class_='topic-comments-header')
|
ch = a.find('header', class_='topic-comments-header')
|
||||||
s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
|
s['num_comments'] = int(ch.h2.string.split(' ')[0]) if ch else 0
|
||||||
|
|
||||||
|
@ -79,8 +81,10 @@ if __name__ == '__main__':
|
||||||
print(no_comments)
|
print(no_comments)
|
||||||
self_post = story('gsb')
|
self_post = story('gsb')
|
||||||
print(self_post)
|
print(self_post)
|
||||||
|
li_comment = story('gqx')
|
||||||
|
print(li_comment)
|
||||||
|
|
||||||
# make sure there's no self-reference
|
# make sure there's no self-reference
|
||||||
import copy
|
import copy
|
||||||
for x in [normal, no_comments, self_post]:
|
for x in [normal, no_comments, self_post, li_comment]:
|
||||||
_ = copy.deepcopy(x)
|
_ = copy.deepcopy(x)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user