stuff comments.

This commit is contained in:
Jason Schwarzenberger 2020-11-19 14:23:01 +13:00
parent dc3d17b171
commit 1fe524207e
4 changed files with 85 additions and 15 deletions

View File

@ -53,7 +53,7 @@ class Category(Base):
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
print("Category: RadioNZ") print("Category: RadioNZ")
site = Category("https://www.rnz.co.nz/news/") site = Category({ 'url': "https://www.rnz.co.nz/news/" })
excludes = [ excludes = [
'rnz.co.nz/news/sport', 'rnz.co.nz/news/sport',
'rnz.co.nz/weather', 'rnz.co.nz/weather',
@ -61,12 +61,12 @@ if __name__ == '__main__':
] ]
posts = site.feed(excludes) posts = site.feed(excludes)
print(posts[:5]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0][0], posts[0][1]))
print("Category: Newsroom") print("Category: Newsroom")
site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland') site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'})
posts = site.feed() posts = site.feed()
print(posts[:5]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0][0], posts[0][1]))

View File

@ -76,7 +76,7 @@ class Sitemap(Base):
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
print("Sitemap: The Spinoff") print("Sitemap: The Spinoff")
site = Sitemap("https://thespinoff.co.nz/sitemap.xml") site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" })
excludes = [ excludes = [
'thespinoff.co.nz/sitemap-misc.xml', 'thespinoff.co.nz/sitemap-misc.xml',
'thespinoff.co.nz/sitemap-authors.xml', 'thespinoff.co.nz/sitemap-authors.xml',
@ -84,16 +84,18 @@ if __name__ == '__main__':
] ]
posts = site.feed(excludes) posts = site.feed(excludes)
print(posts[:5]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0][0], posts[0][1]))
print("Sitemap: Newshub") print("Sitemap: Newshub")
site = Sitemap([ site = Sitemap({
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', 'url': [
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', 'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
'https://www.newshub.co.nz/home/world.gnewssitemap.xml', 'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
'https://www.newshub.co.nz/home/money.gnewssitemap.xml', 'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
]) 'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
],
})
posts = site.feed() posts = site.feed()
print(posts[:5]) print(posts[:5])
print(site.story(posts[0])) print(site.story(posts[0][0], posts[0][1]))
print(site.story(posts[:-1]))

View File

@ -14,6 +14,7 @@ from utils import clean
from misc.metadata import parse_extruct from misc.metadata import parse_extruct
from misc.time import unix from misc.time import unix
from misc.api import xml from misc.api import xml
import misc.stuff as stuff
def comment(i): def comment(i):
if 'author' not in i: if 'author' not in i:
@ -89,13 +90,18 @@ class Base:
if 'disqus' in markup: if 'disqus' in markup:
try: try:
s['comments'] = declutter.get_comments(urlref) s['comments'] = declutter.get_comments(urlref)
c['comments'] = list(filter(bool, c['comments'])) s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = comment_count(s['comments']) s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt: except KeyboardInterrupt:
raise raise
except: except:
pass pass
if urlref.startswith('https://www.stuff.co.nz'):
s['comments'] = stuff.get_comments(urlref)
s['comments'] = list(filter(bool, s['comments']))
s['num_comments'] = len(s['comments'])
if not s['date']: if not s['date']:
return False return False
return s return s

62
apiserver/misc/stuff.py Normal file
View File

@ -0,0 +1,62 @@
import re
from bs4 import BeautifulSoup
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from misc.time import unix
from misc.api import xml
def _soup_get_text(soup):
if not soup: return None
if soup.text: return soup.text
s = soup.find(text=lambda tag: isinstance(tag, bs4.CData))
if s and s.string: return s.string.strip()
return None
def _parse_comment(soup):
c = {
'author': '',
'authorLink': '',
'score': 0,
'date': 0,
'text': '',
'comments': [],
}
if soup.find('link'):
title = _soup_get_text(soup.find('link'))
if title and 'By:' in title:
c['author'] = title.strip('By:').strip()
if soup.find('dc:creator'):
c['author'] = _soup_get_text(soup.find('dc:creator'))
if soup.find('link'):
c['authorLink'] = _soup_get_text(soup.find('link'))
if soup.find('description'):
c['text'] = _soup_get_text(soup.find('description'))
if soup.find('pubDate'):
c['date'] = unix(soup.find('pubDate').text)
return c
def get_comments(url):
regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+"
p = re.compile(regex).match(url)
path = p.groups()[0]
comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}'
markup = xml(lambda x: comment_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
comments = soup.find_all('item')
if not comments: return []
comments = [_parse_comment(c) for c in comments]
return comments
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing')
print(len(comments))
print(comments[:5])