stuff comments.
This commit is contained in:
parent
dc3d17b171
commit
1fe524207e
|
@ -53,7 +53,7 @@ class Category(Base):
|
||||||
# scratchpad so I can quickly develop the parser
|
# scratchpad so I can quickly develop the parser
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print("Category: RadioNZ")
|
print("Category: RadioNZ")
|
||||||
site = Category("https://www.rnz.co.nz/news/")
|
site = Category({ 'url': "https://www.rnz.co.nz/news/" })
|
||||||
excludes = [
|
excludes = [
|
||||||
'rnz.co.nz/news/sport',
|
'rnz.co.nz/news/sport',
|
||||||
'rnz.co.nz/weather',
|
'rnz.co.nz/weather',
|
||||||
|
@ -61,12 +61,12 @@ if __name__ == '__main__':
|
||||||
]
|
]
|
||||||
posts = site.feed(excludes)
|
posts = site.feed(excludes)
|
||||||
print(posts[:5])
|
print(posts[:5])
|
||||||
print(site.story(posts[0]))
|
print(site.story(posts[0][0], posts[0][1]))
|
||||||
|
|
||||||
print("Category: Newsroom")
|
print("Category: Newsroom")
|
||||||
site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
|
site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'})
|
||||||
posts = site.feed()
|
posts = site.feed()
|
||||||
print(posts[:5])
|
print(posts[:5])
|
||||||
print(site.story(posts[0]))
|
print(site.story(posts[0][0], posts[0][1]))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@ class Sitemap(Base):
|
||||||
# scratchpad so I can quickly develop the parser
|
# scratchpad so I can quickly develop the parser
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print("Sitemap: The Spinoff")
|
print("Sitemap: The Spinoff")
|
||||||
site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
|
site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" })
|
||||||
excludes = [
|
excludes = [
|
||||||
'thespinoff.co.nz/sitemap-misc.xml',
|
'thespinoff.co.nz/sitemap-misc.xml',
|
||||||
'thespinoff.co.nz/sitemap-authors.xml',
|
'thespinoff.co.nz/sitemap-authors.xml',
|
||||||
|
@ -84,16 +84,18 @@ if __name__ == '__main__':
|
||||||
]
|
]
|
||||||
posts = site.feed(excludes)
|
posts = site.feed(excludes)
|
||||||
print(posts[:5])
|
print(posts[:5])
|
||||||
print(site.story(posts[0]))
|
print(site.story(posts[0][0], posts[0][1]))
|
||||||
|
|
||||||
print("Sitemap: Newshub")
|
print("Sitemap: Newshub")
|
||||||
site = Sitemap([
|
site = Sitemap({
|
||||||
|
'url': [
|
||||||
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
|
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
|
||||||
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
|
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
|
||||||
'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
|
'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
|
||||||
'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
|
'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
|
||||||
])
|
],
|
||||||
|
})
|
||||||
posts = site.feed()
|
posts = site.feed()
|
||||||
print(posts[:5])
|
print(posts[:5])
|
||||||
print(site.story(posts[0]))
|
print(site.story(posts[0][0], posts[0][1]))
|
||||||
print(site.story(posts[:-1]))
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ from utils import clean
|
||||||
from misc.metadata import parse_extruct
|
from misc.metadata import parse_extruct
|
||||||
from misc.time import unix
|
from misc.time import unix
|
||||||
from misc.api import xml
|
from misc.api import xml
|
||||||
|
import misc.stuff as stuff
|
||||||
|
|
||||||
def comment(i):
|
def comment(i):
|
||||||
if 'author' not in i:
|
if 'author' not in i:
|
||||||
|
@ -89,13 +90,18 @@ class Base:
|
||||||
if 'disqus' in markup:
|
if 'disqus' in markup:
|
||||||
try:
|
try:
|
||||||
s['comments'] = declutter.get_comments(urlref)
|
s['comments'] = declutter.get_comments(urlref)
|
||||||
c['comments'] = list(filter(bool, c['comments']))
|
s['comments'] = list(filter(bool, s['comments']))
|
||||||
s['num_comments'] = comment_count(s['comments'])
|
s['num_comments'] = comment_count(s['comments'])
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
raise
|
raise
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if urlref.startswith('https://www.stuff.co.nz'):
|
||||||
|
s['comments'] = stuff.get_comments(urlref)
|
||||||
|
s['comments'] = list(filter(bool, s['comments']))
|
||||||
|
s['num_comments'] = len(s['comments'])
|
||||||
|
|
||||||
if not s['date']:
|
if not s['date']:
|
||||||
return False
|
return False
|
||||||
return s
|
return s
|
||||||
|
|
62
apiserver/misc/stuff.py
Normal file
62
apiserver/misc/stuff.py
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0,'.')
|
||||||
|
|
||||||
|
from misc.time import unix
|
||||||
|
from misc.api import xml
|
||||||
|
|
||||||
|
def _soup_get_text(soup):
|
||||||
|
if not soup: return None
|
||||||
|
if soup.text: return soup.text
|
||||||
|
|
||||||
|
s = soup.find(text=lambda tag: isinstance(tag, bs4.CData))
|
||||||
|
if s and s.string: return s.string.strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_comment(soup):
|
||||||
|
c = {
|
||||||
|
'author': '',
|
||||||
|
'authorLink': '',
|
||||||
|
'score': 0,
|
||||||
|
'date': 0,
|
||||||
|
'text': '',
|
||||||
|
'comments': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
if soup.find('link'):
|
||||||
|
title = _soup_get_text(soup.find('link'))
|
||||||
|
if title and 'By:' in title:
|
||||||
|
c['author'] = title.strip('By:').strip()
|
||||||
|
if soup.find('dc:creator'):
|
||||||
|
c['author'] = _soup_get_text(soup.find('dc:creator'))
|
||||||
|
if soup.find('link'):
|
||||||
|
c['authorLink'] = _soup_get_text(soup.find('link'))
|
||||||
|
if soup.find('description'):
|
||||||
|
c['text'] = _soup_get_text(soup.find('description'))
|
||||||
|
if soup.find('pubDate'):
|
||||||
|
c['date'] = unix(soup.find('pubDate').text)
|
||||||
|
|
||||||
|
return c
|
||||||
|
|
||||||
|
def get_comments(url):
|
||||||
|
regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+"
|
||||||
|
p = re.compile(regex).match(url)
|
||||||
|
path = p.groups()[0]
|
||||||
|
comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}'
|
||||||
|
markup = xml(lambda x: comment_url)
|
||||||
|
if not markup: return []
|
||||||
|
soup = BeautifulSoup(markup, features='html.parser')
|
||||||
|
comments = soup.find_all('item')
|
||||||
|
if not comments: return []
|
||||||
|
comments = [_parse_comment(c) for c in comments]
|
||||||
|
return comments
|
||||||
|
|
||||||
|
|
||||||
|
# scratchpad so I can quickly develop the parser
|
||||||
|
if __name__ == '__main__':
|
||||||
|
comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing')
|
||||||
|
print(len(comments))
|
||||||
|
print(comments[:5])
|
Loading…
Reference in New Issue
Block a user