forked from tanner/qotnews
		
	stuff comments.
This commit is contained in:
		| @@ -53,7 +53,7 @@ class Category(Base): | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     print("Category: RadioNZ") | ||||
|     site = Category("https://www.rnz.co.nz/news/") | ||||
|     site = Category({ 'url': "https://www.rnz.co.nz/news/" }) | ||||
|     excludes = [ | ||||
|         'rnz.co.nz/news/sport', | ||||
|         'rnz.co.nz/weather', | ||||
| @@ -61,12 +61,12 @@ if __name__ == '__main__': | ||||
|     ] | ||||
|     posts = site.feed(excludes) | ||||
|     print(posts[:5]) | ||||
|     print(site.story(posts[0])) | ||||
|     print(site.story(posts[0][0], posts[0][1])) | ||||
|  | ||||
|     print("Category: Newsroom") | ||||
|     site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland') | ||||
|     site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'}) | ||||
|     posts = site.feed() | ||||
|     print(posts[:5]) | ||||
|     print(site.story(posts[0])) | ||||
|     print(site.story(posts[0][0], posts[0][1])) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -76,7 +76,7 @@ class Sitemap(Base): | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     print("Sitemap: The Spinoff") | ||||
|     site = Sitemap("https://thespinoff.co.nz/sitemap.xml") | ||||
|     site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" }) | ||||
|     excludes = [ | ||||
|         'thespinoff.co.nz/sitemap-misc.xml', | ||||
|         'thespinoff.co.nz/sitemap-authors.xml', | ||||
| @@ -84,16 +84,18 @@ if __name__ == '__main__': | ||||
|     ] | ||||
|     posts = site.feed(excludes) | ||||
|     print(posts[:5]) | ||||
|     print(site.story(posts[0])) | ||||
|     print(site.story(posts[0][0], posts[0][1])) | ||||
|  | ||||
|     print("Sitemap: Newshub") | ||||
|     site = Sitemap([ | ||||
|         'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', | ||||
|         'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', | ||||
|         'https://www.newshub.co.nz/home/world.gnewssitemap.xml', | ||||
|         'https://www.newshub.co.nz/home/money.gnewssitemap.xml', | ||||
|     ]) | ||||
|     site = Sitemap({ | ||||
|         'url': [ | ||||
|             'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', | ||||
|             'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', | ||||
|             'https://www.newshub.co.nz/home/world.gnewssitemap.xml', | ||||
|             'https://www.newshub.co.nz/home/money.gnewssitemap.xml', | ||||
|         ], | ||||
|     }) | ||||
|     posts = site.feed() | ||||
|     print(posts[:5]) | ||||
|     print(site.story(posts[0])) | ||||
|     print(site.story(posts[:-1])) | ||||
|     print(site.story(posts[0][0], posts[0][1])) | ||||
|      | ||||
|   | ||||
| @@ -14,6 +14,7 @@ from utils import clean | ||||
| from misc.metadata import parse_extruct | ||||
| from misc.time import unix | ||||
| from misc.api import xml | ||||
| import misc.stuff as stuff | ||||
|  | ||||
| def comment(i): | ||||
|     if 'author' not in i: | ||||
| @@ -89,13 +90,18 @@ class Base: | ||||
|         if 'disqus' in markup: | ||||
|             try: | ||||
|                 s['comments'] = declutter.get_comments(urlref) | ||||
|                 c['comments'] = list(filter(bool, c['comments'])) | ||||
|                 s['comments'] = list(filter(bool, s['comments'])) | ||||
|                 s['num_comments'] = comment_count(s['comments']) | ||||
|             except KeyboardInterrupt: | ||||
|                 raise | ||||
|             except: | ||||
|                 pass | ||||
|  | ||||
|         if urlref.startswith('https://www.stuff.co.nz'): | ||||
|             s['comments'] = stuff.get_comments(urlref) | ||||
|             s['comments'] = list(filter(bool, s['comments'])) | ||||
|             s['num_comments'] = len(s['comments']) | ||||
|  | ||||
|         if not s['date']: | ||||
|             return False | ||||
|         return s | ||||
|   | ||||
							
								
								
									
										62
									
								
								apiserver/misc/stuff.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								apiserver/misc/stuff.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| import re | ||||
| from bs4 import BeautifulSoup | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     import sys | ||||
|     sys.path.insert(0,'.') | ||||
|  | ||||
| from misc.time import unix | ||||
| from misc.api import xml | ||||
|  | ||||
| def _soup_get_text(soup): | ||||
|     if not soup: return None | ||||
|     if soup.text: return soup.text | ||||
|  | ||||
|     s = soup.find(text=lambda tag: isinstance(tag, bs4.CData)) | ||||
|     if s and s.string: return s.string.strip() | ||||
|     return None | ||||
|  | ||||
| def _parse_comment(soup): | ||||
|     c = { | ||||
|         'author': '', | ||||
|         'authorLink': '', | ||||
|         'score': 0, | ||||
|         'date': 0, | ||||
|         'text': '', | ||||
|         'comments': [], | ||||
|     } | ||||
|      | ||||
|     if soup.find('link'): | ||||
|         title = _soup_get_text(soup.find('link')) | ||||
|         if title and 'By:' in title: | ||||
|             c['author'] = title.strip('By:').strip() | ||||
|     if soup.find('dc:creator'): | ||||
|         c['author'] = _soup_get_text(soup.find('dc:creator')) | ||||
|     if soup.find('link'): | ||||
|         c['authorLink'] = _soup_get_text(soup.find('link')) | ||||
|     if soup.find('description'): | ||||
|         c['text'] = _soup_get_text(soup.find('description')) | ||||
|     if soup.find('pubDate'): | ||||
|         c['date'] = unix(soup.find('pubDate').text) | ||||
|  | ||||
|     return c | ||||
|  | ||||
| def get_comments(url): | ||||
|     regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+" | ||||
|     p = re.compile(regex).match(url) | ||||
|     path = p.groups()[0] | ||||
|     comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}' | ||||
|     markup = xml(lambda x: comment_url) | ||||
|     if not markup: return [] | ||||
|     soup = BeautifulSoup(markup, features='html.parser') | ||||
|     comments = soup.find_all('item') | ||||
|     if not comments: return [] | ||||
|     comments = [_parse_comment(c) for c in comments] | ||||
|     return comments | ||||
|  | ||||
|  | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing') | ||||
|     print(len(comments)) | ||||
|     print(comments[:5]) | ||||
		Reference in New Issue
	
	Block a user