stuff comments.
This commit is contained in:
		| @@ -53,7 +53,7 @@ class Category(Base): | |||||||
| # scratchpad so I can quickly develop the parser | # scratchpad so I can quickly develop the parser | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     print("Category: RadioNZ") |     print("Category: RadioNZ") | ||||||
|     site = Category("https://www.rnz.co.nz/news/") |     site = Category({ 'url': "https://www.rnz.co.nz/news/" }) | ||||||
|     excludes = [ |     excludes = [ | ||||||
|         'rnz.co.nz/news/sport', |         'rnz.co.nz/news/sport', | ||||||
|         'rnz.co.nz/weather', |         'rnz.co.nz/weather', | ||||||
| @@ -61,12 +61,12 @@ if __name__ == '__main__': | |||||||
|     ] |     ] | ||||||
|     posts = site.feed(excludes) |     posts = site.feed(excludes) | ||||||
|     print(posts[:5]) |     print(posts[:5]) | ||||||
|     print(site.story(posts[0])) |     print(site.story(posts[0][0], posts[0][1])) | ||||||
|  |  | ||||||
|     print("Category: Newsroom") |     print("Category: Newsroom") | ||||||
|     site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland') |     site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'}) | ||||||
|     posts = site.feed() |     posts = site.feed() | ||||||
|     print(posts[:5]) |     print(posts[:5]) | ||||||
|     print(site.story(posts[0])) |     print(site.story(posts[0][0], posts[0][1])) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -76,7 +76,7 @@ class Sitemap(Base): | |||||||
| # scratchpad so I can quickly develop the parser | # scratchpad so I can quickly develop the parser | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     print("Sitemap: The Spinoff") |     print("Sitemap: The Spinoff") | ||||||
|     site = Sitemap("https://thespinoff.co.nz/sitemap.xml") |     site = Sitemap({ 'url': "https://thespinoff.co.nz/sitemap.xml" }) | ||||||
|     excludes = [ |     excludes = [ | ||||||
|         'thespinoff.co.nz/sitemap-misc.xml', |         'thespinoff.co.nz/sitemap-misc.xml', | ||||||
|         'thespinoff.co.nz/sitemap-authors.xml', |         'thespinoff.co.nz/sitemap-authors.xml', | ||||||
| @@ -84,16 +84,18 @@ if __name__ == '__main__': | |||||||
|     ] |     ] | ||||||
|     posts = site.feed(excludes) |     posts = site.feed(excludes) | ||||||
|     print(posts[:5]) |     print(posts[:5]) | ||||||
|     print(site.story(posts[0])) |     print(site.story(posts[0][0], posts[0][1])) | ||||||
|  |  | ||||||
|     print("Sitemap: Newshub") |     print("Sitemap: Newshub") | ||||||
|     site = Sitemap([ |     site = Sitemap({ | ||||||
|         'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', |         'url': [ | ||||||
|         'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', |             'https://www.newshub.co.nz/home/politics.gnewssitemap.xml', | ||||||
|         'https://www.newshub.co.nz/home/world.gnewssitemap.xml', |             'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml', | ||||||
|         'https://www.newshub.co.nz/home/money.gnewssitemap.xml', |             'https://www.newshub.co.nz/home/world.gnewssitemap.xml', | ||||||
|     ]) |             'https://www.newshub.co.nz/home/money.gnewssitemap.xml', | ||||||
|  |         ], | ||||||
|  |     }) | ||||||
|     posts = site.feed() |     posts = site.feed() | ||||||
|     print(posts[:5]) |     print(posts[:5]) | ||||||
|     print(site.story(posts[0])) |     print(site.story(posts[0][0], posts[0][1])) | ||||||
|     print(site.story(posts[:-1])) |      | ||||||
|   | |||||||
| @@ -14,6 +14,7 @@ from utils import clean | |||||||
| from misc.metadata import parse_extruct | from misc.metadata import parse_extruct | ||||||
| from misc.time import unix | from misc.time import unix | ||||||
| from misc.api import xml | from misc.api import xml | ||||||
|  | import misc.stuff as stuff | ||||||
|  |  | ||||||
| def comment(i): | def comment(i): | ||||||
|     if 'author' not in i: |     if 'author' not in i: | ||||||
| @@ -89,13 +90,18 @@ class Base: | |||||||
|         if 'disqus' in markup: |         if 'disqus' in markup: | ||||||
|             try: |             try: | ||||||
|                 s['comments'] = declutter.get_comments(urlref) |                 s['comments'] = declutter.get_comments(urlref) | ||||||
|                 c['comments'] = list(filter(bool, c['comments'])) |                 s['comments'] = list(filter(bool, s['comments'])) | ||||||
|                 s['num_comments'] = comment_count(s['comments']) |                 s['num_comments'] = comment_count(s['comments']) | ||||||
|             except KeyboardInterrupt: |             except KeyboardInterrupt: | ||||||
|                 raise |                 raise | ||||||
|             except: |             except: | ||||||
|                 pass |                 pass | ||||||
|  |  | ||||||
|  |         if urlref.startswith('https://www.stuff.co.nz'): | ||||||
|  |             s['comments'] = stuff.get_comments(urlref) | ||||||
|  |             s['comments'] = list(filter(bool, s['comments'])) | ||||||
|  |             s['num_comments'] = len(s['comments']) | ||||||
|  |  | ||||||
|         if not s['date']: |         if not s['date']: | ||||||
|             return False |             return False | ||||||
|         return s |         return s | ||||||
|   | |||||||
							
								
								
									
										62
									
								
								apiserver/misc/stuff.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								apiserver/misc/stuff.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | |||||||
|  | import re | ||||||
|  | from bs4 import BeautifulSoup | ||||||
|  |  | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     import sys | ||||||
|  |     sys.path.insert(0,'.') | ||||||
|  |  | ||||||
|  | from misc.time import unix | ||||||
|  | from misc.api import xml | ||||||
|  |  | ||||||
|  | def _soup_get_text(soup): | ||||||
|  |     if not soup: return None | ||||||
|  |     if soup.text: return soup.text | ||||||
|  |  | ||||||
|  |     s = soup.find(text=lambda tag: isinstance(tag, bs4.CData)) | ||||||
|  |     if s and s.string: return s.string.strip() | ||||||
|  |     return None | ||||||
|  |  | ||||||
|  | def _parse_comment(soup): | ||||||
|  |     c = { | ||||||
|  |         'author': '', | ||||||
|  |         'authorLink': '', | ||||||
|  |         'score': 0, | ||||||
|  |         'date': 0, | ||||||
|  |         'text': '', | ||||||
|  |         'comments': [], | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     if soup.find('link'): | ||||||
|  |         title = _soup_get_text(soup.find('link')) | ||||||
|  |         if title and 'By:' in title: | ||||||
|  |             c['author'] = title.strip('By:').strip() | ||||||
|  |     if soup.find('dc:creator'): | ||||||
|  |         c['author'] = _soup_get_text(soup.find('dc:creator')) | ||||||
|  |     if soup.find('link'): | ||||||
|  |         c['authorLink'] = _soup_get_text(soup.find('link')) | ||||||
|  |     if soup.find('description'): | ||||||
|  |         c['text'] = _soup_get_text(soup.find('description')) | ||||||
|  |     if soup.find('pubDate'): | ||||||
|  |         c['date'] = unix(soup.find('pubDate').text) | ||||||
|  |  | ||||||
|  |     return c | ||||||
|  |  | ||||||
|  | def get_comments(url): | ||||||
|  |     regex = r"https:\/\/www\.stuff\.co\.nz\/(.*\/\d+)/[^\/]+" | ||||||
|  |     p = re.compile(regex).match(url) | ||||||
|  |     path = p.groups()[0] | ||||||
|  |     comment_url = f'https://comments.us1.gigya.com/comments/rss/6201101/Stuff/stuff/{path}' | ||||||
|  |     markup = xml(lambda x: comment_url) | ||||||
|  |     if not markup: return [] | ||||||
|  |     soup = BeautifulSoup(markup, features='html.parser') | ||||||
|  |     comments = soup.find_all('item') | ||||||
|  |     if not comments: return [] | ||||||
|  |     comments = [_parse_comment(c) for c in comments] | ||||||
|  |     return comments | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # scratchpad so I can quickly develop the parser | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     comments = get_comments('https://www.stuff.co.nz/life-style/homed/houses/123418468/dear-jacinda-we-need-to-talk-about-housing') | ||||||
|  |     print(len(comments)) | ||||||
|  |     print(comments[:5]) | ||||||
		Reference in New Issue
	
	Block a user