disqus test.

This commit is contained in:
Jason Schwarzenberger 2020-11-05 14:23:51 +13:00
parent 0fc1a44d2b
commit afda5b635c
2 changed files with 65 additions and 47 deletions

View File

@ -10,6 +10,7 @@ if __name__ == '__main__':
import requests import requests
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapers import declutter
import extruct import extruct
from utils import clean from utils import clean
@ -86,7 +87,7 @@ def parse_extruct(s, data):
s['author'] = props['author']['properties']['name'] s['author'] = props['author']['properties']['name']
for ld in data['json-ld']: for ld in data['json-ld']:
if ld['@type'] == 'Article': if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline'] s['title'] = ld['headline']
if ld['dateModified']: if ld['dateModified']:
s['date'] = unix(ld['dateModified']) s['date'] = unix(ld['dateModified'])
@ -94,24 +95,35 @@ def parse_extruct(s, data):
s['date'] = unix(ld['datePublished']) s['date'] = unix(ld['datePublished'])
if 'author' in ld and ld['author']: if 'author' in ld and ld['author']:
s['author'] = ld['author']['name'] s['author'] = ld['author']['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = unix(gld['dateModified'])
if gld['datePublished']:
s['date'] = unix(gld['datePublished'])
return s return s
def comment(i):
if 'author' not in i:
return False
class Sitemap: c = {}
def __init__(self, url): c['author'] = i.get('author', '')
self.sitemap_url = url c['score'] = i.get('points', 0)
c['date'] = unix(i.get('date', 0))
c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
def feed(self): def comment_count(i):
markup = xml(lambda x: self.sitemap_url) alive = 1 if i['author'] else 0
if not markup: return [] return sum([comment_count(c) for c in i['comments']]) + alive
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
links = [x.find('loc').text for x in articles] or []
links = list(set(links))
return links
class _Base:
def story(self, ref): def story(self, ref):
markup = xml(lambda x: ref) markup = xml(lambda x: ref)
if not markup: if not markup:
@ -129,12 +141,36 @@ class Sitemap:
data = extruct.extract(markup) data = extruct.extract(markup)
s = parse_extruct(s, data) s = parse_extruct(s, data)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(ref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']: if not s['date']:
return False return False
return s return s
class Sitemap(_Base):
def __init__(self, url):
self.sitemap_url = url
class Category: def feed(self):
markup = xml(lambda x: self.sitemap_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
links = [x.find('loc').text for x in articles] or []
links = list(set(links))
return links
class Category(_Base):
def __init__(self, url): def __init__(self, url):
self.category_url = url self.category_url = url
self.base_url = '/'.join(url.split('/')[:3]) self.base_url = '/'.join(url.split('/')[:3])
@ -149,30 +185,8 @@ class Category:
links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links])) links = list(filter(None, [link if link.startswith(self.category_url) else None for link in links]))
links = list(filter(None, [link if link != self.category_url else None for link in links])) links = list(filter(None, [link if link != self.category_url else None for link in links]))
links = list(set(links)) links = list(set(links))
return links return links
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
data = extruct.extract(markup)
s = parse_extruct(s, data)
if not s['date']:
return False
return s
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
@ -182,19 +196,10 @@ if __name__ == '__main__':
print(posts[:1]) print(posts[:1])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Sitemap: NZ Herald")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))
print("Category: RadioNZ Te Ao Māori") print("Category: RadioNZ Te Ao Māori")
site = Category("https://www.rnz.co.nz/news/te-manu-korihi/") site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:1])
print(site.story(posts[0])) print(site.story(posts[0]))
print("Category: Newsroom Business")
site = Category("https://www.newsroom.co.nz/business/")
posts = site.feed()
print(posts[:1])
print(site.story(posts[0]))

View File

@ -5,6 +5,7 @@ logging.basicConfig(
import requests import requests
DECLUTTER_API = 'https://declutter.1j.nz/details' DECLUTTER_API = 'https://declutter.1j.nz/details'
DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/comments'
TIMEOUT = 30 TIMEOUT = 30
@ -26,3 +27,15 @@ def get_details(url):
except BaseException as e: except BaseException as e:
logging.error('Problem decluttering article: {}'.format(str(e))) logging.error('Problem decluttering article: {}'.format(str(e)))
return None return None
def get_comments(url):
try:
r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem getting comments for article: {}'.format(str(e)))
return None