Compare commits

...

5 Commits

Author SHA1 Message Date
Jason Schwarzenberger 5668fa5dbc fix mistake. 4 years ago
Jason Schwarzenberger b771b52501 add regex to get a unique ref from each sitemap/category based article url. 4 years ago
Jason Schwarzenberger f5c7a658ba cosmetic filters for the spinoff. 4 years ago
Jason Schwarzenberger f5ccd844da fix import error. 4 years ago
Jason Schwarzenberger 6a91b9402f split categories, sitemap and other crap out of news.py 4 years ago
  1. 7
      apiserver/database.py
  2. 28
      apiserver/feed.py
  3. 72
      apiserver/feeds/category.py
  4. 307
      apiserver/feeds/news.py
  5. 99
      apiserver/feeds/sitemap.py
  6. 35
      apiserver/misc/api.py
  7. 69
      apiserver/misc/metadata.py
  8. 101
      apiserver/misc/news.py
  9. 18
      apiserver/misc/time.py
  10. 6
      apiserver/server.py
  11. 34
      apiserver/settings.py.example
  12. 3
      readerserver/scraper/browser/scripts/cosmetic-filters.js

@ -24,6 +24,7 @@ class Reflist(Base):
rid = Column(Integer, primary_key=True)
ref = Column(String(16), unique=True)
urlref = Column(String)
sid = Column(String, ForeignKey('stories.sid'), unique=True)
source = Column(String(16))
@ -75,7 +76,7 @@ def get_stories_by_url(url):
def get_reflist():
session = Session()
q = session.query(Reflist).order_by(Reflist.rid.desc())
return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]
def get_stories(maxage=60*60*24*2):
time = datetime.now().timestamp() - maxage
@ -87,10 +88,10 @@ def get_stories(maxage=60*60*24*2):
order_by(Story.meta['date'].desc())
return [x[1] for x in q]
def put_ref(ref, sid, source):
def put_ref(ref, sid, source, urlref):
try:
session = Session()
r = Reflist(ref=ref, sid=sid, source=source)
r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
session.add(r)
session.commit()
except:

@ -9,7 +9,9 @@ from bs4 import BeautifulSoup
import itertools
import settings
from feeds import hackernews, reddit, tildes, substack, manual, news
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
from scrapers import outline, declutter, browser, local
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -19,40 +21,40 @@ for key, value in settings.SUBSTACK.items():
substacks[key] = substack.Publication(value['url'])
categories = {}
for key, value in settings.CATEGORY.items():
categories[key] = news.Category(value['url'], value.get('tz'))
categories[key] = Category(value)
sitemaps = {}
for key, value in settings.SITEMAP.items():
sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
sitemaps[key] = Sitemap(value)
def get_list():
feeds = {}
if settings.NUM_HACKERNEWS:
feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
if settings.NUM_REDDIT:
feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
if settings.NUM_TILDES:
feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
if settings.NUM_SUBSTACK:
feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items():
count = settings.SUBSTACK[key]['count']
feeds[key] = [(x, key) for x in publication.feed()[:count]]
feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
for key, sites in categories.items():
count = settings.CATEGORY[key].get('count') or 0
excludes = settings.CATEGORY[key].get('excludes')
tz = settings.CATEGORY[key].get('tz')
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
for key, sites in sitemaps.items():
count = settings.SITEMAP[key].get('count') or 0
excludes = settings.SITEMAP[key].get('excludes')
feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
values = feeds.values()
feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
@ -99,7 +101,7 @@ def get_content_type(url):
except:
return ''
def update_story(story, is_manual=False):
def update_story(story, is_manual=False, urlref=None):
res = {}
if story['source'] == 'hackernews':
@ -111,9 +113,9 @@ def update_story(story, is_manual=False):
elif story['source'] == 'substack':
res = substack.top.story(story['ref'])
elif story['source'] in categories.keys():
res = categories[story['source']].story(story['ref'])
res = categories[story['source']].story(story['ref'], urlref)
elif story['source'] in sitemaps.keys():
res = sitemaps[story['source']].story(story['ref'])
res = sitemaps[story['source']].story(story['ref'], urlref)
elif story['source'] in substacks.keys():
res = substacks[story['source']].story(story['ref'])
elif story['source'] == 'manual':

@ -0,0 +1,72 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.api import xml
from misc.news import Base
def _filter_links(links, category_url, excludes=None):
links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
links = list(filter(None, [link if link != category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_category(category_url, excludes=None):
base_url = '/'.join(category_url.split('/')[:3])
markup = xml(lambda x: category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
links = _filter_links(links, category_url, excludes)
return links
class Category(Base):
def __init__(self, config):
self.config = config
self.category_url = config.get('url')
self.tz = config.get('tz')
def feed(self, excludes=None):
links = []
if isinstance(self.category_url, str):
links += _get_category(self.category_url, excludes)
elif isinstance(self.category_url, list):
for url in self.category_url:
links += _get_category(url, excludes)
links = list(set(links))
return [(self.get_id(link), link) for link in links]
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Category: RadioNZ")
site = Category("https://www.rnz.co.nz/news/")
excludes = [
'rnz.co.nz/news/sport',
'rnz.co.nz/weather',
'rnz.co.nz/news/weather',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0]))
print("Category: Newsroom")
site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))

@ -1,307 +0,0 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests
from datetime import datetime
from bs4 import BeautifulSoup
from scrapers import declutter
import dateutil.parser
import extruct
import pytz
from utils import clean
import settings
tzinfos = {
'NZDT': pytz.timezone('Pacific/Auckland'),
'NZST': pytz.timezone('Pacific/Auckland')
}
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
def unix(date_str, tz=None):
try:
dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
if tz:
dt = pytz.timezone(tz).localize(dt)
return int(dt.timestamp())
except:
pass
return 0
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def parse_extruct(s, data):
rdfa_keys = {
'title': [
'http://ogp.me/ns#title',
'https://ogp.me/ns#title',
],
'date': [
'http://ogp.me/ns/article#modified_time',
'https://ogp.me/ns/article#modified_time',
'http://ogp.me/ns/article#published_time',
'https://ogp.me/ns/article#published_time',
]
}
for rdfa in data['rdfa']:
for key, props in rdfa.items():
for attribute, properties in rdfa_keys.items():
for prop in properties:
if prop in props:
for values in props[prop]:
s[attribute] = values['@value']
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = modified[0]
if len(published):
s['date'] = published[0]
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = props['dateModified']
if props['datePublished']:
s['date'] = props['datePublished']
if 'author' in props and props['author']:
if 'properties' in props['author']:
s['author'] = props['author']['properties']['name']
elif isinstance(props['author'], list):
s['author'] = props['author'][0]['properties']['name']
for ld in data['json-ld']:
if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = ld['dateModified']
if ld['datePublished']:
s['date'] = ld['datePublished']
if 'author' in ld and ld['author']:
if 'name' in ld['author']:
s['author'] = ld['author']['name']
elif isinstance(ld['author'], list):
s['author'] = ld['author'][0]['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s
def comment(i):
if 'author' not in i:
return False
c = {}
c['author'] = i.get('author', '')
c['score'] = i.get('points', 0)
c['date'] = unix(i.get('date', 0))
c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class _Base:
def __init__(url, tz=None):
self.url = url
self.tz = tz
def feed(self, excludes=None):
return []
def story(self, ref):
markup = xml(lambda x: ref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = ref
s['url'] = ref
s['date'] = 0
soup = BeautifulSoup(markup, features='html.parser')
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
favicon = soup.find_all('link', rel="shortcut icon", href=True)
others = soup.find_all('link', rel="icon", href=True)
icons = icon32 + icon16 + favicon + others
base_url = '/'.join(ref.split('/')[:3])
icons = list(set([i.get('href') for i in icons]))
icons = [i if i.startswith('http') else base_url + i for i in icons]
if icons:
s['icon'] = icons[0]
data = extruct.extract(markup)
s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(ref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s
def get_sitemap_date(a):
if a.find('lastmod'):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return ''
class Sitemap(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.sitemap_url = url
def feed(self, excludes=None):
links = []
if isinstance(self.sitemap_url, str):
links += self._get_sitemap(self.sitemap_url, excludes)
elif isinstance(self.sitemap_url, list):
for url in self.sitemap_url:
links += self._get_sitemap(url, excludes)
return list(set(links))
def _filter_links(self, links, excludes=None):
too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links]))
links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_sitemap(self, feed_url, excludes=None):
markup = xml(lambda x: feed_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
links = []
feed_urls = []
if soup.find('sitemapindex'):
sitemap = soup.find('sitemapindex').findAll('sitemap')
feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
if soup.find('urlset'):
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
feed_urls = self._filter_links(feed_urls, excludes)
links = self._filter_links(links, excludes)
for url in feed_urls:
links += self._get_sitemap(url, excludes)
return list(set(links))
class Category(_Base):
def __init__(self, url, tz=None):
self.tz = tz
self.category_url = url
def _filter_links(self, links, category_url, excludes=None):
links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
links = list(filter(None, [link if link != category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_category(self, category_url, excludes=None):
base_url = '/'.join(category_url.split('/')[:3])
markup = xml(lambda x: category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
links = self._filter_links(links, category_url, excludes)
return links
def feed(self, excludes=None):
links = []
if isinstance(self.category_url, str):
links += self._get_category(self.category_url, excludes)
elif isinstance(self.category_url, list):
for url in self.category_url:
links += self._get_category(url, excludes)
return list(set(links))
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: The Spinoff")
site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
excludes = [
'thespinoff.co.nz/sitemap-misc.xml',
'thespinoff.co.nz/sitemap-authors.xml',
'thespinoff.co.nz/sitemap-tax-category.xml',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0]))
print("Sitemap: Newshub")
site = Sitemap([
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
])
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))
print(site.story(posts[:-1]))

@ -0,0 +1,99 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from datetime import datetime
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.time import unix
from misc.api import xml
from misc.news import Base
def _get_sitemap_date(a):
if a.find('lastmod'):
return a.find('lastmod').text
if a.find('news:publication_date'):
return a.find('news:publication_date').text
if a.find('ns2:publication_date'):
return a.find('ns2:publication_date').text
return ''
def _filter_links(links, excludes=None):
too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
links = [x.find('loc').text for x in links] or []
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_sitemap(feed_url, excludes=None):
markup = xml(lambda x: feed_url)
if not markup: return []
soup = BeautifulSoup(markup, features='lxml')
links = []
feed_urls = []
if soup.find('sitemapindex'):
sitemap = soup.find('sitemapindex').findAll('sitemap')
feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
if soup.find('urlset'):
sitemap = soup.find('urlset').findAll('url')
links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
feed_urls = _filter_links(feed_urls, excludes)
links = _filter_links(links, excludes)
for url in feed_urls:
links += _get_sitemap(url, excludes)
return list(set(links))
class Sitemap(Base):
def __init__(self, config):
self.config = config
self.sitemap_url = config.get('url')
self.tz = config.get('tz')
def feed(self, excludes=None):
links = []
if isinstance(self.sitemap_url, str):
links += _get_sitemap(self.sitemap_url, excludes)
elif isinstance(self.sitemap_url, list):
for url in self.sitemap_url:
links += _get_sitemap(url, excludes)
links = list(set(links))
return [(self.get_id(link), link) for link in links]
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Sitemap: The Spinoff")
site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
excludes = [
'thespinoff.co.nz/sitemap-misc.xml',
'thespinoff.co.nz/sitemap-authors.xml',
'thespinoff.co.nz/sitemap-tax-category.xml',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0]))
print("Sitemap: Newshub")
site = Sitemap([
'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
])
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))
print(site.story(posts[:-1]))

@ -0,0 +1,35 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import requests
USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
FORWARD_IP = '66.249.66.1'
def xml(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.text
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False
def json(route, ref=None):
try:
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
r = requests.get(route(ref), headers=headers, timeout=5)
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
return r.json()
except KeyboardInterrupt:
raise
except BaseException as e:
logging.error('Problem hitting URL: {}'.format(str(e)))
return False

@ -0,0 +1,69 @@
def parse_extruct(s, data):
rdfa_keys = {
'title': [
'http://ogp.me/ns#title',
'https://ogp.me/ns#title',
],
'date': [
'http://ogp.me/ns/article#modified_time',
'https://ogp.me/ns/article#modified_time',
'http://ogp.me/ns/article#published_time',
'https://ogp.me/ns/article#published_time',
]
}
for rdfa in data['rdfa']:
for key, props in rdfa.items():
for attribute, properties in rdfa_keys.items():
for prop in properties:
if prop in props:
for values in props[prop]:
s[attribute] = values['@value']
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
if len(modified):
s['date'] = modified[0]
if len(published):
s['date'] = published[0]
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
props = md['properties']
s['title'] = props['headline']
if props['dateModified']:
s['date'] = props['dateModified']
if props['datePublished']:
s['date'] = props['datePublished']
if 'author' in props and props['author']:
if 'properties' in props['author']:
s['author'] = props['author']['properties']['name']
elif isinstance(props['author'], list):
s['author'] = props['author'][0]['properties']['name']
for ld in data['json-ld']:
if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
s['title'] = ld['headline']
if ld['dateModified']:
s['date'] = ld['dateModified']
if ld['datePublished']:
s['date'] = ld['datePublished']
if 'author' in ld and ld['author']:
if 'name' in ld['author']:
s['author'] = ld['author']['name']
elif isinstance(ld['author'], list):
s['author'] = ld['author'][0]['name']
if '@graph' in ld:
for gld in ld['@graph']:
if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
s['title'] = gld['headline']
if gld['dateModified']:
s['date'] = gld['dateModified']
if gld['datePublished']:
s['date'] = gld['datePublished']
return s

@ -0,0 +1,101 @@
import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import re
import requests
from bs4 import BeautifulSoup
from scrapers import declutter
import extruct
import settings
from utils import clean
from misc.metadata import parse_extruct
from misc.time import unix
from misc.api import xml
def comment(i):
if 'author' not in i:
return False
c = {}
c['author'] = i.get('author', '')
c['score'] = i.get('points', 0)
c['date'] = unix(i.get('date', 0))
c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class Base:
def __init__(config):
self.config = config
self.url = config.get('url')
self.tz = config.get('tz')
def get_id(self, link):
patterns = self.config.get('patterns')
if not patterns:
return link
patterns = [re.compile(p) for p in patterns]
patterns = list(filter(None, [p.match(link) for p in patterns]))
patterns = list(set([':'.join(p.groups()) for p in patterns]))
if not patterns:
return link
return patterns[0]
def feed(self, excludes=None):
return []
def story(self, ref, urlref):
if urlref is None:
return False
markup = xml(lambda x: urlref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = urlref
s['url'] = urlref
s['date'] = 0
soup = BeautifulSoup(markup, features='html.parser')
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
favicon = soup.find_all('link', rel="shortcut icon", href=True)
others = soup.find_all('link', rel="icon", href=True)
icons = icon32 + icon16 + favicon + others
base_url = '/'.join(urlref.split('/')[:3])
icons = list(set([i.get('href') for i in icons]))
icons = [i if i.startswith('http') else base_url + i for i in icons]
if icons:
s['icon'] = icons[0]
data = extruct.extract(markup)
s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(urlref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s

@ -0,0 +1,18 @@
import pytz
import dateutil.parser
TZINFOS = {
'NZDT': pytz.timezone('Pacific/Auckland'),
'NZST': pytz.timezone('Pacific/Auckland')
}
def unix(date_str, tz=None, tzinfos=TZINFOS):
try:
dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
if tz:
dt = pytz.timezone(tz).localize(dt)
return int(dt.timestamp())
except:
pass
return 0

@ -145,12 +145,12 @@ def static_story(sid):
http_server = WSGIServer(('', 33842), flask_app)
def _add_new_refs():
for ref, source in feed.get_list():
for ref, source, urlref in feed.get_list():
if database.get_story_by_ref(ref):
continue
try:
nid = new_id()
database.put_ref(ref, nid, source)
database.put_ref(ref, nid, source, urlref)
logging.info('Added ref ' + ref)
except database.IntegrityError:
continue
@ -163,7 +163,7 @@ def _update_current_story(item):
logging.info('Updating story: {}'.format(str(story['ref'])))
valid = feed.update_story(story)
valid = feed.update_story(story, urlref=item['urlref'])
if valid:
database.put_story(story)
search.put_story(story)

@ -13,15 +13,43 @@ NUM_TILDES = 5
NUM_SUBSTACK = 10
SITEMAP = {}
# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
# SITEMAP['nzherald'] = {
# 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
# 'count': 20,
# 'patterns': [
# r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
# ],
# 'excludes': [
# 'driven.co.nz',
# 'oneroof.co.nz',
# 'nzherald.co.nz/sponsored-stories',
# 'nzherald.co.nz/entertainment/',
# 'nzherald.co.nz/lifestyle/',
# 'nzherald.co.nz/travel/',
# 'nzherald.co.nz/sport/',
# 'nzherald.co.nz/promotions/',
# 'nzherald.co.nzhttp',
# 'herald-afternoon-quiz',
# 'herald-morning-quiz'
# ],
# }
SUBSTACK = {}
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
CATEGORY = {}
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
# CATEGORY['radionz'] = {
# 'url': "https://www.rnz.co.nz/news/",
# 'count': 20,
# 'patterns': [
# r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
# ],
# 'excludes': [
# 'rnz.co.nz/news/sport',
# 'rnz.co.nz/weather',
# ],
# }
SCRAPERS = ['browser', 'declutter', 'outline', 'local']

@ -39,6 +39,9 @@
if (matchDomain(["tvnz.co.nz"])) {
removeSelectors([".signup-container container"]);
}
if (matchDomain(["thespinoff.co.nz"])) {
removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
}
function matchDomain(domains) {
const hostname = window.location.hostname;

Loading…
Cancel
Save