12 changed files with 329 additions and 450 deletions
--- a/apiserver/database.py
+++ b/apiserver/database.py
@ -24,7 +24,6 @@ class Reflist(Base):

    rid = Column(Integer, primary_key=True)
    ref = Column(String(16), unique=True)
-    urlref = Column(String)
    sid = Column(String, ForeignKey('stories.sid'), unique=True)
    source = Column(String(16))

@ -76,7 +75,7 @@ def get_stories_by_url(url):
 def get_reflist():
    session = Session()
    q = session.query(Reflist).order_by(Reflist.rid.desc())
-    return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]
+    return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]

 def get_stories(maxage=60*60*24*2):
    time = datetime.now().timestamp() - maxage
@ -88,10 +87,10 @@ def get_stories(maxage=60*60*24*2):
            order_by(Story.meta['date'].desc())
    return [x[1] for x in q]

-def put_ref(ref, sid, source, urlref):
+def put_ref(ref, sid, source):
    try:
        session = Session()
-        r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
+        r = Reflist(ref=ref, sid=sid, source=source)
        session.add(r)
        session.commit()
    except:
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@ -9,9 +9,7 @@ from bs4 import BeautifulSoup
 import itertools

 import settings
-from feeds import hackernews, reddit, tildes, substack, manual
-from feeds.sitemap import Sitemap
-from feeds.category import Category
+from feeds import hackernews, reddit, tildes, substack, manual, news
 from scrapers import outline, declutter, browser, local

 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@ -21,40 +19,40 @@ for key, value in settings.SUBSTACK.items():
    substacks[key] = substack.Publication(value['url'])
 categories = {}
 for key, value in settings.CATEGORY.items():
-    categories[key] = Category(value)
+    categories[key] = news.Category(value['url'], value.get('tz'))
 sitemaps = {}
 for key, value in settings.SITEMAP.items():
-    sitemaps[key] = Sitemap(value)
+    sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))

 def get_list():
    feeds = {}

    if settings.NUM_HACKERNEWS:
-        feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
+        feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]

    if settings.NUM_REDDIT:
-        feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
+        feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]

    if settings.NUM_TILDES:
-        feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
+        feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]

    if settings.NUM_SUBSTACK:
-        feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
+        feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]

    for key, publication in substacks.items():
        count = settings.SUBSTACK[key]['count']
-        feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
+        feeds[key] = [(x, key) for x in publication.feed()[:count]]

    for key, sites in categories.items():
        count = settings.CATEGORY[key].get('count') or 0
        excludes = settings.CATEGORY[key].get('excludes')
        tz = settings.CATEGORY[key].get('tz')
-        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
+        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]

    for key, sites in sitemaps.items():
        count = settings.SITEMAP[key].get('count') or 0
        excludes = settings.SITEMAP[key].get('excludes')
-        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
+        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]

    values = feeds.values()
    feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
@ -101,7 +99,7 @@ def get_content_type(url):
    except:
        return ''

-def update_story(story, is_manual=False, urlref=None):
+def update_story(story, is_manual=False):
    res = {}

    if story['source'] == 'hackernews':
@ -113,9 +111,9 @@ def update_story(story, is_manual=False, urlref=None):
    elif story['source'] == 'substack':
        res = substack.top.story(story['ref'])
    elif story['source'] in categories.keys():
-        res = categories[story['source']].story(story['ref'], urlref)
+        res = categories[story['source']].story(story['ref'])
    elif story['source'] in sitemaps.keys():
-        res = sitemaps[story['source']].story(story['ref'], urlref)
+        res = sitemaps[story['source']].story(story['ref'])
    elif story['source'] in substacks.keys():
        res = substacks[story['source']].story(story['ref'])
    elif story['source'] == 'manual':
--- a/apiserver/feeds/category.py
+++ b/apiserver/feeds/category.py
@ -1,72 +0,0 @@
-import logging
-logging.basicConfig(
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
-
-if __name__ == '__main__':
-    import sys
-    sys.path.insert(0,'.')
-
-from bs4 import BeautifulSoup
-
-import settings
-from utils import clean
-from misc.api import xml
-from misc.news import Base
-
-def _filter_links(links, category_url, excludes=None):
-    links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
-    links = list(filter(None, [link if link != category_url else None for link in links]))
-    links = list(set(links))
-    if excludes:
-        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
-    return links
-
-def _get_category(category_url, excludes=None):
-    base_url = '/'.join(category_url.split('/')[:3])
-    markup = xml(lambda x: category_url)
-    if not markup: return []
-    soup = BeautifulSoup(markup, features='html.parser')
-    links = soup.find_all('a', href=True)
-    links = [link.get('href') for link in links]
-    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
-    links = _filter_links(links, category_url, excludes)
-    return links
-
-class Category(Base):
-    def __init__(self, config):
-        self.config = config
-        self.category_url = config.get('url')
-        self.tz = config.get('tz')
-
-    def feed(self, excludes=None):
-        links = []
-        if isinstance(self.category_url, str):
-            links += _get_category(self.category_url, excludes)
-        elif isinstance(self.category_url, list):
-            for url in self.category_url:
-                links += _get_category(url, excludes)
-        links = list(set(links))
-        return [(self.get_id(link), link) for link in links]
-
-
-# scratchpad so I can quickly develop the parser
-if __name__ == '__main__':
-    print("Category: RadioNZ")
-    site = Category("https://www.rnz.co.nz/news/")
-    excludes = [
-        'rnz.co.nz/news/sport',
-        'rnz.co.nz/weather',
-        'rnz.co.nz/news/weather',
-    ]
-    posts = site.feed(excludes)
-    print(posts[:5])
-    print(site.story(posts[0]))
-
-    print("Category: Newsroom")
-    site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
-    posts = site.feed()
-    print(posts[:5])
-    print(site.story(posts[0]))
-
-
--- a/apiserver/feeds/news.py
+++ b/apiserver/feeds/news.py
@ -0,0 +1,307 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+import requests
+from datetime import datetime
+from bs4 import BeautifulSoup
+from scrapers import declutter
+import dateutil.parser
+import extruct
+import pytz
+
+from utils import clean
+import settings
+
+tzinfos = {
+    'NZDT': pytz.timezone('Pacific/Auckland'),
+    'NZST': pytz.timezone('Pacific/Auckland')
+}
+
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
+#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+
+def unix(date_str, tz=None):
+    try:
+        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
+        if tz:
+            dt = pytz.timezone(tz).localize(dt)
+        return int(dt.timestamp())
+    except:
+        pass
+    return 0
+
+
+def xml(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.text
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
+
+
+def parse_extruct(s, data):
+    rdfa_keys = {
+        'title': [
+            'http://ogp.me/ns#title',
+            'https://ogp.me/ns#title',
+        ],
+        'date': [
+            'http://ogp.me/ns/article#modified_time',
+            'https://ogp.me/ns/article#modified_time',
+            'http://ogp.me/ns/article#published_time',
+            'https://ogp.me/ns/article#published_time',
+        ]
+    }
+    for rdfa in data['rdfa']:
+        for key, props in rdfa.items():
+            for attribute, properties in rdfa_keys.items():
+                for prop in properties:
+                    if prop in props:
+                        for values in props[prop]:
+                            s[attribute] = values['@value']
+
+    for og in data['opengraph']:
+        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
+        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
+        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
+        if len(modified):
+            s['date'] = modified[0]
+        if len(published):
+            s['date'] = published[0]
+        if len(titles):
+            s['title'] = titles[0]
+
+    for md in data['microdata']:
+        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
+            props = md['properties']
+            s['title'] = props['headline']
+            if props['dateModified']:
+                s['date'] = props['dateModified']
+            if props['datePublished']:
+                s['date'] = props['datePublished']
+            if 'author' in props and props['author']:
+                if 'properties' in props['author']:
+                    s['author'] = props['author']['properties']['name']
+                elif isinstance(props['author'], list):
+                    s['author'] = props['author'][0]['properties']['name']
+
+    for ld in data['json-ld']:
+        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
+            s['title'] = ld['headline']
+            if ld['dateModified']:
+                s['date'] = ld['dateModified']
+            if ld['datePublished']:
+                s['date'] = ld['datePublished']
+            if 'author' in ld and ld['author']:
+                if 'name' in ld['author']:
+                    s['author'] = ld['author']['name']
+                elif isinstance(ld['author'], list):
+                    s['author'] = ld['author'][0]['name']
+        if '@graph' in ld:
+            for gld in ld['@graph']:
+                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
+                    s['title'] = gld['headline']
+                    if gld['dateModified']:
+                        s['date'] = gld['dateModified']
+                    if gld['datePublished']:
+                        s['date'] = gld['datePublished']
+
+    return s
+
+def comment(i):
+    if 'author' not in i:
+        return False
+
+    c = {}
+    c['author'] = i.get('author', '')
+    c['score'] = i.get('points', 0)
+    c['date'] = unix(i.get('date', 0))
+    c['text'] = clean(i.get('text', '') or '')
+    c['comments'] = [comment(j) for j in i['children']]
+    c['comments'] = list(filter(bool, c['comments']))
+    return c
+
+def comment_count(i):
+    alive = 1 if i['author'] else 0
+    return sum([comment_count(c) for c in i['comments']]) + alive
+
+class _Base:
+    def __init__(url, tz=None):
+        self.url = url
+        self.tz = tz
+
+    def feed(self, excludes=None):
+        return []
+
+    def story(self, ref):
+        markup = xml(lambda x: ref)
+        if not markup:
+            return False
+
+        s = {}
+        s['author_link'] = ''
+        s['score'] = 0
+        s['comments'] = []
+        s['num_comments'] = 0
+        s['link'] = ref
+        s['url'] = ref
+        s['date'] = 0
+
+        soup = BeautifulSoup(markup, features='html.parser')
+        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
+        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
+        favicon = soup.find_all('link', rel="shortcut icon", href=True)
+        others = soup.find_all('link', rel="icon", href=True)
+        icons = icon32 + icon16 + favicon + others
+        base_url = '/'.join(ref.split('/')[:3])
+        icons = list(set([i.get('href') for i in icons]))
+        icons = [i if i.startswith('http') else base_url + i for i in icons]
+
+        if icons:
+            s['icon'] = icons[0]
+
+        data = extruct.extract(markup)
+        s = parse_extruct(s, data)
+        if s['date']:
+            s['date'] = unix(s['date'], tz=self.tz)
+
+        if 'disqus' in markup:
+            try:
+                s['comments'] = declutter.get_comments(ref)
+                c['comments'] = list(filter(bool, c['comments']))
+                s['num_comments'] = comment_count(s['comments'])
+            except KeyboardInterrupt:
+                raise
+            except:
+                pass
+
+        if not s['date']:
+            return False
+        return s
+
+def get_sitemap_date(a):
+    if a.find('lastmod'):
+        return a.find('lastmod').text
+    if a.find('news:publication_date'):
+        return a.find('news:publication_date').text
+    if a.find('ns2:publication_date'):
+        return a.find('ns2:publication_date').text
+    return ''
+
+class Sitemap(_Base):
+    def __init__(self, url, tz=None):
+        self.tz = tz
+        self.sitemap_url = url
+
+    def feed(self, excludes=None):
+        links = []
+        if isinstance(self.sitemap_url, str):
+            links += self._get_sitemap(self.sitemap_url, excludes)
+        elif isinstance(self.sitemap_url, list):
+            for url in self.sitemap_url:
+                links += self._get_sitemap(url, excludes)
+        return list(set(links))
+
+    def _filter_links(self, links, excludes=None):
+        too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
+        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
+        links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links]))
+        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
+
+        links = [x.find('loc').text for x in links] or []
+        links = list(set(links))
+        if excludes:
+            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+        return links
+
+    def _get_sitemap(self, feed_url, excludes=None):
+        markup = xml(lambda x: feed_url)
+        if not markup: return []
+        soup = BeautifulSoup(markup, features='lxml')
+        links = []
+        feed_urls = []
+        if soup.find('sitemapindex'):
+            sitemap = soup.find('sitemapindex').findAll('sitemap')
+            feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+        if soup.find('urlset'):
+            sitemap = soup.find('urlset').findAll('url')
+            links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+
+        feed_urls = self._filter_links(feed_urls, excludes)
+        links = self._filter_links(links, excludes)
+
+        for url in feed_urls:
+            links += self._get_sitemap(url, excludes)
+        return list(set(links))
+
+class Category(_Base):
+    def __init__(self, url, tz=None):
+        self.tz = tz
+        self.category_url = url
+
+    def _filter_links(self, links, category_url, excludes=None):
+        links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
+        links = list(filter(None, [link if link != category_url else None for link in links]))
+        links = list(set(links))
+        if excludes:
+            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+        return links
+
+    def _get_category(self, category_url, excludes=None):
+        base_url = '/'.join(category_url.split('/')[:3])
+        markup = xml(lambda x: category_url)
+        if not markup: return []
+        soup = BeautifulSoup(markup, features='html.parser')
+        links = soup.find_all('a', href=True)
+        links = [link.get('href') for link in links]
+        links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
+        links = self._filter_links(links, category_url, excludes)
+        return links
+
+    def feed(self, excludes=None):
+        links = []
+        if isinstance(self.category_url, str):
+            links += self._get_category(self.category_url, excludes)
+        elif isinstance(self.category_url, list):
+            for url in self.category_url:
+                links += self._get_category(url, excludes)
+        return list(set(links))
+
+
+# scratchpad so I can quickly develop the parser
+if __name__ == '__main__':
+    print("Sitemap: The Spinoff")
+    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
+    excludes = [
+        'thespinoff.co.nz/sitemap-misc.xml',
+        'thespinoff.co.nz/sitemap-authors.xml',
+        'thespinoff.co.nz/sitemap-tax-category.xml',
+    ]
+    posts = site.feed(excludes)
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+    print("Sitemap: Newshub")
+    site = Sitemap([
+        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
+    ])
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+    print(site.story(posts[:-1]))
+
--- a/apiserver/feeds/sitemap.py
+++ b/apiserver/feeds/sitemap.py
@ -1,99 +0,0 @@
-import logging
-logging.basicConfig(
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
-
-if __name__ == '__main__':
-    import sys
-    sys.path.insert(0,'.')
-
-from datetime import datetime
-from bs4 import BeautifulSoup
-
-import settings
-from utils import clean
-from misc.time import unix
-from misc.api import xml
-from misc.news import Base
-
-def _get_sitemap_date(a):
-    if a.find('lastmod'):
-        return a.find('lastmod').text
-    if a.find('news:publication_date'):
-        return a.find('news:publication_date').text
-    if a.find('ns2:publication_date'):
-        return a.find('ns2:publication_date').text
-    return ''
-
-def _filter_links(links, excludes=None):
-    too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
-    links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
-    links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
-    links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
-
-    links = [x.find('loc').text for x in links] or []
-    links = list(set(links))
-    if excludes:
-        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
-    return links
-
-def _get_sitemap(feed_url, excludes=None):
-    markup = xml(lambda x: feed_url)
-    if not markup: return []
-    soup = BeautifulSoup(markup, features='lxml')
-    links = []
-    feed_urls = []
-    if soup.find('sitemapindex'):
-        sitemap = soup.find('sitemapindex').findAll('sitemap')
-        feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
-    if soup.find('urlset'):
-        sitemap = soup.find('urlset').findAll('url')
-        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
-
-    feed_urls = _filter_links(feed_urls, excludes)
-    links = _filter_links(links, excludes)
-
-    for url in feed_urls:
-        links += _get_sitemap(url, excludes)
-    return list(set(links))
-
-class Sitemap(Base):
-    def __init__(self, config):
-        self.config = config
-        self.sitemap_url = config.get('url')
-        self.tz = config.get('tz')
-
-    def feed(self, excludes=None):
-        links = []
-        if isinstance(self.sitemap_url, str):
-            links += _get_sitemap(self.sitemap_url, excludes)
-        elif isinstance(self.sitemap_url, list):
-            for url in self.sitemap_url:
-                links += _get_sitemap(url, excludes)
-        links = list(set(links))
-        return [(self.get_id(link), link) for link in links]
-
-# scratchpad so I can quickly develop the parser
-if __name__ == '__main__':
-    print("Sitemap: The Spinoff")
-    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
-    excludes = [
-        'thespinoff.co.nz/sitemap-misc.xml',
-        'thespinoff.co.nz/sitemap-authors.xml',
-        'thespinoff.co.nz/sitemap-tax-category.xml',
-    ]
-    posts = site.feed(excludes)
-    print(posts[:5])
-    print(site.story(posts[0]))
-
-    print("Sitemap: Newshub")
-    site = Sitemap([
-        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
-    ])
-    posts = site.feed()
-    print(posts[:5])
-    print(site.story(posts[0]))
-    print(site.story(posts[:-1]))
--- a/apiserver/misc/api.py
+++ b/apiserver/misc/api.py
@ -1,35 +0,0 @@
-import logging
-logging.basicConfig(
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
-
-import requests
-
-USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-FORWARD_IP = '66.249.66.1'
-
-def xml(route, ref=None):
-    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
-        r = requests.get(route(ref), headers=headers, timeout=5)
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        return r.text
-    except KeyboardInterrupt:
-        raise
-    except BaseException as e:
-        logging.error('Problem hitting URL: {}'.format(str(e)))
-        return False
-
-def json(route, ref=None):
-    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
-        r = requests.get(route(ref), headers=headers, timeout=5)
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        return r.json()
-    except KeyboardInterrupt:
-        raise
-    except BaseException as e:
-        logging.error('Problem hitting URL: {}'.format(str(e)))
-        return False
--- a/apiserver/misc/metadata.py
+++ b/apiserver/misc/metadata.py
@ -1,69 +0,0 @@
-
-def parse_extruct(s, data):
-    rdfa_keys = {
-        'title': [
-            'http://ogp.me/ns#title',
-            'https://ogp.me/ns#title',
-        ],
-        'date': [
-            'http://ogp.me/ns/article#modified_time',
-            'https://ogp.me/ns/article#modified_time',
-            'http://ogp.me/ns/article#published_time',
-            'https://ogp.me/ns/article#published_time',
-        ]
-    }
-    for rdfa in data['rdfa']:
-        for key, props in rdfa.items():
-            for attribute, properties in rdfa_keys.items():
-                for prop in properties:
-                    if prop in props:
-                        for values in props[prop]:
-                            s[attribute] = values['@value']
-
-    for og in data['opengraph']:
-        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
-        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
-        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
-        if len(modified):
-            s['date'] = modified[0]
-        if len(published):
-            s['date'] = published[0]
-        if len(titles):
-            s['title'] = titles[0]
-
-    for md in data['microdata']:
-        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
-            props = md['properties']
-            s['title'] = props['headline']
-            if props['dateModified']:
-                s['date'] = props['dateModified']
-            if props['datePublished']:
-                s['date'] = props['datePublished']
-            if 'author' in props and props['author']:
-                if 'properties' in props['author']:
-                    s['author'] = props['author']['properties']['name']
-                elif isinstance(props['author'], list):
-                    s['author'] = props['author'][0]['properties']['name']
-
-    for ld in data['json-ld']:
-        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
-            s['title'] = ld['headline']
-            if ld['dateModified']:
-                s['date'] = ld['dateModified']
-            if ld['datePublished']:
-                s['date'] = ld['datePublished']
-            if 'author' in ld and ld['author']:
-                if 'name' in ld['author']:
-                    s['author'] = ld['author']['name']
-                elif isinstance(ld['author'], list):
-                    s['author'] = ld['author'][0]['name']
-        if '@graph' in ld:
-            for gld in ld['@graph']:
-                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
-                    s['title'] = gld['headline']
-                    if gld['dateModified']:
-                        s['date'] = gld['dateModified']
-                    if gld['datePublished']:
-                        s['date'] = gld['datePublished']
-
-    return s
--- a/apiserver/misc/news.py
+++ b/apiserver/misc/news.py
@ -1,101 +0,0 @@
-import logging
-logging.basicConfig(
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
-
-import re
-import requests
-from bs4 import BeautifulSoup
-from scrapers import declutter
-import extruct
-
-import settings
-from utils import clean
-from misc.metadata import parse_extruct
-from misc.time import unix
-from misc.api import xml
-
-def comment(i):
-    if 'author' not in i:
-        return False
-
-    c = {}
-    c['author'] = i.get('author', '')
-    c['score'] = i.get('points', 0)
-    c['date'] = unix(i.get('date', 0))
-    c['text'] = clean(i.get('text', '') or '')
-    c['comments'] = [comment(j) for j in i['children']]
-    c['comments'] = list(filter(bool, c['comments']))
-    return c
-
-def comment_count(i):
-    alive = 1 if i['author'] else 0
-    return sum([comment_count(c) for c in i['comments']]) + alive
-
-class Base:
-    def __init__(config):
-        self.config = config
-        self.url = config.get('url')
-        self.tz = config.get('tz')
-
-    def get_id(self, link):
-        patterns = self.config.get('patterns')
-        if not patterns:
-            return link
-        patterns = [re.compile(p) for p in patterns]
-        patterns = list(filter(None, [p.match(link) for p in patterns]))
-        patterns = list(set([':'.join(p.groups()) for p in patterns]))
-        if not patterns:
-            return link
-        return patterns[0]
-
-    def feed(self, excludes=None):
-        return []
-
-    def story(self, ref, urlref):
-        if urlref is None:
-            return False
-        markup = xml(lambda x: urlref)
-        if not markup:
-            return False
-
-        s = {}
-        s['author_link'] = ''
-        s['score'] = 0
-        s['comments'] = []
-        s['num_comments'] = 0
-        s['link'] = urlref
-        s['url'] = urlref
-        s['date'] = 0
-
-        soup = BeautifulSoup(markup, features='html.parser')
-        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
-        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
-        favicon = soup.find_all('link', rel="shortcut icon", href=True)
-        others = soup.find_all('link', rel="icon", href=True)
-        icons = icon32 + icon16 + favicon + others
-        base_url = '/'.join(urlref.split('/')[:3])
-        icons = list(set([i.get('href') for i in icons]))
-        icons = [i if i.startswith('http') else base_url + i for i in icons]
-
-        if icons:
-            s['icon'] = icons[0]
-
-        data = extruct.extract(markup)
-        s = parse_extruct(s, data)
-        if s['date']:
-            s['date'] = unix(s['date'], tz=self.tz)
-
-        if 'disqus' in markup:
-            try:
-                s['comments'] = declutter.get_comments(urlref)
-                c['comments'] = list(filter(bool, c['comments']))
-                s['num_comments'] = comment_count(s['comments'])
-            except KeyboardInterrupt:
-                raise
-            except:
-                pass
-
-        if not s['date']:
-            return False
-        return s
--- a/apiserver/misc/time.py
+++ b/apiserver/misc/time.py
@ -1,18 +0,0 @@
-import pytz
-import dateutil.parser
-
-
-TZINFOS = {
-    'NZDT': pytz.timezone('Pacific/Auckland'),
-    'NZST': pytz.timezone('Pacific/Auckland')
-}
-
-def unix(date_str, tz=None, tzinfos=TZINFOS):
-    try:
-        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
-        if tz:
-            dt = pytz.timezone(tz).localize(dt)
-        return int(dt.timestamp())
-    except:
-        pass
-    return 0
--- a/apiserver/server.py
+++ b/apiserver/server.py
@ -145,12 +145,12 @@ def static_story(sid):
 http_server = WSGIServer(('', 33842), flask_app)

 def _add_new_refs():
-    for ref, source, urlref in feed.get_list():
+    for ref, source in feed.get_list():
        if database.get_story_by_ref(ref):
            continue
        try:
            nid = new_id()
-            database.put_ref(ref, nid, source, urlref)
+            database.put_ref(ref, nid, source)
            logging.info('Added ref ' + ref)
        except database.IntegrityError:
            continue
@ -163,7 +163,7 @@ def _update_current_story(item):

    logging.info('Updating story: {}'.format(str(story['ref'])))

-    valid = feed.update_story(story, urlref=item['urlref'])
+    valid = feed.update_story(story)
    if valid:
        database.put_story(story)
        search.put_story(story)
--- a/apiserver/settings.py.example
+++ b/apiserver/settings.py.example
@ -13,43 +13,15 @@ NUM_TILDES = 5
 NUM_SUBSTACK = 10

 SITEMAP = {}
-# SITEMAP['nzherald'] = {
-#     'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
-#     'count': 20,
-#     'patterns': [
-#         r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
-#     ],
-#     'excludes': [
-#         'driven.co.nz',
-#         'oneroof.co.nz',
-#         'nzherald.co.nz/sponsored-stories',
-#         'nzherald.co.nz/entertainment/',
-#         'nzherald.co.nz/lifestyle/',
-#         'nzherald.co.nz/travel/',
-#         'nzherald.co.nz/sport/',
-#         'nzherald.co.nz/promotions/',
-#         'nzherald.co.nzhttp',
-#         'herald-afternoon-quiz',
-#         'herald-morning-quiz'
-#     ],
-# }
+# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
+# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},

 SUBSTACK = {}
 # SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
 # SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},

 CATEGORY = {}
-# CATEGORY['radionz'] = {
-#     'url': "https://www.rnz.co.nz/news/",
-#     'count': 20,
-#     'patterns': [
-#         r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
-#     ],
-#     'excludes': [
-#         'rnz.co.nz/news/sport',
-#         'rnz.co.nz/weather',
-#     ],
-# }
+# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},

 SCRAPERS = ['browser', 'declutter', 'outline', 'local']

--- a/readerserver/scraper/browser/scripts/cosmetic-filters.js
+++ b/readerserver/scraper/browser/scripts/cosmetic-filters.js
@ -39,9 +39,6 @@
 	if (matchDomain(["tvnz.co.nz"])) {
 		removeSelectors([".signup-container container"]);
 	}
-	if (matchDomain(["thespinoff.co.nz"])) {
-		removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
-	}

 	function matchDomain(domains) {
 		const hostname = window.location.hostname;