fix mistake.

add regex to get a unique ref from each sitemap/category based article url.
cosmetic filters for the spinoff.
2020-11-17 12:54:54 +13:00 · 2020-11-17 12:38:28 +13:00 · 2020-11-16 16:49:39 +13:00 · 2020-11-16 15:41:09 +13:00 · 2020-11-16 15:30:33 +13:00
12 changed files with 450 additions and 329 deletions
--- a/apiserver/database.py
+++ b/apiserver/database.py
@@ -24,6 +24,7 @@ class Reflist(Base):

    rid = Column(Integer, primary_key=True)
    ref = Column(String(16), unique=True)
+    urlref = Column(String)
    sid = Column(String, ForeignKey('stories.sid'), unique=True)
    source = Column(String(16))

@@ -75,7 +76,7 @@ def get_stories_by_url(url):
 def get_reflist():
    session = Session()
    q = session.query(Reflist).order_by(Reflist.rid.desc())
-    return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
+    return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]

 def get_stories(maxage=60*60*24*2):
    time = datetime.now().timestamp() - maxage
@@ -87,10 +88,10 @@ def get_stories(maxage=60*60*24*2):
            order_by(Story.meta['date'].desc())
    return [x[1] for x in q]

-def put_ref(ref, sid, source):
+def put_ref(ref, sid, source, urlref):
    try:
        session = Session()
-        r = Reflist(ref=ref, sid=sid, source=source)
+        r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
        session.add(r)
        session.commit()
    except:
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -9,7 +9,9 @@ from bs4 import BeautifulSoup
 import itertools

 import settings
-from feeds import hackernews, reddit, tildes, substack, manual, news
+from feeds import hackernews, reddit, tildes, substack, manual
+from feeds.sitemap import Sitemap
+from feeds.category import Category
 from scrapers import outline, declutter, browser, local

 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@@ -19,40 +21,40 @@ for key, value in settings.SUBSTACK.items():
    substacks[key] = substack.Publication(value['url'])
 categories = {}
 for key, value in settings.CATEGORY.items():
-    categories[key] = news.Category(value['url'], value.get('tz'))
+    categories[key] = Category(value)
 sitemaps = {}
 for key, value in settings.SITEMAP.items():
-    sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
+    sitemaps[key] = Sitemap(value)

 def get_list():
    feeds = {}

    if settings.NUM_HACKERNEWS:
-        feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
+        feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]

    if settings.NUM_REDDIT:
-        feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
+        feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]

    if settings.NUM_TILDES:
-        feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
+        feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]

    if settings.NUM_SUBSTACK:
-        feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
+        feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]

    for key, publication in substacks.items():
        count = settings.SUBSTACK[key]['count']
-        feeds[key] = [(x, key) for x in publication.feed()[:count]]
+        feeds[key] = [(x, key, x) for x in publication.feed()[:count]]

    for key, sites in categories.items():
        count = settings.CATEGORY[key].get('count') or 0
        excludes = settings.CATEGORY[key].get('excludes')
        tz = settings.CATEGORY[key].get('tz')
-        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
+        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]

    for key, sites in sitemaps.items():
        count = settings.SITEMAP[key].get('count') or 0
        excludes = settings.SITEMAP[key].get('excludes')
-        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
+        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]

    values = feeds.values()
    feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
@@ -99,7 +101,7 @@ def get_content_type(url):
    except:
        return ''

-def update_story(story, is_manual=False):
+def update_story(story, is_manual=False, urlref=None):
    res = {}

    if story['source'] == 'hackernews':
@@ -111,9 +113,9 @@ def update_story(story, is_manual=False):
    elif story['source'] == 'substack':
        res = substack.top.story(story['ref'])
    elif story['source'] in categories.keys():
-        res = categories[story['source']].story(story['ref'])
+        res = categories[story['source']].story(story['ref'], urlref)
    elif story['source'] in sitemaps.keys():
-        res = sitemaps[story['source']].story(story['ref'])
+        res = sitemaps[story['source']].story(story['ref'], urlref)
    elif story['source'] in substacks.keys():
        res = substacks[story['source']].story(story['ref'])
    elif story['source'] == 'manual':
--- a/apiserver/feeds/category.py
+++ b/apiserver/feeds/category.py
@@ -0,0 +1,72 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+from bs4 import BeautifulSoup
+
+import settings
+from utils import clean
+from misc.api import xml
+from misc.news import Base
+
+def _filter_links(links, category_url, excludes=None):
+    links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
+    links = list(filter(None, [link if link != category_url else None for link in links]))
+    links = list(set(links))
+    if excludes:
+        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+    return links
+
+def _get_category(category_url, excludes=None):
+    base_url = '/'.join(category_url.split('/')[:3])
+    markup = xml(lambda x: category_url)
+    if not markup: return []
+    soup = BeautifulSoup(markup, features='html.parser')
+    links = soup.find_all('a', href=True)
+    links = [link.get('href') for link in links]
+    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
+    links = _filter_links(links, category_url, excludes)
+    return links
+
+class Category(Base):
+    def __init__(self, config):
+        self.config = config
+        self.category_url = config.get('url')
+        self.tz = config.get('tz')
+
+    def feed(self, excludes=None):
+        links = []
+        if isinstance(self.category_url, str):
+            links += _get_category(self.category_url, excludes)
+        elif isinstance(self.category_url, list):
+            for url in self.category_url:
+                links += _get_category(url, excludes)
+        links = list(set(links))
+        return [(self.get_id(link), link) for link in links]
+
+
+# scratchpad so I can quickly develop the parser
+if __name__ == '__main__':
+    print("Category: RadioNZ")
+    site = Category("https://www.rnz.co.nz/news/")
+    excludes = [
+        'rnz.co.nz/news/sport',
+        'rnz.co.nz/weather',
+        'rnz.co.nz/news/weather',
+    ]
+    posts = site.feed(excludes)
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+    print("Category: Newsroom")
+    site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+
--- a/apiserver/feeds/news.py
+++ b/apiserver/feeds/news.py
@@ -1,307 +0,0 @@
-import logging
-logging.basicConfig(
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
-
-if __name__ == '__main__':
-    import sys
-    sys.path.insert(0,'.')
-
-import requests
-from datetime import datetime
-from bs4 import BeautifulSoup
-from scrapers import declutter
-import dateutil.parser
-import extruct
-import pytz
-
-from utils import clean
-import settings
-
-tzinfos = {
-    'NZDT': pytz.timezone('Pacific/Auckland'),
-    'NZST': pytz.timezone('Pacific/Auckland')
-}
-
-USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
-#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-
-def unix(date_str, tz=None):
-    try:
-        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
-        if tz:
-            dt = pytz.timezone(tz).localize(dt)
-        return int(dt.timestamp())
-    except:
-        pass
-    return 0
-
-
-def xml(route, ref=None):
-    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
-        r = requests.get(route(ref), headers=headers, timeout=5)
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        return r.text
-    except KeyboardInterrupt:
-        raise
-    except BaseException as e:
-        logging.error('Problem hitting URL: {}'.format(str(e)))
-        return False
-
-
-def parse_extruct(s, data):
-    rdfa_keys = {
-        'title': [
-            'http://ogp.me/ns#title',
-            'https://ogp.me/ns#title',
-        ],
-        'date': [
-            'http://ogp.me/ns/article#modified_time',
-            'https://ogp.me/ns/article#modified_time',
-            'http://ogp.me/ns/article#published_time',
-            'https://ogp.me/ns/article#published_time',
-        ]
-    }
-    for rdfa in data['rdfa']:
-        for key, props in rdfa.items():
-            for attribute, properties in rdfa_keys.items():
-                for prop in properties:
-                    if prop in props:
-                        for values in props[prop]:
-                            s[attribute] = values['@value']
-
-    for og in data['opengraph']:
-        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
-        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
-        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
-        if len(modified):
-            s['date'] = modified[0]
-        if len(published):
-            s['date'] = published[0]
-        if len(titles):
-            s['title'] = titles[0]
-
-    for md in data['microdata']:
-        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
-            props = md['properties']
-            s['title'] = props['headline']
-            if props['dateModified']:
-                s['date'] = props['dateModified']
-            if props['datePublished']:
-                s['date'] = props['datePublished']
-            if 'author' in props and props['author']:
-                if 'properties' in props['author']:
-                    s['author'] = props['author']['properties']['name']
-                elif isinstance(props['author'], list):
-                    s['author'] = props['author'][0]['properties']['name']
-
-    for ld in data['json-ld']:
-        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
-            s['title'] = ld['headline']
-            if ld['dateModified']:
-                s['date'] = ld['dateModified']
-            if ld['datePublished']:
-                s['date'] = ld['datePublished']
-            if 'author' in ld and ld['author']:
-                if 'name' in ld['author']:
-                    s['author'] = ld['author']['name']
-                elif isinstance(ld['author'], list):
-                    s['author'] = ld['author'][0]['name']
-        if '@graph' in ld:
-            for gld in ld['@graph']:
-                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
-                    s['title'] = gld['headline']
-                    if gld['dateModified']:
-                        s['date'] = gld['dateModified']
-                    if gld['datePublished']:
-                        s['date'] = gld['datePublished']
-
-    return s
-
-def comment(i):
-    if 'author' not in i:
-        return False
-
-    c = {}
-    c['author'] = i.get('author', '')
-    c['score'] = i.get('points', 0)
-    c['date'] = unix(i.get('date', 0))
-    c['text'] = clean(i.get('text', '') or '')
-    c['comments'] = [comment(j) for j in i['children']]
-    c['comments'] = list(filter(bool, c['comments']))
-    return c
-
-def comment_count(i):
-    alive = 1 if i['author'] else 0
-    return sum([comment_count(c) for c in i['comments']]) + alive
-
-class _Base:
-    def __init__(url, tz=None):
-        self.url = url
-        self.tz = tz
-
-    def feed(self, excludes=None):
-        return []
-
-    def story(self, ref):
-        markup = xml(lambda x: ref)
-        if not markup:
-            return False
-
-        s = {}
-        s['author_link'] = ''
-        s['score'] = 0
-        s['comments'] = []
-        s['num_comments'] = 0
-        s['link'] = ref
-        s['url'] = ref
-        s['date'] = 0
-
-        soup = BeautifulSoup(markup, features='html.parser')
-        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
-        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
-        favicon = soup.find_all('link', rel="shortcut icon", href=True)
-        others = soup.find_all('link', rel="icon", href=True)
-        icons = icon32 + icon16 + favicon + others
-        base_url = '/'.join(ref.split('/')[:3])
-        icons = list(set([i.get('href') for i in icons]))
-        icons = [i if i.startswith('http') else base_url + i for i in icons]
-
-        if icons:
-            s['icon'] = icons[0]
-
-        data = extruct.extract(markup)
-        s = parse_extruct(s, data)
-        if s['date']:
-            s['date'] = unix(s['date'], tz=self.tz)
-
-        if 'disqus' in markup:
-            try:
-                s['comments'] = declutter.get_comments(ref)
-                c['comments'] = list(filter(bool, c['comments']))
-                s['num_comments'] = comment_count(s['comments'])
-            except KeyboardInterrupt:
-                raise
-            except:
-                pass
-
-        if not s['date']:
-            return False
-        return s
-
-def get_sitemap_date(a):
-    if a.find('lastmod'):
-        return a.find('lastmod').text
-    if a.find('news:publication_date'):
-        return a.find('news:publication_date').text
-    if a.find('ns2:publication_date'):
-        return a.find('ns2:publication_date').text
-    return ''
-
-class Sitemap(_Base):
-    def __init__(self, url, tz=None):
-        self.tz = tz
-        self.sitemap_url = url
-
-    def feed(self, excludes=None):
-        links = []
-        if isinstance(self.sitemap_url, str):
-            links += self._get_sitemap(self.sitemap_url, excludes)
-        elif isinstance(self.sitemap_url, list):
-            for url in self.sitemap_url:
-                links += self._get_sitemap(url, excludes)
-        return list(set(links))
-
-    def _filter_links(self, links, excludes=None):
-        too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
-        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
-        links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links]))
-        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
-
-        links = [x.find('loc').text for x in links] or []
-        links = list(set(links))
-        if excludes:
-            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
-        return links
-
-    def _get_sitemap(self, feed_url, excludes=None):
-        markup = xml(lambda x: feed_url)
-        if not markup: return []
-        soup = BeautifulSoup(markup, features='lxml')
-        links = []
-        feed_urls = []
-        if soup.find('sitemapindex'):
-            sitemap = soup.find('sitemapindex').findAll('sitemap')
-            feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
-        if soup.find('urlset'):
-            sitemap = soup.find('urlset').findAll('url')
-            links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
-
-        feed_urls = self._filter_links(feed_urls, excludes)
-        links = self._filter_links(links, excludes)
-
-        for url in feed_urls:
-            links += self._get_sitemap(url, excludes)
-        return list(set(links))
-
-class Category(_Base):
-    def __init__(self, url, tz=None):
-        self.tz = tz
-        self.category_url = url
-
-    def _filter_links(self, links, category_url, excludes=None):
-        links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
-        links = list(filter(None, [link if link != category_url else None for link in links]))
-        links = list(set(links))
-        if excludes:
-            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
-        return links
-
-    def _get_category(self, category_url, excludes=None):
-        base_url = '/'.join(category_url.split('/')[:3])
-        markup = xml(lambda x: category_url)
-        if not markup: return []
-        soup = BeautifulSoup(markup, features='html.parser')
-        links = soup.find_all('a', href=True)
-        links = [link.get('href') for link in links]
-        links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
-        links = self._filter_links(links, category_url, excludes)
-        return links
-
-    def feed(self, excludes=None):
-        links = []
-        if isinstance(self.category_url, str):
-            links += self._get_category(self.category_url, excludes)
-        elif isinstance(self.category_url, list):
-            for url in self.category_url:
-                links += self._get_category(url, excludes)
-        return list(set(links))
-
-
-# scratchpad so I can quickly develop the parser
-if __name__ == '__main__':
-    print("Sitemap: The Spinoff")
-    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
-    excludes = [
-        'thespinoff.co.nz/sitemap-misc.xml',
-        'thespinoff.co.nz/sitemap-authors.xml',
-        'thespinoff.co.nz/sitemap-tax-category.xml',
-    ]
-    posts = site.feed(excludes)
-    print(posts[:5])
-    print(site.story(posts[0]))
-
-    print("Sitemap: Newshub")
-    site = Sitemap([
-        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
-    ])
-    posts = site.feed()
-    print(posts[:5])
-    print(site.story(posts[0]))
-    print(site.story(posts[:-1]))
-
--- a/apiserver/feeds/sitemap.py
+++ b/apiserver/feeds/sitemap.py
@@ -0,0 +1,99 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+import settings
+from utils import clean
+from misc.time import unix
+from misc.api import xml
+from misc.news import Base
+
+def _get_sitemap_date(a):
+    if a.find('lastmod'):
+        return a.find('lastmod').text
+    if a.find('news:publication_date'):
+        return a.find('news:publication_date').text
+    if a.find('ns2:publication_date'):
+        return a.find('ns2:publication_date').text
+    return ''
+
+def _filter_links(links, excludes=None):
+    too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
+    links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
+    links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
+    links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
+
+    links = [x.find('loc').text for x in links] or []
+    links = list(set(links))
+    if excludes:
+        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+    return links
+
+def _get_sitemap(feed_url, excludes=None):
+    markup = xml(lambda x: feed_url)
+    if not markup: return []
+    soup = BeautifulSoup(markup, features='lxml')
+    links = []
+    feed_urls = []
+    if soup.find('sitemapindex'):
+        sitemap = soup.find('sitemapindex').findAll('sitemap')
+        feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+    if soup.find('urlset'):
+        sitemap = soup.find('urlset').findAll('url')
+        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+
+    feed_urls = _filter_links(feed_urls, excludes)
+    links = _filter_links(links, excludes)
+
+    for url in feed_urls:
+        links += _get_sitemap(url, excludes)
+    return list(set(links))
+
+class Sitemap(Base):
+    def __init__(self, config):
+        self.config = config
+        self.sitemap_url = config.get('url')
+        self.tz = config.get('tz')
+
+    def feed(self, excludes=None):
+        links = []
+        if isinstance(self.sitemap_url, str):
+            links += _get_sitemap(self.sitemap_url, excludes)
+        elif isinstance(self.sitemap_url, list):
+            for url in self.sitemap_url:
+                links += _get_sitemap(url, excludes)
+        links = list(set(links))
+        return [(self.get_id(link), link) for link in links]
+
+# scratchpad so I can quickly develop the parser
+if __name__ == '__main__':
+    print("Sitemap: The Spinoff")
+    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
+    excludes = [
+        'thespinoff.co.nz/sitemap-misc.xml',
+        'thespinoff.co.nz/sitemap-authors.xml',
+        'thespinoff.co.nz/sitemap-tax-category.xml',
+    ]
+    posts = site.feed(excludes)
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+    print("Sitemap: Newshub")
+    site = Sitemap([
+        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
+    ])
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+    print(site.story(posts[:-1]))
--- a/apiserver/misc/api.py
+++ b/apiserver/misc/api.py
@@ -0,0 +1,35 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+import requests
+
+USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+FORWARD_IP = '66.249.66.1'
+
+def xml(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.text
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
+
+def json(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
--- a/apiserver/misc/metadata.py
+++ b/apiserver/misc/metadata.py
@@ -0,0 +1,69 @@
+
+def parse_extruct(s, data):
+    rdfa_keys = {
+        'title': [
+            'http://ogp.me/ns#title',
+            'https://ogp.me/ns#title',
+        ],
+        'date': [
+            'http://ogp.me/ns/article#modified_time',
+            'https://ogp.me/ns/article#modified_time',
+            'http://ogp.me/ns/article#published_time',
+            'https://ogp.me/ns/article#published_time',
+        ]
+    }
+    for rdfa in data['rdfa']:
+        for key, props in rdfa.items():
+            for attribute, properties in rdfa_keys.items():
+                for prop in properties:
+                    if prop in props:
+                        for values in props[prop]:
+                            s[attribute] = values['@value']
+
+    for og in data['opengraph']:
+        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
+        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
+        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
+        if len(modified):
+            s['date'] = modified[0]
+        if len(published):
+            s['date'] = published[0]
+        if len(titles):
+            s['title'] = titles[0]
+
+    for md in data['microdata']:
+        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
+            props = md['properties']
+            s['title'] = props['headline']
+            if props['dateModified']:
+                s['date'] = props['dateModified']
+            if props['datePublished']:
+                s['date'] = props['datePublished']
+            if 'author' in props and props['author']:
+                if 'properties' in props['author']:
+                    s['author'] = props['author']['properties']['name']
+                elif isinstance(props['author'], list):
+                    s['author'] = props['author'][0]['properties']['name']
+
+    for ld in data['json-ld']:
+        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
+            s['title'] = ld['headline']
+            if ld['dateModified']:
+                s['date'] = ld['dateModified']
+            if ld['datePublished']:
+                s['date'] = ld['datePublished']
+            if 'author' in ld and ld['author']:
+                if 'name' in ld['author']:
+                    s['author'] = ld['author']['name']
+                elif isinstance(ld['author'], list):
+                    s['author'] = ld['author'][0]['name']
+        if '@graph' in ld:
+            for gld in ld['@graph']:
+                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
+                    s['title'] = gld['headline']
+                    if gld['dateModified']:
+                        s['date'] = gld['dateModified']
+                    if gld['datePublished']:
+                        s['date'] = gld['datePublished']
+
+    return s
--- a/apiserver/misc/news.py
+++ b/apiserver/misc/news.py
@@ -0,0 +1,101 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+import re
+import requests
+from bs4 import BeautifulSoup
+from scrapers import declutter
+import extruct
+
+import settings
+from utils import clean
+from misc.metadata import parse_extruct
+from misc.time import unix
+from misc.api import xml
+
+def comment(i):
+    if 'author' not in i:
+        return False
+
+    c = {}
+    c['author'] = i.get('author', '')
+    c['score'] = i.get('points', 0)
+    c['date'] = unix(i.get('date', 0))
+    c['text'] = clean(i.get('text', '') or '')
+    c['comments'] = [comment(j) for j in i['children']]
+    c['comments'] = list(filter(bool, c['comments']))
+    return c
+
+def comment_count(i):
+    alive = 1 if i['author'] else 0
+    return sum([comment_count(c) for c in i['comments']]) + alive
+
+class Base:
+    def __init__(config):
+        self.config = config
+        self.url = config.get('url')
+        self.tz = config.get('tz')
+
+    def get_id(self, link):
+        patterns = self.config.get('patterns')
+        if not patterns:
+            return link
+        patterns = [re.compile(p) for p in patterns]
+        patterns = list(filter(None, [p.match(link) for p in patterns]))
+        patterns = list(set([':'.join(p.groups()) for p in patterns]))
+        if not patterns:
+            return link
+        return patterns[0]
+
+    def feed(self, excludes=None):
+        return []
+
+    def story(self, ref, urlref):
+        if urlref is None:
+            return False
+        markup = xml(lambda x: urlref)
+        if not markup:
+            return False
+
+        s = {}
+        s['author_link'] = ''
+        s['score'] = 0
+        s['comments'] = []
+        s['num_comments'] = 0
+        s['link'] = urlref
+        s['url'] = urlref
+        s['date'] = 0
+
+        soup = BeautifulSoup(markup, features='html.parser')
+        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
+        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
+        favicon = soup.find_all('link', rel="shortcut icon", href=True)
+        others = soup.find_all('link', rel="icon", href=True)
+        icons = icon32 + icon16 + favicon + others
+        base_url = '/'.join(urlref.split('/')[:3])
+        icons = list(set([i.get('href') for i in icons]))
+        icons = [i if i.startswith('http') else base_url + i for i in icons]
+
+        if icons:
+            s['icon'] = icons[0]
+
+        data = extruct.extract(markup)
+        s = parse_extruct(s, data)
+        if s['date']:
+            s['date'] = unix(s['date'], tz=self.tz)
+
+        if 'disqus' in markup:
+            try:
+                s['comments'] = declutter.get_comments(urlref)
+                c['comments'] = list(filter(bool, c['comments']))
+                s['num_comments'] = comment_count(s['comments'])
+            except KeyboardInterrupt:
+                raise
+            except:
+                pass
+
+        if not s['date']:
+            return False
+        return s
--- a/apiserver/misc/time.py
+++ b/apiserver/misc/time.py
@@ -0,0 +1,18 @@
+import pytz
+import dateutil.parser
+
+
+TZINFOS = {
+    'NZDT': pytz.timezone('Pacific/Auckland'),
+    'NZST': pytz.timezone('Pacific/Auckland')
+}
+
+def unix(date_str, tz=None, tzinfos=TZINFOS):
+    try:
+        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
+        if tz:
+            dt = pytz.timezone(tz).localize(dt)
+        return int(dt.timestamp())
+    except:
+        pass
+    return 0
--- a/apiserver/server.py
+++ b/apiserver/server.py
@@ -145,12 +145,12 @@ def static_story(sid):
 http_server = WSGIServer(('', 33842), flask_app)

 def _add_new_refs():
-    for ref, source in feed.get_list():
+    for ref, source, urlref in feed.get_list():
        if database.get_story_by_ref(ref):
            continue
        try:
            nid = new_id()
-            database.put_ref(ref, nid, source)
+            database.put_ref(ref, nid, source, urlref)
            logging.info('Added ref ' + ref)
        except database.IntegrityError:
            continue
@@ -163,7 +163,7 @@ def _update_current_story(item):

    logging.info('Updating story: {}'.format(str(story['ref'])))

-    valid = feed.update_story(story)
+    valid = feed.update_story(story, urlref=item['urlref'])
    if valid:
        database.put_story(story)
        search.put_story(story)
--- a/apiserver/settings.py.example
+++ b/apiserver/settings.py.example
@@ -13,15 +13,43 @@ NUM_TILDES = 5
 NUM_SUBSTACK = 10

 SITEMAP = {}
-# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
-# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
+# SITEMAP['nzherald'] = {
+#     'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
+#     'count': 20,
+#     'patterns': [
+#         r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
+#     ],
+#     'excludes': [
+#         'driven.co.nz',
+#         'oneroof.co.nz',
+#         'nzherald.co.nz/sponsored-stories',
+#         'nzherald.co.nz/entertainment/',
+#         'nzherald.co.nz/lifestyle/',
+#         'nzherald.co.nz/travel/',
+#         'nzherald.co.nz/sport/',
+#         'nzherald.co.nz/promotions/',
+#         'nzherald.co.nzhttp',
+#         'herald-afternoon-quiz',
+#         'herald-morning-quiz'
+#     ],
+# }

 SUBSTACK = {}
 # SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
 # SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},

 CATEGORY = {}
-# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
+# CATEGORY['radionz'] = {
+#     'url': "https://www.rnz.co.nz/news/",
+#     'count': 20,
+#     'patterns': [
+#         r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
+#     ],
+#     'excludes': [
+#         'rnz.co.nz/news/sport',
+#         'rnz.co.nz/weather',
+#     ],
+# }

 SCRAPERS = ['browser', 'declutter', 'outline', 'local']

--- a/readerserver/scraper/browser/scripts/cosmetic-filters.js
+++ b/readerserver/scraper/browser/scripts/cosmetic-filters.js
@@ -39,6 +39,9 @@
 	if (matchDomain(["tvnz.co.nz"])) {
 		removeSelectors([".signup-container container"]);
 	}
+	if (matchDomain(["thespinoff.co.nz"])) {
+		removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
+	}

 	function matchDomain(domains) {
 		const hostname = window.location.hostname;
Author	SHA1	Message	Date
Jason Schwarzenberger	5668fa5dbc	fix mistake.	2020-11-17 12:54:54 +13:00
Jason Schwarzenberger	b771b52501	add regex to get a unique ref from each sitemap/category based article url.	2020-11-17 12:38:28 +13:00
Jason Schwarzenberger	f5c7a658ba	cosmetic filters for the spinoff.	2020-11-16 16:49:39 +13:00
Jason Schwarzenberger	f5ccd844da	fix import error.	2020-11-16 15:41:09 +13:00
Jason Schwarzenberger	6a91b9402f	split categories, sitemap and other crap out of news.py	2020-11-16 15:30:33 +13:00