fix mistake.

add regex to get a unique ref from each sitemap/category based article url.
cosmetic filters for the spinoff.
2020-11-17 12:54:54 +13:00 · 2020-11-17 12:38:28 +13:00 · 2020-11-16 16:49:39 +13:00 · 2020-11-16 15:41:09 +13:00 · 2020-11-16 15:30:33 +13:00
12 changed files with 450 additions and 329 deletions
@@ -24,6 +24,7 @@ class Reflist(Base):
    rid = Column(Integer, primary_key=True)
    ref = Column(String(16), unique=True)
    urlref = Column(String)
    sid = Column(String, ForeignKey('stories.sid'), unique=True)
    source = Column(String(16))
@@ -75,7 +76,7 @@ def get_stories_by_url(url):
 def get_reflist():
    session = Session()
    q = session.query(Reflist).order_by(Reflist.rid.desc())
-    return [dict(ref=x.ref, sid=x.sid, source=x.source) for x in q.all()]
+    return [dict(ref=x.ref, sid=x.sid, source=x.source, urlref=x.urlref) for x in q.all()]
 def get_stories(maxage=60*60*24*2):
    time = datetime.now().timestamp() - maxage
@@ -87,10 +88,10 @@ def get_stories(maxage=60*60*24*2):
            order_by(Story.meta['date'].desc())
    return [x[1] for x in q]
-def put_ref(ref, sid, source):
+def put_ref(ref, sid, source, urlref):
    try:
        session = Session()
-        r = Reflist(ref=ref, sid=sid, source=source)
+        r = Reflist(ref=ref, sid=sid, source=source, urlref=urlref)
        session.add(r)
        session.commit()
    except:
@@ -9,7 +9,9 @@ from bs4 import BeautifulSoup
 import itertools
 import settings
-from feeds import hackernews, reddit, tildes, substack, manual, news
+from feeds import hackernews, reddit, tildes, substack, manual
 from feeds.sitemap import Sitemap
 from feeds.category import Category
 from scrapers import outline, declutter, browser, local
 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@@ -19,40 +21,40 @@ for key, value in settings.SUBSTACK.items():
    substacks[key] = substack.Publication(value['url'])
 categories = {}
 for key, value in settings.CATEGORY.items():
-    categories[key] = news.Category(value['url'], value.get('tz'))
+    categories[key] = Category(value)
 sitemaps = {}
 for key, value in settings.SITEMAP.items():
-    sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
+    sitemaps[key] = Sitemap(value)
 def get_list():
    feeds = {}
    if settings.NUM_HACKERNEWS:
-        feeds['hackernews'] = [(x, 'hackernews') for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
+        feeds['hackernews'] = [(x, 'hackernews', x) for x in hackernews.feed()[:settings.NUM_HACKERNEWS]]
    if settings.NUM_REDDIT:
-        feeds['reddit'] = [(x, 'reddit') for x in reddit.feed()[:settings.NUM_REDDIT]]
+        feeds['reddit'] = [(x, 'reddit', x) for x in reddit.feed()[:settings.NUM_REDDIT]]
    if settings.NUM_TILDES:
-        feeds['tildes'] = [(x, 'tildes') for x in tildes.feed()[:settings.NUM_TILDES]]
+        feeds['tildes'] = [(x, 'tildes', x) for x in tildes.feed()[:settings.NUM_TILDES]]
    if settings.NUM_SUBSTACK:
-        feeds['substack'] = [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
+        feeds['substack'] = [(x, 'substack', x) for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
    for key, publication in substacks.items():
        count = settings.SUBSTACK[key]['count']
-        feeds[key] = [(x, key) for x in publication.feed()[:count]]
+        feeds[key] = [(x, key, x) for x in publication.feed()[:count]]
    for key, sites in categories.items():
        count = settings.CATEGORY[key].get('count') or 0
        excludes = settings.CATEGORY[key].get('excludes')
        tz = settings.CATEGORY[key].get('tz')
-        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
+        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
    for key, sites in sitemaps.items():
        count = settings.SITEMAP[key].get('count') or 0
        excludes = settings.SITEMAP[key].get('excludes')
-        feeds[key] = [(x, key) for x in sites.feed(excludes)[:count]]
+        feeds[key] = [(x, key, u) for x, u in sites.feed(excludes)[:count]]
    values = feeds.values()
    feed = itertools.chain.from_iterable(itertools.zip_longest(*values, fillvalue=None))
@@ -99,7 +101,7 @@ def get_content_type(url):
    except:
        return ''
-def update_story(story, is_manual=False):
+def update_story(story, is_manual=False, urlref=None):
    res = {}
    if story['source'] == 'hackernews':
@@ -111,9 +113,9 @@ def update_story(story, is_manual=False):
    elif story['source'] == 'substack':
        res = substack.top.story(story['ref'])
    elif story['source'] in categories.keys():
-        res = categories[story['source']].story(story['ref'])
+        res = categories[story['source']].story(story['ref'], urlref)
    elif story['source'] in sitemaps.keys():
-        res = sitemaps[story['source']].story(story['ref'])
+        res = sitemaps[story['source']].story(story['ref'], urlref)
    elif story['source'] in substacks.keys():
        res = substacks[story['source']].story(story['ref'])
    elif story['source'] == 'manual':
@@ -0,0 +1,72 @@
 import logging
 logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)
 if __name__ == '__main__':
    import sys
    sys.path.insert(0,'.')
 from bs4 import BeautifulSoup
 import settings
 from utils import clean
 from misc.api import xml
 from misc.news import Base
 def _filter_links(links, category_url, excludes=None):
    links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
    links = list(filter(None, [link if link != category_url else None for link in links]))
    links = list(set(links))
    if excludes:
        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
    return links
 def _get_category(category_url, excludes=None):
    base_url = '/'.join(category_url.split('/')[:3])
    markup = xml(lambda x: category_url)
    if not markup: return []
    soup = BeautifulSoup(markup, features='html.parser')
    links = soup.find_all('a', href=True)
    links = [link.get('href') for link in links]
    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
    links = _filter_links(links, category_url, excludes)
    return links
 class Category(Base):
    def __init__(self, config):
        self.config = config
        self.category_url = config.get('url')
        self.tz = config.get('tz')
    def feed(self, excludes=None):
        links = []
        if isinstance(self.category_url, str):
            links += _get_category(self.category_url, excludes)
        elif isinstance(self.category_url, list):
            for url in self.category_url:
                links += _get_category(url, excludes)
        links = list(set(links))
        return [(self.get_id(link), link) for link in links]
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
    print("Category: RadioNZ")
    site = Category("https://www.rnz.co.nz/news/")
    excludes = [
        'rnz.co.nz/news/sport',
        'rnz.co.nz/weather',
        'rnz.co.nz/news/weather',
    ]
    posts = site.feed(excludes)
    print(posts[:5])
    print(site.story(posts[0]))
    print("Category: Newsroom")
    site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
    posts = site.feed()
    print(posts[:5])
    print(site.story(posts[0]))
@@ -1,307 +0,0 @@
 import logging
 logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)
 if __name__ == '__main__':
    import sys
    sys.path.insert(0,'.')
 import requests
 from datetime import datetime
 from bs4 import BeautifulSoup
 from scrapers import declutter
 import dateutil.parser
 import extruct
 import pytz
 from utils import clean
 import settings
 tzinfos = {
    'NZDT': pytz.timezone('Pacific/Auckland'),
    'NZST': pytz.timezone('Pacific/Auckland')
 }
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
 #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
 def unix(date_str, tz=None):
    try:
        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
        if tz:
            dt = pytz.timezone(tz).localize(dt)
        return int(dt.timestamp())
    except:
        pass
    return 0
 def xml(route, ref=None):
    try:
        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
        r = requests.get(route(ref), headers=headers, timeout=5)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.text
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting URL: {}'.format(str(e)))
        return False
 def parse_extruct(s, data):
    rdfa_keys = {
        'title': [
            'http://ogp.me/ns#title',
            'https://ogp.me/ns#title',
        ],
        'date': [
            'http://ogp.me/ns/article#modified_time',
            'https://ogp.me/ns/article#modified_time',
            'http://ogp.me/ns/article#published_time',
            'https://ogp.me/ns/article#published_time',
        ]
    }
    for rdfa in data['rdfa']:
        for key, props in rdfa.items():
            for attribute, properties in rdfa_keys.items():
                for prop in properties:
                    if prop in props:
                        for values in props[prop]:
                            s[attribute] = values['@value']
    for og in data['opengraph']:
        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
        if len(modified):
            s['date'] = modified[0]
        if len(published):
            s['date'] = published[0]
        if len(titles):
            s['title'] = titles[0]
    for md in data['microdata']:
        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
            props = md['properties']
            s['title'] = props['headline']
            if props['dateModified']:
                s['date'] = props['dateModified']
            if props['datePublished']:
                s['date'] = props['datePublished']
            if 'author' in props and props['author']:
                if 'properties' in props['author']:
                    s['author'] = props['author']['properties']['name']
                elif isinstance(props['author'], list):
                    s['author'] = props['author'][0]['properties']['name']
    for ld in data['json-ld']:
        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
            s['title'] = ld['headline']
            if ld['dateModified']:
                s['date'] = ld['dateModified']
            if ld['datePublished']:
                s['date'] = ld['datePublished']
            if 'author' in ld and ld['author']:
                if 'name' in ld['author']:
                    s['author'] = ld['author']['name']
                elif isinstance(ld['author'], list):
                    s['author'] = ld['author'][0]['name']
        if '@graph' in ld:
            for gld in ld['@graph']:
                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
                    s['title'] = gld['headline']
                    if gld['dateModified']:
                        s['date'] = gld['dateModified']
                    if gld['datePublished']:
                        s['date'] = gld['datePublished']
    return s
 def comment(i):
    if 'author' not in i:
        return False
    c = {}
    c['author'] = i.get('author', '')
    c['score'] = i.get('points', 0)
    c['date'] = unix(i.get('date', 0))
    c['text'] = clean(i.get('text', '') or '')
    c['comments'] = [comment(j) for j in i['children']]
    c['comments'] = list(filter(bool, c['comments']))
    return c
 def comment_count(i):
    alive = 1 if i['author'] else 0
    return sum([comment_count(c) for c in i['comments']]) + alive
 class _Base:
    def __init__(url, tz=None):
        self.url = url
        self.tz = tz
    def feed(self, excludes=None):
        return []
    def story(self, ref):
        markup = xml(lambda x: ref)
        if not markup:
            return False
        s = {}
        s['author_link'] = ''
        s['score'] = 0
        s['comments'] = []
        s['num_comments'] = 0
        s['link'] = ref
        s['url'] = ref
        s['date'] = 0
        soup = BeautifulSoup(markup, features='html.parser')
        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
        favicon = soup.find_all('link', rel="shortcut icon", href=True)
        others = soup.find_all('link', rel="icon", href=True)
        icons = icon32 + icon16 + favicon + others
        base_url = '/'.join(ref.split('/')[:3])
        icons = list(set([i.get('href') for i in icons]))
        icons = [i if i.startswith('http') else base_url + i for i in icons]
        if icons:
            s['icon'] = icons[0]
        data = extruct.extract(markup)
        s = parse_extruct(s, data)
        if s['date']:
            s['date'] = unix(s['date'], tz=self.tz)
        if 'disqus' in markup:
            try:
                s['comments'] = declutter.get_comments(ref)
                c['comments'] = list(filter(bool, c['comments']))
                s['num_comments'] = comment_count(s['comments'])
            except KeyboardInterrupt:
                raise
            except:
                pass
        if not s['date']:
            return False
        return s
 def get_sitemap_date(a):
    if a.find('lastmod'):
        return a.find('lastmod').text
    if a.find('news:publication_date'):
        return a.find('news:publication_date').text
    if a.find('ns2:publication_date'):
        return a.find('ns2:publication_date').text
    return ''
 class Sitemap(_Base):
    def __init__(self, url, tz=None):
        self.tz = tz
        self.sitemap_url = url
    def feed(self, excludes=None):
        links = []
        if isinstance(self.sitemap_url, str):
            links += self._get_sitemap(self.sitemap_url, excludes)
        elif isinstance(self.sitemap_url, list):
            for url in self.sitemap_url:
                links += self._get_sitemap(url, excludes)
        return list(set(links))
    def _filter_links(self, links, excludes=None):
        too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
        links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links]))
        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
        links = [x.find('loc').text for x in links] or []
        links = list(set(links))
        if excludes:
            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
        return links
    def _get_sitemap(self, feed_url, excludes=None):
        markup = xml(lambda x: feed_url)
        if not markup: return []
        soup = BeautifulSoup(markup, features='lxml')
        links = []
        feed_urls = []
        if soup.find('sitemapindex'):
            sitemap = soup.find('sitemapindex').findAll('sitemap')
            feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
        if soup.find('urlset'):
            sitemap = soup.find('urlset').findAll('url')
            links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
        feed_urls = self._filter_links(feed_urls, excludes)
        links = self._filter_links(links, excludes)
        for url in feed_urls:
            links += self._get_sitemap(url, excludes)
        return list(set(links))
 class Category(_Base):
    def __init__(self, url, tz=None):
        self.tz = tz
        self.category_url = url
    def _filter_links(self, links, category_url, excludes=None):
        links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
        links = list(filter(None, [link if link != category_url else None for link in links]))
        links = list(set(links))
        if excludes:
            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
        return links
    def _get_category(self, category_url, excludes=None):
        base_url = '/'.join(category_url.split('/')[:3])
        markup = xml(lambda x: category_url)
        if not markup: return []
        soup = BeautifulSoup(markup, features='html.parser')
        links = soup.find_all('a', href=True)
        links = [link.get('href') for link in links]
        links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
        links = self._filter_links(links, category_url, excludes)
        return links
    def feed(self, excludes=None):
        links = []
        if isinstance(self.category_url, str):
            links += self._get_category(self.category_url, excludes)
        elif isinstance(self.category_url, list):
            for url in self.category_url:
                links += self._get_category(url, excludes)
        return list(set(links))
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
    print("Sitemap: The Spinoff")
    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
    excludes = [
        'thespinoff.co.nz/sitemap-misc.xml',
        'thespinoff.co.nz/sitemap-authors.xml',
        'thespinoff.co.nz/sitemap-tax-category.xml',
    ]
    posts = site.feed(excludes)
    print(posts[:5])
    print(site.story(posts[0]))
    print("Sitemap: Newshub")
    site = Sitemap([
        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
    ])
    posts = site.feed()
    print(posts[:5])
    print(site.story(posts[0]))
    print(site.story(posts[:-1]))
@@ -0,0 +1,99 @@
 import logging
 logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)
 if __name__ == '__main__':
    import sys
    sys.path.insert(0,'.')
 from datetime import datetime
 from bs4 import BeautifulSoup
 import settings
 from utils import clean
 from misc.time import unix
 from misc.api import xml
 from misc.news import Base
 def _get_sitemap_date(a):
    if a.find('lastmod'):
        return a.find('lastmod').text
    if a.find('news:publication_date'):
        return a.find('news:publication_date').text
    if a.find('ns2:publication_date'):
        return a.find('ns2:publication_date').text
    return ''
 def _filter_links(links, excludes=None):
    too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
    links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
    links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
    links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
    links = [x.find('loc').text for x in links] or []
    links = list(set(links))
    if excludes:
        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
    return links
 def _get_sitemap(feed_url, excludes=None):
    markup = xml(lambda x: feed_url)
    if not markup: return []
    soup = BeautifulSoup(markup, features='lxml')
    links = []
    feed_urls = []
    if soup.find('sitemapindex'):
        sitemap = soup.find('sitemapindex').findAll('sitemap')
        feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
    if soup.find('urlset'):
        sitemap = soup.find('urlset').findAll('url')
        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
    feed_urls = _filter_links(feed_urls, excludes)
    links = _filter_links(links, excludes)
    for url in feed_urls:
        links += _get_sitemap(url, excludes)
    return list(set(links))
 class Sitemap(Base):
    def __init__(self, config):
        self.config = config
        self.sitemap_url = config.get('url')
        self.tz = config.get('tz')
    def feed(self, excludes=None):
        links = []
        if isinstance(self.sitemap_url, str):
            links += _get_sitemap(self.sitemap_url, excludes)
        elif isinstance(self.sitemap_url, list):
            for url in self.sitemap_url:
                links += _get_sitemap(url, excludes)
        links = list(set(links))
        return [(self.get_id(link), link) for link in links]
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
    print("Sitemap: The Spinoff")
    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
    excludes = [
        'thespinoff.co.nz/sitemap-misc.xml',
        'thespinoff.co.nz/sitemap-authors.xml',
        'thespinoff.co.nz/sitemap-tax-category.xml',
    ]
    posts = site.feed(excludes)
    print(posts[:5])
    print(site.story(posts[0]))
    print("Sitemap: Newshub")
    site = Sitemap([
        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
    ])
    posts = site.feed()
    print(posts[:5])
    print(site.story(posts[0]))
    print(site.story(posts[:-1]))
@@ -0,0 +1,35 @@
 import logging
 logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)
 import requests
 USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
 FORWARD_IP = '66.249.66.1'
 def xml(route, ref=None):
    try:
        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
        r = requests.get(route(ref), headers=headers, timeout=5)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.text
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting URL: {}'.format(str(e)))
        return False
 def json(route, ref=None):
    try:
        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
        r = requests.get(route(ref), headers=headers, timeout=5)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting URL: {}'.format(str(e)))
        return False
@@ -0,0 +1,69 @@
 def parse_extruct(s, data):
    rdfa_keys = {
        'title': [
            'http://ogp.me/ns#title',
            'https://ogp.me/ns#title',
        ],
        'date': [
            'http://ogp.me/ns/article#modified_time',
            'https://ogp.me/ns/article#modified_time',
            'http://ogp.me/ns/article#published_time',
            'https://ogp.me/ns/article#published_time',
        ]
    }
    for rdfa in data['rdfa']:
        for key, props in rdfa.items():
            for attribute, properties in rdfa_keys.items():
                for prop in properties:
                    if prop in props:
                        for values in props[prop]:
                            s[attribute] = values['@value']
    for og in data['opengraph']:
        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
        if len(modified):
            s['date'] = modified[0]
        if len(published):
            s['date'] = published[0]
        if len(titles):
            s['title'] = titles[0]
    for md in data['microdata']:
        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
            props = md['properties']
            s['title'] = props['headline']
            if props['dateModified']:
                s['date'] = props['dateModified']
            if props['datePublished']:
                s['date'] = props['datePublished']
            if 'author' in props and props['author']:
                if 'properties' in props['author']:
                    s['author'] = props['author']['properties']['name']
                elif isinstance(props['author'], list):
                    s['author'] = props['author'][0]['properties']['name']
    for ld in data['json-ld']:
        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
            s['title'] = ld['headline']
            if ld['dateModified']:
                s['date'] = ld['dateModified']
            if ld['datePublished']:
                s['date'] = ld['datePublished']
            if 'author' in ld and ld['author']:
                if 'name' in ld['author']:
                    s['author'] = ld['author']['name']
                elif isinstance(ld['author'], list):
                    s['author'] = ld['author'][0]['name']
        if '@graph' in ld:
            for gld in ld['@graph']:
                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
                    s['title'] = gld['headline']
                    if gld['dateModified']:
                        s['date'] = gld['dateModified']
                    if gld['datePublished']:
                        s['date'] = gld['datePublished']
    return s
@@ -0,0 +1,101 @@
 import logging
 logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)
 import re
 import requests
 from bs4 import BeautifulSoup
 from scrapers import declutter
 import extruct
 import settings
 from utils import clean
 from misc.metadata import parse_extruct
 from misc.time import unix
 from misc.api import xml
 def comment(i):
    if 'author' not in i:
        return False
    c = {}
    c['author'] = i.get('author', '')
    c['score'] = i.get('points', 0)
    c['date'] = unix(i.get('date', 0))
    c['text'] = clean(i.get('text', '') or '')
    c['comments'] = [comment(j) for j in i['children']]
    c['comments'] = list(filter(bool, c['comments']))
    return c
 def comment_count(i):
    alive = 1 if i['author'] else 0
    return sum([comment_count(c) for c in i['comments']]) + alive
 class Base:
    def __init__(config):
        self.config = config
        self.url = config.get('url')
        self.tz = config.get('tz')
    def get_id(self, link):
        patterns = self.config.get('patterns')
        if not patterns:
            return link
        patterns = [re.compile(p) for p in patterns]
        patterns = list(filter(None, [p.match(link) for p in patterns]))
        patterns = list(set([':'.join(p.groups()) for p in patterns]))
        if not patterns:
            return link
        return patterns[0]
    def feed(self, excludes=None):
        return []
    def story(self, ref, urlref):
        if urlref is None:
            return False
        markup = xml(lambda x: urlref)
        if not markup:
            return False
        s = {}
        s['author_link'] = ''
        s['score'] = 0
        s['comments'] = []
        s['num_comments'] = 0
        s['link'] = urlref
        s['url'] = urlref
        s['date'] = 0
        soup = BeautifulSoup(markup, features='html.parser')
        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
        favicon = soup.find_all('link', rel="shortcut icon", href=True)
        others = soup.find_all('link', rel="icon", href=True)
        icons = icon32 + icon16 + favicon + others
        base_url = '/'.join(urlref.split('/')[:3])
        icons = list(set([i.get('href') for i in icons]))
        icons = [i if i.startswith('http') else base_url + i for i in icons]
        if icons:
            s['icon'] = icons[0]
        data = extruct.extract(markup)
        s = parse_extruct(s, data)
        if s['date']:
            s['date'] = unix(s['date'], tz=self.tz)
        if 'disqus' in markup:
            try:
                s['comments'] = declutter.get_comments(urlref)
                c['comments'] = list(filter(bool, c['comments']))
                s['num_comments'] = comment_count(s['comments'])
            except KeyboardInterrupt:
                raise
            except:
                pass
        if not s['date']:
            return False
        return s
@@ -0,0 +1,18 @@
 import pytz
 import dateutil.parser
 TZINFOS = {
    'NZDT': pytz.timezone('Pacific/Auckland'),
    'NZST': pytz.timezone('Pacific/Auckland')
 }
 def unix(date_str, tz=None, tzinfos=TZINFOS):
    try:
        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
        if tz:
            dt = pytz.timezone(tz).localize(dt)
        return int(dt.timestamp())
    except:
        pass
    return 0
@@ -145,12 +145,12 @@ def static_story(sid):
 http_server = WSGIServer(('', 33842), flask_app)
 def _add_new_refs():
-    for ref, source in feed.get_list():
+    for ref, source, urlref in feed.get_list():
        if database.get_story_by_ref(ref):
            continue
        try:
            nid = new_id()
-            database.put_ref(ref, nid, source)
+            database.put_ref(ref, nid, source, urlref)
            logging.info('Added ref ' + ref)
        except database.IntegrityError:
            continue
@@ -163,7 +163,7 @@ def _update_current_story(item):
    logging.info('Updating story: {}'.format(str(story['ref'])))
-    valid = feed.update_story(story)
+    valid = feed.update_story(story, urlref=item['urlref'])
    if valid:
        database.put_story(story)
        search.put_story(story)
@@ -13,15 +13,43 @@ NUM_TILDES = 5
 NUM_SUBSTACK = 10
 SITEMAP = {}
-# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
+# SITEMAP['nzherald'] = {
-# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
+#     'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
 #     'count': 20,
 #     'patterns': [
 #         r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
 #     ],
 #     'excludes': [
 #         'driven.co.nz',
 #         'oneroof.co.nz',
 #         'nzherald.co.nz/sponsored-stories',
 #         'nzherald.co.nz/entertainment/',
 #         'nzherald.co.nz/lifestyle/',
 #         'nzherald.co.nz/travel/',
 #         'nzherald.co.nz/sport/',
 #         'nzherald.co.nz/promotions/',
 #         'nzherald.co.nzhttp',
 #         'herald-afternoon-quiz',
 #         'herald-morning-quiz'
 #     ],
 # }
 SUBSTACK = {}
 # SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
 # SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
 CATEGORY = {}
-# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
+# CATEGORY['radionz'] = {
 #     'url': "https://www.rnz.co.nz/news/",
 #     'count': 20,
 #     'patterns': [
 #         r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
 #     ],
 #     'excludes': [
 #         'rnz.co.nz/news/sport',
 #         'rnz.co.nz/weather',
 #     ],
 # }
 SCRAPERS = ['browser', 'declutter', 'outline', 'local']
@@ -39,6 +39,9 @@
 	if (matchDomain(["tvnz.co.nz"])) {
 		removeSelectors([".signup-container container"]);
 	}
 	if (matchDomain(["thespinoff.co.nz"])) {
 		removeSelectors([".the-spinoff-club-interruptive", ".bulletin-signup"]);
 	}
 	function matchDomain(domains) {
 		const hostname = window.location.hostname;
Author	SHA1	Message	Date
Jason Schwarzenberger	5668fa5dbc	fix mistake.	2020-11-17 12:54:54 +13:00
Jason Schwarzenberger	b771b52501	add regex to get a unique ref from each sitemap/category based article url.	2020-11-17 12:38:28 +13:00
Jason Schwarzenberger	f5c7a658ba	cosmetic filters for the spinoff.	2020-11-16 16:49:39 +13:00
Jason Schwarzenberger	f5ccd844da	fix import error.	2020-11-16 15:41:09 +13:00
Jason Schwarzenberger	6a91b9402f	split categories, sitemap and other crap out of news.py	2020-11-16 15:30:33 +13:00