split categories, sitemap and other crap out of news.py

2020-11-16 15:30:33 +13:00
parent b80c1a5cb5
commit 6a91b9402f
8 changed files with 384 additions and 310 deletions
@@ -9,7 +9,9 @@ from bs4 import BeautifulSoup
 import itertools

 import settings
-from feeds import hackernews, reddit, tildes, substack, manual, news
+from feeds import hackernews, reddit, tildes, substack, manual
+from feeds.sitemap import Sitemap
+from feeds.category import Category
 from scrapers import outline, declutter, browser, local

 INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@@ -19,10 +21,10 @@ for key, value in settings.SUBSTACK.items():
    substacks[key] = substack.Publication(value['url'])
 categories = {}
 for key, value in settings.CATEGORY.items():
-    categories[key] = news.Category(value['url'], value.get('tz'))
+    categories[key] = Category(value['url'], value.get('tz'))
 sitemaps = {}
 for key, value in settings.SITEMAP.items():
-    sitemaps[key] = news.Sitemap(value['url'], value.get('tz'))
+    sitemaps[key] = Sitemap(value['url'], value.get('tz'))

 def get_list():
    feeds = {}
@@ -0,0 +1,90 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+import requests
+from bs4 import BeautifulSoup
+from scrapers import declutter
+import extruct
+
+import settings
+from utils import clean
+from misc.metadata import parse_extruct
+from misc.time import unix
+from misc.api import xml
+
+def comment(i):
+    if 'author' not in i:
+        return False
+
+    c = {}
+    c['author'] = i.get('author', '')
+    c['score'] = i.get('points', 0)
+    c['date'] = unix(i.get('date', 0))
+    c['text'] = clean(i.get('text', '') or '')
+    c['comments'] = [comment(j) for j in i['children']]
+    c['comments'] = list(filter(bool, c['comments']))
+    return c
+
+def comment_count(i):
+    alive = 1 if i['author'] else 0
+    return sum([comment_count(c) for c in i['comments']]) + alive
+
+class Base:
+    def __init__(url, tz=None):
+        self.url = url
+        self.tz = tz
+
+    def feed(self, excludes=None):
+        return []
+
+    def story(self, ref):
+        markup = xml(lambda x: ref)
+        if not markup:
+            return False
+
+        s = {}
+        s['author_link'] = ''
+        s['score'] = 0
+        s['comments'] = []
+        s['num_comments'] = 0
+        s['link'] = ref
+        s['url'] = ref
+        s['date'] = 0
+
+        soup = BeautifulSoup(markup, features='html.parser')
+        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
+        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
+        favicon = soup.find_all('link', rel="shortcut icon", href=True)
+        others = soup.find_all('link', rel="icon", href=True)
+        icons = icon32 + icon16 + favicon + others
+        base_url = '/'.join(ref.split('/')[:3])
+        icons = list(set([i.get('href') for i in icons]))
+        icons = [i if i.startswith('http') else base_url + i for i in icons]
+
+        if icons:
+            s['icon'] = icons[0]
+
+        data = extruct.extract(markup)
+        s = parse_extruct(s, data)
+        if s['date']:
+            s['date'] = unix(s['date'], tz=self.tz)
+
+        if 'disqus' in markup:
+            try:
+                s['comments'] = declutter.get_comments(ref)
+                c['comments'] = list(filter(bool, c['comments']))
+                s['num_comments'] = comment_count(s['comments'])
+            except KeyboardInterrupt:
+                raise
+            except:
+                pass
+
+        if not s['date']:
+            return False
+        return s
@@ -0,0 +1,70 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+from bs4 import BeautifulSoup
+
+import settings
+from utils import clean
+from misc.api import xml
+from _news import Base
+
+def _filter_links(links, category_url, excludes=None):
+    links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
+    links = list(filter(None, [link if link != category_url else None for link in links]))
+    links = list(set(links))
+    if excludes:
+        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+    return links
+
+def _get_category(category_url, excludes=None):
+    base_url = '/'.join(category_url.split('/')[:3])
+    markup = xml(lambda x: category_url)
+    if not markup: return []
+    soup = BeautifulSoup(markup, features='html.parser')
+    links = soup.find_all('a', href=True)
+    links = [link.get('href') for link in links]
+    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
+    links = _filter_links(links, category_url, excludes)
+    return links
+
+class Category(Base):
+    def __init__(self, url, tz=None):
+        self.tz = tz
+        self.category_url = url
+
+    def feed(self, excludes=None):
+        links = []
+        if isinstance(self.category_url, str):
+            links += _get_category(self.category_url, excludes)
+        elif isinstance(self.category_url, list):
+            for url in self.category_url:
+                links += _get_category(url, excludes)
+        return list(set(links))
+
+
+# scratchpad so I can quickly develop the parser
+if __name__ == '__main__':
+    print("Category: RadioNZ")
+    site = Category("https://www.rnz.co.nz/news/")
+    excludes = [
+        'rnz.co.nz/news/sport',
+        'rnz.co.nz/weather',
+        'rnz.co.nz/news/weather',
+    ]
+    posts = site.feed(excludes)
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+    print("Category: Newsroom")
+    site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+
@@ -1,307 +0,0 @@
-import logging
-logging.basicConfig(
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
-
-if __name__ == '__main__':
-    import sys
-    sys.path.insert(0,'.')
-
-import requests
-from datetime import datetime
-from bs4 import BeautifulSoup
-from scrapers import declutter
-import dateutil.parser
-import extruct
-import pytz
-
-from utils import clean
-import settings
-
-tzinfos = {
-    'NZDT': pytz.timezone('Pacific/Auckland'),
-    'NZST': pytz.timezone('Pacific/Auckland')
-}
-
-USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
-#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-
-def unix(date_str, tz=None):
-    try:
-        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
-        if tz:
-            dt = pytz.timezone(tz).localize(dt)
-        return int(dt.timestamp())
-    except:
-        pass
-    return 0
-
-
-def xml(route, ref=None):
-    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': '66.249.66.1'}
-        r = requests.get(route(ref), headers=headers, timeout=5)
-        if r.status_code != 200:
-            raise Exception('Bad response code ' + str(r.status_code))
-        return r.text
-    except KeyboardInterrupt:
-        raise
-    except BaseException as e:
-        logging.error('Problem hitting URL: {}'.format(str(e)))
-        return False
-
-
-def parse_extruct(s, data):
-    rdfa_keys = {
-        'title': [
-            'http://ogp.me/ns#title',
-            'https://ogp.me/ns#title',
-        ],
-        'date': [
-            'http://ogp.me/ns/article#modified_time',
-            'https://ogp.me/ns/article#modified_time',
-            'http://ogp.me/ns/article#published_time',
-            'https://ogp.me/ns/article#published_time',
-        ]
-    }
-    for rdfa in data['rdfa']:
-        for key, props in rdfa.items():
-            for attribute, properties in rdfa_keys.items():
-                for prop in properties:
-                    if prop in props:
-                        for values in props[prop]:
-                            s[attribute] = values['@value']
-
-    for og in data['opengraph']:
-        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
-        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
-        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
-        if len(modified):
-            s['date'] = modified[0]
-        if len(published):
-            s['date'] = published[0]
-        if len(titles):
-            s['title'] = titles[0]
-
-    for md in data['microdata']:
-        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
-            props = md['properties']
-            s['title'] = props['headline']
-            if props['dateModified']:
-                s['date'] = props['dateModified']
-            if props['datePublished']:
-                s['date'] = props['datePublished']
-            if 'author' in props and props['author']:
-                if 'properties' in props['author']:
-                    s['author'] = props['author']['properties']['name']
-                elif isinstance(props['author'], list):
-                    s['author'] = props['author'][0]['properties']['name']
-
-    for ld in data['json-ld']:
-        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
-            s['title'] = ld['headline']
-            if ld['dateModified']:
-                s['date'] = ld['dateModified']
-            if ld['datePublished']:
-                s['date'] = ld['datePublished']
-            if 'author' in ld and ld['author']:
-                if 'name' in ld['author']:
-                    s['author'] = ld['author']['name']
-                elif isinstance(ld['author'], list):
-                    s['author'] = ld['author'][0]['name']
-        if '@graph' in ld:
-            for gld in ld['@graph']:
-                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
-                    s['title'] = gld['headline']
-                    if gld['dateModified']:
-                        s['date'] = gld['dateModified']
-                    if gld['datePublished']:
-                        s['date'] = gld['datePublished']
-
-    return s
-
-def comment(i):
-    if 'author' not in i:
-        return False
-
-    c = {}
-    c['author'] = i.get('author', '')
-    c['score'] = i.get('points', 0)
-    c['date'] = unix(i.get('date', 0))
-    c['text'] = clean(i.get('text', '') or '')
-    c['comments'] = [comment(j) for j in i['children']]
-    c['comments'] = list(filter(bool, c['comments']))
-    return c
-
-def comment_count(i):
-    alive = 1 if i['author'] else 0
-    return sum([comment_count(c) for c in i['comments']]) + alive
-
-class _Base:
-    def __init__(url, tz=None):
-        self.url = url
-        self.tz = tz
-
-    def feed(self, excludes=None):
-        return []
-
-    def story(self, ref):
-        markup = xml(lambda x: ref)
-        if not markup:
-            return False
-
-        s = {}
-        s['author_link'] = ''
-        s['score'] = 0
-        s['comments'] = []
-        s['num_comments'] = 0
-        s['link'] = ref
-        s['url'] = ref
-        s['date'] = 0
-
-        soup = BeautifulSoup(markup, features='html.parser')
-        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
-        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
-        favicon = soup.find_all('link', rel="shortcut icon", href=True)
-        others = soup.find_all('link', rel="icon", href=True)
-        icons = icon32 + icon16 + favicon + others
-        base_url = '/'.join(ref.split('/')[:3])
-        icons = list(set([i.get('href') for i in icons]))
-        icons = [i if i.startswith('http') else base_url + i for i in icons]
-
-        if icons:
-            s['icon'] = icons[0]
-
-        data = extruct.extract(markup)
-        s = parse_extruct(s, data)
-        if s['date']:
-            s['date'] = unix(s['date'], tz=self.tz)
-
-        if 'disqus' in markup:
-            try:
-                s['comments'] = declutter.get_comments(ref)
-                c['comments'] = list(filter(bool, c['comments']))
-                s['num_comments'] = comment_count(s['comments'])
-            except KeyboardInterrupt:
-                raise
-            except:
-                pass
-
-        if not s['date']:
-            return False
-        return s
-
-def get_sitemap_date(a):
-    if a.find('lastmod'):
-        return a.find('lastmod').text
-    if a.find('news:publication_date'):
-        return a.find('news:publication_date').text
-    if a.find('ns2:publication_date'):
-        return a.find('ns2:publication_date').text
-    return ''
-
-class Sitemap(_Base):
-    def __init__(self, url, tz=None):
-        self.tz = tz
-        self.sitemap_url = url
-
-    def feed(self, excludes=None):
-        links = []
-        if isinstance(self.sitemap_url, str):
-            links += self._get_sitemap(self.sitemap_url, excludes)
-        elif isinstance(self.sitemap_url, list):
-            for url in self.sitemap_url:
-                links += self._get_sitemap(url, excludes)
-        return list(set(links))
-
-    def _filter_links(self, links, excludes=None):
-        too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
-        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
-        links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links]))
-        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
-
-        links = [x.find('loc').text for x in links] or []
-        links = list(set(links))
-        if excludes:
-            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
-        return links
-
-    def _get_sitemap(self, feed_url, excludes=None):
-        markup = xml(lambda x: feed_url)
-        if not markup: return []
-        soup = BeautifulSoup(markup, features='lxml')
-        links = []
-        feed_urls = []
-        if soup.find('sitemapindex'):
-            sitemap = soup.find('sitemapindex').findAll('sitemap')
-            feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
-        if soup.find('urlset'):
-            sitemap = soup.find('urlset').findAll('url')
-            links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
-
-        feed_urls = self._filter_links(feed_urls, excludes)
-        links = self._filter_links(links, excludes)
-
-        for url in feed_urls:
-            links += self._get_sitemap(url, excludes)
-        return list(set(links))
-
-class Category(_Base):
-    def __init__(self, url, tz=None):
-        self.tz = tz
-        self.category_url = url
-
-    def _filter_links(self, links, category_url, excludes=None):
-        links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
-        links = list(filter(None, [link if link != category_url else None for link in links]))
-        links = list(set(links))
-        if excludes:
-            links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
-        return links
-
-    def _get_category(self, category_url, excludes=None):
-        base_url = '/'.join(category_url.split('/')[:3])
-        markup = xml(lambda x: category_url)
-        if not markup: return []
-        soup = BeautifulSoup(markup, features='html.parser')
-        links = soup.find_all('a', href=True)
-        links = [link.get('href') for link in links]
-        links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
-        links = self._filter_links(links, category_url, excludes)
-        return links
-
-    def feed(self, excludes=None):
-        links = []
-        if isinstance(self.category_url, str):
-            links += self._get_category(self.category_url, excludes)
-        elif isinstance(self.category_url, list):
-            for url in self.category_url:
-                links += self._get_category(url, excludes)
-        return list(set(links))
-
-
-# scratchpad so I can quickly develop the parser
-if __name__ == '__main__':
-    print("Sitemap: The Spinoff")
-    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
-    excludes = [
-        'thespinoff.co.nz/sitemap-misc.xml',
-        'thespinoff.co.nz/sitemap-authors.xml',
-        'thespinoff.co.nz/sitemap-tax-category.xml',
-    ]
-    posts = site.feed(excludes)
-    print(posts[:5])
-    print(site.story(posts[0]))
-
-    print("Sitemap: Newshub")
-    site = Sitemap([
-        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
-        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
-    ])
-    posts = site.feed()
-    print(posts[:5])
-    print(site.story(posts[0]))
-    print(site.story(posts[:-1]))
-
@@ -0,0 +1,97 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+import settings
+from utils import clean
+from misc.time import unix
+from misc.api import xml
+from _news import Base
+
+def _get_sitemap_date(a):
+    if a.find('lastmod'):
+        return a.find('lastmod').text
+    if a.find('news:publication_date'):
+        return a.find('news:publication_date').text
+    if a.find('ns2:publication_date'):
+        return a.find('ns2:publication_date').text
+    return ''
+
+def _filter_links(links, excludes=None):
+    too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE
+    links = list(filter(None, [a if _get_sitemap_date(a) else None for a in links]))
+    links = list(filter(None, [a if unix(_get_sitemap_date(a)) > too_old else None for a in links]))
+    links.sort(key=lambda a: unix(_get_sitemap_date(a)), reverse=True)
+
+    links = [x.find('loc').text for x in links] or []
+    links = list(set(links))
+    if excludes:
+        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
+    return links
+
+def _get_sitemap(feed_url, excludes=None):
+    markup = xml(lambda x: feed_url)
+    if not markup: return []
+    soup = BeautifulSoup(markup, features='lxml')
+    links = []
+    feed_urls = []
+    if soup.find('sitemapindex'):
+        sitemap = soup.find('sitemapindex').findAll('sitemap')
+        feed_urls = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+    if soup.find('urlset'):
+        sitemap = soup.find('urlset').findAll('url')
+        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+
+    feed_urls = _filter_links(feed_urls, excludes)
+    links = _filter_links(links, excludes)
+
+    for url in feed_urls:
+        links += _get_sitemap(url, excludes)
+    return list(set(links))
+
+class Sitemap(Base):
+    def __init__(self, url, tz=None):
+        self.tz = tz
+        self.sitemap_url = url
+
+    def feed(self, excludes=None):
+        links = []
+        if isinstance(self.sitemap_url, str):
+            links += _get_sitemap(self.sitemap_url, excludes)
+        elif isinstance(self.sitemap_url, list):
+            for url in self.sitemap_url:
+                links += _get_sitemap(url, excludes)
+        return list(set(links))
+
+# scratchpad so I can quickly develop the parser
+if __name__ == '__main__':
+    print("Sitemap: The Spinoff")
+    site = Sitemap("https://thespinoff.co.nz/sitemap.xml")
+    excludes = [
+        'thespinoff.co.nz/sitemap-misc.xml',
+        'thespinoff.co.nz/sitemap-authors.xml',
+        'thespinoff.co.nz/sitemap-tax-category.xml',
+    ]
+    posts = site.feed(excludes)
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+    print("Sitemap: Newshub")
+    site = Sitemap([
+        'https://www.newshub.co.nz/home/politics.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/new-zealand.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/world.gnewssitemap.xml',
+        'https://www.newshub.co.nz/home/money.gnewssitemap.xml',
+    ])
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+    print(site.story(posts[:-1]))
@@ -0,0 +1,35 @@
+import logging
+logging.basicConfig(
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        level=logging.DEBUG)
+
+import requests
+
+USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+FORWARD_IP = '66.249.66.1'
+
+def xml(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.text
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
+
+def json(route, ref=None):
+    try:
+        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        r = requests.get(route(ref), headers=headers, timeout=5)
+        if r.status_code != 200:
+            raise Exception('Bad response code ' + str(r.status_code))
+        return r.json()
+    except KeyboardInterrupt:
+        raise
+    except BaseException as e:
+        logging.error('Problem hitting URL: {}'.format(str(e)))
+        return False
@@ -0,0 +1,69 @@
+
+def parse_extruct(s, data):
+    rdfa_keys = {
+        'title': [
+            'http://ogp.me/ns#title',
+            'https://ogp.me/ns#title',
+        ],
+        'date': [
+            'http://ogp.me/ns/article#modified_time',
+            'https://ogp.me/ns/article#modified_time',
+            'http://ogp.me/ns/article#published_time',
+            'https://ogp.me/ns/article#published_time',
+        ]
+    }
+    for rdfa in data['rdfa']:
+        for key, props in rdfa.items():
+            for attribute, properties in rdfa_keys.items():
+                for prop in properties:
+                    if prop in props:
+                        for values in props[prop]:
+                            s[attribute] = values['@value']
+
+    for og in data['opengraph']:
+        titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
+        modified = list(filter(None, [value if 'article:modified_time' in key else None for key, value in og['properties']]))
+        published = list(filter(None, [value if 'article:published_time' in key else None for key, value in og['properties']]))
+        if len(modified):
+            s['date'] = modified[0]
+        if len(published):
+            s['date'] = published[0]
+        if len(titles):
+            s['title'] = titles[0]
+
+    for md in data['microdata']:
+        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
+            props = md['properties']
+            s['title'] = props['headline']
+            if props['dateModified']:
+                s['date'] = props['dateModified']
+            if props['datePublished']:
+                s['date'] = props['datePublished']
+            if 'author' in props and props['author']:
+                if 'properties' in props['author']:
+                    s['author'] = props['author']['properties']['name']
+                elif isinstance(props['author'], list):
+                    s['author'] = props['author'][0]['properties']['name']
+
+    for ld in data['json-ld']:
+        if '@type' in ld and ld['@type'] in ['Article', 'NewsArticle']:
+            s['title'] = ld['headline']
+            if ld['dateModified']:
+                s['date'] = ld['dateModified']
+            if ld['datePublished']:
+                s['date'] = ld['datePublished']
+            if 'author' in ld and ld['author']:
+                if 'name' in ld['author']:
+                    s['author'] = ld['author']['name']
+                elif isinstance(ld['author'], list):
+                    s['author'] = ld['author'][0]['name']
+        if '@graph' in ld:
+            for gld in ld['@graph']:
+                if '@type' in gld and gld['@type'] in ['Article', 'NewsArticle']:
+                    s['title'] = gld['headline']
+                    if gld['dateModified']:
+                        s['date'] = gld['dateModified']
+                    if gld['datePublished']:
+                        s['date'] = gld['datePublished']
+
+    return s
@@ -0,0 +1,18 @@
+import pytz
+import dateutil.parser
+
+
+TZINFOS = {
+    'NZDT': pytz.timezone('Pacific/Auckland'),
+    'NZST': pytz.timezone('Pacific/Auckland')
+}
+
+def unix(date_str, tz=None, tzinfos=TZINFOS):
+    try:
+        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
+        if tz:
+            dt = pytz.timezone(tz).localize(dt)
+        return int(dt.timestamp())
+    except:
+        pass
+    return 0