From 408e2870b26e4d8bd9fe3d061b51347848a30d79 Mon Sep 17 00:00:00 2001
From: Jason Schwarzenberger <jason@credisense.io>
Date: Tue, 10 Nov 2020 16:51:27 +1300
Subject: [PATCH] tzinfo and microdata schema urls.

---
 apiserver/feeds/news.py | 53 +++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/apiserver/feeds/news.py b/apiserver/feeds/news.py
index 9bf1a84..f29da14 100644
--- a/apiserver/feeds/news.py
+++ b/apiserver/feeds/news.py
@@ -17,12 +17,17 @@ import pytz
 
 from utils import clean
 
+tzinfos = {
+    'NZDT': pytz.timezone('Pacific/Auckland'),
+    'NZST': pytz.timezone('Pacific/Auckland')
+}
+
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'
 #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
 
 def unix(date_str, tz=None):
     try:
-        dt = dateutil.parser.parse(date_str)
+        dt = dateutil.parser.parse(date_str, tzinfos=tzinfos)
         if tz:
             dt = pytz.timezone(tz).localize(dt)
         return int(dt.timestamp())
@@ -46,17 +51,25 @@ def xml(route, ref=None):
 
 
 def parse_extruct(s, data):
+    rdfa_keys = {
+        'title': [
+            'http://ogp.me/ns#title',
+            'https://ogp.me/ns#title',
+        ],
+        'date': [
+            'http://ogp.me/ns/article#modified_time',
+            'https://ogp.me/ns/article#modified_time',
+            'http://ogp.me/ns/article#published_time',
+            'https://ogp.me/ns/article#published_time',
+        ]
+    }
     for rdfa in data['rdfa']:
-            for key, props in rdfa.items():
-                if 'http://ogp.me/ns#title' in props:
-                    for values in props['http://ogp.me/ns#title']:
-                        s['title'] = values['@value']
-                if 'http://ogp.me/ns/article#modified_time' in props:
-                    for values in props['http://ogp.me/ns/article#modified_time']:
-                        s['date'] = values['@value']
-                if 'http://ogp.me/ns/article#published_time' in props:
-                    for values in props['http://ogp.me/ns/article#published_time']:
-                        s['date'] = values['@value']
+        for key, props in rdfa.items():
+            for attribute, properties in rdfa_keys.items():
+                for prop in properties:
+                    if prop in props:
+                        for values in props[prop]:
+                            s[attribute] = values['@value']
 
     for og in data['opengraph']:
         titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
@@ -70,7 +83,7 @@ def parse_extruct(s, data):
             s['title'] = titles[0]
 
     for md in data['microdata']:
-        if md['type'] == 'https://schema.org/NewsArticle':
+        if md['type'] in ['https://schema.org/NewsArticle', 'http://schema.org/NewsArticle']:
             props = md['properties']
             s['title'] = props['headline']
             if props['dateModified']:
@@ -224,20 +237,20 @@ class Category(_Base):
 
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
-    print("Sitemap: Stuff")
-    site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml")
-    posts = site.feed()
-    print(posts[:5])
-    print(site.story(posts[0]))
-
     print("Category: RadioNZ Te Ao Māori")
     site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
     posts = site.feed()
     print(posts[:5])
     print(site.story(posts[0]))
 
-    print("Sitemap: Newsroom")
-    site = Sitemap("https://www.newsroom.co.nz/sitemap.xml")
+    print("Sitemap: tvnz")
+    site = Sitemap("https://www.tvnz.co.nz/system/tvnz/sitemap.xml")
+    posts = site.feed()
+    print(posts[:5])
+    print(site.story(posts[0]))
+
+    print("Sitemap: Newsroom")
+    site = Sitemap("https://www.newsroom.co.nz/sitemap.xml", tz='Pacific/Auckland')
     posts = site.feed()
     print(posts[:5])
     print(site.story(posts[0]))