From 1658346aa91272839bd51c03afdf98802de57541 Mon Sep 17 00:00:00 2001
From: Jason Schwarzenberger <jason@credisense.io>
Date: Fri, 6 Nov 2020 10:37:43 +1300
Subject: [PATCH] fix news.py feed.

---
 apiserver/feeds/news.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/apiserver/feeds/news.py b/apiserver/feeds/news.py
index fd57c80..42bcb26 100644
--- a/apiserver/feeds/news.py
+++ b/apiserver/feeds/news.py
@@ -1,7 +1,7 @@
 import logging
 logging.basicConfig(
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        level=logging.DEBUG)
+        level=logging.ERROR)
 
 if __name__ == '__main__':
     import sys
@@ -22,7 +22,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101
 
 def unix(date_str, tz=None):
     try:
-        dt = dateutil.parser.parse(date_str, f)
+        dt = dateutil.parser.parse(date_str)
         if tz:
             dt = pytz.timezone(tz).localize(dt)
         return int(dt.timestamp())
@@ -158,6 +158,13 @@ class _Base:
             return False
         return s
 
+def get_sitemap_date(a):
+    if a.find('lastmod'):
+        return a.find('lastmod').text
+    if a.find('news:publication_date'):
+        return a.find('news:publication_date').text
+    return ''
+
 class Sitemap(_Base):
     def __init__(self, url, tz=None):
         self.tz = tz
@@ -167,13 +174,12 @@ class Sitemap(_Base):
         markup = xml(lambda x: self.sitemap_url)
         if not markup: return []
         soup = BeautifulSoup(markup, features='lxml')
-        articles = soup.find('urlset').findAll('url')
-        news = list(filter(None, [a if a.find('news:news') else None for a in articles]))
-        news = list(filter(None, [a if a.find('news:publication_date') else None for a in news]))
-        articles = list(filter(None, [a if a.find('lastmod') else None for a in articles]))
-        links = articles + news
-        links.sort(key=lambda a: unix(a.find('lastmod')), reverse=True)
-        links = [x.find('loc').text for x in articles] or []
+        sitemap = soup.find('urlset').findAll('url')
+
+        links = list(filter(None, [a if a.find('loc') else None for a in sitemap]))
+        links = list(filter(None, [a if get_sitemap_date(a) else None for a in links]))
+        links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True)
+        links = [x.find('loc').text for x in links] or []
         links = list(set(links))
         if excludes:
             links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
@@ -204,15 +210,15 @@ class Category(_Base):
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
     print("Sitemap: Stuff")
-    site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
+    site = Sitemap("https://www.stuff.co.nz/sitemap/news/sitemap.xml")
     posts = site.feed()
-    print(posts[:1])
+    print(posts[:5])
     print(site.story(posts[0]))
 
     print("Category: RadioNZ Te Ao Māori")
     site = Category("https://www.rnz.co.nz/news/te-manu-korihi/")
     posts = site.feed()
-    print(posts[:1])
+    print(posts[:5])
     print(site.story(posts[0]))
 
     print("Sitemap: Newsroom")