From 2dbc702b407aa6ad7f7c59e9da551c67faa85dce Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Fri, 6 Nov 2020 10:02:39 +1300 Subject: [PATCH] switch to python-dateutil for parser, reverse sort xml feeds. --- apiserver/feeds/news.py | 35 +++++++++++++---------------------- apiserver/requirements.txt | 1 + 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/apiserver/feeds/news.py b/apiserver/feeds/news.py index b788530..fd57c80 100644 --- a/apiserver/feeds/news.py +++ b/apiserver/feeds/news.py @@ -11,6 +11,7 @@ import requests from datetime import datetime from bs4 import BeautifulSoup from scrapers import declutter +import dateutil.parser import extruct import pytz @@ -20,26 +21,13 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" def unix(date_str, tz=None): - date_tzfix = date_str - if ":" == date_tzfix[-3]: - date_tzfix = date_tzfix[:-3]+date_tzfix[-2:] - formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%f'] - formats = formats + [f.replace("T%H", " %H") for f in formats] - for f in formats: - try: - dt = datetime.strptime(date_str, f) - if tz: - dt = pytz.timezone(tz).localize(dt) - return int(dt.timestamp()) - except: - pass - try: - dt = datetime.strptime(date_tzfix, f) - if tz: - dt = pytz.timezone(tz).localize(dt) - return int(dt.timestamp()) - except: - pass + try: + dt = dateutil.parser.parse(date_str, f) + if tz: + dt = pytz.timezone(tz).localize(dt) + return int(dt.timestamp()) + except: + pass return 0 @@ -180,8 +168,11 @@ class Sitemap(_Base): if not markup: return [] soup = BeautifulSoup(markup, features='lxml') articles = soup.find('urlset').findAll('url') - articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) - articles.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) + news = list(filter(None, [a if a.find('news:news') else None for a in articles])) + news = list(filter(None, [a if a.find('news:publication_date') else None for a in news])) + articles = list(filter(None, [a if a.find('lastmod') else None for a in articles])) + links = articles + news + links.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) links = [x.find('loc').text for x in articles] or [] links = list(set(links)) if excludes: diff --git a/apiserver/requirements.txt b/apiserver/requirements.txt index 1490660..d809c87 100644 --- a/apiserver/requirements.txt +++ b/apiserver/requirements.txt @@ -30,3 +30,4 @@ websocket-client==0.57.0 Werkzeug==1.0.1 zope.event==4.4 zope.interface==5.1.0 +python-dateutil==2.8.1