switch to python-dateutil for parser, reverse sort xml feeds.

This commit is contained in:
Jason Schwarzenberger 2020-11-06 10:02:39 +13:00
parent 1c4764e67d
commit 2dbc702b40
2 changed files with 14 additions and 22 deletions

View File

@ -11,6 +11,7 @@ import requests
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapers import declutter from scrapers import declutter
import dateutil.parser
import extruct import extruct
import pytz import pytz
@ -20,21 +21,8 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
def unix(date_str, tz=None): def unix(date_str, tz=None):
date_tzfix = date_str
if ":" == date_tzfix[-3]:
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%f']
formats = formats + [f.replace("T%H", " %H") for f in formats]
for f in formats:
try: try:
dt = datetime.strptime(date_str, f) dt = dateutil.parser.parse(date_str, f)
if tz:
dt = pytz.timezone(tz).localize(dt)
return int(dt.timestamp())
except:
pass
try:
dt = datetime.strptime(date_tzfix, f)
if tz: if tz:
dt = pytz.timezone(tz).localize(dt) dt = pytz.timezone(tz).localize(dt)
return int(dt.timestamp()) return int(dt.timestamp())
@ -180,8 +168,11 @@ class Sitemap(_Base):
if not markup: return [] if not markup: return []
soup = BeautifulSoup(markup, features='lxml') soup = BeautifulSoup(markup, features='lxml')
articles = soup.find('urlset').findAll('url') articles = soup.find('urlset').findAll('url')
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) news = list(filter(None, [a if a.find('news:news') else None for a in articles]))
articles.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) news = list(filter(None, [a if a.find('news:publication_date') else None for a in news]))
articles = list(filter(None, [a if a.find('lastmod') else None for a in articles]))
links = articles + news
links.sort(key=lambda a: unix(a.find('lastmod')), reverse=True)
links = [x.find('loc').text for x in articles] or [] links = [x.find('loc').text for x in articles] or []
links = list(set(links)) links = list(set(links))
if excludes: if excludes:

View File

@ -30,3 +30,4 @@ websocket-client==0.57.0
Werkzeug==1.0.1 Werkzeug==1.0.1
zope.event==4.4 zope.event==4.4
zope.interface==5.1.0 zope.interface==5.1.0
python-dateutil==2.8.1