switch to python-dateutil for parser, reverse sort xml feeds.
This commit is contained in:
parent
1c4764e67d
commit
2dbc702b40
|
@ -11,6 +11,7 @@ import requests
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from scrapers import declutter
|
from scrapers import declutter
|
||||||
|
import dateutil.parser
|
||||||
import extruct
|
import extruct
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
|
@ -20,21 +21,8 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101
|
||||||
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
#USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||||
|
|
||||||
def unix(date_str, tz=None):
|
def unix(date_str, tz=None):
|
||||||
date_tzfix = date_str
|
|
||||||
if ":" == date_tzfix[-3]:
|
|
||||||
date_tzfix = date_tzfix[:-3]+date_tzfix[-2:]
|
|
||||||
formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%f']
|
|
||||||
formats = formats + [f.replace("T%H", " %H") for f in formats]
|
|
||||||
for f in formats:
|
|
||||||
try:
|
try:
|
||||||
dt = datetime.strptime(date_str, f)
|
dt = dateutil.parser.parse(date_str, f)
|
||||||
if tz:
|
|
||||||
dt = pytz.timezone(tz).localize(dt)
|
|
||||||
return int(dt.timestamp())
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(date_tzfix, f)
|
|
||||||
if tz:
|
if tz:
|
||||||
dt = pytz.timezone(tz).localize(dt)
|
dt = pytz.timezone(tz).localize(dt)
|
||||||
return int(dt.timestamp())
|
return int(dt.timestamp())
|
||||||
|
@ -180,8 +168,11 @@ class Sitemap(_Base):
|
||||||
if not markup: return []
|
if not markup: return []
|
||||||
soup = BeautifulSoup(markup, features='lxml')
|
soup = BeautifulSoup(markup, features='lxml')
|
||||||
articles = soup.find('urlset').findAll('url')
|
articles = soup.find('urlset').findAll('url')
|
||||||
articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles]))
|
news = list(filter(None, [a if a.find('news:news') else None for a in articles]))
|
||||||
articles.sort(key=lambda a: unix(a.find('lastmod')), reverse=True)
|
news = list(filter(None, [a if a.find('news:publication_date') else None for a in news]))
|
||||||
|
articles = list(filter(None, [a if a.find('lastmod') else None for a in articles]))
|
||||||
|
links = articles + news
|
||||||
|
links.sort(key=lambda a: unix(a.find('lastmod')), reverse=True)
|
||||||
links = [x.find('loc').text for x in articles] or []
|
links = [x.find('loc').text for x in articles] or []
|
||||||
links = list(set(links))
|
links = list(set(links))
|
||||||
if excludes:
|
if excludes:
|
||||||
|
|
|
@ -30,3 +30,4 @@ websocket-client==0.57.0
|
||||||
Werkzeug==1.0.1
|
Werkzeug==1.0.1
|
||||||
zope.event==4.4
|
zope.event==4.4
|
||||||
zope.interface==5.1.0
|
zope.interface==5.1.0
|
||||||
|
python-dateutil==2.8.1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user