switch to python-dateutil for parser, reverse sort xml feeds.
This commit is contained in:
		| @@ -11,6 +11,7 @@ import requests | |||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from bs4 import BeautifulSoup | from bs4 import BeautifulSoup | ||||||
| from scrapers import declutter | from scrapers import declutter | ||||||
|  | import dateutil.parser | ||||||
| import extruct | import extruct | ||||||
| import pytz | import pytz | ||||||
|  |  | ||||||
| @@ -20,21 +21,8 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 | |||||||
| #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" | #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" | ||||||
|  |  | ||||||
| def unix(date_str, tz=None): | def unix(date_str, tz=None): | ||||||
|     date_tzfix = date_str |  | ||||||
|     if ":" == date_tzfix[-3]: |  | ||||||
|         date_tzfix = date_tzfix[:-3]+date_tzfix[-2:] |  | ||||||
|     formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%f'] |  | ||||||
|     formats = formats + [f.replace("T%H", " %H") for f in formats] |  | ||||||
|     for f in formats: |  | ||||||
|     try: |     try: | ||||||
|             dt = datetime.strptime(date_str, f) |         dt = dateutil.parser.parse(date_str, f) | ||||||
|             if tz: |  | ||||||
|                 dt = pytz.timezone(tz).localize(dt) |  | ||||||
|             return int(dt.timestamp()) |  | ||||||
|         except: |  | ||||||
|             pass |  | ||||||
|         try: |  | ||||||
|             dt = datetime.strptime(date_tzfix, f) |  | ||||||
|         if tz: |         if tz: | ||||||
|             dt = pytz.timezone(tz).localize(dt) |             dt = pytz.timezone(tz).localize(dt) | ||||||
|         return int(dt.timestamp()) |         return int(dt.timestamp()) | ||||||
| @@ -180,8 +168,11 @@ class Sitemap(_Base): | |||||||
|         if not markup: return [] |         if not markup: return [] | ||||||
|         soup = BeautifulSoup(markup, features='lxml') |         soup = BeautifulSoup(markup, features='lxml') | ||||||
|         articles = soup.find('urlset').findAll('url') |         articles = soup.find('urlset').findAll('url') | ||||||
|         articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) |         news = list(filter(None, [a if a.find('news:news') else None for a in articles])) | ||||||
|         articles.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) |         news = list(filter(None, [a if a.find('news:publication_date') else None for a in news])) | ||||||
|  |         articles = list(filter(None, [a if a.find('lastmod') else None for a in articles])) | ||||||
|  |         links = articles + news | ||||||
|  |         links.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) | ||||||
|         links = [x.find('loc').text for x in articles] or [] |         links = [x.find('loc').text for x in articles] or [] | ||||||
|         links = list(set(links)) |         links = list(set(links)) | ||||||
|         if excludes: |         if excludes: | ||||||
|   | |||||||
| @@ -30,3 +30,4 @@ websocket-client==0.57.0 | |||||||
| Werkzeug==1.0.1 | Werkzeug==1.0.1 | ||||||
| zope.event==4.4 | zope.event==4.4 | ||||||
| zope.interface==5.1.0 | zope.interface==5.1.0 | ||||||
|  | python-dateutil==2.8.1 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user