forked from tanner/qotnews
		
	switch to python-dateutil for parser, reverse sort xml feeds.
This commit is contained in:
		| @@ -11,6 +11,7 @@ import requests | ||||
| from datetime import datetime | ||||
| from bs4 import BeautifulSoup | ||||
| from scrapers import declutter | ||||
| import dateutil.parser | ||||
| import extruct | ||||
| import pytz | ||||
|  | ||||
| @@ -20,26 +21,13 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 | ||||
| #USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" | ||||
|  | ||||
| def unix(date_str, tz=None): | ||||
|     date_tzfix = date_str | ||||
|     if ":" == date_tzfix[-3]: | ||||
|         date_tzfix = date_tzfix[:-3]+date_tzfix[-2:] | ||||
|     formats = ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f%z', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M:%S.%f'] | ||||
|     formats = formats + [f.replace("T%H", " %H") for f in formats] | ||||
|     for f in formats: | ||||
|         try: | ||||
|             dt = datetime.strptime(date_str, f) | ||||
|             if tz: | ||||
|                 dt = pytz.timezone(tz).localize(dt) | ||||
|             return int(dt.timestamp()) | ||||
|         except: | ||||
|             pass | ||||
|         try: | ||||
|             dt = datetime.strptime(date_tzfix, f) | ||||
|             if tz: | ||||
|                 dt = pytz.timezone(tz).localize(dt) | ||||
|             return int(dt.timestamp()) | ||||
|         except: | ||||
|             pass | ||||
|     try: | ||||
|         dt = dateutil.parser.parse(date_str, f) | ||||
|         if tz: | ||||
|             dt = pytz.timezone(tz).localize(dt) | ||||
|         return int(dt.timestamp()) | ||||
|     except: | ||||
|         pass | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| @@ -180,8 +168,11 @@ class Sitemap(_Base): | ||||
|         if not markup: return [] | ||||
|         soup = BeautifulSoup(markup, features='lxml') | ||||
|         articles = soup.find('urlset').findAll('url') | ||||
|         articles = list(filter(None, [a if a.find('lastmod') is not None else None for a in articles])) | ||||
|         articles.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) | ||||
|         news = list(filter(None, [a if a.find('news:news') else None for a in articles])) | ||||
|         news = list(filter(None, [a if a.find('news:publication_date') else None for a in news])) | ||||
|         articles = list(filter(None, [a if a.find('lastmod') else None for a in articles])) | ||||
|         links = articles + news | ||||
|         links.sort(key=lambda a: unix(a.find('lastmod')), reverse=True) | ||||
|         links = [x.find('loc').text for x in articles] or [] | ||||
|         links = list(set(links)) | ||||
|         if excludes: | ||||
|   | ||||
| @@ -30,3 +30,4 @@ websocket-client==0.57.0 | ||||
| Werkzeug==1.0.1 | ||||
| zope.event==4.4 | ||||
| zope.interface==5.1.0 | ||||
| python-dateutil==2.8.1 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user