From 587b10c438877cbf944dfc3a12d1f8bbed1e0145 Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Thu, 12 Nov 2020 14:51:53 +1300 Subject: [PATCH] recursive sitemaps (sitemap indexes) --- apiserver/feeds/news.py | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/apiserver/feeds/news.py b/apiserver/feeds/news.py index f29da14..7591cf5 100644 --- a/apiserver/feeds/news.py +++ b/apiserver/feeds/news.py @@ -16,6 +16,7 @@ import extruct import pytz from utils import clean +import settings tzinfos = { 'NZDT': pytz.timezone('Pacific/Auckland'), @@ -198,20 +199,35 @@ class Sitemap(_Base): self.tz = tz self.sitemap_url = url - def feed(self, excludes=None): - markup = xml(lambda x: self.sitemap_url) + def _feed(self, feed_url, excludes=None): + too_old = datetime.now().timestamp() - settings.MAX_STORY_AGE + markup = xml(lambda x: feed_url) if not markup: return [] soup = BeautifulSoup(markup, features='lxml') - sitemap = soup.find('urlset').findAll('url') + if soup.find('sitemapindex'): + sitemap = soup.find('sitemapindex').findAll('sitemap') + else: + sitemap = soup.find('urlset').findAll('url') links = list(filter(None, [a if a.find('loc') else None for a in sitemap])) links = list(filter(None, [a if get_sitemap_date(a) else None for a in links])) + links = list(filter(None, [a if unix(get_sitemap_date(a)) > too_old else None for a in links])) links.sort(key=lambda a: unix(get_sitemap_date(a)), reverse=True) + links = [x.find('loc').text for x in links] or [] links = list(set(links)) if excludes: links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links])) - return links + + feed_urls = list(filter(None, [l if l.endswith(".xml") else None for l in links])) + urls = list(set(links) - set(feed_urls)) + + for url in feed_urls: + urls += self._feed(url, excludes) + return urls + + def feed(self, excludes=None): + return self._feed(self.sitemap_url, excludes) class Category(_Base): @@ -237,20 +253,8 @@ class Category(_Base): # scratchpad so I can quickly develop the parser if __name__ == '__main__': - print("Category: RadioNZ Te Ao Māori") - site = Category("https://www.rnz.co.nz/news/te-manu-korihi/") - posts = site.feed() - print(posts[:5]) - print(site.story(posts[0])) - - print("Sitemap: tvnz") - site = Sitemap("https://www.tvnz.co.nz/system/tvnz/sitemap.xml") - posts = site.feed() - print(posts[:5]) - print(site.story(posts[0])) - - print("Sitemap: Newsroom") - site = Sitemap("https://www.newsroom.co.nz/sitemap.xml", tz='Pacific/Auckland') + print("Sitemap: The Spinoff") + site = Sitemap("https://thespinoff.co.nz/sitemap.xml") posts = site.feed() print(posts[:5]) print(site.story(posts[0]))