fix sitemap

This commit is contained in:
Jason 2020-11-03 10:53:40 +00:00
parent b759f46582
commit abf8589e02

View File

@ -108,20 +108,20 @@ class Sitemap:
if md['type'] == 'https://schema.org/NewsArticle': if md['type'] == 'https://schema.org/NewsArticle':
props = md['properties'] props = md['properties']
s['title'] = props['headline'] s['title'] = props['headline']
if props['author']: if 'author' in props and props['author']:
s['author'] = props['author']['properties']['name'] s['author'] = props['author']['properties']['name']
for ld in data['json-ld']: for ld in data['json-ld']:
if ld['@type'] == 'Article': if ld['@type'] == 'Article':
s['title'] = ld['headline'] s['title'] = ld['headline']
if ld['author']: if 'author' in ld and ld['author']:
s['author'] = ld['author']['name'] s['author'] = ld['author']['name']
return s return s
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
# site = Sitemap("https://www.stuff.co.nz/sitemap.xml") #site = Sitemap("https://www.stuff.co.nz/sitemap.xml")
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:1])