forked from tanner/qotnews
		
	use extruct for opengraph/json-ld/microdata of articles
This commit is contained in:
		| @@ -38,11 +38,11 @@ def list(): | ||||
|         feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] | ||||
|  | ||||
|     for key, publication in substacks.items(): | ||||
|         count = settings.SUBSTACK[key].count | ||||
|         count = settings.SUBSTACK[key]['count'] | ||||
|         feed += [(x, key) for x in publication.feed()[:count]] | ||||
|  | ||||
|     for key, sites in sitemaps.items(): | ||||
|         count = settings.SITEMAP[key].count | ||||
|         count = settings.SITEMAP[key]['count'] | ||||
|         feed += [(x, key) for x in sites.feed()[:count]] | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -10,6 +10,7 @@ if __name__ == '__main__': | ||||
| import requests | ||||
| from datetime import datetime | ||||
| from bs4 import BeautifulSoup | ||||
| import extruct | ||||
|  | ||||
| from utils import clean | ||||
|  | ||||
| @@ -81,23 +82,40 @@ class Sitemap: | ||||
|         if not r: | ||||
|             return False | ||||
|  | ||||
|         (data, method) = get_article_details(ref) | ||||
|         if not data: | ||||
|             return False | ||||
|         if 'outline' not in method: | ||||
|         html = xml(lambda x: ref) | ||||
|  | ||||
|         if not html: | ||||
|             return False | ||||
|  | ||||
|         data = extruct.extract(html) | ||||
|  | ||||
|         s = {} | ||||
|         s['author'] = data['author'] | ||||
|         s['author_link'] = '' | ||||
|         s['date'] = unix(r.find('lastmod').text) | ||||
|         s['score'] = 0 | ||||
|         s['title'] = data['title'] | ||||
|         s['link'] = data['article_url'] | ||||
|         s['url'] = data['article_url'] | ||||
|         s['score'] = '' | ||||
|         s['comments'] = [] | ||||
|         s['num_comments'] = 0 | ||||
|         s['text'] = data['html'] | ||||
|          | ||||
|         s['link'] = ref | ||||
|         s['url'] = ref | ||||
|         s['date'] = unix(r.find('lastmod').text) | ||||
|  | ||||
|         for og in data['opengraph']: | ||||
|            titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']])) | ||||
|            if len(titles): | ||||
|                s['title'] = titles[0] | ||||
|  | ||||
|  | ||||
|         for md in data['microdata']: | ||||
|             if md['type'] == 'https://schema.org/NewsArticle': | ||||
|                 props = md['properties'] | ||||
|                 s['title'] = props['headline'] | ||||
|                 if props['author']: | ||||
|                     s['author'] = props['author']['properties']['name'] | ||||
|  | ||||
|         for ld in data['json-ld']: | ||||
|             if ld['@type'] == 'Article': | ||||
|                 s['title'] = ld['headline'] | ||||
|                 if ld['author']: | ||||
|                     s['author'] = ld['author']['name'] | ||||
|         return s | ||||
|  | ||||
|  | ||||
| @@ -107,4 +125,4 @@ if __name__ == '__main__': | ||||
|     site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") | ||||
|     posts = site.feed() | ||||
|     print(posts[:1]) | ||||
|     print(site.story(posts[0])) | ||||
|     print(site.story(posts[0])) | ||||
|   | ||||
| @@ -157,4 +157,4 @@ if __name__ == '__main__': | ||||
|     webworm = Publication("https://www.webworm.co/") | ||||
|     posts = webworm.feed() | ||||
|     print(posts[:1]) | ||||
|     print(webworm.story(posts[0])) | ||||
|     print(webworm.story(posts[0])) | ||||
|   | ||||
| @@ -4,6 +4,7 @@ certifi==2020.6.20 | ||||
| chardet==3.0.4 | ||||
| click==7.1.2 | ||||
| commonmark==0.9.1 | ||||
| extruct==0.10.0 | ||||
| Flask==1.1.2 | ||||
| Flask-Cors==3.0.8 | ||||
| gevent==20.6.2 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user