use extruct for opengraph/json-ld/microdata of articles

This commit is contained in:
Jason 2020-11-03 10:31:36 +00:00
parent 736cdc8576
commit b759f46582
4 changed files with 35 additions and 16 deletions

View File

@ -38,11 +38,11 @@ def list():
feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]] feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
for key, publication in substacks.items(): for key, publication in substacks.items():
count = settings.SUBSTACK[key].count count = settings.SUBSTACK[key]['count']
feed += [(x, key) for x in publication.feed()[:count]] feed += [(x, key) for x in publication.feed()[:count]]
for key, sites in sitemaps.items(): for key, sites in sitemaps.items():
count = settings.SITEMAP[key].count count = settings.SITEMAP[key]['count']
feed += [(x, key) for x in sites.feed()[:count]] feed += [(x, key) for x in sites.feed()[:count]]

View File

@ -10,6 +10,7 @@ if __name__ == '__main__':
import requests import requests
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import extruct
from utils import clean from utils import clean
@ -81,23 +82,40 @@ class Sitemap:
if not r: if not r:
return False return False
(data, method) = get_article_details(ref) html = xml(lambda x: ref)
if not data:
return False if not html:
if 'outline' not in method:
return False return False
data = extruct.extract(html)
s = {} s = {}
s['author'] = data['author']
s['author_link'] = '' s['author_link'] = ''
s['date'] = unix(r.find('lastmod').text) s['score'] = ''
s['score'] = 0
s['title'] = data['title']
s['link'] = data['article_url']
s['url'] = data['article_url']
s['comments'] = [] s['comments'] = []
s['num_comments'] = 0 s['num_comments'] = 0
s['text'] = data['html'] s['link'] = ref
s['url'] = ref
s['date'] = unix(r.find('lastmod').text)
for og in data['opengraph']:
titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
if len(titles):
s['title'] = titles[0]
for md in data['microdata']:
if md['type'] == 'https://schema.org/NewsArticle':
props = md['properties']
s['title'] = props['headline']
if props['author']:
s['author'] = props['author']['properties']['name']
for ld in data['json-ld']:
if ld['@type'] == 'Article':
s['title'] = ld['headline']
if ld['author']:
s['author'] = ld['author']['name']
return s return s
@ -107,4 +125,4 @@ if __name__ == '__main__':
site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/") site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
posts = site.feed() posts = site.feed()
print(posts[:1]) print(posts[:1])
print(site.story(posts[0])) print(site.story(posts[0]))

View File

@ -157,4 +157,4 @@ if __name__ == '__main__':
webworm = Publication("https://www.webworm.co/") webworm = Publication("https://www.webworm.co/")
posts = webworm.feed() posts = webworm.feed()
print(posts[:1]) print(posts[:1])
print(webworm.story(posts[0])) print(webworm.story(posts[0]))

View File

@ -4,6 +4,7 @@ certifi==2020.6.20
chardet==3.0.4 chardet==3.0.4
click==7.1.2 click==7.1.2
commonmark==0.9.1 commonmark==0.9.1
extruct==0.10.0
Flask==1.1.2 Flask==1.1.2
Flask-Cors==3.0.8 Flask-Cors==3.0.8
gevent==20.6.2 gevent==20.6.2