use extruct for opengraph/json-ld/microdata of articles

2020-11-03 10:31:36 +00:00
parent 736cdc8576
commit b759f46582
4 changed files with 35 additions and 16 deletions
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -38,11 +38,11 @@ def list():
        feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]

    for key, publication in substacks.items():
-        count = settings.SUBSTACK[key].count
+        count = settings.SUBSTACK[key]['count']
        feed += [(x, key) for x in publication.feed()[:count]]

    for key, sites in sitemaps.items():
-        count = settings.SITEMAP[key].count
+        count = settings.SITEMAP[key]['count']
        feed += [(x, key) for x in sites.feed()[:count]]


--- a/apiserver/feeds/sitemap.py
+++ b/apiserver/feeds/sitemap.py
@@ -10,6 +10,7 @@ if __name__ == '__main__':
 import requests
 from datetime import datetime
 from bs4 import BeautifulSoup
+import extruct

 from utils import clean

@@ -81,23 +82,40 @@ class Sitemap:
        if not r:
            return False

-        (data, method) = get_article_details(ref)
-        if not data:
-            return False
-        if 'outline' not in method:
+        html = xml(lambda x: ref)
+
+        if not html:
            return False
+
+        data = extruct.extract(html)
+
        s = {}
-        s['author'] = data['author']
        s['author_link'] = ''
-        s['date'] = unix(r.find('lastmod').text)
-        s['score'] = 0
-        s['title'] = data['title']
-        s['link'] = data['article_url']
-        s['url'] = data['article_url']
+        s['score'] = ''
        s['comments'] = []
        s['num_comments'] = 0
-        s['text'] = data['html']
-        
+        s['link'] = ref
+        s['url'] = ref
+        s['date'] = unix(r.find('lastmod').text)
+
+        for og in data['opengraph']:
+           titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
+           if len(titles):
+               s['title'] = titles[0]
+
+
+        for md in data['microdata']:
+            if md['type'] == 'https://schema.org/NewsArticle':
+                props = md['properties']
+                s['title'] = props['headline']
+                if props['author']:
+                    s['author'] = props['author']['properties']['name']
+
+        for ld in data['json-ld']:
+            if ld['@type'] == 'Article':
+                s['title'] = ld['headline']
+                if ld['author']:
+                    s['author'] = ld['author']['name']
        return s


@@ -107,4 +125,4 @@ if __name__ == '__main__':
    site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
    posts = site.feed()
    print(posts[:1])
-    print(site.story(posts[0]))
+    print(site.story(posts[0]))
--- a/apiserver/feeds/substack.py
+++ b/apiserver/feeds/substack.py
@@ -157,4 +157,4 @@ if __name__ == '__main__':
    webworm = Publication("https://www.webworm.co/")
    posts = webworm.feed()
    print(posts[:1])
-    print(webworm.story(posts[0]))
+    print(webworm.story(posts[0]))
--- a/apiserver/requirements.txt
+++ b/apiserver/requirements.txt
@@ -4,6 +4,7 @@ certifi==2020.6.20
 chardet==3.0.4
 click==7.1.2
 commonmark==0.9.1
+extruct==0.10.0
 Flask==1.1.2
 Flask-Cors==3.0.8
 gevent==20.6.2