use extruct for opengraph/json-ld/microdata of articles

2020-11-03 10:31:36 +00:00
parent 736cdc8576
commit b759f46582
4 changed files with 35 additions and 16 deletions
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -38,11 +38,11 @@ def list():
        feed += [(x, 'substack') for x in substack.top.feed()[:settings.NUM_SUBSTACK]]
    for key, publication in substacks.items():
-        count = settings.SUBSTACK[key].count
+        count = settings.SUBSTACK[key]['count']
        feed += [(x, key) for x in publication.feed()[:count]]
    for key, sites in sitemaps.items():
-        count = settings.SITEMAP[key].count
+        count = settings.SITEMAP[key]['count']
        feed += [(x, key) for x in sites.feed()[:count]]
--- a/apiserver/feeds/sitemap.py
+++ b/apiserver/feeds/sitemap.py
@@ -10,6 +10,7 @@ if __name__ == '__main__':
 import requests
 from datetime import datetime
 from bs4 import BeautifulSoup
 import extruct
 from utils import clean
@@ -81,23 +82,40 @@ class Sitemap:
        if not r:
            return False
-        (data, method) = get_article_details(ref)
+        html = xml(lambda x: ref)
-        if not data:
+
-            return False
+        if not html:
        if 'outline' not in method:
            return False
        data = extruct.extract(html)
        s = {}
        s['author'] = data['author']
        s['author_link'] = ''
-        s['date'] = unix(r.find('lastmod').text)
+        s['score'] = ''
        s['score'] = 0
        s['title'] = data['title']
        s['link'] = data['article_url']
        s['url'] = data['article_url']
        s['comments'] = []
        s['num_comments'] = 0
-        s['text'] = data['html']
+        s['link'] = ref
-        
+        s['url'] = ref
        s['date'] = unix(r.find('lastmod').text)
        for og in data['opengraph']:
           titles = list(filter(None, [value if 'og:title' in key else None for key, value in og['properties']]))
           if len(titles):
               s['title'] = titles[0]
        for md in data['microdata']:
            if md['type'] == 'https://schema.org/NewsArticle':
                props = md['properties']
                s['title'] = props['headline']
                if props['author']:
                    s['author'] = props['author']['properties']['name']
        for ld in data['json-ld']:
            if ld['@type'] == 'Article':
                s['title'] = ld['headline']
                if ld['author']:
                    s['author'] = ld['author']['name']
        return s
@@ -107,4 +125,4 @@ if __name__ == '__main__':
    site = Sitemap("https://www.nzherald.co.nz/arcio/news-sitemap/")
    posts = site.feed()
    print(posts[:1])
-    print(site.story(posts[0]))
+    print(site.story(posts[0]))
--- a/apiserver/feeds/substack.py
+++ b/apiserver/feeds/substack.py
@@ -157,4 +157,4 @@ if __name__ == '__main__':
    webworm = Publication("https://www.webworm.co/")
    posts = webworm.feed()
    print(posts[:1])
-    print(webworm.story(posts[0]))
+    print(webworm.story(posts[0]))
--- a/apiserver/requirements.txt
+++ b/apiserver/requirements.txt
@@ -4,6 +4,7 @@ certifi==2020.6.20
 chardet==3.0.4
 click==7.1.2
 commonmark==0.9.1
 extruct==0.10.0
 Flask==1.1.2
 Flask-Cors==3.0.8
 gevent==20.6.2