From 2a2bf4d671fe87c9e396f1d223ebeb0e813b48fd Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Thu, 3 Dec 2020 16:41:27 +1300 Subject: [PATCH] add excerpt and scraper details. --- apiserver/feed.py | 15 ++++++++----- apiserver/scrapers/outline.py | 30 ++++++++++++++++++++++++-- webapp/src/components/Article.svelte | 10 ++++----- webapp/src/components/StoryList.svelte | 4 ++-- webapp/src/components/StoryMeta.svelte | 30 ++++++++++++++++++++++++++ webapp/src/routes/[id].svelte | 1 + 6 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 webapp/src/components/StoryMeta.svelte diff --git a/apiserver/feed.py b/apiserver/feed.py index 1e8c6eb..1e0166a 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -76,14 +76,14 @@ def get_article(url): if scraper not in scrapers.keys(): continue try: - html = scrapers[scraper].get_html(url) - if html: - return html + details = scrapers[scraper].get_details(url) + if details and details.get('content'): + return details, scraper except KeyboardInterrupt: raise except: pass - return '' + return None, None def get_content_type(url): try: @@ -143,7 +143,12 @@ def update_story(story, is_manual=False, urlref=None): return False logging.info('Getting article ' + story['url']) - story['text'] = get_article(story['url']) + details, scraper = get_article(story['url']) + if not details: return False + story['text'] = details.get('content', '') + story['excerpt'] = details.get('excerpt', '') + story['scraper'] = scraper + story['scraper_link'] = details.get('scraper_link', '') if not story['text']: return False return True diff --git a/apiserver/scrapers/outline.py b/apiserver/scrapers/outline.py index f96cd2a..07b14da 100644 --- a/apiserver/scrapers/outline.py +++ b/apiserver/scrapers/outline.py @@ -12,9 +12,28 @@ def get_html(url): details = get_details(url) if not details: return '' - return details['html'] + return details['content'] def get_details(url): + outline = _get_outline(url) + if not outline: + return None + return as_readable(outline) + +def as_readable(details): + readable = { + 'title': details['title'], + 'byline': details['author'], + 'content': details['html'], + 'excerpt': _excerpt(details), + 'siteName': details['site_name'], + 'url': details['article_url'], + 'publisher': details['site_name'], + 'scraper_link': 'https://outline.com/' + details['short_code'] + } + return readable + +def _get_outline(url): try: logging.info(f"Outline Scraper: {url}") params = {'source_url': url} @@ -34,4 +53,11 @@ def get_details(url): raise except BaseException as e: logging.error('Problem outlining article: {}'.format(str(e))) - return None \ No newline at end of file + return None + +def _excerpt(details): + meta = details.get('meta') + if not meta: return '' + if meta.get('description'): return meta.get('description', '') + if not meta.get('og'): return '' + return meta.get('og').get('og:description', '') \ No newline at end of file diff --git a/webapp/src/components/Article.svelte b/webapp/src/components/Article.svelte index 9cbc8bc..6ad5367 100644 --- a/webapp/src/components/Article.svelte +++ b/webapp/src/components/Article.svelte @@ -1,9 +1,8 @@ + + diff --git a/webapp/src/routes/[id].svelte b/webapp/src/routes/[id].svelte index 957e463..e8a1f9f 100644 --- a/webapp/src/routes/[id].svelte +++ b/webapp/src/routes/[id].svelte @@ -42,6 +42,7 @@ property="article:published_time" content={fromUnixTime(story.date).toISOString()} /> +