From da7f6330bf8016770ec196ffecaa099d1340f983 Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Fri, 4 Dec 2020 12:46:46 +1300 Subject: [PATCH] improve meta data scraping. --- apiserver/feed.py | 15 ++++-- apiserver/misc/news.py | 2 +- apiserver/scrapers/declutter.py | 85 +++++++++++++++++++++------------ apiserver/scrapers/headless.py | 41 ---------------- apiserver/scrapers/outline.py | 7 +-- apiserver/scrapers/simple.py | 28 ----------- readerserver | 2 +- webapp/src/routes/[id].svelte | 3 ++ 8 files changed, 74 insertions(+), 109 deletions(-) delete mode 100644 apiserver/scrapers/headless.py delete mode 100644 apiserver/scrapers/simple.py diff --git a/apiserver/feed.py b/apiserver/feed.py index 1e0166a..8b0c8f6 100644 --- a/apiserver/feed.py +++ b/apiserver/feed.py @@ -12,7 +12,8 @@ import settings from feeds import hackernews, reddit, tildes, substack, manual from feeds.sitemap import Sitemap from feeds.category import Category -from scrapers import outline, declutter, headless, simple +from scrapers import outline +from scrapers.declutter import declutter, headless, simple INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com'] @@ -145,11 +146,17 @@ def update_story(story, is_manual=False, urlref=None): logging.info('Getting article ' + story['url']) details, scraper = get_article(story['url']) if not details: return False - story['text'] = details.get('content', '') - story['excerpt'] = details.get('excerpt', '') story['scraper'] = scraper - story['scraper_link'] = details.get('scraper_link', '') + story['text'] = details.get('content', '') if not story['text']: return False + story['excerpt'] = details.get('excerpt', '') + story['scraper_link'] = details.get('scraper_link', '') + meta = details.get('meta') + if meta: + og = meta.get('og') + story['image'] = meta.get('image', '') + if og: + story['image'] = og.get('og:image', meta.get('image', '')) return True diff --git a/apiserver/misc/news.py b/apiserver/misc/news.py index 49a6ed5..ef271c2 100644 --- a/apiserver/misc/news.py +++ b/apiserver/misc/news.py @@ -6,7 +6,7 @@ logging.basicConfig( import re import requests from bs4 import BeautifulSoup -from scrapers import declutter +from scrapers.declutter import declutter, headless import extruct import settings diff --git a/apiserver/scrapers/declutter.py b/apiserver/scrapers/declutter.py index 84500ec..c3a56a5 100644 --- a/apiserver/scrapers/declutter.py +++ b/apiserver/scrapers/declutter.py @@ -4,38 +4,61 @@ logging.basicConfig( level=logging.DEBUG) import requests -DECLUTTER_API = 'https://declutter.1j.nz/headless/details' -DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments' -TIMEOUT = 90 +from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT + +class Simple: + def __init__(self, host, name, internal=True, timeout=20): + self.host = host + self.name = name + self.internal = internal + self.timeout = timeout + self.variant = 'simple' + + def as_readable(self, details): + if not self.internal: + details['scraper_link'] = self.host + return details + + def get_html(self, url): + details = self.get_details(url) + if not details: + return '' + return details['content'] + + def get_details(self, url): + logging.info(f"{self.name} Scraper: {url}") + details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article") + if not details: return None + return self.as_readable(details) -def get_html(url): - logging.info(f"Declutter Scraper: {url}") - details = get_details(url) - if not details: - return '' - return details['content'] + def _json(self, url, data, adjective): + try: + r = requests.post(url, data=data, timeout=self.timeout) + if r.status_code != 200: + raise Exception('Bad response code ' + str(r.status_code)) + return r.json() + except KeyboardInterrupt: + raise + except BaseException as e: + logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e))) + return None -def get_details(url): - try: - r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem decluttering article: {}'.format(str(e))) - return None -def get_comments(url): - try: - r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem getting comments for article: {}'.format(str(e))) - return None \ No newline at end of file +class Headless(Simple): + def __init__(self, host, name, internal=True, timeout=90): + self.host = host + self.name = name + self.internal = internal + self.timeout = timeout + self.variant = 'headless' + + def get_comments(self, url): + logging.info(f"{self.name} Scraper: {url}") + comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments") + if not comments: return None + return comments + +declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False) +headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper') +simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper') \ No newline at end of file diff --git a/apiserver/scrapers/headless.py b/apiserver/scrapers/headless.py deleted file mode 100644 index 7d32f82..0000000 --- a/apiserver/scrapers/headless.py +++ /dev/null @@ -1,41 +0,0 @@ -import logging -logging.basicConfig( - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.DEBUG) -import requests -from settings import HEADLESS_READER_PORT - -READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843) -READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843) -TIMEOUT = 90 - -def get_html(url): - logging.info(f"Headless Scraper: {url}") - details = get_details(url) - if not details: - return '' - return details['content'] - -def get_details(url): - try: - r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem scraping article: {}'.format(str(e))) - return None - -def get_comments(url): - try: - r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem getting comments for article: {}'.format(str(e))) - return None diff --git a/apiserver/scrapers/outline.py b/apiserver/scrapers/outline.py index 07b14da..15a21f7 100644 --- a/apiserver/scrapers/outline.py +++ b/apiserver/scrapers/outline.py @@ -29,8 +29,10 @@ def as_readable(details): 'siteName': details['site_name'], 'url': details['article_url'], 'publisher': details['site_name'], - 'scraper_link': 'https://outline.com/' + details['short_code'] + 'scraper_link': 'https://outline.com/' + details['short_code'], + 'meta': {} } + readable['meta'].update(details['meta']) return readable def _get_outline(url): @@ -40,8 +42,7 @@ def _get_outline(url): headers = {'Referer': OUTLINE_REFERER} r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT) if r.status_code == 429: - logging.info('Rate limited by outline, sleeping 30s and skipping...') - time.sleep(30) + logging.info('Rate limited by outline, skipping...') return None if r.status_code != 200: raise Exception('Bad response code ' + str(r.status_code)) diff --git a/apiserver/scrapers/simple.py b/apiserver/scrapers/simple.py deleted file mode 100644 index 433ba75..0000000 --- a/apiserver/scrapers/simple.py +++ /dev/null @@ -1,28 +0,0 @@ -import logging -logging.basicConfig( - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - level=logging.DEBUG) -import requests -from settings import SIMPLE_READER_PORT - -READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843) -TIMEOUT = 20 - -def get_html(url): - logging.info(f"Simple Scraper: {url}") - details = get_details(url) - if not details: - return '' - return details['content'] - -def get_details(url): - try: - r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem getting article: {}'.format(str(e))) - return None \ No newline at end of file diff --git a/readerserver b/readerserver index 1a81bc1..006be62 160000 --- a/readerserver +++ b/readerserver @@ -1 +1 @@ -Subproject commit 1a81bc139f6e5f2fcb021ff47921e9c47eb3f6da +Subproject commit 006be62214d89f33e1e99cd0d6af4e4e5e53e3b2 diff --git a/webapp/src/routes/[id].svelte b/webapp/src/routes/[id].svelte index e8a1f9f..8d01bef 100644 --- a/webapp/src/routes/[id].svelte +++ b/webapp/src/routes/[id].svelte @@ -43,6 +43,9 @@ content={fromUnixTime(story.date).toISOString()} /> + {#if story.image} + + {/if}