diff --git a/apiserver/feed.py b/apiserver/feed.py
index 1e0166a..8b0c8f6 100644
--- a/apiserver/feed.py
+++ b/apiserver/feed.py
@@ -12,7 +12,8 @@ import settings
from feeds import hackernews, reddit, tildes, substack, manual
from feeds.sitemap import Sitemap
from feeds.category import Category
-from scrapers import outline, declutter, headless, simple
+from scrapers import outline
+from scrapers.declutter import declutter, headless, simple
INVALID_DOMAINS = ['youtube.com', 'bloomberg.com', 'wsj.com']
@@ -145,11 +146,17 @@ def update_story(story, is_manual=False, urlref=None):
logging.info('Getting article ' + story['url'])
details, scraper = get_article(story['url'])
if not details: return False
- story['text'] = details.get('content', '')
- story['excerpt'] = details.get('excerpt', '')
story['scraper'] = scraper
- story['scraper_link'] = details.get('scraper_link', '')
+ story['text'] = details.get('content', '')
if not story['text']: return False
+ story['excerpt'] = details.get('excerpt', '')
+ story['scraper_link'] = details.get('scraper_link', '')
+ meta = details.get('meta')
+ if meta:
+ og = meta.get('og')
+ story['image'] = meta.get('image', '')
+ if og:
+ story['image'] = og.get('og:image', meta.get('image', ''))
return True
diff --git a/apiserver/misc/news.py b/apiserver/misc/news.py
index 49a6ed5..ef271c2 100644
--- a/apiserver/misc/news.py
+++ b/apiserver/misc/news.py
@@ -6,7 +6,7 @@ logging.basicConfig(
import re
import requests
from bs4 import BeautifulSoup
-from scrapers import declutter
+from scrapers.declutter import declutter, headless
import extruct
import settings
diff --git a/apiserver/scrapers/declutter.py b/apiserver/scrapers/declutter.py
index 84500ec..c3a56a5 100644
--- a/apiserver/scrapers/declutter.py
+++ b/apiserver/scrapers/declutter.py
@@ -4,38 +4,61 @@ logging.basicConfig(
level=logging.DEBUG)
import requests
-DECLUTTER_API = 'https://declutter.1j.nz/headless/details'
-DECLUTTER_COMMENT_API = 'https://declutter.1j.nz/headless/comments'
-TIMEOUT = 90
+from settings import HEADLESS_READER_PORT, SIMPLE_READER_PORT
+
+class Simple:
+ def __init__(self, host, name, internal=True, timeout=20):
+ self.host = host
+ self.name = name
+ self.internal = internal
+ self.timeout = timeout
+ self.variant = 'simple'
+
+ def as_readable(self, details):
+ if not self.internal:
+ details['scraper_link'] = self.host
+ return details
+
+ def get_html(self, url):
+ details = self.get_details(url)
+ if not details:
+ return ''
+ return details['content']
+
+ def get_details(self, url):
+ logging.info(f"{self.name} Scraper: {url}")
+ details = self._json(f"{self.host}/{self.variant}/details", dict(url=url), "article")
+ if not details: return None
+ return self.as_readable(details)
-def get_html(url):
- logging.info(f"Declutter Scraper: {url}")
- details = get_details(url)
- if not details:
- return ''
- return details['content']
+ def _json(self, url, data, adjective):
+ try:
+ r = requests.post(url, data=data, timeout=self.timeout)
+ if r.status_code != 200:
+ raise Exception('Bad response code ' + str(r.status_code))
+ return r.json()
+ except KeyboardInterrupt:
+ raise
+ except BaseException as e:
+ logging.error('{}: Problem scraping {}: {}'.format(self.name, adjective, str(e)))
+ return None
-def get_details(url):
- try:
- r = requests.post(DECLUTTER_API, data=dict(url=url), timeout=TIMEOUT)
- if r.status_code != 200:
- raise Exception('Bad response code ' + str(r.status_code))
- return r.json()
- except KeyboardInterrupt:
- raise
- except BaseException as e:
- logging.error('Problem decluttering article: {}'.format(str(e)))
- return None
-def get_comments(url):
- try:
- r = requests.post(DECLUTTER_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
- if r.status_code != 200:
- raise Exception('Bad response code ' + str(r.status_code))
- return r.json()
- except KeyboardInterrupt:
- raise
- except BaseException as e:
- logging.error('Problem getting comments for article: {}'.format(str(e)))
- return None
\ No newline at end of file
+class Headless(Simple):
+ def __init__(self, host, name, internal=True, timeout=90):
+ self.host = host
+ self.name = name
+ self.internal = internal
+ self.timeout = timeout
+ self.variant = 'headless'
+
+ def get_comments(self, url):
+ logging.info(f"{self.name} Scraper: {url}")
+ comments = self._json(f"{self.host}/{self.variant}/comments", dict(url=url), "comments")
+ if not comments: return None
+ return comments
+
+declutter = Headless('https://declutter.1j.nz', 'Declutter scraper', internal=False)
+headless = Headless(f"http://127.0.0.1:{HEADLESS_READER_PORT or 33843}", 'Headless scraper')
+simple = Simple(f"http://127.0.0.1:{SIMPLE_READER_PORT or 33843}", 'Simple scraper')
\ No newline at end of file
diff --git a/apiserver/scrapers/headless.py b/apiserver/scrapers/headless.py
deleted file mode 100644
index 7d32f82..0000000
--- a/apiserver/scrapers/headless.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import logging
-logging.basicConfig(
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- level=logging.DEBUG)
-import requests
-from settings import HEADLESS_READER_PORT
-
-READ_API = 'http://127.0.0.1:{}/headless/details'.format(HEADLESS_READER_PORT or 33843)
-READ_COMMENT__API = 'http://127.0.0.1:{}/headless/comments'.format(HEADLESS_READER_PORT or 33843)
-TIMEOUT = 90
-
-def get_html(url):
- logging.info(f"Headless Scraper: {url}")
- details = get_details(url)
- if not details:
- return ''
- return details['content']
-
-def get_details(url):
- try:
- r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
- if r.status_code != 200:
- raise Exception('Bad response code ' + str(r.status_code))
- return r.json()
- except KeyboardInterrupt:
- raise
- except BaseException as e:
- logging.error('Problem scraping article: {}'.format(str(e)))
- return None
-
-def get_comments(url):
- try:
- r = requests.post(READ_COMMENT_API, data=dict(url=url), timeout=TIMEOUT)
- if r.status_code != 200:
- raise Exception('Bad response code ' + str(r.status_code))
- return r.json()
- except KeyboardInterrupt:
- raise
- except BaseException as e:
- logging.error('Problem getting comments for article: {}'.format(str(e)))
- return None
diff --git a/apiserver/scrapers/outline.py b/apiserver/scrapers/outline.py
index 07b14da..15a21f7 100644
--- a/apiserver/scrapers/outline.py
+++ b/apiserver/scrapers/outline.py
@@ -29,8 +29,10 @@ def as_readable(details):
'siteName': details['site_name'],
'url': details['article_url'],
'publisher': details['site_name'],
- 'scraper_link': 'https://outline.com/' + details['short_code']
+ 'scraper_link': 'https://outline.com/' + details['short_code'],
+ 'meta': {}
}
+ readable['meta'].update(details['meta'])
return readable
def _get_outline(url):
@@ -40,8 +42,7 @@ def _get_outline(url):
headers = {'Referer': OUTLINE_REFERER}
r = requests.get(OUTLINE_API, params=params, headers=headers, timeout=TIMEOUT)
if r.status_code == 429:
- logging.info('Rate limited by outline, sleeping 30s and skipping...')
- time.sleep(30)
+ logging.info('Rate limited by outline, skipping...')
return None
if r.status_code != 200:
raise Exception('Bad response code ' + str(r.status_code))
diff --git a/apiserver/scrapers/simple.py b/apiserver/scrapers/simple.py
deleted file mode 100644
index 433ba75..0000000
--- a/apiserver/scrapers/simple.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import logging
-logging.basicConfig(
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- level=logging.DEBUG)
-import requests
-from settings import SIMPLE_READER_PORT
-
-READ_API = 'http://127.0.0.1:{}/simple/details'.format(SIMPLE_READER_PORT or 33843)
-TIMEOUT = 20
-
-def get_html(url):
- logging.info(f"Simple Scraper: {url}")
- details = get_details(url)
- if not details:
- return ''
- return details['content']
-
-def get_details(url):
- try:
- r = requests.post(READ_API, data=dict(url=url), timeout=TIMEOUT)
- if r.status_code != 200:
- raise Exception('Bad response code ' + str(r.status_code))
- return r.json()
- except KeyboardInterrupt:
- raise
- except BaseException as e:
- logging.error('Problem getting article: {}'.format(str(e)))
- return None
\ No newline at end of file
diff --git a/readerserver b/readerserver
index 1a81bc1..006be62 160000
--- a/readerserver
+++ b/readerserver
@@ -1 +1 @@
-Subproject commit 1a81bc139f6e5f2fcb021ff47921e9c47eb3f6da
+Subproject commit 006be62214d89f33e1e99cd0d6af4e4e5e53e3b2
diff --git a/webapp/src/routes/[id].svelte b/webapp/src/routes/[id].svelte
index e8a1f9f..8d01bef 100644
--- a/webapp/src/routes/[id].svelte
+++ b/webapp/src/routes/[id].svelte
@@ -43,6 +43,9 @@
content={fromUnixTime(story.date).toISOString()} />
+ {#if story.image}
+
+ {/if}