From fe01ea52e5a70f4f97d6ff7730f975a7bee6a4bd Mon Sep 17 00:00:00 2001 From: Jason Schwarzenberger Date: Tue, 24 Nov 2020 10:36:31 +1300 Subject: [PATCH] get favicons for custom substack publications. --- apiserver/feeds/substack.py | 43 ++++++++++++------------------------- apiserver/misc/api.py | 21 +++++++++++------- apiserver/misc/icons.py | 14 ++++++++++++ apiserver/misc/metadata.py | 15 +++++++++++++ apiserver/misc/news.py | 13 ++--------- 5 files changed, 58 insertions(+), 48 deletions(-) create mode 100644 apiserver/misc/icons.py diff --git a/apiserver/feeds/substack.py b/apiserver/feeds/substack.py index e3efe87..1f8a9af 100644 --- a/apiserver/feeds/substack.py +++ b/apiserver/feeds/substack.py @@ -11,6 +11,8 @@ import requests from datetime import datetime from misc.time import unix +from misc.metadata import get_icons +from misc.api import xml, json from utils import clean SUBSTACK_REFERER = 'https://substack.com' @@ -23,29 +25,6 @@ def api_comments(post_id, base_url): def api_stories(x, base_url): return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100" -def api(route, ref=None, referer=None): - headers = {'Referer': referer} if referer else None - try: - r = requests.get(route(ref), headers=headers, timeout=10) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem hitting Substack API: {}, trying again'.format(str(e))) - - try: - r = requests.get(route(ref), headers=headers, timeout=20) - if r.status_code != 200: - raise Exception('Bad response code ' + str(r.status_code)) - return r.json() - except KeyboardInterrupt: - raise - except BaseException as e: - logging.error('Problem hitting Substack API: {}'.format(str(e))) - return False - def comment(i): if 'body' not in i: return False @@ -71,14 +50,14 @@ class Publication: return ref.replace(f"{self.BASE_DOMAIN}/#id:", '') def feed(self): - stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN) + stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN}) if not stories: return [] stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) return [self.ref_prefix(str(i.get("id"))) for i in stories or []] def story(self, ref): ref = self.strip_ref_prefix(ref) - stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN) + stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN}) if not stories: return False stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) @@ -99,7 +78,7 @@ class Publication: s['title'] = r.get('title', '') s['link'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '') - comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN) + comments = json(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), headers={'Referer': self.BASE_DOMAIN}) s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = r.get('comment_count', 0) @@ -109,6 +88,12 @@ class Publication: s['author'] = authors[0].get('name') s['author_link'] = authors[0].get('link') + markup = xml(lambda x: s['link']) + if markup: + icons = get_icons(markup, url=s['link']) + if icons: + s['icon'] = icons[0] + return s def _bylines(self, b): @@ -131,7 +116,7 @@ class Top: return ref def feed(self): - stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER) + stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER}) if not stories: return [] stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = [dict(id=i.get('id'), base_url=i.get("pub", { 'base_url': '' }).get("base_url")) for i in stories or []] @@ -139,7 +124,7 @@ class Top: def story(self, ref): ref = self.strip_ref_prefix(ref) - stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER) + stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER}) if not stories: return False stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories])) stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories])) @@ -162,7 +147,7 @@ class Top: s['title'] = r.get('title', '') s['link'] = r.get('canonical_url', '') s['url'] = r.get('canonical_url', '') - comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER) + comments = json(lambda x: api_comments(x, base_url), r.get('id'), headers={'Referer': SUBSTACK_REFERER}) s['comments'] = [comment(i) for i in comments.get('comments')] s['comments'] = list(filter(bool, s['comments'])) s['num_comments'] = r.get('comment_count', 0) diff --git a/apiserver/misc/api.py b/apiserver/misc/api.py index 9353375..e2e4226 100644 --- a/apiserver/misc/api.py +++ b/apiserver/misc/api.py @@ -5,13 +5,16 @@ logging.basicConfig( import requests -USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" -FORWARD_IP = '66.249.66.1' +GOOGLEBOT_USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +GOOGLEBOT_IP = '66.249.66.1' +TIMEOUT = 30 -def xml(route, ref=None): +def xml(route, ref=None, headers=dict(), use_googlebot=True): try: - headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP} - r = requests.get(route(ref), headers=headers, timeout=5) + if use_googlebot: + headers['User-Agent'] = GOOGLEBOT_USER_AGENT + headers['X-Forwarded-For'] = GOOGLEBOT_IP + r = requests.get(route(ref), headers=headers, timeout=TIMEOUT) if r.status_code != 200: raise Exception('Bad response code ' + str(r.status_code)) return r.text @@ -21,10 +24,12 @@ def xml(route, ref=None): logging.error('Problem hitting URL: {}'.format(str(e))) return False -def json(route, ref=None): +def json(route, ref=None, headers=dict(), use_googlebot=True): try: - headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP} - r = requests.get(route(ref), headers=headers, timeout=5) + if use_googlebot: + headers['User-Agent'] = GOOGLEBOT_USER_AGENT + headers['X-Forwarded-For'] = GOOGLEBOT_IP + r = requests.get(route(ref), headers=headers, timeout=TIMEOUT) if r.status_code != 200: raise Exception('Bad response code ' + str(r.status_code)) return r.json() diff --git a/apiserver/misc/icons.py b/apiserver/misc/icons.py new file mode 100644 index 0000000..7bf6b26 --- /dev/null +++ b/apiserver/misc/icons.py @@ -0,0 +1,14 @@ +from bs4 import BeautifulSoup + +def get_icons(markup): + soup = BeautifulSoup(markup, features='html.parser') + icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32") + icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16") + favicon = soup.find_all('link', rel="shortcut icon", href=True) + others = soup.find_all('link', rel="icon", href=True) + icons = icon32 + icon16 + favicon + others + base_url = '/'.join(urlref.split('/')[:3]) + icons = list(set([i.get('href') for i in icons])) + icons = [i if i.startswith('http') else base_url + i for i in icons] + + return icons \ No newline at end of file diff --git a/apiserver/misc/metadata.py b/apiserver/misc/metadata.py index c705ab9..9a6d73a 100644 --- a/apiserver/misc/metadata.py +++ b/apiserver/misc/metadata.py @@ -1,4 +1,19 @@ +from bs4 import BeautifulSoup + +def get_icons(markup, url): + soup = BeautifulSoup(markup, features='html.parser') + icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32") + icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16") + favicon = soup.find_all('link', rel="shortcut icon", href=True) + others = soup.find_all('link', rel="icon", href=True) + icons = icon32 + icon16 + favicon + others + base_url = '/'.join(url.split('/')[:3]) + icons = list(set([i.get('href') for i in icons])) + icons = [i if i.startswith('http') else base_url + i for i in icons] + + return icons + def parse_extruct(s, data): rdfa_keys = { 'title': [ diff --git a/apiserver/misc/news.py b/apiserver/misc/news.py index 1594574..5498266 100644 --- a/apiserver/misc/news.py +++ b/apiserver/misc/news.py @@ -11,7 +11,7 @@ import extruct import settings from utils import clean -from misc.metadata import parse_extruct +from misc.metadata import parse_extruct, get_icons from misc.time import unix from misc.api import xml import misc.stuff as stuff @@ -69,16 +69,7 @@ class Base: s['url'] = urlref s['date'] = 0 - soup = BeautifulSoup(markup, features='html.parser') - icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32") - icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16") - favicon = soup.find_all('link', rel="shortcut icon", href=True) - others = soup.find_all('link', rel="icon", href=True) - icons = icon32 + icon16 + favicon + others - base_url = '/'.join(urlref.split('/')[:3]) - icons = list(set([i.get('href') for i in icons])) - icons = [i if i.startswith('http') else base_url + i for i in icons] - + icons = get_icons(markup, url=urlref) if icons: s['icon'] = icons[0]