get favicons for custom substack publications.

2020-11-24 10:36:31 +13:00
parent 3daae5fa1b
commit fe01ea52e5
5 changed files with 58 additions and 48 deletions
--- a/apiserver/feeds/substack.py
+++ b/apiserver/feeds/substack.py
@@ -11,6 +11,8 @@ import requests
 from datetime import datetime
 from misc.time import unix
 from misc.metadata import get_icons
 from misc.api import xml, json
 from utils import clean
 SUBSTACK_REFERER = 'https://substack.com'
@@ -23,29 +25,6 @@ def api_comments(post_id, base_url):
 def api_stories(x, base_url): 
    return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100"
 def api(route, ref=None, referer=None):
    headers = {'Referer': referer} if referer else None
    try:
        r = requests.get(route(ref), headers=headers, timeout=10)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
    try:
        r = requests.get(route(ref), headers=headers, timeout=20)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
    except KeyboardInterrupt:
        raise
    except BaseException as e:
        logging.error('Problem hitting Substack API: {}'.format(str(e)))
        return False
 def comment(i):
    if 'body' not in i:
        return False
@@ -71,14 +50,14 @@ class Publication:
        return ref.replace(f"{self.BASE_DOMAIN}/#id:", '')
    def feed(self):
-        stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
+        stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN})
        if not stories: return []
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        return [self.ref_prefix(str(i.get("id"))) for i in stories or []]
    def story(self, ref):
        ref = self.strip_ref_prefix(ref)
-        stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
+        stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN})
        if not stories: return False
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@@ -99,7 +78,7 @@ class Publication:
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
-        comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
+        comments = json(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), headers={'Referer': self.BASE_DOMAIN})
        s['comments'] = [comment(i) for i in comments.get('comments')]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)
@@ -109,6 +88,12 @@ class Publication:
            s['author'] = authors[0].get('name')
            s['author_link'] = authors[0].get('link')
        markup = xml(lambda x: s['link'])
        if markup:
            icons = get_icons(markup, url=s['link'])
            if icons:
                s['icon'] = icons[0]
        return s
    def _bylines(self, b):
@@ -131,7 +116,7 @@ class Top:
        return ref
    def feed(self):
-        stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
+        stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER})
        if not stories: return []
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        stories = [dict(id=i.get('id'), base_url=i.get("pub", { 'base_url': '' }).get("base_url")) for i in stories or []]
@@ -139,7 +124,7 @@ class Top:
    def story(self, ref):
        ref = self.strip_ref_prefix(ref)
-        stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
+        stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER})
        if not stories: return False
        stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
        stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
@@ -162,7 +147,7 @@ class Top:
        s['title'] = r.get('title', '')
        s['link'] = r.get('canonical_url', '')
        s['url'] = r.get('canonical_url', '')
-        comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
+        comments = json(lambda x: api_comments(x, base_url), r.get('id'), headers={'Referer': SUBSTACK_REFERER})
        s['comments'] = [comment(i) for i in comments.get('comments')]
        s['comments'] = list(filter(bool, s['comments']))
        s['num_comments'] = r.get('comment_count', 0)
--- a/apiserver/misc/api.py
+++ b/apiserver/misc/api.py
@@ -5,13 +5,16 @@ logging.basicConfig(
 import requests
-USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+GOOGLEBOT_USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-FORWARD_IP = '66.249.66.1'
+GOOGLEBOT_IP = '66.249.66.1'
 TIMEOUT = 30
-def xml(route, ref=None):
+def xml(route, ref=None, headers=dict(), use_googlebot=True):
    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        if use_googlebot:
-        r = requests.get(route(ref), headers=headers, timeout=5)
+            headers['User-Agent'] = GOOGLEBOT_USER_AGENT
            headers['X-Forwarded-For'] = GOOGLEBOT_IP
        r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.text
@@ -21,10 +24,12 @@ def xml(route, ref=None):
        logging.error('Problem hitting URL: {}'.format(str(e)))
        return False
-def json(route, ref=None):
+def json(route, ref=None, headers=dict(), use_googlebot=True):
    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
+        if use_googlebot:
-        r = requests.get(route(ref), headers=headers, timeout=5)
+            headers['User-Agent'] = GOOGLEBOT_USER_AGENT
            headers['X-Forwarded-For'] = GOOGLEBOT_IP
        r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
--- a/apiserver/misc/icons.py
+++ b/apiserver/misc/icons.py
@@ -0,0 +1,14 @@
 from bs4 import BeautifulSoup
 def get_icons(markup):
    soup = BeautifulSoup(markup, features='html.parser')
    icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
    icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
    favicon = soup.find_all('link', rel="shortcut icon", href=True)
    others = soup.find_all('link', rel="icon", href=True)
    icons = icon32 + icon16 + favicon + others
    base_url = '/'.join(urlref.split('/')[:3])
    icons = list(set([i.get('href') for i in icons]))
    icons = [i if i.startswith('http') else base_url + i for i in icons]
    return icons
--- a/apiserver/misc/metadata.py
+++ b/apiserver/misc/metadata.py
@@ -1,4 +1,19 @@
 from bs4 import BeautifulSoup
 def get_icons(markup, url):
    soup = BeautifulSoup(markup, features='html.parser')
    icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
    icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
    favicon = soup.find_all('link', rel="shortcut icon", href=True)
    others = soup.find_all('link', rel="icon", href=True)
    icons = icon32 + icon16 + favicon + others
    base_url = '/'.join(url.split('/')[:3])
    icons = list(set([i.get('href') for i in icons]))
    icons = [i if i.startswith('http') else base_url + i for i in icons]
    return icons
 def parse_extruct(s, data):
    rdfa_keys = {
        'title': [
--- a/apiserver/misc/news.py
+++ b/apiserver/misc/news.py
@@ -11,7 +11,7 @@ import extruct
 import settings
 from utils import clean
-from misc.metadata import parse_extruct
+from misc.metadata import parse_extruct, get_icons
 from misc.time import unix
 from misc.api import xml
 import misc.stuff as stuff
@@ -69,16 +69,7 @@ class Base:
        s['url'] = urlref
        s['date'] = 0
-        soup = BeautifulSoup(markup, features='html.parser')
+        icons = get_icons(markup, url=urlref)
        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
        favicon = soup.find_all('link', rel="shortcut icon", href=True)
        others = soup.find_all('link', rel="icon", href=True)
        icons = icon32 + icon16 + favicon + others
        base_url = '/'.join(urlref.split('/')[:3])
        icons = list(set([i.get('href') for i in icons]))
        icons = [i if i.startswith('http') else base_url + i for i in icons]
        if icons:
            s['icon'] = icons[0]