get favicons for custom substack publications.

2020-11-24 10:36:31 +13:00
parent 3daae5fa1b
commit fe01ea52e5
5 changed files with 58 additions and 48 deletions
--- a/apiserver/misc/api.py
+++ b/apiserver/misc/api.py
@@ -5,13 +5,16 @@ logging.basicConfig(

 import requests

-USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
-FORWARD_IP = '66.249.66.1'
+GOOGLEBOT_USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+GOOGLEBOT_IP = '66.249.66.1'
+TIMEOUT = 30

-def xml(route, ref=None):
+def xml(route, ref=None, headers=dict(), use_googlebot=True):
    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
-        r = requests.get(route(ref), headers=headers, timeout=5)
+        if use_googlebot:
+            headers['User-Agent'] = GOOGLEBOT_USER_AGENT
+            headers['X-Forwarded-For'] = GOOGLEBOT_IP
+        r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.text
@@ -21,10 +24,12 @@ def xml(route, ref=None):
        logging.error('Problem hitting URL: {}'.format(str(e)))
        return False

-def json(route, ref=None):
+def json(route, ref=None, headers=dict(), use_googlebot=True):
    try:
-        headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
-        r = requests.get(route(ref), headers=headers, timeout=5)
+        if use_googlebot:
+            headers['User-Agent'] = GOOGLEBOT_USER_AGENT
+            headers['X-Forwarded-For'] = GOOGLEBOT_IP
+        r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
        if r.status_code != 200:
            raise Exception('Bad response code ' + str(r.status_code))
        return r.json()
--- a/apiserver/misc/icons.py
+++ b/apiserver/misc/icons.py
@@ -0,0 +1,14 @@
+from bs4 import BeautifulSoup
+
+def get_icons(markup):
+    soup = BeautifulSoup(markup, features='html.parser')
+    icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
+    icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
+    favicon = soup.find_all('link', rel="shortcut icon", href=True)
+    others = soup.find_all('link', rel="icon", href=True)
+    icons = icon32 + icon16 + favicon + others
+    base_url = '/'.join(urlref.split('/')[:3])
+    icons = list(set([i.get('href') for i in icons]))
+    icons = [i if i.startswith('http') else base_url + i for i in icons]
+
+    return icons
--- a/apiserver/misc/metadata.py
+++ b/apiserver/misc/metadata.py
@@ -1,4 +1,19 @@

+from bs4 import BeautifulSoup
+
+def get_icons(markup, url):
+    soup = BeautifulSoup(markup, features='html.parser')
+    icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
+    icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
+    favicon = soup.find_all('link', rel="shortcut icon", href=True)
+    others = soup.find_all('link', rel="icon", href=True)
+    icons = icon32 + icon16 + favicon + others
+    base_url = '/'.join(url.split('/')[:3])
+    icons = list(set([i.get('href') for i in icons]))
+    icons = [i if i.startswith('http') else base_url + i for i in icons]
+
+    return icons
+
 def parse_extruct(s, data):
    rdfa_keys = {
        'title': [
--- a/apiserver/misc/news.py
+++ b/apiserver/misc/news.py
@@ -11,7 +11,7 @@ import extruct

 import settings
 from utils import clean
-from misc.metadata import parse_extruct
+from misc.metadata import parse_extruct, get_icons
 from misc.time import unix
 from misc.api import xml
 import misc.stuff as stuff
@@ -69,16 +69,7 @@ class Base:
        s['url'] = urlref
        s['date'] = 0

-        soup = BeautifulSoup(markup, features='html.parser')
-        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
-        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
-        favicon = soup.find_all('link', rel="shortcut icon", href=True)
-        others = soup.find_all('link', rel="icon", href=True)
-        icons = icon32 + icon16 + favicon + others
-        base_url = '/'.join(urlref.split('/')[:3])
-        icons = list(set([i.get('href') for i in icons]))
-        icons = [i if i.startswith('http') else base_url + i for i in icons]
-
+        icons = get_icons(markup, url=urlref)
        if icons:
            s['icon'] = icons[0]