get favicons for custom substack publications.
This commit is contained in:
parent
3daae5fa1b
commit
fe01ea52e5
|
@ -11,6 +11,8 @@ import requests
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from misc.time import unix
|
from misc.time import unix
|
||||||
|
from misc.metadata import get_icons
|
||||||
|
from misc.api import xml, json
|
||||||
from utils import clean
|
from utils import clean
|
||||||
|
|
||||||
SUBSTACK_REFERER = 'https://substack.com'
|
SUBSTACK_REFERER = 'https://substack.com'
|
||||||
|
@ -23,29 +25,6 @@ def api_comments(post_id, base_url):
|
||||||
def api_stories(x, base_url):
|
def api_stories(x, base_url):
|
||||||
return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100"
|
return f"{base_url}/api/v1/archive?sort=new&search=&offset=0&limit=100"
|
||||||
|
|
||||||
def api(route, ref=None, referer=None):
|
|
||||||
headers = {'Referer': referer} if referer else None
|
|
||||||
try:
|
|
||||||
r = requests.get(route(ref), headers=headers, timeout=10)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem hitting Substack API: {}, trying again'.format(str(e)))
|
|
||||||
|
|
||||||
try:
|
|
||||||
r = requests.get(route(ref), headers=headers, timeout=20)
|
|
||||||
if r.status_code != 200:
|
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
|
||||||
return r.json()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
raise
|
|
||||||
except BaseException as e:
|
|
||||||
logging.error('Problem hitting Substack API: {}'.format(str(e)))
|
|
||||||
return False
|
|
||||||
|
|
||||||
def comment(i):
|
def comment(i):
|
||||||
if 'body' not in i:
|
if 'body' not in i:
|
||||||
return False
|
return False
|
||||||
|
@ -71,14 +50,14 @@ class Publication:
|
||||||
return ref.replace(f"{self.BASE_DOMAIN}/#id:", '')
|
return ref.replace(f"{self.BASE_DOMAIN}/#id:", '')
|
||||||
|
|
||||||
def feed(self):
|
def feed(self):
|
||||||
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
|
stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN})
|
||||||
if not stories: return []
|
if not stories: return []
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
return [self.ref_prefix(str(i.get("id"))) for i in stories or []]
|
return [self.ref_prefix(str(i.get("id"))) for i in stories or []]
|
||||||
|
|
||||||
def story(self, ref):
|
def story(self, ref):
|
||||||
ref = self.strip_ref_prefix(ref)
|
ref = self.strip_ref_prefix(ref)
|
||||||
stories = api(lambda x: api_stories(x, self.BASE_DOMAIN), referer=self.BASE_DOMAIN)
|
stories = json(lambda x: api_stories(x, self.BASE_DOMAIN), headers={'Referer': self.BASE_DOMAIN})
|
||||||
if not stories: return False
|
if not stories: return False
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
||||||
|
@ -99,7 +78,7 @@ class Publication:
|
||||||
s['title'] = r.get('title', '')
|
s['title'] = r.get('title', '')
|
||||||
s['link'] = r.get('canonical_url', '')
|
s['link'] = r.get('canonical_url', '')
|
||||||
s['url'] = r.get('canonical_url', '')
|
s['url'] = r.get('canonical_url', '')
|
||||||
comments = api(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), referer=self.BASE_DOMAIN)
|
comments = json(lambda x: api_comments(x, self.BASE_DOMAIN), r.get('id'), headers={'Referer': self.BASE_DOMAIN})
|
||||||
s['comments'] = [comment(i) for i in comments.get('comments')]
|
s['comments'] = [comment(i) for i in comments.get('comments')]
|
||||||
s['comments'] = list(filter(bool, s['comments']))
|
s['comments'] = list(filter(bool, s['comments']))
|
||||||
s['num_comments'] = r.get('comment_count', 0)
|
s['num_comments'] = r.get('comment_count', 0)
|
||||||
|
@ -109,6 +88,12 @@ class Publication:
|
||||||
s['author'] = authors[0].get('name')
|
s['author'] = authors[0].get('name')
|
||||||
s['author_link'] = authors[0].get('link')
|
s['author_link'] = authors[0].get('link')
|
||||||
|
|
||||||
|
markup = xml(lambda x: s['link'])
|
||||||
|
if markup:
|
||||||
|
icons = get_icons(markup, url=s['link'])
|
||||||
|
if icons:
|
||||||
|
s['icon'] = icons[0]
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def _bylines(self, b):
|
def _bylines(self, b):
|
||||||
|
@ -131,7 +116,7 @@ class Top:
|
||||||
return ref
|
return ref
|
||||||
|
|
||||||
def feed(self):
|
def feed(self):
|
||||||
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
|
stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER})
|
||||||
if not stories: return []
|
if not stories: return []
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
stories = [dict(id=i.get('id'), base_url=i.get("pub", { 'base_url': '' }).get("base_url")) for i in stories or []]
|
stories = [dict(id=i.get('id'), base_url=i.get("pub", { 'base_url': '' }).get("base_url")) for i in stories or []]
|
||||||
|
@ -139,7 +124,7 @@ class Top:
|
||||||
|
|
||||||
def story(self, ref):
|
def story(self, ref):
|
||||||
ref = self.strip_ref_prefix(ref)
|
ref = self.strip_ref_prefix(ref)
|
||||||
stories = api(SUBSTACK_API_TOP_POSTS, referer=SUBSTACK_REFERER)
|
stories = json(SUBSTACK_API_TOP_POSTS, headers={'Referer': SUBSTACK_REFERER})
|
||||||
if not stories: return False
|
if not stories: return False
|
||||||
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
stories = list(filter(None, [i if i.get("audience") == "everyone" else None for i in stories]))
|
||||||
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
stories = list(filter(None, [i if str(i.get('id')) == ref else None for i in stories]))
|
||||||
|
@ -162,7 +147,7 @@ class Top:
|
||||||
s['title'] = r.get('title', '')
|
s['title'] = r.get('title', '')
|
||||||
s['link'] = r.get('canonical_url', '')
|
s['link'] = r.get('canonical_url', '')
|
||||||
s['url'] = r.get('canonical_url', '')
|
s['url'] = r.get('canonical_url', '')
|
||||||
comments = api(lambda x: api_comments(x, base_url), r.get('id'), referer=SUBSTACK_REFERER)
|
comments = json(lambda x: api_comments(x, base_url), r.get('id'), headers={'Referer': SUBSTACK_REFERER})
|
||||||
s['comments'] = [comment(i) for i in comments.get('comments')]
|
s['comments'] = [comment(i) for i in comments.get('comments')]
|
||||||
s['comments'] = list(filter(bool, s['comments']))
|
s['comments'] = list(filter(bool, s['comments']))
|
||||||
s['num_comments'] = r.get('comment_count', 0)
|
s['num_comments'] = r.get('comment_count', 0)
|
||||||
|
|
|
@ -5,13 +5,16 @@ logging.basicConfig(
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
GOOGLEBOT_USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||||
FORWARD_IP = '66.249.66.1'
|
GOOGLEBOT_IP = '66.249.66.1'
|
||||||
|
TIMEOUT = 30
|
||||||
|
|
||||||
def xml(route, ref=None):
|
def xml(route, ref=None, headers=dict(), use_googlebot=True):
|
||||||
try:
|
try:
|
||||||
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
|
if use_googlebot:
|
||||||
r = requests.get(route(ref), headers=headers, timeout=5)
|
headers['User-Agent'] = GOOGLEBOT_USER_AGENT
|
||||||
|
headers['X-Forwarded-For'] = GOOGLEBOT_IP
|
||||||
|
r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.text
|
return r.text
|
||||||
|
@ -21,10 +24,12 @@ def xml(route, ref=None):
|
||||||
logging.error('Problem hitting URL: {}'.format(str(e)))
|
logging.error('Problem hitting URL: {}'.format(str(e)))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def json(route, ref=None):
|
def json(route, ref=None, headers=dict(), use_googlebot=True):
|
||||||
try:
|
try:
|
||||||
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
|
if use_googlebot:
|
||||||
r = requests.get(route(ref), headers=headers, timeout=5)
|
headers['User-Agent'] = GOOGLEBOT_USER_AGENT
|
||||||
|
headers['X-Forwarded-For'] = GOOGLEBOT_IP
|
||||||
|
r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise Exception('Bad response code ' + str(r.status_code))
|
raise Exception('Bad response code ' + str(r.status_code))
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
14
apiserver/misc/icons.py
Normal file
14
apiserver/misc/icons.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def get_icons(markup):
|
||||||
|
soup = BeautifulSoup(markup, features='html.parser')
|
||||||
|
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
||||||
|
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
||||||
|
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
||||||
|
others = soup.find_all('link', rel="icon", href=True)
|
||||||
|
icons = icon32 + icon16 + favicon + others
|
||||||
|
base_url = '/'.join(urlref.split('/')[:3])
|
||||||
|
icons = list(set([i.get('href') for i in icons]))
|
||||||
|
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
||||||
|
|
||||||
|
return icons
|
|
@ -1,4 +1,19 @@
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def get_icons(markup, url):
|
||||||
|
soup = BeautifulSoup(markup, features='html.parser')
|
||||||
|
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
||||||
|
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
||||||
|
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
||||||
|
others = soup.find_all('link', rel="icon", href=True)
|
||||||
|
icons = icon32 + icon16 + favicon + others
|
||||||
|
base_url = '/'.join(url.split('/')[:3])
|
||||||
|
icons = list(set([i.get('href') for i in icons]))
|
||||||
|
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
||||||
|
|
||||||
|
return icons
|
||||||
|
|
||||||
def parse_extruct(s, data):
|
def parse_extruct(s, data):
|
||||||
rdfa_keys = {
|
rdfa_keys = {
|
||||||
'title': [
|
'title': [
|
||||||
|
|
|
@ -11,7 +11,7 @@ import extruct
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
from utils import clean
|
from utils import clean
|
||||||
from misc.metadata import parse_extruct
|
from misc.metadata import parse_extruct, get_icons
|
||||||
from misc.time import unix
|
from misc.time import unix
|
||||||
from misc.api import xml
|
from misc.api import xml
|
||||||
import misc.stuff as stuff
|
import misc.stuff as stuff
|
||||||
|
@ -69,16 +69,7 @@ class Base:
|
||||||
s['url'] = urlref
|
s['url'] = urlref
|
||||||
s['date'] = 0
|
s['date'] = 0
|
||||||
|
|
||||||
soup = BeautifulSoup(markup, features='html.parser')
|
icons = get_icons(markup, url=urlref)
|
||||||
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
|
||||||
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
|
||||||
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
|
||||||
others = soup.find_all('link', rel="icon", href=True)
|
|
||||||
icons = icon32 + icon16 + favicon + others
|
|
||||||
base_url = '/'.join(urlref.split('/')[:3])
|
|
||||||
icons = list(set([i.get('href') for i in icons]))
|
|
||||||
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
|
||||||
|
|
||||||
if icons:
|
if icons:
|
||||||
s['icon'] = icons[0]
|
s['icon'] = icons[0]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user