forked from tanner/qotnews
get favicons for custom substack publications.
This commit is contained in:
@@ -5,13 +5,16 @@ logging.basicConfig(
|
||||
|
||||
import requests
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||
FORWARD_IP = '66.249.66.1'
|
||||
GOOGLEBOT_USER_AGENT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||
GOOGLEBOT_IP = '66.249.66.1'
|
||||
TIMEOUT = 30
|
||||
|
||||
def xml(route, ref=None):
|
||||
def xml(route, ref=None, headers=dict(), use_googlebot=True):
|
||||
try:
|
||||
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
|
||||
r = requests.get(route(ref), headers=headers, timeout=5)
|
||||
if use_googlebot:
|
||||
headers['User-Agent'] = GOOGLEBOT_USER_AGENT
|
||||
headers['X-Forwarded-For'] = GOOGLEBOT_IP
|
||||
r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
|
||||
if r.status_code != 200:
|
||||
raise Exception('Bad response code ' + str(r.status_code))
|
||||
return r.text
|
||||
@@ -21,10 +24,12 @@ def xml(route, ref=None):
|
||||
logging.error('Problem hitting URL: {}'.format(str(e)))
|
||||
return False
|
||||
|
||||
def json(route, ref=None):
|
||||
def json(route, ref=None, headers=dict(), use_googlebot=True):
|
||||
try:
|
||||
headers = {'User-Agent': USER_AGENT, 'X-Forwarded-For': FORWARD_IP}
|
||||
r = requests.get(route(ref), headers=headers, timeout=5)
|
||||
if use_googlebot:
|
||||
headers['User-Agent'] = GOOGLEBOT_USER_AGENT
|
||||
headers['X-Forwarded-For'] = GOOGLEBOT_IP
|
||||
r = requests.get(route(ref), headers=headers, timeout=TIMEOUT)
|
||||
if r.status_code != 200:
|
||||
raise Exception('Bad response code ' + str(r.status_code))
|
||||
return r.json()
|
||||
|
14
apiserver/misc/icons.py
Normal file
14
apiserver/misc/icons.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_icons(markup):
|
||||
soup = BeautifulSoup(markup, features='html.parser')
|
||||
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
||||
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
||||
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
||||
others = soup.find_all('link', rel="icon", href=True)
|
||||
icons = icon32 + icon16 + favicon + others
|
||||
base_url = '/'.join(urlref.split('/')[:3])
|
||||
icons = list(set([i.get('href') for i in icons]))
|
||||
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
||||
|
||||
return icons
|
@@ -1,4 +1,19 @@
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_icons(markup, url):
|
||||
soup = BeautifulSoup(markup, features='html.parser')
|
||||
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
||||
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
||||
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
||||
others = soup.find_all('link', rel="icon", href=True)
|
||||
icons = icon32 + icon16 + favicon + others
|
||||
base_url = '/'.join(url.split('/')[:3])
|
||||
icons = list(set([i.get('href') for i in icons]))
|
||||
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
||||
|
||||
return icons
|
||||
|
||||
def parse_extruct(s, data):
|
||||
rdfa_keys = {
|
||||
'title': [
|
||||
|
@@ -11,7 +11,7 @@ import extruct
|
||||
|
||||
import settings
|
||||
from utils import clean
|
||||
from misc.metadata import parse_extruct
|
||||
from misc.metadata import parse_extruct, get_icons
|
||||
from misc.time import unix
|
||||
from misc.api import xml
|
||||
import misc.stuff as stuff
|
||||
@@ -69,16 +69,7 @@ class Base:
|
||||
s['url'] = urlref
|
||||
s['date'] = 0
|
||||
|
||||
soup = BeautifulSoup(markup, features='html.parser')
|
||||
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
||||
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
||||
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
||||
others = soup.find_all('link', rel="icon", href=True)
|
||||
icons = icon32 + icon16 + favicon + others
|
||||
base_url = '/'.join(urlref.split('/')[:3])
|
||||
icons = list(set([i.get('href') for i in icons]))
|
||||
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
||||
|
||||
icons = get_icons(markup, url=urlref)
|
||||
if icons:
|
||||
s['icon'] = icons[0]
|
||||
|
||||
|
Reference in New Issue
Block a user