From ebcbf1b62456e960e490ec43143f2724ecccc3c1 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Sun, 1 Dec 2019 22:18:41 +0000 Subject: [PATCH] Sanitize html --- apiserver/feeds/hackernews.py | 10 ++++++++-- apiserver/feeds/reddit.py | 14 +++++++------- apiserver/feeds/tildes.py | 10 ++++++++-- apiserver/utils.py | 27 +++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 11 deletions(-) diff --git a/apiserver/feeds/hackernews.py b/apiserver/feeds/hackernews.py index 2670886..3cdc752 100644 --- a/apiserver/feeds/hackernews.py +++ b/apiserver/feeds/hackernews.py @@ -3,8 +3,14 @@ logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + import requests +from utils import clean + API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json' API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x) @@ -34,7 +40,7 @@ def comment(i): c['author'] = i.get('author', '') c['score'] = i.get('points', 0) c['date'] = i.get('created_at_i', 0) - c['text'] = i.get('text', '') + c['text'] = clean(i.get('text', '') or '') c['comments'] = [comment(j) for j in i['children']] c['comments'] = list(filter(bool, c['comments'])) return c @@ -65,7 +71,7 @@ def story(ref): s['num_comments'] = comment_count(s) - 1 if 'text' in r and r['text']: - s['text'] = r['text'] + s['text'] = clean(r['text'] or '') return s diff --git a/apiserver/feeds/reddit.py b/apiserver/feeds/reddit.py index eb05f0c..7162ab0 100644 --- a/apiserver/feeds/reddit.py +++ b/apiserver/feeds/reddit.py @@ -12,7 +12,7 @@ from praw.exceptions import PRAWException from praw.models import MoreComments from prawcore.exceptions import PrawcoreException -from utils import render_md +from utils import render_md, clean SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy' @@ -45,7 +45,7 @@ def comment(i): c['author'] = i.author.name if i.author else '[Deleted]' c['score'] = i.score c['date'] = i.created_utc - c['text'] = render_md(i.body) + c['text'] = render_md(clean(i.body)) c['comments'] = [comment(j) for j in i.replies] c['comments'] = list(filter(bool, c['comments'])) return c @@ -68,7 +68,7 @@ def story(ref): s['num_comments'] = r.num_comments if r.selftext: - s['text'] = render_md(r.selftext) + s['text'] = render_md(clean(r.selftext)) return s @@ -83,7 +83,7 @@ def story(ref): # scratchpad so I can quickly develop the parser if __name__ == '__main__': - print(feed()) - print(reddit.submission(feed()[0]).permalink) - print() - print(story('cuozg4')) + #print(feed()) + #print(reddit.submission(feed()[0]).permalink) + #print() + print(story('e4asnp')) diff --git a/apiserver/feeds/tildes.py b/apiserver/feeds/tildes.py index cab4395..db5c028 100644 --- a/apiserver/feeds/tildes.py +++ b/apiserver/feeds/tildes.py @@ -3,10 +3,16 @@ logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) +if __name__ == '__main__': + import sys + sys.path.insert(0,'.') + import requests from bs4 import BeautifulSoup from datetime import datetime +from utils import clean + # cache the topic groups to prevent redirects group_lookup = {} @@ -52,7 +58,7 @@ def comment(i): c['author'] = str(lu.string if lu else 'unknown user') c['score'] = 1 c['date'] = unix(i.find('time')['datetime']) - c['text'] = i.find('div', class_='comment-text').encode_contents().decode() + c['text'] = clean(i.find('div', class_='comment-text').encode_contents().decode() or '') ct = i.find('ol', class_='comment-tree') c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else [] c['comments'] = list(filter(bool, c['comments'])) @@ -99,7 +105,7 @@ def story(ref): td = a.find('div', class_='topic-full-text') if td: - s['text'] = td.encode_contents().decode() + s['text'] = clean(td.encode_contents().decode() or '') return s diff --git a/apiserver/utils.py b/apiserver/utils.py index ac1cf59..5b3cc38 100644 --- a/apiserver/utils.py +++ b/apiserver/utils.py @@ -7,6 +7,8 @@ import commonmark import random import string +from bleach.sanitizer import Cleaner + def gen_rand_id(): return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) @@ -15,3 +17,28 @@ def render_md(md): return commonmark.commonmark(md) else: return '' + +ALLOWED_TAGS = [ + 'a', + 'abbr', + 'acronym', + 'b', + 'blockquote', + 'code', + 'em', + 'i', + 'li', + 'ol', + 'strong', + 'ul', + 'p', + 'hr', + 'small', + 'ins', + 'sup', + 'sub', + 'details', + 'summary', + ] + +clean = Cleaner(tags=ALLOWED_TAGS).clean