From ebcbf1b62456e960e490ec43143f2724ecccc3c1 Mon Sep 17 00:00:00 2001
From: Tanner Collin <git@tannercollin.com>
Date: Sun, 1 Dec 2019 22:18:41 +0000
Subject: [PATCH] Sanitize html

---
 apiserver/feeds/hackernews.py | 10 ++++++++--
 apiserver/feeds/reddit.py     | 14 +++++++-------
 apiserver/feeds/tildes.py     | 10 ++++++++--
 apiserver/utils.py            | 27 +++++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/apiserver/feeds/hackernews.py b/apiserver/feeds/hackernews.py
index 2670886..3cdc752 100644
--- a/apiserver/feeds/hackernews.py
+++ b/apiserver/feeds/hackernews.py
@@ -3,8 +3,14 @@ logging.basicConfig(
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
         level=logging.DEBUG)
 
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
 import requests
 
+from utils import clean
+
 API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
 API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
 
@@ -34,7 +40,7 @@ def comment(i):
     c['author'] = i.get('author', '')
     c['score'] = i.get('points', 0)
     c['date'] = i.get('created_at_i', 0)
-    c['text'] = i.get('text', '')
+    c['text'] = clean(i.get('text', '') or '')
     c['comments'] = [comment(j) for j in i['children']]
     c['comments'] = list(filter(bool, c['comments']))
     return c
@@ -65,7 +71,7 @@ def story(ref):
     s['num_comments'] = comment_count(s) - 1
 
     if 'text' in r and r['text']:
-        s['text'] = r['text']
+        s['text'] = clean(r['text'] or '')
 
     return s
 
diff --git a/apiserver/feeds/reddit.py b/apiserver/feeds/reddit.py
index eb05f0c..7162ab0 100644
--- a/apiserver/feeds/reddit.py
+++ b/apiserver/feeds/reddit.py
@@ -12,7 +12,7 @@ from praw.exceptions import PRAWException
 from praw.models import MoreComments
 from prawcore.exceptions import PrawcoreException
 
-from utils import render_md
+from utils import render_md, clean
 
 SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy'
 
@@ -45,7 +45,7 @@ def comment(i):
     c['author'] = i.author.name if i.author else '[Deleted]'
     c['score'] = i.score
     c['date'] = i.created_utc
-    c['text'] = render_md(i.body)
+    c['text'] = render_md(clean(i.body))
     c['comments'] = [comment(j) for j in i.replies]
     c['comments'] = list(filter(bool, c['comments']))
     return c
@@ -68,7 +68,7 @@ def story(ref):
         s['num_comments'] = r.num_comments
 
         if r.selftext:
-            s['text'] = render_md(r.selftext)
+            s['text'] = render_md(clean(r.selftext))
 
         return s
 
@@ -83,7 +83,7 @@ def story(ref):
 
 # scratchpad so I can quickly develop the parser
 if __name__ == '__main__':
-    print(feed())
-    print(reddit.submission(feed()[0]).permalink)
-    print()
-    print(story('cuozg4'))
+    #print(feed())
+    #print(reddit.submission(feed()[0]).permalink)
+    #print()
+    print(story('e4asnp'))
diff --git a/apiserver/feeds/tildes.py b/apiserver/feeds/tildes.py
index cab4395..db5c028 100644
--- a/apiserver/feeds/tildes.py
+++ b/apiserver/feeds/tildes.py
@@ -3,10 +3,16 @@ logging.basicConfig(
         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
         level=logging.DEBUG)
 
+if __name__ == '__main__':
+    import sys
+    sys.path.insert(0,'.')
+
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime
 
+from utils import clean
+
 # cache the topic groups to prevent redirects
 group_lookup = {}
 
@@ -52,7 +58,7 @@ def comment(i):
     c['author'] = str(lu.string if lu else 'unknown user')
     c['score'] = 1
     c['date'] = unix(i.find('time')['datetime'])
-    c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
+    c['text'] = clean(i.find('div', class_='comment-text').encode_contents().decode() or '')
     ct = i.find('ol', class_='comment-tree')
     c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
     c['comments'] = list(filter(bool, c['comments']))
@@ -99,7 +105,7 @@ def story(ref):
 
     td = a.find('div', class_='topic-full-text')
     if td:
-        s['text'] = td.encode_contents().decode()
+        s['text'] = clean(td.encode_contents().decode() or '')
 
     return s
 
diff --git a/apiserver/utils.py b/apiserver/utils.py
index ac1cf59..5b3cc38 100644
--- a/apiserver/utils.py
+++ b/apiserver/utils.py
@@ -7,6 +7,8 @@ import commonmark
 import random
 import string
 
+from bleach.sanitizer import Cleaner
+
 def gen_rand_id():
     return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))
 
@@ -15,3 +17,28 @@ def render_md(md):
         return commonmark.commonmark(md)
     else:
         return ''
+
+ALLOWED_TAGS = [
+        'a',
+        'abbr',
+        'acronym',
+        'b',
+        'blockquote',
+        'code',
+        'em',
+        'i',
+        'li',
+        'ol',
+        'strong',
+        'ul',
+        'p',
+        'hr',
+        'small',
+        'ins',
+        'sup',
+        'sub',
+        'details',
+        'summary',
+        ]
+
+clean = Cleaner(tags=ALLOWED_TAGS).clean