Sanitize html

This commit is contained in:
Tanner Collin 2019-12-01 22:18:41 +00:00
parent e231cd5c31
commit ebcbf1b624
4 changed files with 50 additions and 11 deletions

View File

@ -3,8 +3,14 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests import requests
from utils import clean
API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json' API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x) API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
@ -34,7 +40,7 @@ def comment(i):
c['author'] = i.get('author', '') c['author'] = i.get('author', '')
c['score'] = i.get('points', 0) c['score'] = i.get('points', 0)
c['date'] = i.get('created_at_i', 0) c['date'] = i.get('created_at_i', 0)
c['text'] = i.get('text', '') c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']] c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments'])) c['comments'] = list(filter(bool, c['comments']))
return c return c
@ -65,7 +71,7 @@ def story(ref):
s['num_comments'] = comment_count(s) - 1 s['num_comments'] = comment_count(s) - 1
if 'text' in r and r['text']: if 'text' in r and r['text']:
s['text'] = r['text'] s['text'] = clean(r['text'] or '')
return s return s

View File

@ -12,7 +12,7 @@ from praw.exceptions import PRAWException
from praw.models import MoreComments from praw.models import MoreComments
from prawcore.exceptions import PrawcoreException from prawcore.exceptions import PrawcoreException
from utils import render_md from utils import render_md, clean
SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy' SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy'
@ -45,7 +45,7 @@ def comment(i):
c['author'] = i.author.name if i.author else '[Deleted]' c['author'] = i.author.name if i.author else '[Deleted]'
c['score'] = i.score c['score'] = i.score
c['date'] = i.created_utc c['date'] = i.created_utc
c['text'] = render_md(i.body) c['text'] = render_md(clean(i.body))
c['comments'] = [comment(j) for j in i.replies] c['comments'] = [comment(j) for j in i.replies]
c['comments'] = list(filter(bool, c['comments'])) c['comments'] = list(filter(bool, c['comments']))
return c return c
@ -68,7 +68,7 @@ def story(ref):
s['num_comments'] = r.num_comments s['num_comments'] = r.num_comments
if r.selftext: if r.selftext:
s['text'] = render_md(r.selftext) s['text'] = render_md(clean(r.selftext))
return s return s
@ -83,7 +83,7 @@ def story(ref):
# scratchpad so I can quickly develop the parser # scratchpad so I can quickly develop the parser
if __name__ == '__main__': if __name__ == '__main__':
print(feed()) #print(feed())
print(reddit.submission(feed()[0]).permalink) #print(reddit.submission(feed()[0]).permalink)
print() #print()
print(story('cuozg4')) print(story('e4asnp'))

View File

@ -3,10 +3,16 @@ logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from datetime import datetime from datetime import datetime
from utils import clean
# cache the topic groups to prevent redirects # cache the topic groups to prevent redirects
group_lookup = {} group_lookup = {}
@ -52,7 +58,7 @@ def comment(i):
c['author'] = str(lu.string if lu else 'unknown user') c['author'] = str(lu.string if lu else 'unknown user')
c['score'] = 1 c['score'] = 1
c['date'] = unix(i.find('time')['datetime']) c['date'] = unix(i.find('time')['datetime'])
c['text'] = i.find('div', class_='comment-text').encode_contents().decode() c['text'] = clean(i.find('div', class_='comment-text').encode_contents().decode() or '')
ct = i.find('ol', class_='comment-tree') ct = i.find('ol', class_='comment-tree')
c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else [] c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
c['comments'] = list(filter(bool, c['comments'])) c['comments'] = list(filter(bool, c['comments']))
@ -99,7 +105,7 @@ def story(ref):
td = a.find('div', class_='topic-full-text') td = a.find('div', class_='topic-full-text')
if td: if td:
s['text'] = td.encode_contents().decode() s['text'] = clean(td.encode_contents().decode() or '')
return s return s

View File

@ -7,6 +7,8 @@ import commonmark
import random import random
import string import string
from bleach.sanitizer import Cleaner
def gen_rand_id(): def gen_rand_id():
return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))
@ -15,3 +17,28 @@ def render_md(md):
return commonmark.commonmark(md) return commonmark.commonmark(md)
else: else:
return '' return ''
ALLOWED_TAGS = [
'a',
'abbr',
'acronym',
'b',
'blockquote',
'code',
'em',
'i',
'li',
'ol',
'strong',
'ul',
'p',
'hr',
'small',
'ins',
'sup',
'sub',
'details',
'summary',
]
clean = Cleaner(tags=ALLOWED_TAGS).clean