Sanitize html
This commit is contained in:
parent
e231cd5c31
commit
ebcbf1b624
|
@ -3,8 +3,14 @@ logging.basicConfig(
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0,'.')
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from utils import clean
|
||||||
|
|
||||||
API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
|
API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
|
||||||
API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
|
API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
|
||||||
|
|
||||||
|
@ -34,7 +40,7 @@ def comment(i):
|
||||||
c['author'] = i.get('author', '')
|
c['author'] = i.get('author', '')
|
||||||
c['score'] = i.get('points', 0)
|
c['score'] = i.get('points', 0)
|
||||||
c['date'] = i.get('created_at_i', 0)
|
c['date'] = i.get('created_at_i', 0)
|
||||||
c['text'] = i.get('text', '')
|
c['text'] = clean(i.get('text', '') or '')
|
||||||
c['comments'] = [comment(j) for j in i['children']]
|
c['comments'] = [comment(j) for j in i['children']]
|
||||||
c['comments'] = list(filter(bool, c['comments']))
|
c['comments'] = list(filter(bool, c['comments']))
|
||||||
return c
|
return c
|
||||||
|
@ -65,7 +71,7 @@ def story(ref):
|
||||||
s['num_comments'] = comment_count(s) - 1
|
s['num_comments'] = comment_count(s) - 1
|
||||||
|
|
||||||
if 'text' in r and r['text']:
|
if 'text' in r and r['text']:
|
||||||
s['text'] = r['text']
|
s['text'] = clean(r['text'] or '')
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ from praw.exceptions import PRAWException
|
||||||
from praw.models import MoreComments
|
from praw.models import MoreComments
|
||||||
from prawcore.exceptions import PrawcoreException
|
from prawcore.exceptions import PrawcoreException
|
||||||
|
|
||||||
from utils import render_md
|
from utils import render_md, clean
|
||||||
|
|
||||||
SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy'
|
SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy'
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ def comment(i):
|
||||||
c['author'] = i.author.name if i.author else '[Deleted]'
|
c['author'] = i.author.name if i.author else '[Deleted]'
|
||||||
c['score'] = i.score
|
c['score'] = i.score
|
||||||
c['date'] = i.created_utc
|
c['date'] = i.created_utc
|
||||||
c['text'] = render_md(i.body)
|
c['text'] = render_md(clean(i.body))
|
||||||
c['comments'] = [comment(j) for j in i.replies]
|
c['comments'] = [comment(j) for j in i.replies]
|
||||||
c['comments'] = list(filter(bool, c['comments']))
|
c['comments'] = list(filter(bool, c['comments']))
|
||||||
return c
|
return c
|
||||||
|
@ -68,7 +68,7 @@ def story(ref):
|
||||||
s['num_comments'] = r.num_comments
|
s['num_comments'] = r.num_comments
|
||||||
|
|
||||||
if r.selftext:
|
if r.selftext:
|
||||||
s['text'] = render_md(r.selftext)
|
s['text'] = render_md(clean(r.selftext))
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ def story(ref):
|
||||||
|
|
||||||
# scratchpad so I can quickly develop the parser
|
# scratchpad so I can quickly develop the parser
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(feed())
|
#print(feed())
|
||||||
print(reddit.submission(feed()[0]).permalink)
|
#print(reddit.submission(feed()[0]).permalink)
|
||||||
print()
|
#print()
|
||||||
print(story('cuozg4'))
|
print(story('e4asnp'))
|
||||||
|
|
|
@ -3,10 +3,16 @@ logging.basicConfig(
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0,'.')
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from utils import clean
|
||||||
|
|
||||||
# cache the topic groups to prevent redirects
|
# cache the topic groups to prevent redirects
|
||||||
group_lookup = {}
|
group_lookup = {}
|
||||||
|
|
||||||
|
@ -52,7 +58,7 @@ def comment(i):
|
||||||
c['author'] = str(lu.string if lu else 'unknown user')
|
c['author'] = str(lu.string if lu else 'unknown user')
|
||||||
c['score'] = 1
|
c['score'] = 1
|
||||||
c['date'] = unix(i.find('time')['datetime'])
|
c['date'] = unix(i.find('time')['datetime'])
|
||||||
c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
|
c['text'] = clean(i.find('div', class_='comment-text').encode_contents().decode() or '')
|
||||||
ct = i.find('ol', class_='comment-tree')
|
ct = i.find('ol', class_='comment-tree')
|
||||||
c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
|
c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
|
||||||
c['comments'] = list(filter(bool, c['comments']))
|
c['comments'] = list(filter(bool, c['comments']))
|
||||||
|
@ -99,7 +105,7 @@ def story(ref):
|
||||||
|
|
||||||
td = a.find('div', class_='topic-full-text')
|
td = a.find('div', class_='topic-full-text')
|
||||||
if td:
|
if td:
|
||||||
s['text'] = td.encode_contents().decode()
|
s['text'] = clean(td.encode_contents().decode() or '')
|
||||||
|
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,8 @@ import commonmark
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
from bleach.sanitizer import Cleaner
|
||||||
|
|
||||||
def gen_rand_id():
|
def gen_rand_id():
|
||||||
return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))
|
return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))
|
||||||
|
|
||||||
|
@ -15,3 +17,28 @@ def render_md(md):
|
||||||
return commonmark.commonmark(md)
|
return commonmark.commonmark(md)
|
||||||
else:
|
else:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
ALLOWED_TAGS = [
|
||||||
|
'a',
|
||||||
|
'abbr',
|
||||||
|
'acronym',
|
||||||
|
'b',
|
||||||
|
'blockquote',
|
||||||
|
'code',
|
||||||
|
'em',
|
||||||
|
'i',
|
||||||
|
'li',
|
||||||
|
'ol',
|
||||||
|
'strong',
|
||||||
|
'ul',
|
||||||
|
'p',
|
||||||
|
'hr',
|
||||||
|
'small',
|
||||||
|
'ins',
|
||||||
|
'sup',
|
||||||
|
'sub',
|
||||||
|
'details',
|
||||||
|
'summary',
|
||||||
|
]
|
||||||
|
|
||||||
|
clean = Cleaner(tags=ALLOWED_TAGS).clean
|
||||||
|
|
Loading…
Reference in New Issue
Block a user