forked from tanner/qotnews
		
	Sanitize html
This commit is contained in:
		@@ -3,8 +3,14 @@ logging.basicConfig(
 | 
				
			|||||||
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
					        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
				
			||||||
        level=logging.DEBUG)
 | 
					        level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    import sys
 | 
				
			||||||
 | 
					    sys.path.insert(0,'.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from utils import clean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
 | 
					API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json'
 | 
				
			||||||
API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
 | 
					API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -34,7 +40,7 @@ def comment(i):
 | 
				
			|||||||
    c['author'] = i.get('author', '')
 | 
					    c['author'] = i.get('author', '')
 | 
				
			||||||
    c['score'] = i.get('points', 0)
 | 
					    c['score'] = i.get('points', 0)
 | 
				
			||||||
    c['date'] = i.get('created_at_i', 0)
 | 
					    c['date'] = i.get('created_at_i', 0)
 | 
				
			||||||
    c['text'] = i.get('text', '')
 | 
					    c['text'] = clean(i.get('text', '') or '')
 | 
				
			||||||
    c['comments'] = [comment(j) for j in i['children']]
 | 
					    c['comments'] = [comment(j) for j in i['children']]
 | 
				
			||||||
    c['comments'] = list(filter(bool, c['comments']))
 | 
					    c['comments'] = list(filter(bool, c['comments']))
 | 
				
			||||||
    return c
 | 
					    return c
 | 
				
			||||||
@@ -65,7 +71,7 @@ def story(ref):
 | 
				
			|||||||
    s['num_comments'] = comment_count(s) - 1
 | 
					    s['num_comments'] = comment_count(s) - 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if 'text' in r and r['text']:
 | 
					    if 'text' in r and r['text']:
 | 
				
			||||||
        s['text'] = r['text']
 | 
					        s['text'] = clean(r['text'] or '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return s
 | 
					    return s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -12,7 +12,7 @@ from praw.exceptions import PRAWException
 | 
				
			|||||||
from praw.models import MoreComments
 | 
					from praw.models import MoreComments
 | 
				
			||||||
from prawcore.exceptions import PrawcoreException
 | 
					from prawcore.exceptions import PrawcoreException
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from utils import render_md
 | 
					from utils import render_md, clean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy'
 | 
					SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -45,7 +45,7 @@ def comment(i):
 | 
				
			|||||||
    c['author'] = i.author.name if i.author else '[Deleted]'
 | 
					    c['author'] = i.author.name if i.author else '[Deleted]'
 | 
				
			||||||
    c['score'] = i.score
 | 
					    c['score'] = i.score
 | 
				
			||||||
    c['date'] = i.created_utc
 | 
					    c['date'] = i.created_utc
 | 
				
			||||||
    c['text'] = render_md(i.body)
 | 
					    c['text'] = render_md(clean(i.body))
 | 
				
			||||||
    c['comments'] = [comment(j) for j in i.replies]
 | 
					    c['comments'] = [comment(j) for j in i.replies]
 | 
				
			||||||
    c['comments'] = list(filter(bool, c['comments']))
 | 
					    c['comments'] = list(filter(bool, c['comments']))
 | 
				
			||||||
    return c
 | 
					    return c
 | 
				
			||||||
@@ -68,7 +68,7 @@ def story(ref):
 | 
				
			|||||||
        s['num_comments'] = r.num_comments
 | 
					        s['num_comments'] = r.num_comments
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if r.selftext:
 | 
					        if r.selftext:
 | 
				
			||||||
            s['text'] = render_md(r.selftext)
 | 
					            s['text'] = render_md(clean(r.selftext))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return s
 | 
					        return s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -83,7 +83,7 @@ def story(ref):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# scratchpad so I can quickly develop the parser
 | 
					# scratchpad so I can quickly develop the parser
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
    print(feed())
 | 
					    #print(feed())
 | 
				
			||||||
    print(reddit.submission(feed()[0]).permalink)
 | 
					    #print(reddit.submission(feed()[0]).permalink)
 | 
				
			||||||
    print()
 | 
					    #print()
 | 
				
			||||||
    print(story('cuozg4'))
 | 
					    print(story('e4asnp'))
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,10 +3,16 @@ logging.basicConfig(
 | 
				
			|||||||
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
					        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 | 
				
			||||||
        level=logging.DEBUG)
 | 
					        level=logging.DEBUG)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    import sys
 | 
				
			||||||
 | 
					    sys.path.insert(0,'.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
from bs4 import BeautifulSoup
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from utils import clean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# cache the topic groups to prevent redirects
 | 
					# cache the topic groups to prevent redirects
 | 
				
			||||||
group_lookup = {}
 | 
					group_lookup = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -52,7 +58,7 @@ def comment(i):
 | 
				
			|||||||
    c['author'] = str(lu.string if lu else 'unknown user')
 | 
					    c['author'] = str(lu.string if lu else 'unknown user')
 | 
				
			||||||
    c['score'] = 1
 | 
					    c['score'] = 1
 | 
				
			||||||
    c['date'] = unix(i.find('time')['datetime'])
 | 
					    c['date'] = unix(i.find('time')['datetime'])
 | 
				
			||||||
    c['text'] = i.find('div', class_='comment-text').encode_contents().decode()
 | 
					    c['text'] = clean(i.find('div', class_='comment-text').encode_contents().decode() or '')
 | 
				
			||||||
    ct = i.find('ol', class_='comment-tree')
 | 
					    ct = i.find('ol', class_='comment-tree')
 | 
				
			||||||
    c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
 | 
					    c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else []
 | 
				
			||||||
    c['comments'] = list(filter(bool, c['comments']))
 | 
					    c['comments'] = list(filter(bool, c['comments']))
 | 
				
			||||||
@@ -99,7 +105,7 @@ def story(ref):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    td = a.find('div', class_='topic-full-text')
 | 
					    td = a.find('div', class_='topic-full-text')
 | 
				
			||||||
    if td:
 | 
					    if td:
 | 
				
			||||||
        s['text'] = td.encode_contents().decode()
 | 
					        s['text'] = clean(td.encode_contents().decode() or '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return s
 | 
					    return s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -7,6 +7,8 @@ import commonmark
 | 
				
			|||||||
import random
 | 
					import random
 | 
				
			||||||
import string
 | 
					import string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from bleach.sanitizer import Cleaner
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def gen_rand_id():
 | 
					def gen_rand_id():
 | 
				
			||||||
    return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))
 | 
					    return ''.join(random.choice(string.ascii_uppercase) for _ in range(4))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -15,3 +17,28 @@ def render_md(md):
 | 
				
			|||||||
        return commonmark.commonmark(md)
 | 
					        return commonmark.commonmark(md)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        return ''
 | 
					        return ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ALLOWED_TAGS = [
 | 
				
			||||||
 | 
					        'a',
 | 
				
			||||||
 | 
					        'abbr',
 | 
				
			||||||
 | 
					        'acronym',
 | 
				
			||||||
 | 
					        'b',
 | 
				
			||||||
 | 
					        'blockquote',
 | 
				
			||||||
 | 
					        'code',
 | 
				
			||||||
 | 
					        'em',
 | 
				
			||||||
 | 
					        'i',
 | 
				
			||||||
 | 
					        'li',
 | 
				
			||||||
 | 
					        'ol',
 | 
				
			||||||
 | 
					        'strong',
 | 
				
			||||||
 | 
					        'ul',
 | 
				
			||||||
 | 
					        'p',
 | 
				
			||||||
 | 
					        'hr',
 | 
				
			||||||
 | 
					        'small',
 | 
				
			||||||
 | 
					        'ins',
 | 
				
			||||||
 | 
					        'sup',
 | 
				
			||||||
 | 
					        'sub',
 | 
				
			||||||
 | 
					        'details',
 | 
				
			||||||
 | 
					        'summary',
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					clean = Cleaner(tags=ALLOWED_TAGS).clean
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user