Sanitize html
This commit is contained in:
		| @@ -3,8 +3,14 @@ logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     import sys | ||||
|     sys.path.insert(0,'.') | ||||
|  | ||||
| import requests | ||||
|  | ||||
| from utils import clean | ||||
|  | ||||
| API_TOPSTORIES = lambda x: 'https://hacker-news.firebaseio.com/v0/topstories.json' | ||||
| API_ITEM = lambda x : 'https://hn.algolia.com/api/v1/items/{}'.format(x) | ||||
|  | ||||
| @@ -34,7 +40,7 @@ def comment(i): | ||||
|     c['author'] = i.get('author', '') | ||||
|     c['score'] = i.get('points', 0) | ||||
|     c['date'] = i.get('created_at_i', 0) | ||||
|     c['text'] = i.get('text', '') | ||||
|     c['text'] = clean(i.get('text', '') or '') | ||||
|     c['comments'] = [comment(j) for j in i['children']] | ||||
|     c['comments'] = list(filter(bool, c['comments'])) | ||||
|     return c | ||||
| @@ -65,7 +71,7 @@ def story(ref): | ||||
|     s['num_comments'] = comment_count(s) - 1 | ||||
|  | ||||
|     if 'text' in r and r['text']: | ||||
|         s['text'] = r['text'] | ||||
|         s['text'] = clean(r['text'] or '') | ||||
|  | ||||
|     return s | ||||
|  | ||||
|   | ||||
| @@ -12,7 +12,7 @@ from praw.exceptions import PRAWException | ||||
| from praw.models import MoreComments | ||||
| from prawcore.exceptions import PrawcoreException | ||||
|  | ||||
| from utils import render_md | ||||
| from utils import render_md, clean | ||||
|  | ||||
| SUBREDDITS = 'Economics+Foodforthought+TrueReddit+business+technology+privacy' | ||||
|  | ||||
| @@ -45,7 +45,7 @@ def comment(i): | ||||
|     c['author'] = i.author.name if i.author else '[Deleted]' | ||||
|     c['score'] = i.score | ||||
|     c['date'] = i.created_utc | ||||
|     c['text'] = render_md(i.body) | ||||
|     c['text'] = render_md(clean(i.body)) | ||||
|     c['comments'] = [comment(j) for j in i.replies] | ||||
|     c['comments'] = list(filter(bool, c['comments'])) | ||||
|     return c | ||||
| @@ -68,7 +68,7 @@ def story(ref): | ||||
|         s['num_comments'] = r.num_comments | ||||
|  | ||||
|         if r.selftext: | ||||
|             s['text'] = render_md(r.selftext) | ||||
|             s['text'] = render_md(clean(r.selftext)) | ||||
|  | ||||
|         return s | ||||
|  | ||||
| @@ -83,7 +83,7 @@ def story(ref): | ||||
|  | ||||
| # scratchpad so I can quickly develop the parser | ||||
| if __name__ == '__main__': | ||||
|     print(feed()) | ||||
|     print(reddit.submission(feed()[0]).permalink) | ||||
|     print() | ||||
|     print(story('cuozg4')) | ||||
|     #print(feed()) | ||||
|     #print(reddit.submission(feed()[0]).permalink) | ||||
|     #print() | ||||
|     print(story('e4asnp')) | ||||
|   | ||||
| @@ -3,10 +3,16 @@ logging.basicConfig( | ||||
|         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||
|         level=logging.DEBUG) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     import sys | ||||
|     sys.path.insert(0,'.') | ||||
|  | ||||
| import requests | ||||
| from bs4 import BeautifulSoup | ||||
| from datetime import datetime | ||||
|  | ||||
| from utils import clean | ||||
|  | ||||
| # cache the topic groups to prevent redirects | ||||
| group_lookup = {} | ||||
|  | ||||
| @@ -52,7 +58,7 @@ def comment(i): | ||||
|     c['author'] = str(lu.string if lu else 'unknown user') | ||||
|     c['score'] = 1 | ||||
|     c['date'] = unix(i.find('time')['datetime']) | ||||
|     c['text'] = i.find('div', class_='comment-text').encode_contents().decode() | ||||
|     c['text'] = clean(i.find('div', class_='comment-text').encode_contents().decode() or '') | ||||
|     ct = i.find('ol', class_='comment-tree') | ||||
|     c['comments'] = [comment(j) for j in ct.findAll('li', recursive=False)] if ct else [] | ||||
|     c['comments'] = list(filter(bool, c['comments'])) | ||||
| @@ -99,7 +105,7 @@ def story(ref): | ||||
|  | ||||
|     td = a.find('div', class_='topic-full-text') | ||||
|     if td: | ||||
|         s['text'] = td.encode_contents().decode() | ||||
|         s['text'] = clean(td.encode_contents().decode() or '') | ||||
|  | ||||
|     return s | ||||
|  | ||||
|   | ||||
| @@ -7,6 +7,8 @@ import commonmark | ||||
| import random | ||||
| import string | ||||
|  | ||||
| from bleach.sanitizer import Cleaner | ||||
|  | ||||
| def gen_rand_id(): | ||||
|     return ''.join(random.choice(string.ascii_uppercase) for _ in range(4)) | ||||
|  | ||||
| @@ -15,3 +17,28 @@ def render_md(md): | ||||
|         return commonmark.commonmark(md) | ||||
|     else: | ||||
|         return '' | ||||
|  | ||||
| ALLOWED_TAGS = [ | ||||
|         'a', | ||||
|         'abbr', | ||||
|         'acronym', | ||||
|         'b', | ||||
|         'blockquote', | ||||
|         'code', | ||||
|         'em', | ||||
|         'i', | ||||
|         'li', | ||||
|         'ol', | ||||
|         'strong', | ||||
|         'ul', | ||||
|         'p', | ||||
|         'hr', | ||||
|         'small', | ||||
|         'ins', | ||||
|         'sup', | ||||
|         'sub', | ||||
|         'details', | ||||
|         'summary', | ||||
|         ] | ||||
|  | ||||
| clean = Cleaner(tags=ALLOWED_TAGS).clean | ||||
|   | ||||
		Reference in New Issue
	
	Block a user