forked from tanner/qotnews
91 lines
2.5 KiB
Python
91 lines
2.5 KiB
Python
|
import logging
|
||
|
logging.basicConfig(
|
||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||
|
level=logging.DEBUG)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import sys
|
||
|
sys.path.insert(0,'.')
|
||
|
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
from scrapers import declutter
|
||
|
import extruct
|
||
|
|
||
|
import settings
|
||
|
from utils import clean
|
||
|
from misc.metadata import parse_extruct
|
||
|
from misc.time import unix
|
||
|
from misc.api import xml
|
||
|
|
||
|
def comment(i):
|
||
|
if 'author' not in i:
|
||
|
return False
|
||
|
|
||
|
c = {}
|
||
|
c['author'] = i.get('author', '')
|
||
|
c['score'] = i.get('points', 0)
|
||
|
c['date'] = unix(i.get('date', 0))
|
||
|
c['text'] = clean(i.get('text', '') or '')
|
||
|
c['comments'] = [comment(j) for j in i['children']]
|
||
|
c['comments'] = list(filter(bool, c['comments']))
|
||
|
return c
|
||
|
|
||
|
def comment_count(i):
|
||
|
alive = 1 if i['author'] else 0
|
||
|
return sum([comment_count(c) for c in i['comments']]) + alive
|
||
|
|
||
|
class Base:
|
||
|
def __init__(url, tz=None):
|
||
|
self.url = url
|
||
|
self.tz = tz
|
||
|
|
||
|
def feed(self, excludes=None):
|
||
|
return []
|
||
|
|
||
|
def story(self, ref):
|
||
|
markup = xml(lambda x: ref)
|
||
|
if not markup:
|
||
|
return False
|
||
|
|
||
|
s = {}
|
||
|
s['author_link'] = ''
|
||
|
s['score'] = 0
|
||
|
s['comments'] = []
|
||
|
s['num_comments'] = 0
|
||
|
s['link'] = ref
|
||
|
s['url'] = ref
|
||
|
s['date'] = 0
|
||
|
|
||
|
soup = BeautifulSoup(markup, features='html.parser')
|
||
|
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
|
||
|
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
|
||
|
favicon = soup.find_all('link', rel="shortcut icon", href=True)
|
||
|
others = soup.find_all('link', rel="icon", href=True)
|
||
|
icons = icon32 + icon16 + favicon + others
|
||
|
base_url = '/'.join(ref.split('/')[:3])
|
||
|
icons = list(set([i.get('href') for i in icons]))
|
||
|
icons = [i if i.startswith('http') else base_url + i for i in icons]
|
||
|
|
||
|
if icons:
|
||
|
s['icon'] = icons[0]
|
||
|
|
||
|
data = extruct.extract(markup)
|
||
|
s = parse_extruct(s, data)
|
||
|
if s['date']:
|
||
|
s['date'] = unix(s['date'], tz=self.tz)
|
||
|
|
||
|
if 'disqus' in markup:
|
||
|
try:
|
||
|
s['comments'] = declutter.get_comments(ref)
|
||
|
c['comments'] = list(filter(bool, c['comments']))
|
||
|
s['num_comments'] = comment_count(s['comments'])
|
||
|
except KeyboardInterrupt:
|
||
|
raise
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
if not s['date']:
|
||
|
return False
|
||
|
return s
|