You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

101 lines
3.0 KiB

import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
import re
import requests
from bs4 import BeautifulSoup
from scrapers import declutter
import extruct
import settings
from utils import clean
from misc.metadata import parse_extruct
from misc.time import unix
from misc.api import xml
def comment(i):
if 'author' not in i:
return False
c = {}
c['author'] = i.get('author', '')
c['score'] = i.get('points', 0)
c['date'] = unix(i.get('date', 0))
c['text'] = clean(i.get('text', '') or '')
c['comments'] = [comment(j) for j in i['children']]
c['comments'] = list(filter(bool, c['comments']))
return c
def comment_count(i):
alive = 1 if i['author'] else 0
return sum([comment_count(c) for c in i['comments']]) + alive
class Base:
def __init__(config):
self.config = config
self.url = config.get('url')
self.tz = config.get('tz')
def get_id(self, link):
patterns = self.config.get('patterns')
if not patterns:
return link
patterns = [re.compile(p) for p in patterns]
patterns = list(filter(None, [p.match(link) for p in patterns]))
patterns = list(set([':'.join(p.groups()) for p in patterns]))
if not patterns:
return link
return patterns[0]
def feed(self, excludes=None):
return []
def story(self, ref, urlref):
if urlref is None:
return False
markup = xml(lambda x: urlref)
if not markup:
return False
s = {}
s['author_link'] = ''
s['score'] = 0
s['comments'] = []
s['num_comments'] = 0
s['link'] = urlref
s['url'] = urlref
s['date'] = 0
soup = BeautifulSoup(markup, features='html.parser')
icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
favicon = soup.find_all('link', rel="shortcut icon", href=True)
others = soup.find_all('link', rel="icon", href=True)
icons = icon32 + icon16 + favicon + others
base_url = '/'.join(urlref.split('/')[:3])
icons = list(set([i.get('href') for i in icons]))
icons = [i if i.startswith('http') else base_url + i for i in icons]
if icons:
s['icon'] = icons[0]
data = extruct.extract(markup)
s = parse_extruct(s, data)
if s['date']:
s['date'] = unix(s['date'], tz=self.tz)
if 'disqus' in markup:
try:
s['comments'] = declutter.get_comments(urlref)
c['comments'] = list(filter(bool, c['comments']))
s['num_comments'] = comment_count(s['comments'])
except KeyboardInterrupt:
raise
except:
pass
if not s['date']:
return False
return s