2020-11-16 02:30:33 +00:00
|
|
|
import logging
|
|
|
|
logging.basicConfig(
|
|
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
|
|
level=logging.DEBUG)
|
|
|
|
|
2020-11-16 23:38:28 +00:00
|
|
|
import re
|
2020-11-16 02:30:33 +00:00
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from scrapers import declutter
|
|
|
|
import extruct
|
|
|
|
|
|
|
|
import settings
|
|
|
|
from utils import clean
|
2020-11-23 21:36:31 +00:00
|
|
|
from misc.metadata import parse_extruct, get_icons
|
2020-11-16 02:30:33 +00:00
|
|
|
from misc.time import unix
|
|
|
|
from misc.api import xml
|
2020-11-19 01:23:01 +00:00
|
|
|
import misc.stuff as stuff
|
2020-11-16 02:30:33 +00:00
|
|
|
|
|
|
|
def comment(i):
|
|
|
|
if 'author' not in i:
|
|
|
|
return False
|
|
|
|
|
|
|
|
c = {}
|
|
|
|
c['author'] = i.get('author', '')
|
|
|
|
c['score'] = i.get('points', 0)
|
|
|
|
c['date'] = unix(i.get('date', 0))
|
|
|
|
c['text'] = clean(i.get('text', '') or '')
|
|
|
|
c['comments'] = [comment(j) for j in i['children']]
|
|
|
|
c['comments'] = list(filter(bool, c['comments']))
|
|
|
|
return c
|
|
|
|
|
|
|
|
def comment_count(i):
|
|
|
|
alive = 1 if i['author'] else 0
|
|
|
|
return sum([comment_count(c) for c in i['comments']]) + alive
|
|
|
|
|
|
|
|
class Base:
|
2020-11-16 23:38:28 +00:00
|
|
|
def __init__(config):
|
|
|
|
self.config = config
|
|
|
|
self.url = config.get('url')
|
|
|
|
self.tz = config.get('tz')
|
|
|
|
|
|
|
|
def get_id(self, link):
|
|
|
|
patterns = self.config.get('patterns')
|
|
|
|
if not patterns:
|
|
|
|
return link
|
|
|
|
patterns = [re.compile(p) for p in patterns]
|
|
|
|
patterns = list(filter(None, [p.match(link) for p in patterns]))
|
|
|
|
patterns = list(set([':'.join(p.groups()) for p in patterns]))
|
|
|
|
if not patterns:
|
|
|
|
return link
|
|
|
|
return patterns[0]
|
2020-11-16 02:30:33 +00:00
|
|
|
|
|
|
|
def feed(self, excludes=None):
|
|
|
|
return []
|
|
|
|
|
2020-11-16 23:38:28 +00:00
|
|
|
def story(self, ref, urlref):
|
|
|
|
if urlref is None:
|
|
|
|
return False
|
|
|
|
markup = xml(lambda x: urlref)
|
2020-11-16 02:30:33 +00:00
|
|
|
if not markup:
|
|
|
|
return False
|
|
|
|
|
|
|
|
s = {}
|
|
|
|
s['author_link'] = ''
|
|
|
|
s['score'] = 0
|
|
|
|
s['comments'] = []
|
|
|
|
s['num_comments'] = 0
|
2020-11-16 23:38:28 +00:00
|
|
|
s['link'] = urlref
|
|
|
|
s['url'] = urlref
|
2020-11-16 02:30:33 +00:00
|
|
|
s['date'] = 0
|
|
|
|
|
2020-11-23 21:36:31 +00:00
|
|
|
icons = get_icons(markup, url=urlref)
|
2020-11-16 02:30:33 +00:00
|
|
|
if icons:
|
|
|
|
s['icon'] = icons[0]
|
|
|
|
|
|
|
|
data = extruct.extract(markup)
|
|
|
|
s = parse_extruct(s, data)
|
|
|
|
if s['date']:
|
|
|
|
s['date'] = unix(s['date'], tz=self.tz)
|
|
|
|
|
|
|
|
if 'disqus' in markup:
|
|
|
|
try:
|
2020-11-16 23:38:28 +00:00
|
|
|
s['comments'] = declutter.get_comments(urlref)
|
2020-11-19 01:23:01 +00:00
|
|
|
s['comments'] = list(filter(bool, s['comments']))
|
2020-11-16 02:30:33 +00:00
|
|
|
s['num_comments'] = comment_count(s['comments'])
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
raise
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
2020-11-19 01:23:01 +00:00
|
|
|
if urlref.startswith('https://www.stuff.co.nz'):
|
|
|
|
s['comments'] = stuff.get_comments(urlref)
|
|
|
|
s['comments'] = list(filter(bool, s['comments']))
|
|
|
|
s['num_comments'] = len(s['comments'])
|
|
|
|
|
2020-11-16 02:30:33 +00:00
|
|
|
if not s['date']:
|
|
|
|
return False
|
|
|
|
return s
|