qotnews/apiserver/misc/news.py

import logging
logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)

import re
import requests
from bs4 import BeautifulSoup
from scrapers import declutter
import extruct

import settings
from utils import clean
from misc.metadata import parse_extruct
from misc.time import unix
from misc.api import xml
import misc.stuff as stuff

def comment(i):
    if 'author' not in i:
        return False

    c = {}
    c['author'] = i.get('author', '')
    c['score'] = i.get('points', 0)
    c['date'] = unix(i.get('date', 0))
    c['text'] = clean(i.get('text', '') or '')
    c['comments'] = [comment(j) for j in i['children']]
    c['comments'] = list(filter(bool, c['comments']))
    return c

def comment_count(i):
    alive = 1 if i['author'] else 0
    return sum([comment_count(c) for c in i['comments']]) + alive

class Base:
    def __init__(config):
        self.config = config
        self.url = config.get('url')
        self.tz = config.get('tz')

    def get_id(self, link):
        patterns = self.config.get('patterns')
        if not patterns:
            return link
        patterns = [re.compile(p) for p in patterns]
        patterns = list(filter(None, [p.match(link) for p in patterns]))
        patterns = list(set([':'.join(p.groups()) for p in patterns]))
        if not patterns:
            return link
        return patterns[0]

    def feed(self, excludes=None):
        return []

    def story(self, ref, urlref):
        if urlref is None:
            return False
        markup = xml(lambda x: urlref)
        if not markup:
            return False

        s = {}
        s['author_link'] = ''
        s['score'] = 0
        s['comments'] = []
        s['num_comments'] = 0
        s['link'] = urlref
        s['url'] = urlref
        s['date'] = 0

        soup = BeautifulSoup(markup, features='html.parser')
        icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")
        icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")
        favicon = soup.find_all('link', rel="shortcut icon", href=True)
        others = soup.find_all('link', rel="icon", href=True)
        icons = icon32 + icon16 + favicon + others
        base_url = '/'.join(urlref.split('/')[:3])
        icons = list(set([i.get('href') for i in icons]))
        icons = [i if i.startswith('http') else base_url + i for i in icons]

        if icons:
            s['icon'] = icons[0]

        data = extruct.extract(markup)
        s = parse_extruct(s, data)
        if s['date']:
            s['date'] = unix(s['date'], tz=self.tz)

        if 'disqus' in markup:
            try:
                s['comments'] = declutter.get_comments(urlref)
                s['comments'] = list(filter(bool, s['comments']))
                s['num_comments'] = comment_count(s['comments'])
            except KeyboardInterrupt:
                raise
            except:
                pass

        if urlref.startswith('https://www.stuff.co.nz'):
            s['comments'] = stuff.get_comments(urlref)
            s['comments'] = list(filter(bool, s['comments']))
            s['num_comments'] = len(s['comments'])

        if not s['date']:
            return False
        return s
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`import logging`
			`logging.basicConfig(`
			`format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',`
			`level=logging.DEBUG)`

add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`import re`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`import requests`
			`from bs4 import BeautifulSoup`
			`from scrapers import declutter`
			`import extruct`

			`import settings`
			`from utils import clean`
			`from misc.metadata import parse_extruct`
			`from misc.time import unix`
			`from misc.api import xml`
stuff comments. 2020-11-19 01:23:01 +00:00			`import misc.stuff as stuff`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00
			`def comment(i):`
			`if 'author' not in i:`
			`return False`

			`c = {}`
			`c['author'] = i.get('author', '')`
			`c['score'] = i.get('points', 0)`
			`c['date'] = unix(i.get('date', 0))`
			`c['text'] = clean(i.get('text', '') or '')`
			`c['comments'] = [comment(j) for j in i['children']]`
			`c['comments'] = list(filter(bool, c['comments']))`
			`return c`

			`def comment_count(i):`
			`alive = 1 if i['author'] else 0`
			`return sum([comment_count(c) for c in i['comments']]) + alive`

			`class Base:`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`def __init__(config):`
			`self.config = config`
			`self.url = config.get('url')`
			`self.tz = config.get('tz')`

			`def get_id(self, link):`
			`patterns = self.config.get('patterns')`
			`if not patterns:`
			`return link`
			`patterns = [re.compile(p) for p in patterns]`
			`patterns = list(filter(None, [p.match(link) for p in patterns]))`
			`patterns = list(set([':'.join(p.groups()) for p in patterns]))`
			`if not patterns:`
			`return link`
			`return patterns[0]`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00
			`def feed(self, excludes=None):`
			`return []`

add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`def story(self, ref, urlref):`
			`if urlref is None:`
			`return False`
			`markup = xml(lambda x: urlref)`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`if not markup:`
			`return False`

			`s = {}`
			`s['author_link'] = ''`
			`s['score'] = 0`
			`s['comments'] = []`
			`s['num_comments'] = 0`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`s['link'] = urlref`
			`s['url'] = urlref`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`s['date'] = 0`

			`soup = BeautifulSoup(markup, features='html.parser')`
			`icon32 = soup.find_all('link', rel="icon", href=True, sizes="32x32")`
			`icon16 = soup.find_all('link', rel="icon", href=True, sizes="16x16")`
			`favicon = soup.find_all('link', rel="shortcut icon", href=True)`
			`others = soup.find_all('link', rel="icon", href=True)`
			`icons = icon32 + icon16 + favicon + others`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`base_url = '/'.join(urlref.split('/')[:3])`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`icons = list(set([i.get('href') for i in icons]))`
			`icons = [i if i.startswith('http') else base_url + i for i in icons]`

			`if icons:`
			`s['icon'] = icons[0]`

			`data = extruct.extract(markup)`
			`s = parse_extruct(s, data)`
			`if s['date']:`
			`s['date'] = unix(s['date'], tz=self.tz)`

			`if 'disqus' in markup:`
			`try:`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`s['comments'] = declutter.get_comments(urlref)`
stuff comments. 2020-11-19 01:23:01 +00:00			`s['comments'] = list(filter(bool, s['comments']))`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`s['num_comments'] = comment_count(s['comments'])`
			`except KeyboardInterrupt:`
			`raise`
			`except:`
			`pass`

stuff comments. 2020-11-19 01:23:01 +00:00			`if urlref.startswith('https://www.stuff.co.nz'):`
			`s['comments'] = stuff.get_comments(urlref)`
			`s['comments'] = list(filter(bool, s['comments']))`
			`s['num_comments'] = len(s['comments'])`

split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`if not s['date']:`
			`return False`
			`return s`