qotnews/apiserver/feeds/category.py

import logging
logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.DEBUG)

if __name__ == '__main__':
    import sys
    sys.path.insert(0,'.')

from bs4 import BeautifulSoup

import settings
from utils import clean
from misc.api import xml
from misc.news import Base

def _filter_links(links, category_url, excludes=None):
    links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
    links = list(filter(None, [link if link != category_url else None for link in links]))
    links = list(set(links))
    if excludes:
        links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
    return links

def _get_category(category_url, excludes=None):
    base_url = '/'.join(category_url.split('/')[:3])
    markup = xml(lambda x: category_url)
    if not markup: return []
    soup = BeautifulSoup(markup, features='html.parser')
    links = soup.find_all('a', href=True)
    links = [link.get('href') for link in links]
    links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
    links = _filter_links(links, category_url, excludes)
    return links

class Category(Base):
    def __init__(self, config):
        self.config = config
        self.category_url = config.get('url')
        self.tz = config.get('tz')

    def feed(self, excludes=None):
        links = []
        if isinstance(self.category_url, str):
            links += _get_category(self.category_url, excludes)
        elif isinstance(self.category_url, list):
            for url in self.category_url:
                links += _get_category(url, excludes)
        links = list(set(links))
        return [(self.get_id(link), link) for link in links]


# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
    print("Category: RadioNZ")
    site = Category({ 'url': "https://www.rnz.co.nz/news/" })
    excludes = [
        'rnz.co.nz/news/sport',
        'rnz.co.nz/weather',
        'rnz.co.nz/news/weather',
    ]
    posts = site.feed(excludes)
    print(posts[:5])
    print(site.story(posts[0][0], posts[0][1]))

    print("Category: Newsroom")
    site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'})
    posts = site.feed()
    print(posts[:5])
    print(site.story(posts[0][0], posts[0][1]))
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`import logging`
			`logging.basicConfig(`
			`format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',`
			`level=logging.DEBUG)`

			`if __name__ == '__main__':`
			`import sys`
			`sys.path.insert(0,'.')`

			`from bs4 import BeautifulSoup`

			`import settings`
			`from utils import clean`
			`from misc.api import xml`
fix import error. 2020-11-16 02:41:09 +00:00			`from misc.news import Base`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00
			`def _filter_links(links, category_url, excludes=None):`
			`links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))`
			`links = list(filter(None, [link if link != category_url else None for link in links]))`
			`links = list(set(links))`
			`if excludes:`
			`links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))`
			`return links`

			`def _get_category(category_url, excludes=None):`
			`base_url = '/'.join(category_url.split('/')[:3])`
			`markup = xml(lambda x: category_url)`
			`if not markup: return []`
			`soup = BeautifulSoup(markup, features='html.parser')`
			`links = soup.find_all('a', href=True)`
			`links = [link.get('href') for link in links]`
			`links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]`
			`links = _filter_links(links, category_url, excludes)`
			`return links`

			`class Category(Base):`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`def __init__(self, config):`
			`self.config = config`
			`self.category_url = config.get('url')`
			`self.tz = config.get('tz')`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00
			`def feed(self, excludes=None):`
			`links = []`
			`if isinstance(self.category_url, str):`
			`links += _get_category(self.category_url, excludes)`
			`elif isinstance(self.category_url, list):`
			`for url in self.category_url:`
			`links += _get_category(url, excludes)`
add regex to get a unique ref from each sitemap/category based article url. 2020-11-16 23:38:28 +00:00			`links = list(set(links))`
			`return [(self.get_id(link), link) for link in links]`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00

			`# scratchpad so I can quickly develop the parser`
			`if __name__ == '__main__':`
			`print("Category: RadioNZ")`
stuff comments. 2020-11-19 01:23:01 +00:00			`site = Category({ 'url': "https://www.rnz.co.nz/news/" })`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`excludes = [`
			`'rnz.co.nz/news/sport',`
			`'rnz.co.nz/weather',`
			`'rnz.co.nz/news/weather',`
			`]`
			`posts = site.feed(excludes)`
			`print(posts[:5])`
stuff comments. 2020-11-19 01:23:01 +00:00			`print(site.story(posts[0][0], posts[0][1]))`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00
			`print("Category: Newsroom")`
stuff comments. 2020-11-19 01:23:01 +00:00			`site = Category({ 'url': "https://www.newsroom.co.nz/news/", 'tz': 'Pacific/Auckland'})`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00			`posts = site.feed()`
			`print(posts[:5])`
stuff comments. 2020-11-19 01:23:01 +00:00			`print(site.story(posts[0][0], posts[0][1]))`
split categories, sitemap and other crap out of news.py 2020-11-16 02:30:33 +00:00