qotnews/apiserver/feeds/category.py
Jason Schwarzenberger f5ccd844da fix import error.
2020-11-16 15:41:09 +13:00

71 lines
2.2 KiB
Python

import logging
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.DEBUG)
if __name__ == '__main__':
import sys
sys.path.insert(0,'.')
from bs4 import BeautifulSoup
import settings
from utils import clean
from misc.api import xml
from misc.news import Base
def _filter_links(links, category_url, excludes=None):
links = list(filter(None, [link if link.startswith(category_url) else None for link in links]))
links = list(filter(None, [link if link != category_url else None for link in links]))
links = list(set(links))
if excludes:
links = list(filter(None, [None if any(e in link for e in excludes) else link for link in links]))
return links
def _get_category(category_url, excludes=None):
base_url = '/'.join(category_url.split('/')[:3])
markup = xml(lambda x: category_url)
if not markup: return []
soup = BeautifulSoup(markup, features='html.parser')
links = soup.find_all('a', href=True)
links = [link.get('href') for link in links]
links = [f"{base_url}{link}" if link.startswith('/') else link for link in links]
links = _filter_links(links, category_url, excludes)
return links
class Category(Base):
def __init__(self, url, tz=None):
self.tz = tz
self.category_url = url
def feed(self, excludes=None):
links = []
if isinstance(self.category_url, str):
links += _get_category(self.category_url, excludes)
elif isinstance(self.category_url, list):
for url in self.category_url:
links += _get_category(url, excludes)
return list(set(links))
# scratchpad so I can quickly develop the parser
if __name__ == '__main__':
print("Category: RadioNZ")
site = Category("https://www.rnz.co.nz/news/")
excludes = [
'rnz.co.nz/news/sport',
'rnz.co.nz/weather',
'rnz.co.nz/news/weather',
]
posts = site.feed(excludes)
print(posts[:5])
print(site.story(posts[0]))
print("Category: Newsroom")
site = Category("https://www.newsroom.co.nz/news/", tz='Pacific/Auckland')
posts = site.feed()
print(posts[:5])
print(site.story(posts[0]))