forked from tanner/qotnews
add regex to get a unique ref from each sitemap/category based article url.
This commit is contained in:
@@ -13,15 +13,43 @@ NUM_TILDES = 5
|
||||
NUM_SUBSTACK = 10
|
||||
|
||||
SITEMAP = {}
|
||||
# SITEMAP['nzherald'] = { 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/", 'count': 10},
|
||||
# SITEMAP['stuff'] = { 'url': "https://www.stuff.co.nz/sitemap.xml", 'count': 10},
|
||||
# SITEMAP['nzherald'] = {
|
||||
# 'url': "https://www.nzherald.co.nz/arcio/news-sitemap/",
|
||||
# 'count': 20,
|
||||
# 'patterns': [
|
||||
# r'^https:\/\/www\.(nzherald\.co\.nz)\/.*\/([^/]+)\/?$',
|
||||
# ],
|
||||
# 'excludes': [
|
||||
# 'driven.co.nz',
|
||||
# 'oneroof.co.nz',
|
||||
# 'nzherald.co.nz/sponsored-stories',
|
||||
# 'nzherald.co.nz/entertainment/',
|
||||
# 'nzherald.co.nz/lifestyle/',
|
||||
# 'nzherald.co.nz/travel/',
|
||||
# 'nzherald.co.nz/sport/',
|
||||
# 'nzherald.co.nz/promotions/',
|
||||
# 'nzherald.co.nzhttp',
|
||||
# 'herald-afternoon-quiz',
|
||||
# 'herald-morning-quiz'
|
||||
# ],
|
||||
# }
|
||||
|
||||
SUBSTACK = {}
|
||||
# SUBSTACK['webworm'] = { 'url': "https://www.webworm.co", 'count': 10},
|
||||
# SUBSTACK['the bulletin'] = { 'url': "https://thespinoff.substack.com", 'count': 10},
|
||||
|
||||
CATEGORY = {}
|
||||
# CATEGORY['rnz national'] = { 'url': "https://www.rnz.co.nz/news/national", 'count': 10},
|
||||
# CATEGORY['radionz'] = {
|
||||
# 'url': "https://www.rnz.co.nz/news/",
|
||||
# 'count': 20,
|
||||
# 'patterns': [
|
||||
# r'https:\/\/www\.(rnz\.co\.nz)\/news\/[^\/]+\/(\d+)\/[^\/]+\/?'
|
||||
# ],
|
||||
# 'excludes': [
|
||||
# 'rnz.co.nz/news/sport',
|
||||
# 'rnz.co.nz/weather',
|
||||
# ],
|
||||
# }
|
||||
|
||||
SCRAPERS = ['browser', 'declutter', 'outline', 'local']
|
||||
|
||||
|
Reference in New Issue
Block a user