feat: Add domain exclusion to smallweb list loading

This commit is contained in:
2025-12-04 22:18:19 +00:00
committed by Tanner Collin (aider)
parent 75779722c1
commit ed8ad1b6f6

View File

@@ -31,13 +31,17 @@ from flask_cors import CORS
smallweb_set = set()
def load_smallweb_list():
EXCLUDED = [
'github.com',
]
global smallweb_set
try:
url = 'https://raw.githubusercontent.com/kagisearch/smallweb/refs/heads/main/smallweb.txt'
with urllib.request.urlopen(url, timeout=10) as response:
urls = response.read().decode('utf-8').splitlines()
hosts = {urlparse(u).hostname for u in urls if u and urlparse(u).hostname}
smallweb_set = {h.replace('www.', '') for h in hosts}
smallweb_set = {h.replace('www.', '') for h in hosts if h not in EXCLUDED}
logging.info('Loaded {} smallweb domains.'.format(len(smallweb_set)))
except Exception as e:
logging.error('Failed to load smallweb list: {}'.format(e))