Keep model loaded in memory, saves 5 seconds

This commit is contained in:
Tanner Collin 2025-01-20 18:24:59 +00:00
parent 587bc67416
commit dcd8af2895

View File

@ -23,7 +23,7 @@ def controller_message(message):
return False
def llama(prompt):
data = dict(model='llama3.1', prompt=prompt, stream=False)
data = dict(model='llama3.1', prompt=prompt, stream=False, keep_alive=-1)
try:
r = requests.post(LLAMA_URL, json=data, timeout=20)
r.raise_for_status()