codex_truenas_helper/llamaCpp.Wrapper.app/warmup.py

import asyncio
import logging
import time
from pathlib import Path

import httpx


log = logging.getLogger("llamacpp_warmup")


def _is_loading_error(response: httpx.Response) -> bool:
    if response.status_code != 503:
        return False
    try:
        payload = response.json()
    except Exception:
        return False
    message = ""
    if isinstance(payload, dict):
        error = payload.get("error")
        if isinstance(error, dict):
            message = str(error.get("message") or "")
        else:
            message = str(payload.get("message") or "")
    return "loading model" in message.lower()


def resolve_warmup_prompt(override: str | None, fallback_path: str) -> str:
    if override:
        prompt = override.strip()
        if prompt:
            return prompt
    try:
        prompt = Path(fallback_path).read_text(encoding="utf-8").strip()
        if prompt:
            return prompt
    except Exception as exc:
        log.warning("Failed to read warmup prompt from %s: %s", fallback_path, exc)
    return "ok"


async def run_warmup(base_url: str, model_id: str, prompt: str, timeout_s: float) -> None:
    payload = {
        "model": model_id,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 8,
        "temperature": 0,
    }
    async with httpx.AsyncClient(base_url=base_url, timeout=timeout_s) as client:
        resp = await client.post("/v1/chat/completions", json=payload)
        if resp.status_code == 503 and _is_loading_error(resp):
            raise RuntimeError("llama.cpp still loading model")
        resp.raise_for_status()


async def run_warmup_with_retry(
    base_url: str,
    model_id: str,
    prompt: str,
    timeout_s: float,
    interval_s: float = 3.0,
) -> None:
    deadline = time.time() + timeout_s
    last_exc: Exception | None = None
    while time.time() < deadline:
        try:
            await run_warmup(base_url, model_id, prompt, timeout_s=timeout_s)
            return
        except Exception as exc:
            last_exc = exc
            await asyncio.sleep(interval_s)
    if last_exc:
        raise last_exc