import json import os import time from datetime import datetime import requests BASE = os.getenv("WRAPPER_BASE", "http://192.168.1.2:9000") UPSTREAM = os.getenv("LLAMACPP_BASE", "http://192.168.1.2:8071") RUNS = int(os.getenv("RUNS", "100")) MAX_TOKENS = int(os.getenv("MAX_TOKENS", "4")) TIMEOUT = int(os.getenv("REQ_TIMEOUT", "300")) def _now(): return datetime.utcnow().isoformat() + "Z" def _get_loaded_model_id(): deadline = time.time() + 600 last_error = None while time.time() < deadline: try: resp = requests.get(UPSTREAM + "/v1/models", timeout=30) resp.raise_for_status() data = resp.json().get("data") or [] if data: return data[0].get("id") last_error = "no models reported by upstream" except Exception as exc: last_error = str(exc) time.sleep(5) raise RuntimeError(f"upstream not ready: {last_error}") def _stream_ok(resp): got_data = False got_done = False for line in resp.iter_lines(decode_unicode=True): if not line: continue if line.startswith("data:"): got_data = True if line.strip() == "data: [DONE]": got_done = True break return got_data, got_done def run_suite(model_id, idx): results = {} # Models r = requests.get(BASE + "/v1/models", timeout=30) results["models"] = r.status_code r = requests.get(BASE + f"/v1/models/{model_id}", timeout=30) results["model_get"] = r.status_code # Chat completions non-stream payload = { "model": model_id, "messages": [{"role": "user", "content": f"Run {idx}: say ok."}], "max_tokens": MAX_TOKENS, "temperature": (idx % 5) / 10.0, } r = requests.post(BASE + "/v1/chat/completions", json=payload, timeout=TIMEOUT) results["chat"] = r.status_code # Chat completions stream payload_stream = dict(payload) payload_stream["stream"] = True r = requests.post(BASE + "/v1/chat/completions", json=payload_stream, stream=True, timeout=TIMEOUT) ok_data, ok_done = _stream_ok(r) results["chat_stream"] = r.status_code results["chat_stream_ok"] = ok_data and ok_done # Responses non-stream payload_resp = { "model": model_id, "input": f"Run {idx}: say ok.", "max_output_tokens": MAX_TOKENS, } r = requests.post(BASE + "/v1/responses", json=payload_resp, timeout=TIMEOUT) results["responses"] = r.status_code # Responses stream payload_resp_stream = { "model": model_id, "input": f"Run {idx}: say ok.", "stream": True, } r = requests.post(BASE + "/v1/responses", json=payload_resp_stream, stream=True, timeout=TIMEOUT) ok_data, ok_done = _stream_ok(r) results["responses_stream"] = r.status_code results["responses_stream_ok"] = ok_data and ok_done # Embeddings (best effort) payload_emb = {"model": model_id, "input": f"Run {idx}"} r = requests.post(BASE + "/v1/embeddings", json=payload_emb, timeout=TIMEOUT) results["embeddings"] = r.status_code # Proxy r = requests.post(BASE + "/proxy/llamacpp/v1/chat/completions", json=payload, timeout=TIMEOUT) results["proxy"] = r.status_code return results def main(): summary = { "started_at": _now(), "base": BASE, "upstream": UPSTREAM, "runs": RUNS, "max_tokens": MAX_TOKENS, "results": [], } model_id = _get_loaded_model_id() summary["model_id"] = model_id for i in range(1, RUNS + 1): start = time.time() try: results = run_suite(model_id, i) ok = all( results.get(key) == 200 for key in ("models", "model_get", "chat", "chat_stream", "responses", "responses_stream", "proxy") ) stream_ok = results.get("chat_stream_ok") and results.get("responses_stream_ok") summary["results"].append({ "run": i, "ok": ok and stream_ok, "stream_ok": stream_ok, "status": results, "elapsed_s": round(time.time() - start, 2), }) except Exception as exc: summary["results"].append({ "run": i, "ok": False, "stream_ok": False, "error": str(exc), "elapsed_s": round(time.time() - start, 2), }) print(f"Run {i}/{RUNS} done") summary["finished_at"] = _now() os.makedirs("reports", exist_ok=True) out_path = os.path.join("reports", "remote_wrapper_test.json") with open(out_path, "w", encoding="utf-8") as f: json.dump(summary, f, indent=2) # Print a compact summary ok_count = sum(1 for r in summary["results"] if r.get("ok")) print(f"OK {ok_count}/{RUNS}") if __name__ == "__main__": main()