163 lines
4.9 KiB
Python
163 lines
4.9 KiB
Python
import json
|
|
import os
|
|
import time
|
|
from datetime import datetime
|
|
|
|
import requests
|
|
|
|
BASE = os.getenv("WRAPPER_BASE", "http://192.168.1.2:9000")
|
|
UPSTREAM = os.getenv("LLAMACPP_BASE", "http://192.168.1.2:8071")
|
|
RUNS = int(os.getenv("RUNS", "100"))
|
|
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "4"))
|
|
TIMEOUT = int(os.getenv("REQ_TIMEOUT", "300"))
|
|
|
|
|
|
def _now():
|
|
return datetime.utcnow().isoformat() + "Z"
|
|
|
|
|
|
def _get_loaded_model_id():
|
|
deadline = time.time() + 600
|
|
last_error = None
|
|
while time.time() < deadline:
|
|
try:
|
|
resp = requests.get(UPSTREAM + "/v1/models", timeout=30)
|
|
resp.raise_for_status()
|
|
data = resp.json().get("data") or []
|
|
if data:
|
|
return data[0].get("id")
|
|
last_error = "no models reported by upstream"
|
|
except Exception as exc:
|
|
last_error = str(exc)
|
|
time.sleep(5)
|
|
raise RuntimeError(f"upstream not ready: {last_error}")
|
|
|
|
|
|
def _stream_ok(resp):
|
|
got_data = False
|
|
got_done = False
|
|
for line in resp.iter_lines(decode_unicode=True):
|
|
if not line:
|
|
continue
|
|
if line.startswith("data:"):
|
|
got_data = True
|
|
if line.strip() == "data: [DONE]":
|
|
got_done = True
|
|
break
|
|
return got_data, got_done
|
|
|
|
|
|
def run_suite(model_id, idx):
|
|
results = {}
|
|
|
|
# Models
|
|
r = requests.get(BASE + "/v1/models", timeout=30)
|
|
results["models"] = r.status_code
|
|
|
|
r = requests.get(BASE + f"/v1/models/{model_id}", timeout=30)
|
|
results["model_get"] = r.status_code
|
|
|
|
# Chat completions non-stream
|
|
payload = {
|
|
"model": model_id,
|
|
"messages": [{"role": "user", "content": f"Run {idx}: say ok."}],
|
|
"max_tokens": MAX_TOKENS,
|
|
"temperature": (idx % 5) / 10.0,
|
|
}
|
|
r = requests.post(BASE + "/v1/chat/completions", json=payload, timeout=TIMEOUT)
|
|
results["chat"] = r.status_code
|
|
|
|
# Chat completions stream
|
|
payload_stream = dict(payload)
|
|
payload_stream["stream"] = True
|
|
r = requests.post(BASE + "/v1/chat/completions", json=payload_stream, stream=True, timeout=TIMEOUT)
|
|
ok_data, ok_done = _stream_ok(r)
|
|
results["chat_stream"] = r.status_code
|
|
results["chat_stream_ok"] = ok_data and ok_done
|
|
|
|
# Responses non-stream
|
|
payload_resp = {
|
|
"model": model_id,
|
|
"input": f"Run {idx}: say ok.",
|
|
"max_output_tokens": MAX_TOKENS,
|
|
}
|
|
r = requests.post(BASE + "/v1/responses", json=payload_resp, timeout=TIMEOUT)
|
|
results["responses"] = r.status_code
|
|
|
|
# Responses stream
|
|
payload_resp_stream = {
|
|
"model": model_id,
|
|
"input": f"Run {idx}: say ok.",
|
|
"stream": True,
|
|
}
|
|
r = requests.post(BASE + "/v1/responses", json=payload_resp_stream, stream=True, timeout=TIMEOUT)
|
|
ok_data, ok_done = _stream_ok(r)
|
|
results["responses_stream"] = r.status_code
|
|
results["responses_stream_ok"] = ok_data and ok_done
|
|
|
|
# Embeddings (best effort)
|
|
payload_emb = {"model": model_id, "input": f"Run {idx}"}
|
|
r = requests.post(BASE + "/v1/embeddings", json=payload_emb, timeout=TIMEOUT)
|
|
results["embeddings"] = r.status_code
|
|
|
|
# Proxy
|
|
r = requests.post(BASE + "/proxy/llamacpp/v1/chat/completions", json=payload, timeout=TIMEOUT)
|
|
results["proxy"] = r.status_code
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
summary = {
|
|
"started_at": _now(),
|
|
"base": BASE,
|
|
"upstream": UPSTREAM,
|
|
"runs": RUNS,
|
|
"max_tokens": MAX_TOKENS,
|
|
"results": [],
|
|
}
|
|
|
|
model_id = _get_loaded_model_id()
|
|
summary["model_id"] = model_id
|
|
|
|
for i in range(1, RUNS + 1):
|
|
start = time.time()
|
|
try:
|
|
results = run_suite(model_id, i)
|
|
ok = all(
|
|
results.get(key) == 200
|
|
for key in ("models", "model_get", "chat", "chat_stream", "responses", "responses_stream", "proxy")
|
|
)
|
|
stream_ok = results.get("chat_stream_ok") and results.get("responses_stream_ok")
|
|
summary["results"].append({
|
|
"run": i,
|
|
"ok": ok and stream_ok,
|
|
"stream_ok": stream_ok,
|
|
"status": results,
|
|
"elapsed_s": round(time.time() - start, 2),
|
|
})
|
|
except Exception as exc:
|
|
summary["results"].append({
|
|
"run": i,
|
|
"ok": False,
|
|
"stream_ok": False,
|
|
"error": str(exc),
|
|
"elapsed_s": round(time.time() - start, 2),
|
|
})
|
|
print(f"Run {i}/{RUNS} done")
|
|
|
|
summary["finished_at"] = _now()
|
|
|
|
os.makedirs("reports", exist_ok=True)
|
|
out_path = os.path.join("reports", "remote_wrapper_test.json")
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
json.dump(summary, f, indent=2)
|
|
|
|
# Print a compact summary
|
|
ok_count = sum(1 for r in summary["results"] if r.get("ok"))
|
|
print(f"OK {ok_count}/{RUNS}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|