progress 235b

2026-06-08 15:31:50 +01:00
parent a29793032d
commit de1635872f
25 changed files with 1598 additions and 53 deletions
--- a/pyinfra/framework/compose/llama/smoke.sh
+++ b/pyinfra/framework/compose/llama/smoke.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Smoke-test the running llama-server (kyuz0 rocm-7.2.2). Hits /health
+# for liveness, then a tiny OpenAI-compatible chat completion. Also
+# prints eval_tps so you can compare to Ollama directly.
+set -euo pipefail
+
+HOST="${LLAMA_HOST:-127.0.0.1:8080}"
+MODEL="${LLAMA_MODEL:-qwen3-coder}"
+
+echo "[smoke] GET /health on $HOST"
+curl -fsS "http://$HOST/health" | python3 -m json.tool
+
+echo
+echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
+curl -fsS "http://$HOST/v1/chat/completions" \
+    -H 'Content-Type: application/json' \
+    -d "{
+        \"model\": \"$MODEL\",
+        \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
+        \"max_tokens\": 16,
+        \"temperature\": 0.0
+    }" | python3 -m json.tool
+
+echo
+echo "[smoke] perf measure — eval_tps and prompt_tps"
+# Use llama.cpp's native /completion endpoint which returns timings.
+curl -fsS "http://$HOST/completion" \
+    -H 'Content-Type: application/json' \
+    -d '{
+        "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
+        "n_predict": 200,
+        "temperature": 0.0,
+        "stream": false
+    }' | python3 -c "
+import json, sys
+r = json.load(sys.stdin)
+t = r.get('timings', {})
+print(f'predicted_per_second:  {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
+print(f'prompt_per_second:     {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
+print(f'predicted_n:           {t.get(\"predicted_n\", \"?\")}')
+print(f'prompt_n:              {t.get(\"prompt_n\", \"?\")}')
+"
+
+echo
+echo "[smoke] passed"