Files
localgenai/pyinfra/framework/compose/ornith/smoke.sh

48 lines
1.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# Smoke-test the running ornith llama-server (port 8083). Hits /health
# for liveness, then a tiny OpenAI-compatible chat completion, then
# measures eval_tps via /completion. MoE 35B-A3B (~3B active) → expect
# ~80-100 tok/s, like the 30B-A3B workhorse (NOT a dense 27/31B).
set -euo pipefail
HOST="${ORNITH_HOST:-127.0.0.1:8083}"
MODEL="${ORNITH_MODEL:-ornith}"
echo "[smoke] GET /health on $HOST"
curl -fsS "http://$HOST/health" | python3 -m json.tool
echo
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
# Ornith opens with a <think> block; ask for a terse final answer.
curl -fsS "http://$HOST/v1/chat/completions" \
-H 'Content-Type: application/json' \
-d "{
\"model\": \"$MODEL\",
\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
\"max_tokens\": 256,
\"temperature\": 0.0
}" | python3 -m json.tool
echo
echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=128)"
curl -fsS "http://$HOST/completion" \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
"n_predict": 128,
"temperature": 0.0,
"stream": false
}' | python3 -c "
import json, sys
r = json.load(sys.stdin)
t = r.get('timings', {})
print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}')
print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}')
"
echo
echo "[smoke] passed — expected band ~80-100 tok/s decode (35B-A3B MoE Q4)."
echo " <30 tok/s = investigate arena (see qwen3-235b/README.md)."