49 lines
1.8 KiB
Bash
49 lines
1.8 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# Smoke-test the running qwen3-235b llama-server (port 8081). Hits
|
||
|
|
# /health for liveness, then a tiny OpenAI-compatible chat completion,
|
||
|
|
# then measures eval_tps via /completion. Generation is bigger than
|
||
|
|
# llama's smoke (n_predict=64) because at 5-10 tok/s the per-token
|
||
|
|
# noise floor swamps a 16-token sample.
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
HOST="${QWEN235_HOST:-127.0.0.1:8081}"
|
||
|
|
MODEL="${QWEN235_MODEL:-qwen3-235b}"
|
||
|
|
|
||
|
|
echo "[smoke] GET /health on $HOST"
|
||
|
|
curl -fsS "http://$HOST/health" | python3 -m json.tool
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
|
||
|
|
curl -fsS "http://$HOST/v1/chat/completions" \
|
||
|
|
-H 'Content-Type: application/json' \
|
||
|
|
-d "{
|
||
|
|
\"model\": \"$MODEL\",
|
||
|
|
\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
|
||
|
|
\"max_tokens\": 16,
|
||
|
|
\"temperature\": 0.0
|
||
|
|
}" | python3 -m json.tool
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=64)"
|
||
|
|
# Bigger sample than llama/smoke.sh — at ~7 tok/s the first few tokens'
|
||
|
|
# warmup dominates a 16-token measurement.
|
||
|
|
curl -fsS "http://$HOST/completion" \
|
||
|
|
-H 'Content-Type: application/json' \
|
||
|
|
-d '{
|
||
|
|
"prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
|
||
|
|
"n_predict": 64,
|
||
|
|
"temperature": 0.0,
|
||
|
|
"stream": false
|
||
|
|
}' | python3 -c "
|
||
|
|
import json, sys
|
||
|
|
r = json.load(sys.stdin)
|
||
|
|
t = r.get('timings', {})
|
||
|
|
print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
|
||
|
|
print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
|
||
|
|
print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}')
|
||
|
|
print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}')
|
||
|
|
"
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "[smoke] passed — expected band 5-10 tok/s decode; <3 tok/s = investigate"
|