pyinfra/framework/compose/qwen3-235b/smoke.sh

#!/usr/bin/env bash
# Smoke-test the running qwen3-235b llama-server (port 8081). Hits
# /health for liveness, then a tiny OpenAI-compatible chat completion,
# then measures eval_tps via /completion. Generation is bigger than
# llama's smoke (n_predict=64) because at 5-10 tok/s the per-token
# noise floor swamps a 16-token sample.
set -euo pipefail

HOST="${QWEN235_HOST:-127.0.0.1:8081}"
MODEL="${QWEN235_MODEL:-qwen3-235b}"

echo "[smoke] GET /health on $HOST"
curl -fsS "http://$HOST/health" | python3 -m json.tool

echo
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
curl -fsS "http://$HOST/v1/chat/completions" \
    -H 'Content-Type: application/json' \
    -d "{
        \"model\": \"$MODEL\",
        \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
        \"max_tokens\": 16,
        \"temperature\": 0.0
    }" | python3 -m json.tool

echo
echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=64)"
# Bigger sample than llama/smoke.sh — at ~7 tok/s the first few tokens'
# warmup dominates a 16-token measurement.
curl -fsS "http://$HOST/completion" \
    -H 'Content-Type: application/json' \
    -d '{
        "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
        "n_predict": 64,
        "temperature": 0.0,
        "stream": false
    }' | python3 -c "
import json, sys
r = json.load(sys.stdin)
t = r.get('timings', {})
print(f'predicted_per_second:  {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
print(f'prompt_per_second:     {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
print(f'predicted_n:           {t.get(\"predicted_n\", \"?\")}')
print(f'prompt_n:              {t.get(\"prompt_n\", \"?\")}')
"

echo
echo "[smoke] passed — expected band 5-10 tok/s decode; <3 tok/s = investigate"
progress 235b 2026-06-08 15:31:50 +01:00			`#!/usr/bin/env bash`
			`# Smoke-test the running qwen3-235b llama-server (port 8081). Hits`
			`# /health for liveness, then a tiny OpenAI-compatible chat completion,`
			`# then measures eval_tps via /completion. Generation is bigger than`
			`# llama's smoke (n_predict=64) because at 5-10 tok/s the per-token`
			`# noise floor swamps a 16-token sample.`
			`set -euo pipefail`

			`HOST="${QWEN235_HOST:-127.0.0.1:8081}"`
			`MODEL="${QWEN235_MODEL:-qwen3-235b}"`

			`echo "[smoke] GET /health on $HOST"`
			`curl -fsS "http://$HOST/health" \| python3 -m json.tool`

			`echo`
			`echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"`
			`curl -fsS "http://$HOST/v1/chat/completions" \`
			`-H 'Content-Type: application/json' \`
			`-d "{`
			`\"model\": \"$MODEL\",`
			`\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],`
			`\"max_tokens\": 16,`
			`\"temperature\": 0.0`
			`}" \| python3 -m json.tool`

			`echo`
			`echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=64)"`
			`# Bigger sample than llama/smoke.sh — at ~7 tok/s the first few tokens'`
			`# warmup dominates a 16-token measurement.`
			`curl -fsS "http://$HOST/completion" \`
			`-H 'Content-Type: application/json' \`
			`-d '{`
			`"prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",`
			`"n_predict": 64,`
			`"temperature": 0.0,`
			`"stream": false`
			`}' \| python3 -c "`
			`import json, sys`
			`r = json.load(sys.stdin)`
			`t = r.get('timings', {})`
			`print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')`
			`print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')`
			`print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}')`
			`print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}')`
			`"`

			`echo`
			`echo "[smoke] passed — expected band 5-10 tok/s decode; <3 tok/s = investigate"`