#!/usr/bin/env bash # Smoke-test the running qwen3-235b llama-server (port 8081). Hits # /health for liveness, then a tiny OpenAI-compatible chat completion, # then measures eval_tps via /completion. Generation is bigger than # llama's smoke (n_predict=64) because at 5-10 tok/s the per-token # noise floor swamps a 16-token sample. set -euo pipefail HOST="${QWEN235_HOST:-127.0.0.1:8081}" MODEL="${QWEN235_MODEL:-qwen3-235b}" echo "[smoke] GET /health on $HOST" curl -fsS "http://$HOST/health" | python3 -m json.tool echo echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation" curl -fsS "http://$HOST/v1/chat/completions" \ -H 'Content-Type: application/json' \ -d "{ \"model\": \"$MODEL\", \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}], \"max_tokens\": 16, \"temperature\": 0.0 }" | python3 -m json.tool echo echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=64)" # Bigger sample than llama/smoke.sh — at ~7 tok/s the first few tokens' # warmup dominates a 16-token measurement. curl -fsS "http://$HOST/completion" \ -H 'Content-Type: application/json' \ -d '{ "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.", "n_predict": 64, "temperature": 0.0, "stream": false }' | python3 -c " import json, sys r = json.load(sys.stdin) t = r.get('timings', {}) print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s') print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s') print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}') print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}') " echo echo "[smoke] passed — expected band 5-10 tok/s decode; <3 tok/s = investigate"