#!/usr/bin/env bash
# Smoke-test the running qwable llama-server (port 8082). Hits /health
# for liveness, then a tiny OpenAI-compatible chat completion, then
# measures eval_tps via /completion. Dense 27B → expect ~10-15 tok/s.
set -euo pipefail

HOST="${QWABLE_HOST:-127.0.0.1:8082}"
MODEL="${QWABLE_MODEL:-qwable}"

echo "[smoke] GET /health on $HOST"
curl -fsS "http://$HOST/health" | python3 -m json.tool

echo
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
curl -fsS "http://$HOST/v1/chat/completions" \
    -H 'Content-Type: application/json' \
    -d "{
        \"model\": \"$MODEL\",
        \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
        \"max_tokens\": 16,
        \"temperature\": 0.0
    }" | python3 -m json.tool

echo
echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=128)"
# 128 tokens — at ~10-15 tok/s the per-token warmup noise still matters,
# but a dense 27B settles faster than the 235B so we don't need 64-only.
curl -fsS "http://$HOST/completion" \
    -H 'Content-Type: application/json' \
    -d '{
        "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
        "n_predict": 128,
        "temperature": 0.0,
        "stream": false
    }' | python3 -c "
import json, sys
r = json.load(sys.stdin)
t = r.get('timings', {})
print(f'predicted_per_second:  {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
print(f'prompt_per_second:     {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
print(f'predicted_n:           {t.get(\"predicted_n\", \"?\")}')
print(f'prompt_n:              {t.get(\"prompt_n\", \"?\")}')
"

echo
echo "[smoke] passed — expected band 10-15 tok/s decode (dense 27B Q4)"