#!/usr/bin/env bash
# bench-engines — compare decode/prefill throughput of Ollama vs
# llama.cpp (kyuz0 toolbox) on the SAME GGUF on gfx1151.
#
# Why this exists. The GGUF-tier consolidation decision (see the
# framework README "Inference engine consolidation") hinges on one
# hardware-specific unknown: how close is Ollama's bundled llama.cpp to
# the gfx1151-tuned kyuz0 build on *this* box? If decode t/s is within
# ~10-15 %, Ollama's convenience wins (it auto-swaps, so no llama-swap
# needed). If kyuz0's rocWMMA flash-attention lead is large, that argues
# for keeping llama.cpp behind llama-swap. This measures it.
#
# Method. Serves the identical GGUF on each engine in isolation (the
# other GGUF engine + 235b are stopped so nothing competes for the
# arena), warms up, then runs R raw-completion trials at a fixed decode
# length. Reads each engine's own authoritative timing fields — no
# token-counting guesswork:
#   - llama.cpp /completion → .timings.{prompt,predicted}_per_second
#   - Ollama   /api/generate → {prompt_eval,eval}_{count,duration}
# Uses raw prompts (no chat template) on both for an apples-to-apples
# prompt-in / tokens-out measurement.
#
# Run ON THE BOX (hits localhost + docker). Requires jq.
#
# Usage:
#   bench-engines                 # bench the model llama.yml serves
#   bench-engines status          # show what's currently GPU-resident
#   BENCH_RUNS=5 bench-engines     # more trials (default 3)
#
# To bench a different model (e.g. Qwen3.6-27B): point compose/llama.yml
# at the new GGUF, set GGUF below to match, redeploy, rerun.

set -euo pipefail

COMPOSE_ROOT="/srv/docker"
RUNS="${BENCH_RUNS:-3}"
N_PREDICT="${BENCH_N_PREDICT:-256}"
WAIT_TIMEOUT="${BENCH_WAIT_TIMEOUT:-600}"

# Must match the GGUF that compose/llama.yml serves — this is the file
# registered into Ollama so both engines run identical weights. The
# path is the in-container path (/models is bind-mounted into both).
GGUF="${BENCH_GGUF:-/models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf}"
OLLAMA_BENCH_MODEL="bench-engines"

# A fixed, moderately long prompt so prefill is measurable. Decode is the
# number that actually decides the consolidation (bandwidth-bound).
read -r -d '' PROMPT <<'EOF' || true
You are a careful systems engineer. Explain, in detail and step by step,
how a unified-memory APU shares a single physical RAM pool between the
CPU and an integrated GPU, what a GTT aperture is, why demand paging
matters for large language model weights, and how this differs from a
discrete GPU with dedicated VRAM. Be thorough and precise.
EOF

LLAMA_URL="http://127.0.0.1:8080"
OLLAMA_URL="http://127.0.0.1:11434"

need() { command -v "$1" >/dev/null 2>&1 || { echo "bench-engines: missing '$1'" >&2; exit 1; }; }
need jq
need curl

is_running() { docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true; }
is_healthy() { curl -fsS --max-time 5 "$1" >/dev/null 2>&1; }

down() {
    local dir="$COMPOSE_ROOT/$1"
    is_running "$1" || return 0
    echo "  stopping $1"
    (cd "$dir" && docker compose down >/dev/null 2>&1)
}

up_wait() {
    local svc="$1" health="$2" deadline=$(( SECONDS + WAIT_TIMEOUT ))
    echo "  starting $svc"
    (cd "$COMPOSE_ROOT/$svc" && docker compose up -d >/dev/null 2>&1)
    printf "    waiting for health"
    while ! is_healthy "$health"; do
        (( SECONDS > deadline )) && { echo " TIMEOUT"; docker logs --tail 20 "$svc" >&2; exit 1; }
        sleep 5; printf "."
    done
    echo " ok"
}

# --- isolation: only the engine under test is GPU-resident ------------
isolate_for() {
    case "$1" in
        llama)  down ollama; down qwen3-235b; down kimi-linear ;;
        ollama) down llama;  down qwen3-235b; down kimi-linear ;;
    esac
}

# --- register the GGUF into Ollama (idempotent) -----------------------
register_ollama_model() {
    if docker exec ollama ollama list 2>/dev/null | grep -q "^${OLLAMA_BENCH_MODEL}"; then
        echo "  ollama model '${OLLAMA_BENCH_MODEL}' already registered"
        return 0
    fi
    echo "  registering ${OLLAMA_BENCH_MODEL} from ${GGUF}"
    # FROM the in-container GGUF path; num_ctx/kv match llama.yml so the
    # comparison stays fair.
    printf 'FROM %s\nPARAMETER num_ctx 65536\n' "$GGUF" \
        | docker exec -i ollama ollama create "${OLLAMA_BENCH_MODEL}" -f -
}

# --- one trial; echoes "prefill_tps decode_tps" -----------------------
trial_llama() {
    local body resp
    body=$(jq -n --arg p "$PROMPT" --argjson n "$N_PREDICT" \
        '{prompt:$p, n_predict:$n, temperature:0, cache_prompt:false}')
    resp=$(curl -fsS --max-time 300 "$LLAMA_URL/completion" \
        -H 'Content-Type: application/json' -d "$body")
    echo "$resp" | jq -r '"\(.timings.prompt_per_second) \(.timings.predicted_per_second)"'
}

trial_ollama() {
    local body resp
    body=$(jq -n --arg m "$OLLAMA_BENCH_MODEL" --arg p "$PROMPT" --argjson n "$N_PREDICT" \
        '{model:$m, prompt:$p, raw:true, stream:false, options:{temperature:0, num_predict:$n}}')
    resp=$(curl -fsS --max-time 300 "$OLLAMA_URL/api/generate" \
        -H 'Content-Type: application/json' -d "$body")
    # durations are ns; t/s = count / (duration/1e9)
    echo "$resp" | jq -r '
        "\(.prompt_eval_count / (.prompt_eval_duration/1e9)) \(.eval_count / (.eval_duration/1e9))"'
}

# --- run R trials, print per-trial + mean decode ----------------------
bench() {
    local engine="$1" trialfn="$2"
    echo "  warmup..."; "$trialfn" >/dev/null
    local sum_pp=0 sum_tg=0
    for i in $(seq 1 "$RUNS"); do
        read -r pp tg < <("$trialfn")
        printf "  trial %d: prefill %6.1f t/s   decode %6.2f t/s\n" "$i" "$pp" "$tg"
        sum_pp=$(echo "$sum_pp + $pp" | bc -l)
        sum_tg=$(echo "$sum_tg + $tg" | bc -l)
    done
    MEAN_PP=$(echo "scale=1; $sum_pp / $RUNS" | bc -l)
    MEAN_TG=$(echo "scale=2; $sum_tg / $RUNS" | bc -l)
    printf "  %s mean: prefill %s t/s   decode %s t/s\n" "$engine" "$MEAN_PP" "$MEAN_TG"
}

if [[ "${1:-}" == "status" ]]; then
    for c in ollama llama kimi-linear qwen3-235b; do
        is_running "$c" && echo "$c: up" || echo "$c: down"
    done
    exit 0
fi

need bc

echo "== llama.cpp (kyuz0 ${GGUF##*/}) =="
isolate_for llama
up_wait llama "$LLAMA_URL/health"
bench "llama.cpp" trial_llama
LLAMA_TG="$MEAN_TG"
down llama

echo
echo "== Ollama (same GGUF) =="
isolate_for ollama
up_wait ollama "$OLLAMA_URL/api/tags"
register_ollama_model
bench "ollama" trial_ollama
OLLAMA_TG="$MEAN_TG"

echo
echo "== Verdict =="
# Ollama as % of llama.cpp decode throughput.
PCT=$(echo "scale=1; 100 * $OLLAMA_TG / $LLAMA_TG" | bc -l)
printf "  llama.cpp decode: %s t/s\n  ollama decode:    %s t/s  (%s%% of llama.cpp)\n" \
    "$LLAMA_TG" "$OLLAMA_TG" "$PCT"
echo
echo "  Guidance: Ollama >=85% of llama.cpp  -> option 1 (Ollama + vLLM,"
echo "            drop standalone llama.cpp; Ollama self-swaps, no llama-swap)."
echo "            Larger gap                  -> option 2 (keep llama.cpp"
echo "            behind llama-swap with coexistence groups; drop Ollama)."