added models, model-swap, ...

2026-06-26 08:13:33 -04:00
parent de1635872f
commit 224afbb3a6
18 changed files with 1659 additions and 243 deletions
--- a/pyinfra/framework/scripts/bench-engines
+++ b/pyinfra/framework/scripts/bench-engines
@@ -0,0 +1,177 @@
+#!/usr/bin/env bash
+# bench-engines — compare decode/prefill throughput of Ollama vs
+# llama.cpp (kyuz0 toolbox) on the SAME GGUF on gfx1151.
+#
+# Why this exists. The GGUF-tier consolidation decision (see the
+# framework README "Inference engine consolidation") hinges on one
+# hardware-specific unknown: how close is Ollama's bundled llama.cpp to
+# the gfx1151-tuned kyuz0 build on *this* box? If decode t/s is within
+# ~10-15 %, Ollama's convenience wins (it auto-swaps, so no llama-swap
+# needed). If kyuz0's rocWMMA flash-attention lead is large, that argues
+# for keeping llama.cpp behind llama-swap. This measures it.
+#
+# Method. Serves the identical GGUF on each engine in isolation (the
+# other GGUF engine + 235b are stopped so nothing competes for the
+# arena), warms up, then runs R raw-completion trials at a fixed decode
+# length. Reads each engine's own authoritative timing fields — no
+# token-counting guesswork:
+#   - llama.cpp /completion → .timings.{prompt,predicted}_per_second
+#   - Ollama   /api/generate → {prompt_eval,eval}_{count,duration}
+# Uses raw prompts (no chat template) on both for an apples-to-apples
+# prompt-in / tokens-out measurement.
+#
+# Run ON THE BOX (hits localhost + docker). Requires jq.
+#
+# Usage:
+#   bench-engines                 # bench the model llama.yml serves
+#   bench-engines status          # show what's currently GPU-resident
+#   BENCH_RUNS=5 bench-engines     # more trials (default 3)
+#
+# To bench a different model (e.g. Qwen3.6-27B): point compose/llama.yml
+# at the new GGUF, set GGUF below to match, redeploy, rerun.
+
+set -euo pipefail
+
+COMPOSE_ROOT="/srv/docker"
+RUNS="${BENCH_RUNS:-3}"
+N_PREDICT="${BENCH_N_PREDICT:-256}"
+WAIT_TIMEOUT="${BENCH_WAIT_TIMEOUT:-600}"
+
+# Must match the GGUF that compose/llama.yml serves — this is the file
+# registered into Ollama so both engines run identical weights. The
+# path is the in-container path (/models is bind-mounted into both).
+GGUF="${BENCH_GGUF:-/models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf}"
+OLLAMA_BENCH_MODEL="bench-engines"
+
+# A fixed, moderately long prompt so prefill is measurable. Decode is the
+# number that actually decides the consolidation (bandwidth-bound).
+read -r -d '' PROMPT <<'EOF' || true
+You are a careful systems engineer. Explain, in detail and step by step,
+how a unified-memory APU shares a single physical RAM pool between the
+CPU and an integrated GPU, what a GTT aperture is, why demand paging
+matters for large language model weights, and how this differs from a
+discrete GPU with dedicated VRAM. Be thorough and precise.
+EOF
+
+LLAMA_URL="http://127.0.0.1:8080"
+OLLAMA_URL="http://127.0.0.1:11434"
+
+need() { command -v "$1" >/dev/null 2>&1 || { echo "bench-engines: missing '$1'" >&2; exit 1; }; }
+need jq
+need curl
+
+is_running() { docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true; }
+is_healthy() { curl -fsS --max-time 5 "$1" >/dev/null 2>&1; }
+
+down() {
+    local dir="$COMPOSE_ROOT/$1"
+    is_running "$1" || return 0
+    echo "  stopping $1"
+    (cd "$dir" && docker compose down >/dev/null 2>&1)
+}
+
+up_wait() {
+    local svc="$1" health="$2" deadline=$(( SECONDS + WAIT_TIMEOUT ))
+    echo "  starting $svc"
+    (cd "$COMPOSE_ROOT/$svc" && docker compose up -d >/dev/null 2>&1)
+    printf "    waiting for health"
+    while ! is_healthy "$health"; do
+        (( SECONDS > deadline )) && { echo " TIMEOUT"; docker logs --tail 20 "$svc" >&2; exit 1; }
+        sleep 5; printf "."
+    done
+    echo " ok"
+}
+
+# --- isolation: only the engine under test is GPU-resident ------------
+isolate_for() {
+    case "$1" in
+        llama)  down ollama; down qwen3-235b; down kimi-linear ;;
+        ollama) down llama;  down qwen3-235b; down kimi-linear ;;
+    esac
+}
+
+# --- register the GGUF into Ollama (idempotent) -----------------------
+register_ollama_model() {
+    if docker exec ollama ollama list 2>/dev/null | grep -q "^${OLLAMA_BENCH_MODEL}"; then
+        echo "  ollama model '${OLLAMA_BENCH_MODEL}' already registered"
+        return 0
+    fi
+    echo "  registering ${OLLAMA_BENCH_MODEL} from ${GGUF}"
+    # FROM the in-container GGUF path; num_ctx/kv match llama.yml so the
+    # comparison stays fair.
+    printf 'FROM %s\nPARAMETER num_ctx 65536\n' "$GGUF" \
+        | docker exec -i ollama ollama create "${OLLAMA_BENCH_MODEL}" -f -
+}
+
+# --- one trial; echoes "prefill_tps decode_tps" -----------------------
+trial_llama() {
+    local body resp
+    body=$(jq -n --arg p "$PROMPT" --argjson n "$N_PREDICT" \
+        '{prompt:$p, n_predict:$n, temperature:0, cache_prompt:false}')
+    resp=$(curl -fsS --max-time 300 "$LLAMA_URL/completion" \
+        -H 'Content-Type: application/json' -d "$body")
+    echo "$resp" | jq -r '"\(.timings.prompt_per_second) \(.timings.predicted_per_second)"'
+}
+
+trial_ollama() {
+    local body resp
+    body=$(jq -n --arg m "$OLLAMA_BENCH_MODEL" --arg p "$PROMPT" --argjson n "$N_PREDICT" \
+        '{model:$m, prompt:$p, raw:true, stream:false, options:{temperature:0, num_predict:$n}}')
+    resp=$(curl -fsS --max-time 300 "$OLLAMA_URL/api/generate" \
+        -H 'Content-Type: application/json' -d "$body")
+    # durations are ns; t/s = count / (duration/1e9)
+    echo "$resp" | jq -r '
+        "\(.prompt_eval_count / (.prompt_eval_duration/1e9)) \(.eval_count / (.eval_duration/1e9))"'
+}
+
+# --- run R trials, print per-trial + mean decode ----------------------
+bench() {
+    local engine="$1" trialfn="$2"
+    echo "  warmup..."; "$trialfn" >/dev/null
+    local sum_pp=0 sum_tg=0
+    for i in $(seq 1 "$RUNS"); do
+        read -r pp tg < <("$trialfn")
+        printf "  trial %d: prefill %6.1f t/s   decode %6.2f t/s\n" "$i" "$pp" "$tg"
+        sum_pp=$(echo "$sum_pp + $pp" | bc -l)
+        sum_tg=$(echo "$sum_tg + $tg" | bc -l)
+    done
+    MEAN_PP=$(echo "scale=1; $sum_pp / $RUNS" | bc -l)
+    MEAN_TG=$(echo "scale=2; $sum_tg / $RUNS" | bc -l)
+    printf "  %s mean: prefill %s t/s   decode %s t/s\n" "$engine" "$MEAN_PP" "$MEAN_TG"
+}
+
+if [[ "${1:-}" == "status" ]]; then
+    for c in ollama llama kimi-linear qwen3-235b; do
+        is_running "$c" && echo "$c: up" || echo "$c: down"
+    done
+    exit 0
+fi
+
+need bc
+
+echo "== llama.cpp (kyuz0 ${GGUF##*/}) =="
+isolate_for llama
+up_wait llama "$LLAMA_URL/health"
+bench "llama.cpp" trial_llama
+LLAMA_TG="$MEAN_TG"
+down llama
+
+echo
+echo "== Ollama (same GGUF) =="
+isolate_for ollama
+up_wait ollama "$OLLAMA_URL/api/tags"
+register_ollama_model
+bench "ollama" trial_ollama
+OLLAMA_TG="$MEAN_TG"
+
+echo
+echo "== Verdict =="
+# Ollama as % of llama.cpp decode throughput.
+PCT=$(echo "scale=1; 100 * $OLLAMA_TG / $LLAMA_TG" | bc -l)
+printf "  llama.cpp decode: %s t/s\n  ollama decode:    %s t/s  (%s%% of llama.cpp)\n" \
+    "$LLAMA_TG" "$OLLAMA_TG" "$PCT"
+echo
+echo "  Guidance: Ollama >=85% of llama.cpp  -> option 1 (Ollama + vLLM,"
+echo "            drop standalone llama.cpp; Ollama self-swaps, no llama-swap)."
+echo "            Larger gap                  -> option 2 (keep llama.cpp"
+echo "            behind llama-swap with coexistence groups; drop Ollama)."
--- a/pyinfra/framework/scripts/swap-model
+++ b/pyinfra/framework/scripts/swap-model
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+# swap-model — coordinate which inference container is GPU-resident on
+# the Strix Halo box.
+#
+# Why this exists. The GPU's merged ~110 GB arena (BIOS UMA=0.5 GB +
+# ttm.pages_limit + HSA_XNACK; see StrixHaloMemory.md) holds at most
+# one 88 GB-class model at a time, and ROCm doesn't reclaim cleanly
+# between consumers. So switching models means stop-then-start of
+# whole compose stacks. This script encodes the per-target conflict
+# table + per-service health probes so the swap is one command.
+#
+# Usage:
+#   swap-model coder        # Qwen3-Coder-30B via Ollama (interactive)
+#   swap-model 235b         # Qwen3-235B-A22B via llama.cpp (long-task)
+#   swap-model kimi         # Kimi-Linear-48B-A3B via vLLM (long-context)
+#   swap-model qwable       # Qwable-3.6-27B via llama.cpp (Fable-style)
+#   swap-model comfyui      # ComfyUI (image generation)
+#   swap-model none         # everything down — free the GPU
+#   swap-model status       # show what's currently up
+#
+# Env knobs:
+#   SWAP_WAIT_TIMEOUT       seconds to wait for /health after up; default 600
+#                           (235B's 88 GB cold load can take 3-5 min)
+#
+# Out of scope (deliberately):
+#   - Always-on services (openwebui, litellm, phoenix, beszel, etc.) —
+#     no GPU footprint, left alone.
+#   - llama.cpp 30B (port 8080) — same weights as Ollama's qwen3-coder
+#     but still LL-P0 perf-evaluating. `coder` target uses Ollama only.
+#   - Multi-target combos (e.g. kimi+ollama coexist on the arena);
+#     for now run swap-model twice if you want both.
+
+set -euo pipefail
+
+COMPOSE_ROOT="/srv/docker"
+WAIT_TIMEOUT="${SWAP_WAIT_TIMEOUT:-600}"
+
+# --- Service table -----------------------------------------------------------
+# Map short name → compose dir (under $COMPOSE_ROOT) and health URL.
+# Container name == compose dir name in every case (intentional convention,
+# enforced in compose/*.yml's container_name fields).
+declare -A SVC_DIR=(
+    [ollama]=ollama
+    [llama]=llama
+    [kimi]=kimi-linear
+    [235b]=qwen3-235b
+    [qwable]=qwable
+    [comfyui]=comfyui
+)
+declare -A SVC_HEALTH=(
+    [ollama]="http://127.0.0.1:11434/api/tags"
+    [llama]="http://127.0.0.1:8080/health"
+    [kimi]="http://127.0.0.1:8000/v1/models"
+    [235b]="http://127.0.0.1:8081/health"
+    [qwable]="http://127.0.0.1:8082/health"
+    [comfyui]="http://127.0.0.1:8188/"
+)
+
+# --- Target → plan -----------------------------------------------------------
+# UP   = services that should be running after the swap
+# DOWN = services that must be stopped to free the GPU arena
+# (anything not in either list is left untouched — e.g. switching to coder
+# leaves kimi alone, since kimi(30 GB) + ollama(30 GB) fit in the arena.)
+plan() {
+    UP=() ; DOWN=()
+    case "$1" in
+        coder)   UP=(ollama)  ; DOWN=(235b comfyui) ;;
+        235b)    UP=(235b)    ; DOWN=(ollama llama kimi qwable comfyui) ;;
+        kimi)    UP=(kimi)    ; DOWN=(235b comfyui) ;;
+        qwable)  UP=(qwable)  ; DOWN=(235b comfyui) ;;
+        comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;;
+        none)    UP=()        ; DOWN=(ollama llama kimi 235b qwable comfyui) ;;
+        *)       return 1 ;;
+    esac
+}
+
+# --- Probes ------------------------------------------------------------------
+is_running() {
+    docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true
+}
+
+is_healthy() {
+    curl -fsS --max-time 5 "$1" >/dev/null 2>&1
+}
+
+wait_healthy() {
+    local svc="$1" url="${SVC_HEALTH[$1]}" deadline=$(( SECONDS + WAIT_TIMEOUT ))
+    printf "    waiting for %s health (timeout %ss)" "$svc" "$WAIT_TIMEOUT"
+    while ! is_healthy "$url"; do
+        if (( SECONDS > deadline )); then
+            printf " TIMEOUT\n"
+            echo "    last 20 lines of container log:" >&2
+            docker logs --tail 20 "${SVC_DIR[$svc]}" 2>&1 | sed 's/^/      /' >&2
+            return 1
+        fi
+        sleep 5
+        printf "."
+    done
+    printf " ok\n"
+}
+
+# --- Actions -----------------------------------------------------------------
+down_svc() {
+    local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
+    if ! is_running "${SVC_DIR[$svc]}"; then
+        echo "  $svc: already down"
+        return 0
+    fi
+    echo "  stopping $svc"
+    (cd "$dir" && docker compose down)
+}
+
+up_svc() {
+    local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
+    if is_running "${SVC_DIR[$svc]}" && is_healthy "${SVC_HEALTH[$svc]}"; then
+        echo "  $svc: already up + healthy"
+        return 0
+    fi
+    if [[ ! -d "$dir" ]]; then
+        echo "  $svc: compose dir $dir missing — run pyinfra deploy first" >&2
+        return 1
+    fi
+    echo "  starting $svc"
+    (cd "$dir" && docker compose up -d)
+    wait_healthy "$svc"
+}
+
+show_status() {
+    echo "Inference services:"
+    for svc in ollama llama kimi 235b qwable comfyui; do
+        local container="${SVC_DIR[$svc]}" state="down" health=""
+        if is_running "$container"; then
+            state="up"
+            if is_healthy "${SVC_HEALTH[$svc]}"; then
+                health=" (healthy)"
+            else
+                health=" (starting/unhealthy)"
+            fi
+        fi
+        printf "  %-8s %s%s\n" "$svc" "$state" "$health"
+    done
+}
+
+usage() {
+    cat <<EOF
+swap-model — coordinate inference containers on Strix Halo.
+
+Usage:
+  swap-model coder        # Qwen3-Coder-30B  (Ollama, interactive daily-driver)
+  swap-model 235b         # Qwen3-235B-A22B  (llama.cpp, long-task, ~5-10 tok/s)
+  swap-model kimi         # Kimi-Linear-48B  (vLLM, long-context chat)
+  swap-model qwable       # Qwable-3.6-27B   (llama.cpp, Fable-style, ~10-15 tok/s)
+  swap-model comfyui      # ComfyUI          (image generation)
+  swap-model none         # everything down  (free the GPU arena)
+  swap-model status       # show current state
+
+Behaviour: stops conflicting services (frees the 110 GB GPU arena),
+starts the target, polls its /health until it returns 200. Wait timeout
+defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT.
+
+Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with
+each other. 235B and comfyui coexist with nothing. See
+compose/qwen3-235b/README.md for arena math.
+EOF
+}
+
+# --- Main --------------------------------------------------------------------
+TARGET="${1:-}"
+case "$TARGET" in
+    coder|235b|kimi|qwable|comfyui|none) ;;
+    status)         show_status ; exit 0 ;;
+    -h|--help|help|"") usage ; exit 0 ;;
+    *)
+        echo "swap-model: unknown target '$TARGET'" >&2
+        echo "Try: swap-model help" >&2
+        exit 2
+        ;;
+esac
+
+plan "$TARGET"
+echo "Plan: down=[${DOWN[*]:-}] up=[${UP[*]:-}]"
+for svc in "${DOWN[@]}"; do down_svc "$svc"; done
+for svc in "${UP[@]}";   do up_svc   "$svc"; done
+
+echo
+show_status