added models, model-swap, ...
This commit is contained in:
177
pyinfra/framework/scripts/bench-engines
Normal file
177
pyinfra/framework/scripts/bench-engines
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env bash
|
||||
# bench-engines — compare decode/prefill throughput of Ollama vs
|
||||
# llama.cpp (kyuz0 toolbox) on the SAME GGUF on gfx1151.
|
||||
#
|
||||
# Why this exists. The GGUF-tier consolidation decision (see the
|
||||
# framework README "Inference engine consolidation") hinges on one
|
||||
# hardware-specific unknown: how close is Ollama's bundled llama.cpp to
|
||||
# the gfx1151-tuned kyuz0 build on *this* box? If decode t/s is within
|
||||
# ~10-15 %, Ollama's convenience wins (it auto-swaps, so no llama-swap
|
||||
# needed). If kyuz0's rocWMMA flash-attention lead is large, that argues
|
||||
# for keeping llama.cpp behind llama-swap. This measures it.
|
||||
#
|
||||
# Method. Serves the identical GGUF on each engine in isolation (the
|
||||
# other GGUF engine + 235b are stopped so nothing competes for the
|
||||
# arena), warms up, then runs R raw-completion trials at a fixed decode
|
||||
# length. Reads each engine's own authoritative timing fields — no
|
||||
# token-counting guesswork:
|
||||
# - llama.cpp /completion → .timings.{prompt,predicted}_per_second
|
||||
# - Ollama /api/generate → {prompt_eval,eval}_{count,duration}
|
||||
# Uses raw prompts (no chat template) on both for an apples-to-apples
|
||||
# prompt-in / tokens-out measurement.
|
||||
#
|
||||
# Run ON THE BOX (hits localhost + docker). Requires jq.
|
||||
#
|
||||
# Usage:
|
||||
# bench-engines # bench the model llama.yml serves
|
||||
# bench-engines status # show what's currently GPU-resident
|
||||
# BENCH_RUNS=5 bench-engines # more trials (default 3)
|
||||
#
|
||||
# To bench a different model (e.g. Qwen3.6-27B): point compose/llama.yml
|
||||
# at the new GGUF, set GGUF below to match, redeploy, rerun.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
COMPOSE_ROOT="/srv/docker"
|
||||
RUNS="${BENCH_RUNS:-3}"
|
||||
N_PREDICT="${BENCH_N_PREDICT:-256}"
|
||||
WAIT_TIMEOUT="${BENCH_WAIT_TIMEOUT:-600}"
|
||||
|
||||
# Must match the GGUF that compose/llama.yml serves — this is the file
|
||||
# registered into Ollama so both engines run identical weights. The
|
||||
# path is the in-container path (/models is bind-mounted into both).
|
||||
GGUF="${BENCH_GGUF:-/models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf}"
|
||||
OLLAMA_BENCH_MODEL="bench-engines"
|
||||
|
||||
# A fixed, moderately long prompt so prefill is measurable. Decode is the
|
||||
# number that actually decides the consolidation (bandwidth-bound).
|
||||
read -r -d '' PROMPT <<'EOF' || true
|
||||
You are a careful systems engineer. Explain, in detail and step by step,
|
||||
how a unified-memory APU shares a single physical RAM pool between the
|
||||
CPU and an integrated GPU, what a GTT aperture is, why demand paging
|
||||
matters for large language model weights, and how this differs from a
|
||||
discrete GPU with dedicated VRAM. Be thorough and precise.
|
||||
EOF
|
||||
|
||||
LLAMA_URL="http://127.0.0.1:8080"
|
||||
OLLAMA_URL="http://127.0.0.1:11434"
|
||||
|
||||
need() { command -v "$1" >/dev/null 2>&1 || { echo "bench-engines: missing '$1'" >&2; exit 1; }; }
|
||||
need jq
|
||||
need curl
|
||||
|
||||
is_running() { docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true; }
|
||||
is_healthy() { curl -fsS --max-time 5 "$1" >/dev/null 2>&1; }
|
||||
|
||||
down() {
|
||||
local dir="$COMPOSE_ROOT/$1"
|
||||
is_running "$1" || return 0
|
||||
echo " stopping $1"
|
||||
(cd "$dir" && docker compose down >/dev/null 2>&1)
|
||||
}
|
||||
|
||||
up_wait() {
|
||||
local svc="$1" health="$2" deadline=$(( SECONDS + WAIT_TIMEOUT ))
|
||||
echo " starting $svc"
|
||||
(cd "$COMPOSE_ROOT/$svc" && docker compose up -d >/dev/null 2>&1)
|
||||
printf " waiting for health"
|
||||
while ! is_healthy "$health"; do
|
||||
(( SECONDS > deadline )) && { echo " TIMEOUT"; docker logs --tail 20 "$svc" >&2; exit 1; }
|
||||
sleep 5; printf "."
|
||||
done
|
||||
echo " ok"
|
||||
}
|
||||
|
||||
# --- isolation: only the engine under test is GPU-resident ------------
|
||||
isolate_for() {
|
||||
case "$1" in
|
||||
llama) down ollama; down qwen3-235b; down kimi-linear ;;
|
||||
ollama) down llama; down qwen3-235b; down kimi-linear ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# --- register the GGUF into Ollama (idempotent) -----------------------
|
||||
register_ollama_model() {
|
||||
if docker exec ollama ollama list 2>/dev/null | grep -q "^${OLLAMA_BENCH_MODEL}"; then
|
||||
echo " ollama model '${OLLAMA_BENCH_MODEL}' already registered"
|
||||
return 0
|
||||
fi
|
||||
echo " registering ${OLLAMA_BENCH_MODEL} from ${GGUF}"
|
||||
# FROM the in-container GGUF path; num_ctx/kv match llama.yml so the
|
||||
# comparison stays fair.
|
||||
printf 'FROM %s\nPARAMETER num_ctx 65536\n' "$GGUF" \
|
||||
| docker exec -i ollama ollama create "${OLLAMA_BENCH_MODEL}" -f -
|
||||
}
|
||||
|
||||
# --- one trial; echoes "prefill_tps decode_tps" -----------------------
|
||||
trial_llama() {
|
||||
local body resp
|
||||
body=$(jq -n --arg p "$PROMPT" --argjson n "$N_PREDICT" \
|
||||
'{prompt:$p, n_predict:$n, temperature:0, cache_prompt:false}')
|
||||
resp=$(curl -fsS --max-time 300 "$LLAMA_URL/completion" \
|
||||
-H 'Content-Type: application/json' -d "$body")
|
||||
echo "$resp" | jq -r '"\(.timings.prompt_per_second) \(.timings.predicted_per_second)"'
|
||||
}
|
||||
|
||||
trial_ollama() {
|
||||
local body resp
|
||||
body=$(jq -n --arg m "$OLLAMA_BENCH_MODEL" --arg p "$PROMPT" --argjson n "$N_PREDICT" \
|
||||
'{model:$m, prompt:$p, raw:true, stream:false, options:{temperature:0, num_predict:$n}}')
|
||||
resp=$(curl -fsS --max-time 300 "$OLLAMA_URL/api/generate" \
|
||||
-H 'Content-Type: application/json' -d "$body")
|
||||
# durations are ns; t/s = count / (duration/1e9)
|
||||
echo "$resp" | jq -r '
|
||||
"\(.prompt_eval_count / (.prompt_eval_duration/1e9)) \(.eval_count / (.eval_duration/1e9))"'
|
||||
}
|
||||
|
||||
# --- run R trials, print per-trial + mean decode ----------------------
|
||||
bench() {
|
||||
local engine="$1" trialfn="$2"
|
||||
echo " warmup..."; "$trialfn" >/dev/null
|
||||
local sum_pp=0 sum_tg=0
|
||||
for i in $(seq 1 "$RUNS"); do
|
||||
read -r pp tg < <("$trialfn")
|
||||
printf " trial %d: prefill %6.1f t/s decode %6.2f t/s\n" "$i" "$pp" "$tg"
|
||||
sum_pp=$(echo "$sum_pp + $pp" | bc -l)
|
||||
sum_tg=$(echo "$sum_tg + $tg" | bc -l)
|
||||
done
|
||||
MEAN_PP=$(echo "scale=1; $sum_pp / $RUNS" | bc -l)
|
||||
MEAN_TG=$(echo "scale=2; $sum_tg / $RUNS" | bc -l)
|
||||
printf " %s mean: prefill %s t/s decode %s t/s\n" "$engine" "$MEAN_PP" "$MEAN_TG"
|
||||
}
|
||||
|
||||
if [[ "${1:-}" == "status" ]]; then
|
||||
for c in ollama llama kimi-linear qwen3-235b; do
|
||||
is_running "$c" && echo "$c: up" || echo "$c: down"
|
||||
done
|
||||
exit 0
|
||||
fi
|
||||
|
||||
need bc
|
||||
|
||||
echo "== llama.cpp (kyuz0 ${GGUF##*/}) =="
|
||||
isolate_for llama
|
||||
up_wait llama "$LLAMA_URL/health"
|
||||
bench "llama.cpp" trial_llama
|
||||
LLAMA_TG="$MEAN_TG"
|
||||
down llama
|
||||
|
||||
echo
|
||||
echo "== Ollama (same GGUF) =="
|
||||
isolate_for ollama
|
||||
up_wait ollama "$OLLAMA_URL/api/tags"
|
||||
register_ollama_model
|
||||
bench "ollama" trial_ollama
|
||||
OLLAMA_TG="$MEAN_TG"
|
||||
|
||||
echo
|
||||
echo "== Verdict =="
|
||||
# Ollama as % of llama.cpp decode throughput.
|
||||
PCT=$(echo "scale=1; 100 * $OLLAMA_TG / $LLAMA_TG" | bc -l)
|
||||
printf " llama.cpp decode: %s t/s\n ollama decode: %s t/s (%s%% of llama.cpp)\n" \
|
||||
"$LLAMA_TG" "$OLLAMA_TG" "$PCT"
|
||||
echo
|
||||
echo " Guidance: Ollama >=85% of llama.cpp -> option 1 (Ollama + vLLM,"
|
||||
echo " drop standalone llama.cpp; Ollama self-swaps, no llama-swap)."
|
||||
echo " Larger gap -> option 2 (keep llama.cpp"
|
||||
echo " behind llama-swap with coexistence groups; drop Ollama)."
|
||||
186
pyinfra/framework/scripts/swap-model
Executable file
186
pyinfra/framework/scripts/swap-model
Executable file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env bash
|
||||
# swap-model — coordinate which inference container is GPU-resident on
|
||||
# the Strix Halo box.
|
||||
#
|
||||
# Why this exists. The GPU's merged ~110 GB arena (BIOS UMA=0.5 GB +
|
||||
# ttm.pages_limit + HSA_XNACK; see StrixHaloMemory.md) holds at most
|
||||
# one 88 GB-class model at a time, and ROCm doesn't reclaim cleanly
|
||||
# between consumers. So switching models means stop-then-start of
|
||||
# whole compose stacks. This script encodes the per-target conflict
|
||||
# table + per-service health probes so the swap is one command.
|
||||
#
|
||||
# Usage:
|
||||
# swap-model coder # Qwen3-Coder-30B via Ollama (interactive)
|
||||
# swap-model 235b # Qwen3-235B-A22B via llama.cpp (long-task)
|
||||
# swap-model kimi # Kimi-Linear-48B-A3B via vLLM (long-context)
|
||||
# swap-model qwable # Qwable-3.6-27B via llama.cpp (Fable-style)
|
||||
# swap-model comfyui # ComfyUI (image generation)
|
||||
# swap-model none # everything down — free the GPU
|
||||
# swap-model status # show what's currently up
|
||||
#
|
||||
# Env knobs:
|
||||
# SWAP_WAIT_TIMEOUT seconds to wait for /health after up; default 600
|
||||
# (235B's 88 GB cold load can take 3-5 min)
|
||||
#
|
||||
# Out of scope (deliberately):
|
||||
# - Always-on services (openwebui, litellm, phoenix, beszel, etc.) —
|
||||
# no GPU footprint, left alone.
|
||||
# - llama.cpp 30B (port 8080) — same weights as Ollama's qwen3-coder
|
||||
# but still LL-P0 perf-evaluating. `coder` target uses Ollama only.
|
||||
# - Multi-target combos (e.g. kimi+ollama coexist on the arena);
|
||||
# for now run swap-model twice if you want both.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
COMPOSE_ROOT="/srv/docker"
|
||||
WAIT_TIMEOUT="${SWAP_WAIT_TIMEOUT:-600}"
|
||||
|
||||
# --- Service table -----------------------------------------------------------
|
||||
# Map short name → compose dir (under $COMPOSE_ROOT) and health URL.
|
||||
# Container name == compose dir name in every case (intentional convention,
|
||||
# enforced in compose/*.yml's container_name fields).
|
||||
declare -A SVC_DIR=(
|
||||
[ollama]=ollama
|
||||
[llama]=llama
|
||||
[kimi]=kimi-linear
|
||||
[235b]=qwen3-235b
|
||||
[qwable]=qwable
|
||||
[comfyui]=comfyui
|
||||
)
|
||||
declare -A SVC_HEALTH=(
|
||||
[ollama]="http://127.0.0.1:11434/api/tags"
|
||||
[llama]="http://127.0.0.1:8080/health"
|
||||
[kimi]="http://127.0.0.1:8000/v1/models"
|
||||
[235b]="http://127.0.0.1:8081/health"
|
||||
[qwable]="http://127.0.0.1:8082/health"
|
||||
[comfyui]="http://127.0.0.1:8188/"
|
||||
)
|
||||
|
||||
# --- Target → plan -----------------------------------------------------------
|
||||
# UP = services that should be running after the swap
|
||||
# DOWN = services that must be stopped to free the GPU arena
|
||||
# (anything not in either list is left untouched — e.g. switching to coder
|
||||
# leaves kimi alone, since kimi(30 GB) + ollama(30 GB) fit in the arena.)
|
||||
plan() {
|
||||
UP=() ; DOWN=()
|
||||
case "$1" in
|
||||
coder) UP=(ollama) ; DOWN=(235b comfyui) ;;
|
||||
235b) UP=(235b) ; DOWN=(ollama llama kimi qwable comfyui) ;;
|
||||
kimi) UP=(kimi) ; DOWN=(235b comfyui) ;;
|
||||
qwable) UP=(qwable) ; DOWN=(235b comfyui) ;;
|
||||
comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;;
|
||||
none) UP=() ; DOWN=(ollama llama kimi 235b qwable comfyui) ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# --- Probes ------------------------------------------------------------------
|
||||
is_running() {
|
||||
docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true
|
||||
}
|
||||
|
||||
is_healthy() {
|
||||
curl -fsS --max-time 5 "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
wait_healthy() {
|
||||
local svc="$1" url="${SVC_HEALTH[$1]}" deadline=$(( SECONDS + WAIT_TIMEOUT ))
|
||||
printf " waiting for %s health (timeout %ss)" "$svc" "$WAIT_TIMEOUT"
|
||||
while ! is_healthy "$url"; do
|
||||
if (( SECONDS > deadline )); then
|
||||
printf " TIMEOUT\n"
|
||||
echo " last 20 lines of container log:" >&2
|
||||
docker logs --tail 20 "${SVC_DIR[$svc]}" 2>&1 | sed 's/^/ /' >&2
|
||||
return 1
|
||||
fi
|
||||
sleep 5
|
||||
printf "."
|
||||
done
|
||||
printf " ok\n"
|
||||
}
|
||||
|
||||
# --- Actions -----------------------------------------------------------------
|
||||
down_svc() {
|
||||
local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
|
||||
if ! is_running "${SVC_DIR[$svc]}"; then
|
||||
echo " $svc: already down"
|
||||
return 0
|
||||
fi
|
||||
echo " stopping $svc"
|
||||
(cd "$dir" && docker compose down)
|
||||
}
|
||||
|
||||
up_svc() {
|
||||
local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
|
||||
if is_running "${SVC_DIR[$svc]}" && is_healthy "${SVC_HEALTH[$svc]}"; then
|
||||
echo " $svc: already up + healthy"
|
||||
return 0
|
||||
fi
|
||||
if [[ ! -d "$dir" ]]; then
|
||||
echo " $svc: compose dir $dir missing — run pyinfra deploy first" >&2
|
||||
return 1
|
||||
fi
|
||||
echo " starting $svc"
|
||||
(cd "$dir" && docker compose up -d)
|
||||
wait_healthy "$svc"
|
||||
}
|
||||
|
||||
show_status() {
|
||||
echo "Inference services:"
|
||||
for svc in ollama llama kimi 235b qwable comfyui; do
|
||||
local container="${SVC_DIR[$svc]}" state="down" health=""
|
||||
if is_running "$container"; then
|
||||
state="up"
|
||||
if is_healthy "${SVC_HEALTH[$svc]}"; then
|
||||
health=" (healthy)"
|
||||
else
|
||||
health=" (starting/unhealthy)"
|
||||
fi
|
||||
fi
|
||||
printf " %-8s %s%s\n" "$svc" "$state" "$health"
|
||||
done
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
swap-model — coordinate inference containers on Strix Halo.
|
||||
|
||||
Usage:
|
||||
swap-model coder # Qwen3-Coder-30B (Ollama, interactive daily-driver)
|
||||
swap-model 235b # Qwen3-235B-A22B (llama.cpp, long-task, ~5-10 tok/s)
|
||||
swap-model kimi # Kimi-Linear-48B (vLLM, long-context chat)
|
||||
swap-model qwable # Qwable-3.6-27B (llama.cpp, Fable-style, ~10-15 tok/s)
|
||||
swap-model comfyui # ComfyUI (image generation)
|
||||
swap-model none # everything down (free the GPU arena)
|
||||
swap-model status # show current state
|
||||
|
||||
Behaviour: stops conflicting services (frees the 110 GB GPU arena),
|
||||
starts the target, polls its /health until it returns 200. Wait timeout
|
||||
defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT.
|
||||
|
||||
Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with
|
||||
each other. 235B and comfyui coexist with nothing. See
|
||||
compose/qwen3-235b/README.md for arena math.
|
||||
EOF
|
||||
}
|
||||
|
||||
# --- Main --------------------------------------------------------------------
|
||||
TARGET="${1:-}"
|
||||
case "$TARGET" in
|
||||
coder|235b|kimi|qwable|comfyui|none) ;;
|
||||
status) show_status ; exit 0 ;;
|
||||
-h|--help|help|"") usage ; exit 0 ;;
|
||||
*)
|
||||
echo "swap-model: unknown target '$TARGET'" >&2
|
||||
echo "Try: swap-model help" >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
plan "$TARGET"
|
||||
echo "Plan: down=[${DOWN[*]:-}] up=[${UP[*]:-}]"
|
||||
for svc in "${DOWN[@]}"; do down_svc "$svc"; done
|
||||
for svc in "${UP[@]}"; do up_svc "$svc"; done
|
||||
|
||||
echo
|
||||
show_status
|
||||
Reference in New Issue
Block a user