187 lines
6.6 KiB
Bash
Executable File
187 lines
6.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# swap-model — coordinate which inference container is GPU-resident on
|
|
# the Strix Halo box.
|
|
#
|
|
# Why this exists. The GPU's merged ~110 GB arena (BIOS UMA=0.5 GB +
|
|
# ttm.pages_limit + HSA_XNACK; see StrixHaloMemory.md) holds at most
|
|
# one 88 GB-class model at a time, and ROCm doesn't reclaim cleanly
|
|
# between consumers. So switching models means stop-then-start of
|
|
# whole compose stacks. This script encodes the per-target conflict
|
|
# table + per-service health probes so the swap is one command.
|
|
#
|
|
# Usage:
|
|
# swap-model coder # Qwen3-Coder-30B via Ollama (interactive)
|
|
# swap-model 235b # Qwen3-235B-A22B via llama.cpp (long-task)
|
|
# swap-model kimi # Kimi-Linear-48B-A3B via vLLM (long-context)
|
|
# swap-model qwable # Qwable-3.6-27B via llama.cpp (Fable-style)
|
|
# swap-model comfyui # ComfyUI (image generation)
|
|
# swap-model none # everything down — free the GPU
|
|
# swap-model status # show what's currently up
|
|
#
|
|
# Env knobs:
|
|
# SWAP_WAIT_TIMEOUT seconds to wait for /health after up; default 600
|
|
# (235B's 88 GB cold load can take 3-5 min)
|
|
#
|
|
# Out of scope (deliberately):
|
|
# - Always-on services (openwebui, litellm, phoenix, beszel, etc.) —
|
|
# no GPU footprint, left alone.
|
|
# - llama.cpp 30B (port 8080) — same weights as Ollama's qwen3-coder
|
|
# but still LL-P0 perf-evaluating. `coder` target uses Ollama only.
|
|
# - Multi-target combos (e.g. kimi+ollama coexist on the arena);
|
|
# for now run swap-model twice if you want both.
|
|
|
|
set -euo pipefail
|
|
|
|
COMPOSE_ROOT="/srv/docker"
|
|
WAIT_TIMEOUT="${SWAP_WAIT_TIMEOUT:-600}"
|
|
|
|
# --- Service table -----------------------------------------------------------
|
|
# Map short name → compose dir (under $COMPOSE_ROOT) and health URL.
|
|
# Container name == compose dir name in every case (intentional convention,
|
|
# enforced in compose/*.yml's container_name fields).
|
|
declare -A SVC_DIR=(
|
|
[ollama]=ollama
|
|
[llama]=llama
|
|
[kimi]=kimi-linear
|
|
[235b]=qwen3-235b
|
|
[qwable]=qwable
|
|
[comfyui]=comfyui
|
|
)
|
|
declare -A SVC_HEALTH=(
|
|
[ollama]="http://127.0.0.1:11434/api/tags"
|
|
[llama]="http://127.0.0.1:8080/health"
|
|
[kimi]="http://127.0.0.1:8000/v1/models"
|
|
[235b]="http://127.0.0.1:8081/health"
|
|
[qwable]="http://127.0.0.1:8082/health"
|
|
[comfyui]="http://127.0.0.1:8188/"
|
|
)
|
|
|
|
# --- Target → plan -----------------------------------------------------------
|
|
# UP = services that should be running after the swap
|
|
# DOWN = services that must be stopped to free the GPU arena
|
|
# (anything not in either list is left untouched — e.g. switching to coder
|
|
# leaves kimi alone, since kimi(30 GB) + ollama(30 GB) fit in the arena.)
|
|
plan() {
|
|
UP=() ; DOWN=()
|
|
case "$1" in
|
|
coder) UP=(ollama) ; DOWN=(235b comfyui) ;;
|
|
235b) UP=(235b) ; DOWN=(ollama llama kimi qwable comfyui) ;;
|
|
kimi) UP=(kimi) ; DOWN=(235b comfyui) ;;
|
|
qwable) UP=(qwable) ; DOWN=(235b comfyui) ;;
|
|
comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;;
|
|
none) UP=() ; DOWN=(ollama llama kimi 235b qwable comfyui) ;;
|
|
*) return 1 ;;
|
|
esac
|
|
}
|
|
|
|
# --- Probes ------------------------------------------------------------------
|
|
is_running() {
|
|
docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true
|
|
}
|
|
|
|
is_healthy() {
|
|
curl -fsS --max-time 5 "$1" >/dev/null 2>&1
|
|
}
|
|
|
|
wait_healthy() {
|
|
local svc="$1" url="${SVC_HEALTH[$1]}" deadline=$(( SECONDS + WAIT_TIMEOUT ))
|
|
printf " waiting for %s health (timeout %ss)" "$svc" "$WAIT_TIMEOUT"
|
|
while ! is_healthy "$url"; do
|
|
if (( SECONDS > deadline )); then
|
|
printf " TIMEOUT\n"
|
|
echo " last 20 lines of container log:" >&2
|
|
docker logs --tail 20 "${SVC_DIR[$svc]}" 2>&1 | sed 's/^/ /' >&2
|
|
return 1
|
|
fi
|
|
sleep 5
|
|
printf "."
|
|
done
|
|
printf " ok\n"
|
|
}
|
|
|
|
# --- Actions -----------------------------------------------------------------
|
|
down_svc() {
|
|
local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
|
|
if ! is_running "${SVC_DIR[$svc]}"; then
|
|
echo " $svc: already down"
|
|
return 0
|
|
fi
|
|
echo " stopping $svc"
|
|
(cd "$dir" && docker compose down)
|
|
}
|
|
|
|
up_svc() {
|
|
local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
|
|
if is_running "${SVC_DIR[$svc]}" && is_healthy "${SVC_HEALTH[$svc]}"; then
|
|
echo " $svc: already up + healthy"
|
|
return 0
|
|
fi
|
|
if [[ ! -d "$dir" ]]; then
|
|
echo " $svc: compose dir $dir missing — run pyinfra deploy first" >&2
|
|
return 1
|
|
fi
|
|
echo " starting $svc"
|
|
(cd "$dir" && docker compose up -d)
|
|
wait_healthy "$svc"
|
|
}
|
|
|
|
show_status() {
|
|
echo "Inference services:"
|
|
for svc in ollama llama kimi 235b qwable comfyui; do
|
|
local container="${SVC_DIR[$svc]}" state="down" health=""
|
|
if is_running "$container"; then
|
|
state="up"
|
|
if is_healthy "${SVC_HEALTH[$svc]}"; then
|
|
health=" (healthy)"
|
|
else
|
|
health=" (starting/unhealthy)"
|
|
fi
|
|
fi
|
|
printf " %-8s %s%s\n" "$svc" "$state" "$health"
|
|
done
|
|
}
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
swap-model — coordinate inference containers on Strix Halo.
|
|
|
|
Usage:
|
|
swap-model coder # Qwen3-Coder-30B (Ollama, interactive daily-driver)
|
|
swap-model 235b # Qwen3-235B-A22B (llama.cpp, long-task, ~5-10 tok/s)
|
|
swap-model kimi # Kimi-Linear-48B (vLLM, long-context chat)
|
|
swap-model qwable # Qwable-3.6-27B (llama.cpp, Fable-style, ~10-15 tok/s)
|
|
swap-model comfyui # ComfyUI (image generation)
|
|
swap-model none # everything down (free the GPU arena)
|
|
swap-model status # show current state
|
|
|
|
Behaviour: stops conflicting services (frees the 110 GB GPU arena),
|
|
starts the target, polls its /health until it returns 200. Wait timeout
|
|
defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT.
|
|
|
|
Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with
|
|
each other. 235B and comfyui coexist with nothing. See
|
|
compose/qwen3-235b/README.md for arena math.
|
|
EOF
|
|
}
|
|
|
|
# --- Main --------------------------------------------------------------------
|
|
TARGET="${1:-}"
|
|
case "$TARGET" in
|
|
coder|235b|kimi|qwable|comfyui|none) ;;
|
|
status) show_status ; exit 0 ;;
|
|
-h|--help|help|"") usage ; exit 0 ;;
|
|
*)
|
|
echo "swap-model: unknown target '$TARGET'" >&2
|
|
echo "Try: swap-model help" >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
|
|
plan "$TARGET"
|
|
echo "Plan: down=[${DOWN[*]:-}] up=[${UP[*]:-}]"
|
|
for svc in "${DOWN[@]}"; do down_svc "$svc"; done
|
|
for svc in "${UP[@]}"; do up_svc "$svc"; done
|
|
|
|
echo
|
|
show_status
|