#!/usr/bin/env bash # swap-model — coordinate which inference container is GPU-resident on # the Strix Halo box. # # Why this exists. The GPU's merged ~110 GB arena (BIOS UMA=0.5 GB + # ttm.pages_limit + HSA_XNACK; see StrixHaloMemory.md) holds at most # one 88 GB-class model at a time, and ROCm doesn't reclaim cleanly # between consumers. So switching models means stop-then-start of # whole compose stacks. This script encodes the per-target conflict # table + per-service health probes so the swap is one command. # # Usage: # swap-model coder # Qwen3-Coder-30B via Ollama (interactive) # swap-model 235b # Qwen3-235B-A22B via llama.cpp (long-task) # swap-model kimi # Kimi-Linear-48B-A3B via vLLM (long-context) # swap-model qwable # Qwable-3.6-27B via llama.cpp (Fable-style) # swap-model ornith # Ornith-1.0-35B via llama.cpp (agentic coding) # swap-model comfyui # ComfyUI (image generation) # swap-model none # everything down — free the GPU # swap-model status # show what's currently up # # Env knobs: # SWAP_WAIT_TIMEOUT seconds to wait for /health after up; default 600 # (235B's 88 GB cold load can take 3-5 min) # # Out of scope (deliberately): # - Always-on services (openwebui, litellm, phoenix, beszel, etc.) — # no GPU footprint, left alone. # - llama.cpp 30B (port 8080) — same weights as Ollama's qwen3-coder # but still LL-P0 perf-evaluating. `coder` target uses Ollama only. # - Multi-target combos (e.g. kimi+ollama coexist on the arena); # for now run swap-model twice if you want both. set -euo pipefail COMPOSE_ROOT="/srv/docker" WAIT_TIMEOUT="${SWAP_WAIT_TIMEOUT:-600}" # --- Service table ----------------------------------------------------------- # Map short name → compose dir (under $COMPOSE_ROOT) and health URL. # Container name == compose dir name in every case (intentional convention, # enforced in compose/*.yml's container_name fields). declare -A SVC_DIR=( [ollama]=ollama [llama]=llama [kimi]=kimi-linear [235b]=qwen3-235b [qwable]=qwable [ornith]=ornith [comfyui]=comfyui ) declare -A SVC_HEALTH=( [ollama]="http://127.0.0.1:11434/api/tags" [llama]="http://127.0.0.1:8080/health" [kimi]="http://127.0.0.1:8000/v1/models" [235b]="http://127.0.0.1:8081/health" [qwable]="http://127.0.0.1:8082/health" [ornith]="http://127.0.0.1:8083/health" [comfyui]="http://127.0.0.1:8188/" ) # --- Target → plan ----------------------------------------------------------- # UP = services that should be running after the swap # DOWN = services that must be stopped to free the GPU arena # (anything not in either list is left untouched — e.g. switching to coder # leaves kimi alone, since kimi(30 GB) + ollama(30 GB) fit in the arena.) plan() { UP=() ; DOWN=() case "$1" in coder) UP=(ollama) ; DOWN=(235b comfyui) ;; 235b) UP=(235b) ; DOWN=(ollama llama kimi qwable comfyui) ;; kimi) UP=(kimi) ; DOWN=(235b comfyui) ;; qwable) UP=(qwable) ; DOWN=(235b comfyui) ;; ornith) UP=(ornith) ; DOWN=(235b comfyui) ;; comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable ornith) ;; none) UP=() ; DOWN=(ollama llama kimi 235b qwable ornith comfyui) ;; *) return 1 ;; esac } # --- Probes ------------------------------------------------------------------ is_running() { docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true } is_healthy() { curl -fsS --max-time 5 "$1" >/dev/null 2>&1 } wait_healthy() { local svc="$1" url="${SVC_HEALTH[$1]}" deadline=$(( SECONDS + WAIT_TIMEOUT )) printf " waiting for %s health (timeout %ss)" "$svc" "$WAIT_TIMEOUT" while ! is_healthy "$url"; do if (( SECONDS > deadline )); then printf " TIMEOUT\n" echo " last 20 lines of container log:" >&2 docker logs --tail 20 "${SVC_DIR[$svc]}" 2>&1 | sed 's/^/ /' >&2 return 1 fi sleep 5 printf "." done printf " ok\n" } # --- Actions ----------------------------------------------------------------- down_svc() { local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}" if ! is_running "${SVC_DIR[$svc]}"; then echo " $svc: already down" return 0 fi echo " stopping $svc" (cd "$dir" && docker compose down) } up_svc() { local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}" if is_running "${SVC_DIR[$svc]}" && is_healthy "${SVC_HEALTH[$svc]}"; then echo " $svc: already up + healthy" return 0 fi if [[ ! -d "$dir" ]]; then echo " $svc: compose dir $dir missing — run pyinfra deploy first" >&2 return 1 fi echo " starting $svc" (cd "$dir" && docker compose up -d) wait_healthy "$svc" } show_status() { echo "Inference services:" for svc in ollama llama kimi 235b qwable ornith comfyui; do local container="${SVC_DIR[$svc]}" state="down" health="" if is_running "$container"; then state="up" if is_healthy "${SVC_HEALTH[$svc]}"; then health=" (healthy)" else health=" (starting/unhealthy)" fi fi printf " %-8s %s%s\n" "$svc" "$state" "$health" done } usage() { cat <&2 echo "Try: swap-model help" >&2 exit 2 ;; esac plan "$TARGET" echo "Plan: down=[${DOWN[*]:-}] up=[${UP[*]:-}]" for svc in "${DOWN[@]}"; do down_svc "$svc"; done for svc in "${UP[@]}"; do up_svc "$svc"; done echo show_status