localgenai/pyinfra/framework/scripts/swap-model

#!/usr/bin/env bash
# swap-model — coordinate which inference container is GPU-resident on
# the Strix Halo box.
#
# Why this exists. The GPU's merged ~110 GB arena (BIOS UMA=0.5 GB +
# ttm.pages_limit + HSA_XNACK; see StrixHaloMemory.md) holds at most
# one 88 GB-class model at a time, and ROCm doesn't reclaim cleanly
# between consumers. So switching models means stop-then-start of
# whole compose stacks. This script encodes the per-target conflict
# table + per-service health probes so the swap is one command.
#
# Usage:
#   swap-model coder        # Qwen3-Coder-30B via Ollama (interactive)
#   swap-model 235b         # Qwen3-235B-A22B via llama.cpp (long-task)
#   swap-model kimi         # Kimi-Linear-48B-A3B via vLLM (long-context)
#   swap-model qwable       # Qwable-3.6-27B via llama.cpp (Fable-style)
#   swap-model comfyui      # ComfyUI (image generation)
#   swap-model none         # everything down — free the GPU
#   swap-model status       # show what's currently up
#
# Env knobs:
#   SWAP_WAIT_TIMEOUT       seconds to wait for /health after up; default 600
#                           (235B's 88 GB cold load can take 3-5 min)
#
# Out of scope (deliberately):
#   - Always-on services (openwebui, litellm, phoenix, beszel, etc.) —
#     no GPU footprint, left alone.
#   - llama.cpp 30B (port 8080) — same weights as Ollama's qwen3-coder
#     but still LL-P0 perf-evaluating. `coder` target uses Ollama only.
#   - Multi-target combos (e.g. kimi+ollama coexist on the arena);
#     for now run swap-model twice if you want both.

set -euo pipefail

COMPOSE_ROOT="/srv/docker"
WAIT_TIMEOUT="${SWAP_WAIT_TIMEOUT:-600}"

# --- Service table -----------------------------------------------------------
# Map short name → compose dir (under $COMPOSE_ROOT) and health URL.
# Container name == compose dir name in every case (intentional convention,
# enforced in compose/*.yml's container_name fields).
declare -A SVC_DIR=(
    [ollama]=ollama
    [llama]=llama
    [kimi]=kimi-linear
    [235b]=qwen3-235b
    [qwable]=qwable
    [comfyui]=comfyui
)
declare -A SVC_HEALTH=(
    [ollama]="http://127.0.0.1:11434/api/tags"
    [llama]="http://127.0.0.1:8080/health"
    [kimi]="http://127.0.0.1:8000/v1/models"
    [235b]="http://127.0.0.1:8081/health"
    [qwable]="http://127.0.0.1:8082/health"
    [comfyui]="http://127.0.0.1:8188/"
)

# --- Target → plan -----------------------------------------------------------
# UP   = services that should be running after the swap
# DOWN = services that must be stopped to free the GPU arena
# (anything not in either list is left untouched — e.g. switching to coder
# leaves kimi alone, since kimi(30 GB) + ollama(30 GB) fit in the arena.)
plan() {
    UP=() ; DOWN=()
    case "$1" in
        coder)   UP=(ollama)  ; DOWN=(235b comfyui) ;;
        235b)    UP=(235b)    ; DOWN=(ollama llama kimi qwable comfyui) ;;
        kimi)    UP=(kimi)    ; DOWN=(235b comfyui) ;;
        qwable)  UP=(qwable)  ; DOWN=(235b comfyui) ;;
        comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;;
        none)    UP=()        ; DOWN=(ollama llama kimi 235b qwable comfyui) ;;
        *)       return 1 ;;
    esac
}

# --- Probes ------------------------------------------------------------------
is_running() {
    docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true
}

is_healthy() {
    curl -fsS --max-time 5 "$1" >/dev/null 2>&1
}

wait_healthy() {
    local svc="$1" url="${SVC_HEALTH[$1]}" deadline=$(( SECONDS + WAIT_TIMEOUT ))
    printf "    waiting for %s health (timeout %ss)" "$svc" "$WAIT_TIMEOUT"
    while ! is_healthy "$url"; do
        if (( SECONDS > deadline )); then
            printf " TIMEOUT\n"
            echo "    last 20 lines of container log:" >&2
            docker logs --tail 20 "${SVC_DIR[$svc]}" 2>&1 | sed 's/^/      /' >&2
            return 1
        fi
        sleep 5
        printf "."
    done
    printf " ok\n"
}

# --- Actions -----------------------------------------------------------------
down_svc() {
    local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
    if ! is_running "${SVC_DIR[$svc]}"; then
        echo "  $svc: already down"
        return 0
    fi
    echo "  stopping $svc"
    (cd "$dir" && docker compose down)
}

up_svc() {
    local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}"
    if is_running "${SVC_DIR[$svc]}" && is_healthy "${SVC_HEALTH[$svc]}"; then
        echo "  $svc: already up + healthy"
        return 0
    fi
    if [[ ! -d "$dir" ]]; then
        echo "  $svc: compose dir $dir missing — run pyinfra deploy first" >&2
        return 1
    fi
    echo "  starting $svc"
    (cd "$dir" && docker compose up -d)
    wait_healthy "$svc"
}

show_status() {
    echo "Inference services:"
    for svc in ollama llama kimi 235b qwable comfyui; do
        local container="${SVC_DIR[$svc]}" state="down" health=""
        if is_running "$container"; then
            state="up"
            if is_healthy "${SVC_HEALTH[$svc]}"; then
                health=" (healthy)"
            else
                health=" (starting/unhealthy)"
            fi
        fi
        printf "  %-8s %s%s\n" "$svc" "$state" "$health"
    done
}

usage() {
    cat <<EOF
swap-model — coordinate inference containers on Strix Halo.

Usage:
  swap-model coder        # Qwen3-Coder-30B  (Ollama, interactive daily-driver)
  swap-model 235b         # Qwen3-235B-A22B  (llama.cpp, long-task, ~5-10 tok/s)
  swap-model kimi         # Kimi-Linear-48B  (vLLM, long-context chat)
  swap-model qwable       # Qwable-3.6-27B   (llama.cpp, Fable-style, ~10-15 tok/s)
  swap-model comfyui      # ComfyUI          (image generation)
  swap-model none         # everything down  (free the GPU arena)
  swap-model status       # show current state

Behaviour: stops conflicting services (frees the 110 GB GPU arena),
starts the target, polls its /health until it returns 200. Wait timeout
defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT.

Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with
each other. 235B and comfyui coexist with nothing. See
compose/qwen3-235b/README.md for arena math.
EOF
}

# --- Main --------------------------------------------------------------------
TARGET="${1:-}"
case "$TARGET" in
    coder|235b|kimi|qwable|comfyui|none) ;;
    status)         show_status ; exit 0 ;;
    -h|--help|help|"") usage ; exit 0 ;;
    *)
        echo "swap-model: unknown target '$TARGET'" >&2
        echo "Try: swap-model help" >&2
        exit 2
        ;;
esac

plan "$TARGET"
echo "Plan: down=[${DOWN[*]:-}] up=[${UP[*]:-}]"
for svc in "${DOWN[@]}"; do down_svc "$svc"; done
for svc in "${UP[@]}";   do up_svc   "$svc"; done

echo
show_status