progress 235b
This commit is contained in:
77
pyinfra/framework/compose/comfyui.yml
Normal file
77
pyinfra/framework/compose/comfyui.yml
Normal file
@@ -0,0 +1,77 @@
|
||||
# ComfyUI on Strix Halo gfx1151 via kyuz0/amd-strix-halo-comfyui.
|
||||
#
|
||||
# Toolbox-style image (Fedora rawhide + ROCm) with /bin/bash as CMD.
|
||||
# We override entrypoint to launch ComfyUI's main.py with the flag set
|
||||
# gfx1151 needs (--disable-mmap because mmap >64 GB is slow on ROCm;
|
||||
# --bf16-vae avoids VAE OOM; --cache-none keeps unified-memory pressure
|
||||
# manageable).
|
||||
#
|
||||
# Coexistence with other services. ComfyUI competes for GPU with
|
||||
# kimi-linear (always-resident) and ollama (loads-on-demand). To avoid
|
||||
# silent contention this stack is NOT set to restart automatically —
|
||||
# bring it up manually (`docker compose up -d`) when you need image gen,
|
||||
# and `docker compose down` after. Mid-term we'll add a
|
||||
# load-shed/coordination layer; this comment is the binding for now.
|
||||
#
|
||||
# Pin: kyuz0/amd-strix-halo-comfyui:20260213-143435 (sha-7242b4d). Bump
|
||||
# deliberately after re-validating Flux/HiDream/LTX2 still work.
|
||||
services:
|
||||
comfyui:
|
||||
image: kyuz0/amd-strix-halo-comfyui:20260213-143435
|
||||
container_name: comfyui
|
||||
# Explicit no auto-restart — see header note about GPU contention.
|
||||
restart: "no"
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the Fedora-rawhide base, but GIDs need to match
|
||||
# the host for /dev/kfd + /dev/dri access.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 16g
|
||||
ipc: host
|
||||
environment:
|
||||
# Same unified-memory recipe as kimi-linear.yml: BIOS UMA=0.5 GB +
|
||||
# ttm.pages_limit=33554432 cmdline + this triple. Without these,
|
||||
# PyTorch's HIP allocator only sees the tiny 0.5 GB UMA pool and
|
||||
# can't reach GTT. The kyuz0 image is built against native gfx1151
|
||||
# so HSA_OVERRIDE_GFX_VERSION isn't needed.
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
- PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9
|
||||
volumes:
|
||||
# All ComfyUI state lives under /srv/docker/comfyui/ on the host.
|
||||
# Image's $HOME is /root (Fedora rawhide). Models go in subdirs
|
||||
# under comfy-models/ (text_encoders/, vae/, checkpoints/,
|
||||
# diffusion_models/, unet/, loras/, clip_vision/) — kyuz0's image
|
||||
# populates extra_model_paths.yaml pointing at $HOME/comfy-models.
|
||||
- /srv/docker/comfyui/models:/root/comfy-models
|
||||
- /srv/docker/comfyui/output:/root/comfy-outputs
|
||||
- /srv/docker/comfyui/custom_nodes:/opt/ComfyUI/custom_nodes
|
||||
- /srv/docker/comfyui/workflows:/opt/ComfyUI/user/default/workflows
|
||||
ports:
|
||||
# 8188 = standard ComfyUI port. kyuz0's banner alias uses 8000 but
|
||||
# that would collide with vLLM (compose/kimi-linear.yml).
|
||||
- "8188:8188"
|
||||
# bash -lc loads /etc/profile.d/01-rocm-envs.sh (TORCH_ROCM_AOTRITON,
|
||||
# TORCH_BLAS_PREFER_HIPBLASLT) — without a login shell those don't
|
||||
# apply and ROCm perf regresses.
|
||||
entrypoint: ["/bin/bash", "-lc"]
|
||||
# set_extra_paths.sh writes /opt/ComfyUI/extra_model_paths.yaml so
|
||||
# ComfyUI finds models under $HOME/comfy-models. Idempotent — safe
|
||||
# to run every start. Without it, model dropdowns in the UI are
|
||||
# empty and templates report "missing model".
|
||||
command:
|
||||
- >
|
||||
/opt/set_extra_paths.sh &&
|
||||
cd /opt/ComfyUI && python main.py
|
||||
--listen 0.0.0.0 --port 8188
|
||||
--output-directory /root/comfy-outputs
|
||||
--disable-mmap --gpu-only --disable-smart-memory
|
||||
--cache-none --bf16-vae
|
||||
121
pyinfra/framework/compose/comfyui/README.md
Normal file
121
pyinfra/framework/compose/comfyui/README.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# comfyui
|
||||
|
||||
ComfyUI for image generation on the Strix Halo box via
|
||||
`kyuz0/amd-strix-halo-comfyui` (battle-tested gfx1151 toolbox). Web UI
|
||||
at `http://framework:8188`. **Not** auto-started — bring up manually so
|
||||
it doesn't contend with kimi-linear for GPU.
|
||||
|
||||
## Coexistence notes (read first)
|
||||
|
||||
ComfyUI competes for GPU memory with the always-resident kimi-linear
|
||||
(vLLM) and on-demand ollama. The stack reflects this:
|
||||
|
||||
- `restart: "no"` — won't come back on box reboot. You start it.
|
||||
- Stop kimi-linear before heavy ComfyUI work, or accept slower swap.
|
||||
- Use case order: `docker compose up -d` here → use UI → `docker compose down`.
|
||||
|
||||
## Prereqs
|
||||
|
||||
- Pyinfra deploy has run (creates `/srv/docker/comfyui/{models,output,
|
||||
custom_nodes,workflows}` with the right perms).
|
||||
- BIOS UMA at 0.5 GB + ttm.pages_limit cmdline active (same recipe as
|
||||
kimi-linear). Verify with `cat /proc/cmdline | grep ttm.pages_limit`.
|
||||
|
||||
## Bring up
|
||||
|
||||
```sh
|
||||
cd /srv/docker/comfyui
|
||||
docker compose pull # ~8-12 GB image
|
||||
docker compose up -d
|
||||
docker compose logs -f # wait for "To see the GUI go to: http://0.0.0.0:8188"
|
||||
./smoke.sh
|
||||
```
|
||||
|
||||
Open `http://framework:8188` in a browser. The blank workflow loads.
|
||||
Empty without models — see next section.
|
||||
|
||||
## Adding Flux.1-Dev (CF-P1)
|
||||
|
||||
Flux.1-Dev is gated on HF — accept the license at
|
||||
<https://huggingface.co/black-forest-labs/FLUX.1-dev> first, set
|
||||
`HF_TOKEN` in your shell.
|
||||
|
||||
```sh
|
||||
export HF_TOKEN=...
|
||||
|
||||
# Diffusion model (UNet)
|
||||
hf download black-forest-labs/FLUX.1-dev \
|
||||
flux1-dev.safetensors \
|
||||
--local-dir /srv/docker/comfyui/models/diffusion_models
|
||||
|
||||
# VAE
|
||||
hf download black-forest-labs/FLUX.1-dev \
|
||||
ae.safetensors \
|
||||
--local-dir /srv/docker/comfyui/models/vae
|
||||
|
||||
# Text encoders (CLIP-L + T5XXL)
|
||||
hf download comfyanonymous/flux_text_encoders \
|
||||
clip_l.safetensors t5xxl_fp8_e4m3fn.safetensors \
|
||||
--local-dir /srv/docker/comfyui/models/text_encoders
|
||||
```
|
||||
|
||||
fp8 t5xxl (~5 GB) over fp16 (~9 GB) — generally indistinguishable
|
||||
quality, much faster. Refresh the UI; the canonical Flux txt2img
|
||||
workflow is at File → Load → flux-txt2img.
|
||||
|
||||
## Model dir layout
|
||||
|
||||
```
|
||||
/srv/docker/comfyui/models/
|
||||
├── checkpoints/ # full pipeline checkpoints (SDXL, SD1.5)
|
||||
├── diffusion_models/ # standalone UNet/transformer (Flux, HiDream, etc.)
|
||||
├── vae/ # VAEs
|
||||
├── text_encoders/ # CLIP-L, T5XXL, etc.
|
||||
├── loras/
|
||||
├── controlnet/
|
||||
├── clip_vision/
|
||||
└── upscale_models/
|
||||
```
|
||||
|
||||
The image's `extra_model_paths.yaml` maps these into ComfyUI's load
|
||||
paths automatically.
|
||||
|
||||
## Operations
|
||||
|
||||
```sh
|
||||
docker compose logs -f # tail
|
||||
docker compose restart comfyui # reload
|
||||
docker compose down # stop
|
||||
docker compose exec comfyui bash # shell in
|
||||
amdgpu_top # GPU view on host
|
||||
./smoke.sh # health check
|
||||
```
|
||||
|
||||
## Pin manifest
|
||||
|
||||
| Component | Pin |
|
||||
|---|---|
|
||||
| Image | `kyuz0/amd-strix-halo-comfyui:20260213-143435` (sha-7242b4d) |
|
||||
| Default port | 8188 (host) → 8188 (container) |
|
||||
| Flux text encoder quant | fp8_e4m3fn (override to fp16 if you observe quality loss) |
|
||||
|
||||
Bump the image pin deliberately after re-validating that Flux + any
|
||||
custom nodes still work.
|
||||
|
||||
## Known caveats
|
||||
|
||||
- **`--disable-mmap` is mandatory** (kyuz0 README: "mmap above 64 GB is
|
||||
currently very slow due to a ROCm issue"). Already set in compose.
|
||||
- **No `controlnet/` subdir created by `set_extra_paths.sh`** — we add
|
||||
it manually via pyinfra. If kyuz0 adds it to their script, our
|
||||
pyinfra dir creation is harmless idempotent overhead.
|
||||
- **Custom nodes can write to `/opt/ComfyUI/custom_nodes`** — that dir
|
||||
is bind-mounted RW so installs persist across container recreates.
|
||||
- **Flux.2 crashes on Strix Halo ROCm** as of 2026-05; stick with
|
||||
Flux.1-Dev or HiDream-O1 until upstream patch lands.
|
||||
|
||||
## Status
|
||||
|
||||
CF-P0 in progress (this commit). CF-P1 (Flux first image), CF-P2
|
||||
(productionize), CF-P3 (workflow library + OpenWebUI hook), CF-P4
|
||||
(HiDream, LTX-2, LoRAs) — see top-level task list.
|
||||
20
pyinfra/framework/compose/comfyui/smoke.sh
Executable file
20
pyinfra/framework/compose/comfyui/smoke.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
# Smoke-test the running ComfyUI container. /system_stats should return
|
||||
# device + memory info; /object_info should list registered nodes.
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${COMFYUI_HOST:-127.0.0.1:8188}"
|
||||
|
||||
echo "[smoke] GET /system_stats on $HOST"
|
||||
curl -fsS "http://$HOST/system_stats" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] GET /object_info — node count"
|
||||
curl -fsS "http://$HOST/object_info" | python3 -c "
|
||||
import json, sys
|
||||
data = json.load(sys.stdin)
|
||||
print(f'{len(data)} nodes registered')
|
||||
"
|
||||
|
||||
echo
|
||||
echo "[smoke] passed"
|
||||
@@ -17,6 +17,14 @@ services:
|
||||
# 7575 picked to avoid the soup of 30xx ports already in use
|
||||
# (OpenWebUI 3000, OpenLIT 3001, OpenHands 3030).
|
||||
- "7575:3000"
|
||||
extra_hosts:
|
||||
# Required for customapi widgets to reach host services. Inside the
|
||||
# container, `framework` resolves to 127.0.1.1 (Ubuntu hostname
|
||||
# loopback alias), which is the container itself. Use
|
||||
# host.docker.internal in widget.url fields to route via the host
|
||||
# gateway. The user-facing `href` fields keep `framework:` since
|
||||
# those resolve on the user's browser, not the container.
|
||||
- "host.docker.internal:host-gateway"
|
||||
volumes:
|
||||
- /srv/docker/homepage/config:/app/config
|
||||
# Read-only docker socket so homepage can render container status
|
||||
|
||||
@@ -10,9 +10,21 @@
|
||||
description: Local model server (Qwen3-Coder-30B and friends)
|
||||
server: localhost-docker
|
||||
container: ollama
|
||||
# Built-in `type: ollama` widget is missing on the installed
|
||||
# Homepage version. customapi against /api/ps gives a better
|
||||
# signal anyway: actually-loaded model + its VRAM footprint.
|
||||
# When no model is loaded the models array is empty and fields
|
||||
# render as N/A — that itself is useful state.
|
||||
widget:
|
||||
type: ollama
|
||||
url: http://framework:11434
|
||||
type: customapi
|
||||
url: http://host.docker.internal:11434/api/ps
|
||||
refreshInterval: 30000
|
||||
mappings:
|
||||
- field: models.0.name
|
||||
label: Loaded
|
||||
- field: models.0.size_vram
|
||||
label: VRAM
|
||||
format: bytes
|
||||
|
||||
- llama.cpp:
|
||||
icon: si-llama
|
||||
@@ -23,18 +35,50 @@
|
||||
# No native widget; a ping check confirms liveness.
|
||||
widget:
|
||||
type: customapi
|
||||
url: http://framework:8080/health
|
||||
url: http://host.docker.internal:8080/health
|
||||
refreshInterval: 30000
|
||||
mappings:
|
||||
- field: status
|
||||
label: Status
|
||||
|
||||
- vLLM:
|
||||
- vLLM (Kimi-Linear):
|
||||
icon: mdi-server-network
|
||||
href: http://framework:8000
|
||||
description: Batched OpenAI-compatible serving (ROCm)
|
||||
description: Batched OpenAI-compatible serving — Kimi-Linear-48B-A3B (long-context)
|
||||
server: localhost-docker
|
||||
container: vllm
|
||||
# Actual vLLM container is `kimi-linear` (compose/kimi-linear.yml).
|
||||
# The legacy `vllm` container in compose/vllm.yml is an unused stub.
|
||||
container: kimi-linear
|
||||
widget:
|
||||
type: customapi
|
||||
url: http://host.docker.internal:8000/v1/models
|
||||
refreshInterval: 30000
|
||||
mappings:
|
||||
- field: data.0.id
|
||||
label: Served
|
||||
- field: data.0.max_model_len
|
||||
label: Context
|
||||
format: number
|
||||
|
||||
- ComfyUI:
|
||||
icon: mdi-image-edit
|
||||
href: http://framework:8188
|
||||
description: Image generation (Flux.1-Dev via kyuz0 gfx1151 toolbox)
|
||||
server: localhost-docker
|
||||
container: comfyui
|
||||
# ComfyUI's /system_stats returns nested {system, devices[0]}.
|
||||
# Surfacing version + free VRAM gives a quick "is it healthy
|
||||
# and does it have memory" read at a glance.
|
||||
widget:
|
||||
type: customapi
|
||||
url: http://host.docker.internal:8188/system_stats
|
||||
refreshInterval: 30000
|
||||
mappings:
|
||||
- field: system.comfyui_version
|
||||
label: Version
|
||||
- field: devices.0.vram_free
|
||||
label: VRAM Free
|
||||
format: bytes
|
||||
|
||||
- Agent UIs:
|
||||
- OpenWebUI:
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
# Weights. Despite their HF name, cyankiwi's "AWQ" Kimi-Linear weights
|
||||
# are actually `compressed-tensors` int4 group-quantized — see config.json.
|
||||
# Download with:
|
||||
# huggingface-cli download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
|
||||
# hf download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
|
||||
# --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
# Size: ~35 GB on disk (4-bit). 8-bit variant is ~54 GB if quality drives
|
||||
# us up later; both fit 128 GB unified comfortably.
|
||||
|
||||
@@ -13,14 +13,14 @@ Smoke-test below confirms all three at once.
|
||||
|
||||
- Pyinfra deploy has run (`./run.sh` from `pyinfra/framework/`) — gives
|
||||
you `/srv/docker/kimi-linear/`, GPU group membership, `/models/`
|
||||
layout, and `huggingface-cli` on the box.
|
||||
- Hugging Face CLI authenticated (`huggingface-cli login`) if the
|
||||
layout, and the `hf` CLI on the box.
|
||||
- Hugging Face CLI authenticated (`hf auth login`) if the
|
||||
weights repo gates downloads. cyankiwi's repo is currently public.
|
||||
|
||||
## Step 1 — Download weights
|
||||
|
||||
```sh
|
||||
huggingface-cli download \
|
||||
hf download \
|
||||
cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
|
||||
--local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
```
|
||||
|
||||
65
pyinfra/framework/compose/litellm.yml
Normal file
65
pyinfra/framework/compose/litellm.yml
Normal file
@@ -0,0 +1,65 @@
|
||||
# LiteLLM proxy — single OpenAI-compatible endpoint in front of all the
|
||||
# local model backends on this box (Ollama 11434, llama.cpp 30B 8080,
|
||||
# vLLM Kimi-Linear 8000, llama.cpp Qwen3-235B 8081).
|
||||
#
|
||||
# Why this exists. With ≥3 backends running and ≥2 client harnesses
|
||||
# (opencode on Mac, OpenHands on the box, future orchestrator on another
|
||||
# server), each client otherwise carries its own per-backend config.
|
||||
# LiteLLM centralizes: model_name → backend_url mapping lives here once,
|
||||
# clients just speak "model: qwen3-235b" to a single URL.
|
||||
#
|
||||
# Routing model is documented in compose/litellm/README.md — opencode
|
||||
# stays direct-wired for now (fewer hops, simpler debug); OpenHands +
|
||||
# the future orchestrator will point here.
|
||||
#
|
||||
# Backend reachability. `extra_hosts: host.docker.internal:host-gateway`
|
||||
# resolves to the host's docker0 IP from inside this container, which
|
||||
# is how it reaches the other compose services published on host ports.
|
||||
# Don't use container_name-based DNS — those containers live on separate
|
||||
# bridge networks (each compose stack has its own).
|
||||
services:
|
||||
litellm:
|
||||
image: ghcr.io/berriai/litellm:main-stable
|
||||
container_name: litellm
|
||||
restart: unless-stopped
|
||||
extra_hosts:
|
||||
# On Linux, `host-gateway` is Docker's magic alias for the host's
|
||||
# docker0 IP — equivalent to host.docker.internal on Mac/Windows.
|
||||
# Lets LiteLLM dial localhost-bound backends as
|
||||
# http://host.docker.internal:<port>.
|
||||
- "host.docker.internal:host-gateway"
|
||||
environment:
|
||||
# Master key. LiteLLM requires one for admin endpoints + serves
|
||||
# as the default Bearer for client requests. Sibling .env file
|
||||
# holds the value (created by pyinfra as a placeholder; you fill
|
||||
# it in on first deploy). Same pattern as compose/beszel.yml.
|
||||
- LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY}
|
||||
# Optional: salt for hashing virtual keys at rest. Unused in the
|
||||
# single-user setup but LiteLLM logs a warning without it.
|
||||
- LITELLM_SALT_KEY=${LITELLM_SALT_KEY:-sk-localgenai-salt}
|
||||
volumes:
|
||||
# Source-of-truth config lives in the repo; pyinfra syncs it to
|
||||
# /srv/docker/litellm/config.yaml on every `./run.sh`. Don't edit
|
||||
# on the box — drift gets overwritten.
|
||||
- /srv/docker/litellm/config.yaml:/app/config.yaml:ro
|
||||
ports:
|
||||
- "4000:4000"
|
||||
command:
|
||||
- --config
|
||||
- /app/config.yaml
|
||||
- --port
|
||||
- "4000"
|
||||
# --num_workers 1 keeps memory minimal; LiteLLM is I/O-bound here,
|
||||
# not CPU-bound. Bump if you see queueing.
|
||||
- --num_workers
|
||||
- "1"
|
||||
healthcheck:
|
||||
# LiteLLM exposes both /health (verifies all backends are reachable
|
||||
# — heavy) and /health/readiness (just the proxy itself — cheap).
|
||||
# Use readiness for the compose healthcheck so a stopped backend
|
||||
# doesn't mark LiteLLM unhealthy.
|
||||
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:4000/health/readiness"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
122
pyinfra/framework/compose/litellm/README.md
Normal file
122
pyinfra/framework/compose/litellm/README.md
Normal file
@@ -0,0 +1,122 @@
|
||||
# litellm
|
||||
|
||||
OpenAI-compatible router in front of all the local model backends on
|
||||
this box. One endpoint (`http://framework:4000/v1`) → four backends
|
||||
(Ollama, llama.cpp 30B, vLLM Kimi, llama.cpp 235B). Maintained from
|
||||
[`config.yaml`](config.yaml) in this repo; edits land on the box via
|
||||
`./run.sh` + `docker compose restart litellm`.
|
||||
|
||||
## When to route through LiteLLM vs direct
|
||||
|
||||
| Client | Recommended | Why |
|
||||
|---|---|---|
|
||||
| **opencode (Mac)** | Direct providers in `opencode.json` | One Mac, four providers fits cleanly in opencode's config schema. Adds no debugging burden. |
|
||||
| **OpenHands (the box)** | Via LiteLLM | OpenHands wants a single `LLM_BASE_URL` + `LLM_MODEL`; switching between qwen3-coder and qwen3-235b for different tasks is a config flip, not a re-wire. |
|
||||
| **Future orchestrator (other server)** | Via LiteLLM | Same reason as OpenHands. Also: one URL to firewall/Tailscale-expose. |
|
||||
| **OpenWebUI** | Direct (already wired to vLLM) | No reason to add a hop. |
|
||||
| **Quick curl / scripts** | Either | LiteLLM is convenient because you don't need to remember port numbers. |
|
||||
|
||||
The trade-off: a hop adds a few ms per request (negligible at our tok/s)
|
||||
and one more thing to debug when something breaks. The win is config
|
||||
centralization.
|
||||
|
||||
## Prereqs
|
||||
|
||||
- Pyinfra deploy has run (creates `/srv/docker/litellm/{config.yaml,.env}`).
|
||||
- Fill in `/srv/docker/litellm/.env`:
|
||||
```
|
||||
LITELLM_MASTER_KEY=sk-<choose-something-stable>
|
||||
LITELLM_SALT_KEY=sk-<choose-something-stable>
|
||||
```
|
||||
Mode 640 root:docker — readable by docker group at compose-parse time,
|
||||
not world-readable.
|
||||
- At least one backend running (otherwise `/v1/models` is empty and
|
||||
routing 404s — LiteLLM itself comes up regardless).
|
||||
|
||||
## Bring up
|
||||
|
||||
```sh
|
||||
cd /srv/docker/litellm
|
||||
docker compose up -d
|
||||
docker compose logs -f # wait for "Application startup complete"
|
||||
|
||||
./smoke.sh # /health/readiness + /v1/models
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
From any client — pass `LITELLM_MASTER_KEY` as the Bearer token:
|
||||
|
||||
```sh
|
||||
# List configured models
|
||||
curl -fsS http://framework:4000/v1/models \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||
| jq '.data[].id'
|
||||
|
||||
# Chat completion (qwen3-coder via the qwen3-coder model_name in config)
|
||||
curl -fsS http://framework:4000/v1/chat/completions \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "qwen3-coder", "messages": [{"role": "user", "content": "ping"}]}' \
|
||||
| jq '.choices[0].message.content'
|
||||
|
||||
# Long-task model (assumes you stopped 30B/Kimi and brought qwen3-235b up)
|
||||
curl -fsS http://framework:4000/v1/chat/completions \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model": "qwen3-235b", "messages": [{"role": "user", "content": "Plan a refactor of foo.py"}], "max_tokens": 2000}'
|
||||
```
|
||||
|
||||
`/health` (no `/readiness`) hits every backend — useful for diagnosing
|
||||
which one is down:
|
||||
|
||||
```sh
|
||||
curl -fsS http://framework:4000/health \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" | jq
|
||||
```
|
||||
|
||||
## Backend down semantics
|
||||
|
||||
LiteLLM does **not** chain fallbacks across our model_list — each
|
||||
`model_name` maps to exactly one backend. If `qwen3-235b` is down,
|
||||
requests for `model: qwen3-235b` return 503. That's deliberate:
|
||||
|
||||
- Silent fallback `qwen3-235b → qwen3-coder` would be confusing
|
||||
(different model, different quality, different latency profile).
|
||||
- The 235B has a manual-start workflow on purpose (GPU contention).
|
||||
Fail-fast surfaces the "you forgot to bring it up" case immediately.
|
||||
|
||||
If you want explicit fallback for a specific harness, add it client-side
|
||||
(OpenHands has `LLM_FALLBACKS=...`).
|
||||
|
||||
## Adding a model
|
||||
|
||||
Edit [`config.yaml`](config.yaml) → `./run.sh` from `pyinfra/framework/`
|
||||
on the Mac → `docker compose restart litellm` on the box. New model
|
||||
appears in `/v1/models` immediately. No state to migrate (database_url
|
||||
is null).
|
||||
|
||||
## Operations
|
||||
|
||||
```sh
|
||||
docker compose logs -f # tail
|
||||
docker compose restart litellm # reload after config edit
|
||||
docker compose down # stop
|
||||
./smoke.sh # health + model list
|
||||
```
|
||||
|
||||
## Pin manifest
|
||||
|
||||
| Component | Pin |
|
||||
|---|---|
|
||||
| Image | `ghcr.io/berriai/litellm:main-stable` |
|
||||
| Default port | 4000 |
|
||||
| Auth | Master key only (no virtual keys, no database) |
|
||||
| Backends | qwen3-coder (Ollama), qwen3-coder-llama, kimi-linear (vLLM), qwen3-235b |
|
||||
|
||||
## Status
|
||||
|
||||
M1 — compose + config artifacts written; awaiting box-side bring-up.
|
||||
M2 will wire OpenHands' `LLM_BASE_URL` to this endpoint. M3 (the
|
||||
orchestrator on the other server) will point at this endpoint over
|
||||
Tailscale.
|
||||
73
pyinfra/framework/compose/litellm/config.yaml
Normal file
73
pyinfra/framework/compose/litellm/config.yaml
Normal file
@@ -0,0 +1,73 @@
|
||||
# LiteLLM model routing. model_name is what clients request; the
|
||||
# litellm_params block is how LiteLLM reaches the backend.
|
||||
#
|
||||
# `model: openai/<served-name>` tells LiteLLM to use its
|
||||
# openai-compatible adapter and forward <served-name> to the backend.
|
||||
# api_base is the backend's /v1 root reachable from inside the LiteLLM
|
||||
# container (host.docker.internal = host's docker0 IP via the
|
||||
# extra_hosts entry in litellm.yml).
|
||||
#
|
||||
# Backend running-state matters: requests to a stopped backend return
|
||||
# 503/connection-refused. By design — no fallback chain, since these
|
||||
# backends compete for GPU and silently routing "qwen3-235b" to the 30B
|
||||
# would be more confusing than failing fast.
|
||||
#
|
||||
# Edits here require `./run.sh` on the Mac to push to the box, then
|
||||
# `docker compose restart litellm` on the box to reload.
|
||||
|
||||
model_list:
|
||||
# Daily-driver coding model. Ollama with gfx1100-coerced ROCm —
|
||||
# currently the default opencode provider. Always-resident
|
||||
# (OLLAMA_KEEP_ALIVE=24h).
|
||||
- model_name: qwen3-coder
|
||||
litellm_params:
|
||||
model: openai/qwen3-coder:30b
|
||||
api_base: http://host.docker.internal:11434/v1
|
||||
api_key: dummy
|
||||
|
||||
# Same weights as qwen3-coder above but served via llama.cpp on the
|
||||
# kyuz0 rocm-7.2.2 image (native gfx1151 + rocWMMA). LL-P0 measures
|
||||
# whether the eval_tps win justifies switching default opencode to
|
||||
# this. Manual start until then.
|
||||
- model_name: qwen3-coder-llama
|
||||
litellm_params:
|
||||
model: openai/qwen3-coder
|
||||
api_base: http://host.docker.internal:8080/v1
|
||||
api_key: dummy
|
||||
|
||||
# Long-context chat (no tool calling) via vLLM. P0 verified at 32K;
|
||||
# context ramp tracked in kimi-linear/NEXT_STEPS.md.
|
||||
- model_name: kimi-linear
|
||||
litellm_params:
|
||||
model: openai/kimi-linear
|
||||
api_base: http://host.docker.internal:8000/v1
|
||||
api_key: dummy
|
||||
|
||||
# Long-task model — Qwen3-235B-A22B-Instruct-2507 UD-Q2_K_XL via
|
||||
# llama.cpp on port 8081. ~5-10 tok/s decode; manual start only
|
||||
# (can't coexist with the other GPU services). Requests will fail
|
||||
# with connection refused when the container is down — that's the
|
||||
# intended UX: a stopped service is a clear signal.
|
||||
- model_name: qwen3-235b
|
||||
litellm_params:
|
||||
model: openai/qwen3-235b
|
||||
api_base: http://host.docker.internal:8081/v1
|
||||
api_key: dummy
|
||||
|
||||
litellm_settings:
|
||||
# Forward all client params to the backend, even unrecognized ones.
|
||||
# Default drops unknown OpenAI params — fine for hosted models, but
|
||||
# our backends vary (vLLM, llama.cpp, Ollama) and each accepts a
|
||||
# slightly different superset. Let the backend reject what it can't
|
||||
# use rather than LiteLLM silently filtering.
|
||||
drop_params: false
|
||||
# No /v1/models caching — the list is short and we want stop/start
|
||||
# of backends to reflect immediately.
|
||||
cache: false
|
||||
# Log proxy requests at info; tail with `docker compose logs litellm`.
|
||||
set_verbose: false
|
||||
|
||||
general_settings:
|
||||
# Disable LiteLLM's database mode — we're stateless. No user/key/spend
|
||||
# tracking needed for a single-user trusted LAN setup.
|
||||
database_url: null
|
||||
50
pyinfra/framework/compose/litellm/smoke.sh
Normal file
50
pyinfra/framework/compose/litellm/smoke.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env bash
|
||||
# Smoke-test the LiteLLM proxy. /health/readiness for liveness, /v1/models
|
||||
# for the configured backends, then /health (which dials every backend)
|
||||
# to surface which ones are actually reachable right now.
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${LITELLM_HOST:-127.0.0.1:4000}"
|
||||
# Read master key from sibling .env if present, otherwise from environment.
|
||||
if [[ -z "${LITELLM_MASTER_KEY:-}" && -f "$(dirname "$0")/../.env" ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source "$(dirname "$0")/../.env"
|
||||
fi
|
||||
if [[ -z "${LITELLM_MASTER_KEY:-}" ]]; then
|
||||
echo "[smoke] LITELLM_MASTER_KEY not set — export it or populate /srv/docker/litellm/.env" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[smoke] GET /health/readiness on $HOST (proxy alive?)"
|
||||
curl -fsS "http://$HOST/health/readiness" \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||
| python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] GET /v1/models (configured model_names)"
|
||||
curl -fsS "http://$HOST/v1/models" \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||
| python3 -c "
|
||||
import json, sys
|
||||
r = json.load(sys.stdin)
|
||||
for m in r.get('data', []):
|
||||
print(f\" - {m.get('id', '?')}\")"
|
||||
|
||||
echo
|
||||
echo "[smoke] GET /health (each backend's reachability — slow, ~10s)"
|
||||
curl -fsS "http://$HOST/health" \
|
||||
-H "Authorization: Bearer $LITELLM_MASTER_KEY" \
|
||||
| python3 -c "
|
||||
import json, sys
|
||||
r = json.load(sys.stdin)
|
||||
healthy = r.get('healthy_endpoints', [])
|
||||
unhealthy = r.get('unhealthy_endpoints', [])
|
||||
print(f' healthy: {len(healthy)}')
|
||||
for e in healthy:
|
||||
print(f' + {e.get(\"model\", \"?\")}')
|
||||
print(f' unhealthy: {len(unhealthy)}')
|
||||
for e in unhealthy:
|
||||
print(f' - {e.get(\"model\", \"?\")}: {e.get(\"error\", \"?\")[:80]}')"
|
||||
|
||||
echo
|
||||
echo "[smoke] passed — proxy up, model list populated. Unhealthy backends are expected if their compose stacks are down."
|
||||
@@ -1,44 +1,101 @@
|
||||
# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
|
||||
# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox.
|
||||
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||
#
|
||||
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
|
||||
# vulkan-radv — most stable, recommended default (this one)
|
||||
# vulkan-amdvlk — alternate Vulkan driver, sometimes faster
|
||||
# rocm-7.2.2 — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
|
||||
# rocm-6.4.4 — ROCm 6.x fallback
|
||||
# rocm-7.2.2 — ROCm 7.x, native gfx1151 + rocWMMA (this one;
|
||||
# best perf for Qwen3-Coder-class models)
|
||||
# vulkan-radv — most-stable Vulkan; fallback if ROCm regresses
|
||||
# vulkan-amdvlk — alternate Vulkan driver
|
||||
# rocm-6.4.4 — older ROCm; only if 7.2.2 breaks
|
||||
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
|
||||
#
|
||||
# Toolbox images use a shell entrypoint, so we override to launch
|
||||
# llama-server directly. Edit the --model path before `docker compose up -d`.
|
||||
# Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important
|
||||
# weights at higher precision than naive Q4_K_M, closer to Q5 quality at
|
||||
# Q4 size. Download path on the box (see compose/llama/README.md):
|
||||
# hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
|
||||
# 'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
|
||||
# --local-dir /models/qwen
|
||||
# Verify exact filename in the HF repo before downloading — Unsloth's
|
||||
# file naming varies (sometimes split into shards).
|
||||
#
|
||||
# Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama
|
||||
# stays the default opencode provider until LL-P0 confirms the eval_tps
|
||||
# bump is real on this box.
|
||||
services:
|
||||
llama:
|
||||
image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
|
||||
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
||||
container_name: llama
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
||||
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups —
|
||||
# required for /dev/kfd + /dev/dri access from inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 8g
|
||||
ipc: host
|
||||
environment:
|
||||
# Unified-memory recipe (same as compose/kimi-linear.yml +
|
||||
# compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB +
|
||||
# ttm.pages_limit cmdline → these flags merge the rocminfo pools
|
||||
# into one ~110 GB arena via the HIP allocator's demand-paging.
|
||||
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE.
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
# Toolbox image drops to shell by default; explicit entrypoint.
|
||||
entrypoint: ["llama-server"]
|
||||
command:
|
||||
- --model
|
||||
- /models/REPLACE/ME/model.gguf
|
||||
- /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf
|
||||
# OpenAI-compatible served name (matches what opencode/llm/curl
|
||||
# request as "model"). Keep simple — provider-side name lives
|
||||
# in opencode.json.
|
||||
- --alias
|
||||
- qwen3-coder
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8080"
|
||||
# Push all layers to GPU. "999" is shorthand for "all available."
|
||||
# gfx1151 with 110 GB merged arena fits 30B-class models easily.
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
# Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the
|
||||
# same across providers. Bump if a workflow needs more; KV cost
|
||||
# at this size is small with q8_0 cache.
|
||||
- --ctx-size
|
||||
- "32768"
|
||||
# Required for GPU backends on Strix Halo per Gygeek's setup
|
||||
# guide. Forces full load into GPU memory rather than mmap.
|
||||
- "65536"
|
||||
# No-mmap is the Strix Halo standard — mmap >64 GB is slow on
|
||||
# ROCm. Forces full GPU load.
|
||||
- --no-mmap
|
||||
# Flash attention — works on Vulkan too; the big win is on the
|
||||
# ROCm tag where kyuz0's build has rocWMMA acceleration.
|
||||
# Flash attention — biggest single win, ~20-40 % faster on MoE.
|
||||
# Modern llama-server takes a value (on/off/auto); bare --flash-attn
|
||||
# is deprecated and consumes the next arg as its value.
|
||||
- --flash-attn
|
||||
- "on"
|
||||
# Quantize KV cache to int8 — halves KV memory at minor / no
|
||||
# quality loss; sometimes faster due to smaller working set.
|
||||
# Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml.
|
||||
- --cache-type-k
|
||||
- q8_0
|
||||
- --cache-type-v
|
||||
- q8_0
|
||||
# Use the model's embedded jinja chat template (rather than
|
||||
# llama.cpp's hardcoded default). Important for Qwen3-Coder which
|
||||
# has a specific chat format.
|
||||
- --jinja
|
||||
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
|
||||
# tokens/sec, KV-cache use, queue depth, and request latency.
|
||||
# tokens/sec, KV-cache use, queue depth, request latency.
|
||||
- --metrics
|
||||
|
||||
92
pyinfra/framework/compose/llama/README.md
Normal file
92
pyinfra/framework/compose/llama/README.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# llama
|
||||
|
||||
llama.cpp server with **native gfx1151** kernels via kyuz0's ROCm 7.2.2
|
||||
toolbox. Sits beside Ollama (11434) and vLLM (8000) on port 8080. Same
|
||||
Qwen3-Coder model as Ollama, faster path.
|
||||
|
||||
## Why this exists
|
||||
|
||||
Ollama's bundled ROCm doesn't ship native gfx1151 — we coerce gfx1100
|
||||
kernels via `HSA_OVERRIDE_GFX_VERSION=11.0.0`. kyuz0's image is built
|
||||
against gfx1151 with rocWMMA acceleration. Expected eval_tps delta on
|
||||
Qwen3-Coder-30B-A3B-Q4: **~30-50 % faster**, with ~2× prefill speedup.
|
||||
The compose stub used to be vulkan-radv with a placeholder model path;
|
||||
this rewrite makes it the second working coding endpoint.
|
||||
|
||||
## Bring up (LL-P0 verification)
|
||||
|
||||
```sh
|
||||
# 1. Pull the Unsloth UD-Q4_K_XL Qwen3-Coder GGUF on the box.
|
||||
# Verify the actual filename in the HF repo first — Unsloth's naming
|
||||
# sometimes splits into shards. As of 2026-05 the single-file
|
||||
# UD-Q4_K_XL is ~17-19 GB.
|
||||
hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
|
||||
'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
|
||||
--local-dir /models/qwen
|
||||
|
||||
# 2. Stand up the container.
|
||||
cd /srv/docker/llama
|
||||
docker compose pull # ~6-10 GB image
|
||||
docker compose up -d
|
||||
docker compose logs -f # wait for "main: server is listening on http://0.0.0.0:8080"
|
||||
|
||||
# 3. Smoke + perf measure.
|
||||
./smoke.sh
|
||||
```
|
||||
|
||||
If `predicted_per_second` is meaningfully higher than what Ollama
|
||||
reports for the same prompt, the migration is justified. If it's the
|
||||
same or worse, leave Ollama as the default and treat llama.cpp as a
|
||||
secondary option.
|
||||
|
||||
## Comparison test (vs Ollama)
|
||||
|
||||
Run the same prompt against both for a clean A/B:
|
||||
|
||||
```sh
|
||||
# Ollama
|
||||
curl -s http://framework:11434/api/generate \
|
||||
-d '{"model":"qwen3-coder:30b","prompt":"Write a Python fibonacci function with type hints.","stream":false}' \
|
||||
| jq '{eval_tps:(.eval_count/(.eval_duration/1e9)), prompt_tps:(.prompt_eval_count/(.prompt_eval_duration/1e9))}'
|
||||
|
||||
# llama.cpp (this stack)
|
||||
curl -s http://framework:8080/completion \
|
||||
-d '{"prompt":"Write a Python fibonacci function with type hints.","n_predict":200,"temperature":0}' \
|
||||
| jq '.timings | {predicted_per_second, prompt_per_second}'
|
||||
```
|
||||
|
||||
## Coexistence with Ollama
|
||||
|
||||
Both can run simultaneously — different ports, different model files on
|
||||
disk (Ollama's content-addressed store at `/models/ollama/` vs the raw
|
||||
GGUF at `/models/qwen/`). They will compete for GPU memory if both have
|
||||
their models hot. With `OLLAMA_KEEP_ALIVE=24h` Ollama keeps Qwen3
|
||||
resident; if you want to A/B without contention, `docker exec ollama
|
||||
ollama stop qwen3-coder:30b` while testing llama.cpp.
|
||||
|
||||
If LL-P0 confirms the perf win, LL-P1 wires this as a third opencode
|
||||
provider (`framework-llama/qwen3-coder` alongside `framework/qwen3-coder:30b`
|
||||
and `framework-vllm/kimi-linear`).
|
||||
|
||||
## Pin manifest
|
||||
|
||||
| Component | Pin |
|
||||
|---|---|
|
||||
| Image | `kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2` |
|
||||
| Weights | `unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF` (UD-Q4_K_XL variant) |
|
||||
| Default port | 8080 |
|
||||
| Context | 65536 (matches Ollama config) |
|
||||
|
||||
## Operations
|
||||
|
||||
```sh
|
||||
docker compose logs -f # tail
|
||||
docker compose restart llama # reload
|
||||
docker compose down # stop
|
||||
docker compose exec llama bash # shell in
|
||||
./smoke.sh # health + perf check
|
||||
```
|
||||
|
||||
## Status
|
||||
|
||||
LL-P0 in progress. LL-P1 (opencode provider wire-up) pending verification.
|
||||
45
pyinfra/framework/compose/llama/smoke.sh
Executable file
45
pyinfra/framework/compose/llama/smoke.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
# Smoke-test the running llama-server (kyuz0 rocm-7.2.2). Hits /health
|
||||
# for liveness, then a tiny OpenAI-compatible chat completion. Also
|
||||
# prints eval_tps so you can compare to Ollama directly.
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${LLAMA_HOST:-127.0.0.1:8080}"
|
||||
MODEL="${LLAMA_MODEL:-qwen3-coder}"
|
||||
|
||||
echo "[smoke] GET /health on $HOST"
|
||||
curl -fsS "http://$HOST/health" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
|
||||
curl -fsS "http://$HOST/v1/chat/completions" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{
|
||||
\"model\": \"$MODEL\",
|
||||
\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
|
||||
\"max_tokens\": 16,
|
||||
\"temperature\": 0.0
|
||||
}" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] perf measure — eval_tps and prompt_tps"
|
||||
# Use llama.cpp's native /completion endpoint which returns timings.
|
||||
curl -fsS "http://$HOST/completion" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
|
||||
"n_predict": 200,
|
||||
"temperature": 0.0,
|
||||
"stream": false
|
||||
}' | python3 -c "
|
||||
import json, sys
|
||||
r = json.load(sys.stdin)
|
||||
t = r.get('timings', {})
|
||||
print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
|
||||
print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
|
||||
print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}')
|
||||
print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}')
|
||||
"
|
||||
|
||||
echo
|
||||
echo "[smoke] passed"
|
||||
@@ -31,6 +31,31 @@ services:
|
||||
# layers between GPU and CPU. 64K keeps the model fully on GPU
|
||||
# while still being plenty for coding contexts.
|
||||
- OLLAMA_CONTEXT_LENGTH=65536
|
||||
# Perf tuning. Flash attention is the biggest single win on MoE
|
||||
# models at long context (20-40 % faster generation). q8_0 KV
|
||||
# cache halves KV memory at minor / no quality loss; sometimes
|
||||
# faster due to smaller working set. The parallel/loaded-models
|
||||
# caps avoid Ollama slicing memory across speculative concurrent
|
||||
# requests we never have.
|
||||
- OLLAMA_FLASH_ATTENTION=1
|
||||
- OLLAMA_KV_CACHE_TYPE=q8_0
|
||||
- OLLAMA_NUM_PARALLEL=1
|
||||
- OLLAMA_MAX_LOADED_MODELS=1
|
||||
# Keep the model resident for 24h instead of the default 5 min.
|
||||
# Avoids cold-start latency between sessions; safe because we cap
|
||||
# max_loaded_models above so memory doesn't drift.
|
||||
- OLLAMA_KEEP_ALIVE=24h
|
||||
# Unified-memory recipe. With BIOS UMA=0.5 GB the dedicated VRAM
|
||||
# pool is tiny; the model lives in GTT (system RAM the GPU borrows
|
||||
# via ttm.pages_limit=33554432 on the kernel cmdline). XNACK +
|
||||
# FINE_GRAIN_PCIE put the HIP allocator into demand-paging mode so
|
||||
# it treats the merged VRAM+GTT pool as one arena. Same flags as
|
||||
# compose/kimi-linear.yml and compose/comfyui.yml — Ollama uses
|
||||
# ggml/llama.cpp underneath but its allocator goes through HIP.
|
||||
# PYTORCH_HIP_ALLOC_CONF is intentionally absent (Ollama isn't
|
||||
# PyTorch).
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
volumes:
|
||||
- /models/ollama:/root/.ollama
|
||||
- /models:/models:ro
|
||||
|
||||
@@ -1,9 +1,14 @@
|
||||
# OpenWebUI — ChatGPT-like web UI in front of Ollama. Pre-configured to
|
||||
# use the host's Ollama instance and the project's SearXNG for web
|
||||
# search. Default port 3000.
|
||||
# use the host's Ollama instance and Kagi for web search. Default port
|
||||
# 3000.
|
||||
#
|
||||
# Persistent state (users, conversations, uploaded docs, RAG vector
|
||||
# index) lives at /srv/docker/openwebui/data so backups touch one path.
|
||||
#
|
||||
# Sibling .env file holds KAGI_API_KEY (single shared key — opencode's
|
||||
# kagimcp MCP on the Mac and this container use the same one). Same
|
||||
# placeholder-then-fill-in-by-hand pattern as compose/beszel.yml and
|
||||
# compose/litellm.yml.
|
||||
services:
|
||||
openwebui:
|
||||
image: ghcr.io/open-webui/open-webui:main
|
||||
@@ -22,9 +27,16 @@ services:
|
||||
# prompt confuses it. OpenWebUI's plain chat UI is the right home.
|
||||
- OPENAI_API_BASE_URLS=http://host.docker.internal:8000/v1
|
||||
- OPENAI_API_KEYS=dummy
|
||||
# Built-in web search via the project's SearXNG instance.
|
||||
# Built-in web search via Kagi. Kagi-specific env var name is
|
||||
# KAGI_SEARCH_API_KEY (OpenWebUI's convention); kagimcp on the Mac
|
||||
# uses KAGI_API_KEY (Kagi's official convention). We standardize on
|
||||
# KAGI_API_KEY in the .env file and let compose interpolate it
|
||||
# into OpenWebUI's expected name here.
|
||||
- ENABLE_RAG_WEB_SEARCH=true
|
||||
- RAG_WEB_SEARCH_ENGINE=searxng
|
||||
- SEARXNG_QUERY_URL=https://searxng.n0n.io/search?q=<query>&format=json
|
||||
- RAG_WEB_SEARCH_ENGINE=kagi
|
||||
- KAGI_SEARCH_API_KEY=${KAGI_API_KEY}
|
||||
# Fallback (commented): the self-hosted SearXNG path. Re-enable by
|
||||
# swapping RAG_WEB_SEARCH_ENGINE back to searxng and uncommenting:
|
||||
# - SEARXNG_QUERY_URL=https://searxng.n0n.io/search?q=<query>&format=json
|
||||
volumes:
|
||||
- /srv/docker/openwebui/data:/app/backend/data
|
||||
|
||||
97
pyinfra/framework/compose/qwen3-235b.yml
Normal file
97
pyinfra/framework/compose/qwen3-235b.yml
Normal file
@@ -0,0 +1,97 @@
|
||||
# Qwen3-235B-A22B-Instruct-2507 (Unsloth UD-Q2_K_XL ~88.8 GB) via the
|
||||
# kyuz0 rocm-7.2.2 Strix Halo toolbox. Same image + unified-memory
|
||||
# recipe as compose/llama.yml; the only deltas are model path, port,
|
||||
# alias, context, and the no-coexist `restart: "no"`.
|
||||
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||
#
|
||||
# Coexistence. At ~88.8 GB weights this CANNOT coexist with the
|
||||
# 30B llama service or Kimi-Linear (vLLM) — the merged GPU arena is
|
||||
# only ~110 GB. Stop those before bringing this up. Same pattern as
|
||||
# compose/comfyui.yml: `restart: "no"`, manual start, swap workflow
|
||||
# documented in compose/qwen3-235b/README.md.
|
||||
#
|
||||
# Weights. UD-Q2_K_XL is Unsloth's "Dynamic" quant — important tensors
|
||||
# kept at higher precision; closer to Q3 quality than naive Q2. 2 shards
|
||||
# (~50 GB + 38.8 GB); llama.cpp auto-discovers shard 2 from shard 1.
|
||||
# Download path on the box (see compose/qwen3-235b/README.md):
|
||||
# hf download unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF \
|
||||
# --include 'UD-Q2_K_XL/*' \
|
||||
# --local-dir /models/qwen/Qwen3-235B-A22B-Instruct-2507
|
||||
#
|
||||
# Port 8081 — distinct from llama 30B (8080) so opencode/curl/etc. can
|
||||
# address either explicitly even though only one runs at a time.
|
||||
#
|
||||
# Performance target. Bandwidth-bound: 256 GB/s ÷ ~22 GB active-bytes →
|
||||
# ~5-10 tok/s decode. This is the "overnight long-task" model, NOT the
|
||||
# interactive driver — see StrixHaloMemory.md for the bandwidth math.
|
||||
services:
|
||||
qwen3-235b:
|
||||
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
||||
container_name: qwen3-235b
|
||||
# Manual start only — see header note about GPU contention.
|
||||
restart: "no"
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups —
|
||||
# required for /dev/kfd + /dev/dri access from inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 8g
|
||||
ipc: host
|
||||
environment:
|
||||
# Unified-memory recipe (same as compose/llama.yml + kimi-linear).
|
||||
# BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these flags merge the
|
||||
# rocminfo pools into one ~110 GB arena. kyuz0's image is native
|
||||
# gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8081:8081"
|
||||
entrypoint: ["llama-server"]
|
||||
command:
|
||||
- --model
|
||||
- /models/qwen/Qwen3-235B-A22B-Instruct-2507/UD-Q2_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00001-of-00002.gguf
|
||||
# OpenAI-compatible served name. Provider-side name lives in
|
||||
# opencode.json once M0 perf is verified and we wire it up.
|
||||
- --alias
|
||||
- qwen3-235b
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8081"
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
# 64K — opencode auto-compaction triggers at ~75-80 % of the
|
||||
# stated context limit, so a small ctx fires the summarize-and-
|
||||
# rewind loop after only a few turns. 64K roughly doubles how
|
||||
# many turns fit. KV at q8_0 ≈ 8 GB (94 layers × 8 kv-heads × 128
|
||||
# head-dim × 2 × 65536 × 1 byte); arena headroom still ~11 GB.
|
||||
# Stretch goal 131072 documented in compose/qwen3-235b/README.md
|
||||
# but tight — verify allocator behaviour first.
|
||||
- --ctx-size
|
||||
- "65536"
|
||||
- --no-mmap
|
||||
# Flash attention is required for q8_0 KV cache in llama.cpp.
|
||||
- --flash-attn
|
||||
- "on"
|
||||
- --cache-type-k
|
||||
- q8_0
|
||||
- --cache-type-v
|
||||
- q8_0
|
||||
# Qwen3-235B-Instruct-2507 ships its own chat template; let
|
||||
# llama.cpp use it rather than the hardcoded default.
|
||||
- --jinja
|
||||
# Single sequence — KV pool isn't sliced across speculative
|
||||
# concurrent requests we'll never have (long-task model, one
|
||||
# request at a time).
|
||||
- --parallel
|
||||
- "1"
|
||||
- --metrics
|
||||
163
pyinfra/framework/compose/qwen3-235b/README.md
Normal file
163
pyinfra/framework/compose/qwen3-235b/README.md
Normal file
@@ -0,0 +1,163 @@
|
||||
# qwen3-235b
|
||||
|
||||
Qwen3-235B-A22B-Instruct-2507 on Strix Halo via `kyuz0:rocm-7.2.2`.
|
||||
The "overnight long-task" model — bandwidth math says ~5-10 tok/s
|
||||
decode, so this is for fire-and-forget runs (deep refactors, long-form
|
||||
analysis), **not** interactive coding. Daily driver stays on Ollama /
|
||||
llama 30B.
|
||||
|
||||
OpenAI-compatible endpoint at `http://framework:8081` once running.
|
||||
|
||||
## Coexistence notes (read first)
|
||||
|
||||
At ~88.8 GB weights this can't share the GPU with anything else:
|
||||
|
||||
| Concurrent service | Action |
|
||||
|---|---|
|
||||
| `llama` (Qwen3-Coder-30B, port 8080) | `docker compose down` in `/srv/docker/llama` first |
|
||||
| `kimi-linear` (vLLM, port 8000) | `docker compose down` in `/srv/docker/kimi-linear` first |
|
||||
| `ollama` (port 11434) | `docker exec ollama ollama stop qwen3-coder:30b` (Ollama itself can stay up) |
|
||||
| `comfyui` (port 8188) | `docker compose down` in `/srv/docker/comfyui` first |
|
||||
|
||||
The stack reflects this: `restart: "no"` — won't come back after a box
|
||||
reboot. You start it deliberately.
|
||||
|
||||
## Prereqs
|
||||
|
||||
- Pyinfra deploy has run (creates `/srv/docker/qwen3-235b/` with right perms).
|
||||
- BIOS UMA at 0.5 GB + `ttm.pages_limit=33554432` kernel cmdline active.
|
||||
Verify: `cat /proc/cmdline | grep ttm.pages_limit`.
|
||||
- Other GPU services stopped per the table above.
|
||||
|
||||
## Download weights (M0.1 — ~88.8 GB, 2 shards)
|
||||
|
||||
```sh
|
||||
# /models/qwen exists via pyinfra; just create the model subdir.
|
||||
mkdir -p /models/qwen/Qwen3-235B-A22B-Instruct-2507
|
||||
|
||||
hf download unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF \
|
||||
--include 'UD-Q2_K_XL/*' \
|
||||
--local-dir /models/qwen/Qwen3-235B-A22B-Instruct-2507
|
||||
|
||||
# Files land at:
|
||||
# /models/qwen/Qwen3-235B-A22B-Instruct-2507/UD-Q2_K_XL/
|
||||
# Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00001-of-00002.gguf (~50 GB)
|
||||
# Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00002-of-00002.gguf (~38.8 GB)
|
||||
#
|
||||
# llama.cpp auto-discovers shard 2 from shard 1 — only point --model at
|
||||
# the 00001-of-00002 file.
|
||||
```
|
||||
|
||||
Disk: needs ~90 GB free on `/models`. Pull is bandwidth-bound; expect
|
||||
20-60 minutes on a fast home link.
|
||||
|
||||
## Bring up (M0.2 — first generation)
|
||||
|
||||
```sh
|
||||
cd /srv/docker/qwen3-235b
|
||||
docker compose pull # already-cached image if you ran llama first
|
||||
docker compose up -d
|
||||
docker compose logs -f # wait for "main: server is listening on http://0.0.0.0:8081"
|
||||
|
||||
./smoke.sh # /health + tiny generation + perf
|
||||
```
|
||||
|
||||
Expect **2-5 minutes** for first start — llama.cpp has to load ~88 GB
|
||||
of weights off disk into the merged arena. Subsequent starts are faster
|
||||
if the page cache is warm.
|
||||
|
||||
If `./smoke.sh` reports `predicted_per_second` in the 5-10 tok/s range,
|
||||
M0 is verified. Lower than 3 tok/s = something's wrong (likely the GPU
|
||||
arena is < 100 GB — see "Troubleshooting").
|
||||
|
||||
## Ramping context
|
||||
|
||||
Defaults to 64K — chosen because opencode's auto-compaction triggers
|
||||
at ~75-80 % of the stated limit, so a smaller ctx fires the rewrite-
|
||||
the-conversation loop after only a handful of turns. 64K roughly
|
||||
doubles how many turns fit. Stages:
|
||||
|
||||
| Stage | `--ctx-size` | KV (q8_0) | Margin in arena |
|
||||
|---|---|---|---|
|
||||
| Previous (M0) | 32768 | ~4 GB | ~15 GB |
|
||||
| **Current default** | **65536** | **~8 GB** | **~11 GB** |
|
||||
| M0.4 stretch | 131072 | ~16 GB | ~3 GB (tight) |
|
||||
|
||||
Edit `--ctx-size` in `docker-compose.yml`, `docker compose down && up -d`,
|
||||
re-run `./smoke.sh`. If you see an alloc error in the logs, dial it back.
|
||||
|
||||
opencode's `limit.context` in `opencode.json` should match — otherwise
|
||||
opencode either compacts too early (limit lower than server) or sends
|
||||
prompts longer than the server can handle (limit higher).
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**OOM on startup.** Check the arena size first:
|
||||
```sh
|
||||
rocminfo | grep -A2 "Pool Info" | head -20
|
||||
```
|
||||
If it reports two ~31 GB pools instead of one ~110 GB arena, the
|
||||
unified-memory recipe didn't apply. Verify (in order):
|
||||
|
||||
1. `cat /proc/cmdline` includes `amdgpu.gttsize=131072 ttm.pages_limit=33554432`
|
||||
2. BIOS UMA Frame Buffer Size is **0.5 GB** (not 64 GB) — Framework BIOS
|
||||
`lfsp0.03.05+`. Counter-intuitive: a tiny UMA frees more pages for GTT.
|
||||
3. Container env shows `HSA_XNACK=1 HSA_FORCE_FINE_GRAIN_PCIE=1` —
|
||||
`docker compose exec qwen3-235b env | grep HSA`.
|
||||
|
||||
If all three are right and OOM persists, drop to Q2_K_L (~85.8 GB) — edit
|
||||
the model path in `docker-compose.yml` after a separate `hf download` of
|
||||
that quant.
|
||||
|
||||
**`predicted_per_second` very low (<3 tok/s).** Likely cold page cache.
|
||||
Re-run `./smoke.sh` once — second run should be in band. If still slow,
|
||||
verify the model file isn't being swapped from disk: `iostat -x 1` should
|
||||
show ~0 read bandwidth during inference.
|
||||
|
||||
**Server starts but answers gibberish.** `--jinja` not picked up; check
|
||||
`docker compose logs qwen3-235b | grep -i 'chat template'`. Should
|
||||
say "using chat template from gguf metadata".
|
||||
|
||||
## Operations
|
||||
|
||||
```sh
|
||||
docker compose logs -f # tail
|
||||
docker compose down # stop (always — coexists with nothing)
|
||||
docker compose exec qwen3-235b bash # shell in
|
||||
./smoke.sh # health + perf
|
||||
amdgpu_top # GPU view on host
|
||||
```
|
||||
|
||||
Suggested cycle:
|
||||
```
|
||||
[evening] stop llama 30B / kimi-linear; up qwen3-235b; submit batch tasks
|
||||
[overnight] qwen3-235b grinds; results land in your harness state
|
||||
[morning] down qwen3-235b; up llama 30B / kimi-linear; back to interactive
|
||||
```
|
||||
|
||||
M3 will automate this swap; M0 does it by hand.
|
||||
|
||||
## Pin manifest
|
||||
|
||||
| Component | Pin |
|
||||
|---|---|
|
||||
| Image | `kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2` (shared with `llama`) |
|
||||
| Weights | `unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF` UD-Q2_K_XL |
|
||||
| Default port | 8081 |
|
||||
| Default context | 65536 (ramp to 131072 deliberately) |
|
||||
| KV cache type | q8_0 (k and v) |
|
||||
|
||||
## Status
|
||||
|
||||
M0 — compose artifacts written; awaiting box-side weight pull + bring-up.
|
||||
M0.3-M0.4 (context ramp) follow once M0 boots cleanly. M1 wires this
|
||||
endpoint as a 4th opencode/LiteLLM provider used by the long-task
|
||||
orchestrator.
|
||||
|
||||
## Why Instruct-2507 not Thinking-2507
|
||||
|
||||
Both are published; Thinking emits a `<think>` block before every answer.
|
||||
At ~7 tok/s decode, a 2K-token think block = ~5 min of wall time per
|
||||
response, then the actual answer. For autonomous coding/refactor tasks
|
||||
that's a tax we don't want. Thinking-2507 is worth adding as a separate
|
||||
compose later for hard-reasoning one-shots; not the long-task default.
|
||||
48
pyinfra/framework/compose/qwen3-235b/smoke.sh
Normal file
48
pyinfra/framework/compose/qwen3-235b/smoke.sh
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
# Smoke-test the running qwen3-235b llama-server (port 8081). Hits
|
||||
# /health for liveness, then a tiny OpenAI-compatible chat completion,
|
||||
# then measures eval_tps via /completion. Generation is bigger than
|
||||
# llama's smoke (n_predict=64) because at 5-10 tok/s the per-token
|
||||
# noise floor swamps a 16-token sample.
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${QWEN235_HOST:-127.0.0.1:8081}"
|
||||
MODEL="${QWEN235_MODEL:-qwen3-235b}"
|
||||
|
||||
echo "[smoke] GET /health on $HOST"
|
||||
curl -fsS "http://$HOST/health" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
|
||||
curl -fsS "http://$HOST/v1/chat/completions" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{
|
||||
\"model\": \"$MODEL\",
|
||||
\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
|
||||
\"max_tokens\": 16,
|
||||
\"temperature\": 0.0
|
||||
}" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=64)"
|
||||
# Bigger sample than llama/smoke.sh — at ~7 tok/s the first few tokens'
|
||||
# warmup dominates a 16-token measurement.
|
||||
curl -fsS "http://$HOST/completion" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
|
||||
"n_predict": 64,
|
||||
"temperature": 0.0,
|
||||
"stream": false
|
||||
}' | python3 -c "
|
||||
import json, sys
|
||||
r = json.load(sys.stdin)
|
||||
t = r.get('timings', {})
|
||||
print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
|
||||
print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
|
||||
print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}')
|
||||
print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}')
|
||||
"
|
||||
|
||||
echo
|
||||
echo "[smoke] passed — expected band 5-10 tok/s decode; <3 tok/s = investigate"
|
||||
Reference in New Issue
Block a user