98 lines
3.8 KiB
YAML
98 lines
3.8 KiB
YAML
|
|
# Qwen3-235B-A22B-Instruct-2507 (Unsloth UD-Q2_K_XL ~88.8 GB) via the
|
|||
|
|
# kyuz0 rocm-7.2.2 Strix Halo toolbox. Same image + unified-memory
|
|||
|
|
# recipe as compose/llama.yml; the only deltas are model path, port,
|
|||
|
|
# alias, context, and the no-coexist `restart: "no"`.
|
|||
|
|
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
|||
|
|
#
|
|||
|
|
# Coexistence. At ~88.8 GB weights this CANNOT coexist with the
|
|||
|
|
# 30B llama service or Kimi-Linear (vLLM) — the merged GPU arena is
|
|||
|
|
# only ~110 GB. Stop those before bringing this up. Same pattern as
|
|||
|
|
# compose/comfyui.yml: `restart: "no"`, manual start, swap workflow
|
|||
|
|
# documented in compose/qwen3-235b/README.md.
|
|||
|
|
#
|
|||
|
|
# Weights. UD-Q2_K_XL is Unsloth's "Dynamic" quant — important tensors
|
|||
|
|
# kept at higher precision; closer to Q3 quality than naive Q2. 2 shards
|
|||
|
|
# (~50 GB + 38.8 GB); llama.cpp auto-discovers shard 2 from shard 1.
|
|||
|
|
# Download path on the box (see compose/qwen3-235b/README.md):
|
|||
|
|
# hf download unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF \
|
|||
|
|
# --include 'UD-Q2_K_XL/*' \
|
|||
|
|
# --local-dir /models/qwen/Qwen3-235B-A22B-Instruct-2507
|
|||
|
|
#
|
|||
|
|
# Port 8081 — distinct from llama 30B (8080) so opencode/curl/etc. can
|
|||
|
|
# address either explicitly even though only one runs at a time.
|
|||
|
|
#
|
|||
|
|
# Performance target. Bandwidth-bound: 256 GB/s ÷ ~22 GB active-bytes →
|
|||
|
|
# ~5-10 tok/s decode. This is the "overnight long-task" model, NOT the
|
|||
|
|
# interactive driver — see StrixHaloMemory.md for the bandwidth math.
|
|||
|
|
services:
|
|||
|
|
qwen3-235b:
|
|||
|
|
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
|||
|
|
container_name: qwen3-235b
|
|||
|
|
# Manual start only — see header note about GPU contention.
|
|||
|
|
restart: "no"
|
|||
|
|
devices:
|
|||
|
|
- /dev/kfd:/dev/kfd
|
|||
|
|
- /dev/dri:/dev/dri
|
|||
|
|
cap_add:
|
|||
|
|
- SYS_PTRACE
|
|||
|
|
security_opt:
|
|||
|
|
- seccomp=unconfined
|
|||
|
|
# Numeric GIDs of host's video (44) and render (991) groups —
|
|||
|
|
# required for /dev/kfd + /dev/dri access from inside the container.
|
|||
|
|
group_add:
|
|||
|
|
- "44"
|
|||
|
|
- "991"
|
|||
|
|
shm_size: 8g
|
|||
|
|
ipc: host
|
|||
|
|
environment:
|
|||
|
|
# Unified-memory recipe (same as compose/llama.yml + kimi-linear).
|
|||
|
|
# BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these flags merge the
|
|||
|
|
# rocminfo pools into one ~110 GB arena. kyuz0's image is native
|
|||
|
|
# gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
|
|||
|
|
- HSA_XNACK=1
|
|||
|
|
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
|||
|
|
volumes:
|
|||
|
|
- /models:/models:ro
|
|||
|
|
ports:
|
|||
|
|
- "8081:8081"
|
|||
|
|
entrypoint: ["llama-server"]
|
|||
|
|
command:
|
|||
|
|
- --model
|
|||
|
|
- /models/qwen/Qwen3-235B-A22B-Instruct-2507/UD-Q2_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00001-of-00002.gguf
|
|||
|
|
# OpenAI-compatible served name. Provider-side name lives in
|
|||
|
|
# opencode.json once M0 perf is verified and we wire it up.
|
|||
|
|
- --alias
|
|||
|
|
- qwen3-235b
|
|||
|
|
- --host
|
|||
|
|
- 0.0.0.0
|
|||
|
|
- --port
|
|||
|
|
- "8081"
|
|||
|
|
- --n-gpu-layers
|
|||
|
|
- "999"
|
|||
|
|
# 64K — opencode auto-compaction triggers at ~75-80 % of the
|
|||
|
|
# stated context limit, so a small ctx fires the summarize-and-
|
|||
|
|
# rewind loop after only a few turns. 64K roughly doubles how
|
|||
|
|
# many turns fit. KV at q8_0 ≈ 8 GB (94 layers × 8 kv-heads × 128
|
|||
|
|
# head-dim × 2 × 65536 × 1 byte); arena headroom still ~11 GB.
|
|||
|
|
# Stretch goal 131072 documented in compose/qwen3-235b/README.md
|
|||
|
|
# but tight — verify allocator behaviour first.
|
|||
|
|
- --ctx-size
|
|||
|
|
- "65536"
|
|||
|
|
- --no-mmap
|
|||
|
|
# Flash attention is required for q8_0 KV cache in llama.cpp.
|
|||
|
|
- --flash-attn
|
|||
|
|
- "on"
|
|||
|
|
- --cache-type-k
|
|||
|
|
- q8_0
|
|||
|
|
- --cache-type-v
|
|||
|
|
- q8_0
|
|||
|
|
# Qwen3-235B-Instruct-2507 ships its own chat template; let
|
|||
|
|
# llama.cpp use it rather than the hardcoded default.
|
|||
|
|
- --jinja
|
|||
|
|
# Single sequence — KV pool isn't sliced across speculative
|
|||
|
|
# concurrent requests we'll never have (long-task model, one
|
|||
|
|
# request at a time).
|
|||
|
|
- --parallel
|
|||
|
|
- "1"
|
|||
|
|
- --metrics
|