Files
localgenai/pyinfra/framework/compose/qwen3-235b.yml
2026-06-08 15:31:50 +01:00

98 lines
3.8 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Qwen3-235B-A22B-Instruct-2507 (Unsloth UD-Q2_K_XL ~88.8 GB) via the
# kyuz0 rocm-7.2.2 Strix Halo toolbox. Same image + unified-memory
# recipe as compose/llama.yml; the only deltas are model path, port,
# alias, context, and the no-coexist `restart: "no"`.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
#
# Coexistence. At ~88.8 GB weights this CANNOT coexist with the
# 30B llama service or Kimi-Linear (vLLM) — the merged GPU arena is
# only ~110 GB. Stop those before bringing this up. Same pattern as
# compose/comfyui.yml: `restart: "no"`, manual start, swap workflow
# documented in compose/qwen3-235b/README.md.
#
# Weights. UD-Q2_K_XL is Unsloth's "Dynamic" quant — important tensors
# kept at higher precision; closer to Q3 quality than naive Q2. 2 shards
# (~50 GB + 38.8 GB); llama.cpp auto-discovers shard 2 from shard 1.
# Download path on the box (see compose/qwen3-235b/README.md):
# hf download unsloth/Qwen3-235B-A22B-Instruct-2507-GGUF \
# --include 'UD-Q2_K_XL/*' \
# --local-dir /models/qwen/Qwen3-235B-A22B-Instruct-2507
#
# Port 8081 — distinct from llama 30B (8080) so opencode/curl/etc. can
# address either explicitly even though only one runs at a time.
#
# Performance target. Bandwidth-bound: 256 GB/s ÷ ~22 GB active-bytes →
# ~5-10 tok/s decode. This is the "overnight long-task" model, NOT the
# interactive driver — see StrixHaloMemory.md for the bandwidth math.
services:
qwen3-235b:
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
container_name: qwen3-235b
# Manual start only — see header note about GPU contention.
restart: "no"
devices:
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
cap_add:
- SYS_PTRACE
security_opt:
- seccomp=unconfined
# Numeric GIDs of host's video (44) and render (991) groups —
# required for /dev/kfd + /dev/dri access from inside the container.
group_add:
- "44"
- "991"
shm_size: 8g
ipc: host
environment:
# Unified-memory recipe (same as compose/llama.yml + kimi-linear).
# BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these flags merge the
# rocminfo pools into one ~110 GB arena. kyuz0's image is native
# gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
- HSA_XNACK=1
- HSA_FORCE_FINE_GRAIN_PCIE=1
volumes:
- /models:/models:ro
ports:
- "8081:8081"
entrypoint: ["llama-server"]
command:
- --model
- /models/qwen/Qwen3-235B-A22B-Instruct-2507/UD-Q2_K_XL/Qwen3-235B-A22B-Instruct-2507-UD-Q2_K_XL-00001-of-00002.gguf
# OpenAI-compatible served name. Provider-side name lives in
# opencode.json once M0 perf is verified and we wire it up.
- --alias
- qwen3-235b
- --host
- 0.0.0.0
- --port
- "8081"
- --n-gpu-layers
- "999"
# 64K — opencode auto-compaction triggers at ~75-80 % of the
# stated context limit, so a small ctx fires the summarize-and-
# rewind loop after only a few turns. 64K roughly doubles how
# many turns fit. KV at q8_0 ≈ 8 GB (94 layers × 8 kv-heads × 128
# head-dim × 2 × 65536 × 1 byte); arena headroom still ~11 GB.
# Stretch goal 131072 documented in compose/qwen3-235b/README.md
# but tight — verify allocator behaviour first.
- --ctx-size
- "65536"
- --no-mmap
# Flash attention is required for q8_0 KV cache in llama.cpp.
- --flash-attn
- "on"
- --cache-type-k
- q8_0
- --cache-type-v
- q8_0
# Qwen3-235B-Instruct-2507 ships its own chat template; let
# llama.cpp use it rather than the hardcoded default.
- --jinja
# Single sequence — KV pool isn't sliced across speculative
# concurrent requests we'll never have (long-task model, one
# request at a time).
- --parallel
- "1"
- --metrics