# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox. # https://github.com/kyuz0/amd-strix-halo-toolboxes # # Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes: # rocm-7.2.2 — ROCm 7.x, native gfx1151 + rocWMMA (this one; # best perf for Qwen3-Coder-class models) # vulkan-radv — most-stable Vulkan; fallback if ROCm regresses # vulkan-amdvlk — alternate Vulkan driver # rocm-6.4.4 — older ROCm; only if 7.2.2 breaks # rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026) # # Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important # weights at higher precision than naive Q4_K_M, closer to Q5 quality at # Q4 size. Download path on the box (see compose/llama/README.md): # hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \ # 'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \ # --local-dir /models/qwen # Verify exact filename in the HF repo before downloading — Unsloth's # file naming varies (sometimes split into shards). # # Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama # stays the default opencode provider until LL-P0 confirms the eval_tps # bump is real on this box. services: llama: image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2 container_name: llama restart: unless-stopped devices: # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan # only needs dri. Don't drop kfd when on the rocm-* tag. - /dev/kfd:/dev/kfd - /dev/dri:/dev/dri cap_add: - SYS_PTRACE security_opt: - seccomp=unconfined # Numeric GIDs of host's video (44) and render (991) groups — # required for /dev/kfd + /dev/dri access from inside the container. group_add: - "44" - "991" shm_size: 8g ipc: host environment: # Unified-memory recipe (same as compose/kimi-linear.yml + # compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB + # ttm.pages_limit cmdline → these flags merge the rocminfo pools # into one ~110 GB arena via the HIP allocator's demand-paging. # kyuz0's image is native gfx1151 so no HSA_OVERRIDE. - HSA_XNACK=1 - HSA_FORCE_FINE_GRAIN_PCIE=1 volumes: - /models:/models:ro ports: - "8080:8080" # Toolbox image drops to shell by default; explicit entrypoint. entrypoint: ["llama-server"] command: - --model - /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf # OpenAI-compatible served name (matches what opencode/llm/curl # request as "model"). Keep simple — provider-side name lives # in opencode.json. - --alias - qwen3-coder - --host - 0.0.0.0 - --port - "8080" # Push all layers to GPU. "999" is shorthand for "all available." # gfx1151 with 110 GB merged arena fits 30B-class models easily. - --n-gpu-layers - "999" # Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the # same across providers. Bump if a workflow needs more; KV cost # at this size is small with q8_0 cache. - --ctx-size - "65536" # No-mmap is the Strix Halo standard — mmap >64 GB is slow on # ROCm. Forces full GPU load. - --no-mmap # Flash attention — biggest single win, ~20-40 % faster on MoE. # Modern llama-server takes a value (on/off/auto); bare --flash-attn # is deprecated and consumes the next arg as its value. - --flash-attn - "on" # Quantize KV cache to int8 — halves KV memory at minor / no # quality loss; sometimes faster due to smaller working set. # Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml. - --cache-type-k - q8_0 - --cache-type-v - q8_0 # Use the model's embedded jinja chat template (rather than # llama.cpp's hardcoded default). Important for Qwen3-Coder which # has a specific chat format. - --jinja # Expose Prometheus metrics at /metrics — scraped by OpenLIT for # tokens/sec, KV-cache use, queue depth, request latency. - --metrics