Files
2026-06-08 15:31:50 +01:00

102 lines
4.1 KiB
YAML

# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
#
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
# rocm-7.2.2 — ROCm 7.x, native gfx1151 + rocWMMA (this one;
# best perf for Qwen3-Coder-class models)
# vulkan-radv — most-stable Vulkan; fallback if ROCm regresses
# vulkan-amdvlk — alternate Vulkan driver
# rocm-6.4.4 — older ROCm; only if 7.2.2 breaks
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
#
# Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important
# weights at higher precision than naive Q4_K_M, closer to Q5 quality at
# Q4 size. Download path on the box (see compose/llama/README.md):
# hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
# 'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
# --local-dir /models/qwen
# Verify exact filename in the HF repo before downloading — Unsloth's
# file naming varies (sometimes split into shards).
#
# Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama
# stays the default opencode provider until LL-P0 confirms the eval_tps
# bump is real on this box.
services:
llama:
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
container_name: llama
restart: unless-stopped
devices:
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
# only needs dri. Don't drop kfd when on the rocm-* tag.
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
cap_add:
- SYS_PTRACE
security_opt:
- seccomp=unconfined
# Numeric GIDs of host's video (44) and render (991) groups —
# required for /dev/kfd + /dev/dri access from inside the container.
group_add:
- "44"
- "991"
shm_size: 8g
ipc: host
environment:
# Unified-memory recipe (same as compose/kimi-linear.yml +
# compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB +
# ttm.pages_limit cmdline → these flags merge the rocminfo pools
# into one ~110 GB arena via the HIP allocator's demand-paging.
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE.
- HSA_XNACK=1
- HSA_FORCE_FINE_GRAIN_PCIE=1
volumes:
- /models:/models:ro
ports:
- "8080:8080"
# Toolbox image drops to shell by default; explicit entrypoint.
entrypoint: ["llama-server"]
command:
- --model
- /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf
# OpenAI-compatible served name (matches what opencode/llm/curl
# request as "model"). Keep simple — provider-side name lives
# in opencode.json.
- --alias
- qwen3-coder
- --host
- 0.0.0.0
- --port
- "8080"
# Push all layers to GPU. "999" is shorthand for "all available."
# gfx1151 with 110 GB merged arena fits 30B-class models easily.
- --n-gpu-layers
- "999"
# Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the
# same across providers. Bump if a workflow needs more; KV cost
# at this size is small with q8_0 cache.
- --ctx-size
- "65536"
# No-mmap is the Strix Halo standard — mmap >64 GB is slow on
# ROCm. Forces full GPU load.
- --no-mmap
# Flash attention — biggest single win, ~20-40 % faster on MoE.
# Modern llama-server takes a value (on/off/auto); bare --flash-attn
# is deprecated and consumes the next arg as its value.
- --flash-attn
- "on"
# Quantize KV cache to int8 — halves KV memory at minor / no
# quality loss; sometimes faster due to smaller working set.
# Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml.
- --cache-type-k
- q8_0
- --cache-type-v
- q8_0
# Use the model's embedded jinja chat template (rather than
# llama.cpp's hardcoded default). Important for Qwen3-Coder which
# has a specific chat format.
- --jinja
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
# tokens/sec, KV-cache use, queue depth, request latency.
- --metrics