102 lines
4.1 KiB
YAML
102 lines
4.1 KiB
YAML
# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox.
|
|
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
|
#
|
|
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
|
|
# rocm-7.2.2 — ROCm 7.x, native gfx1151 + rocWMMA (this one;
|
|
# best perf for Qwen3-Coder-class models)
|
|
# vulkan-radv — most-stable Vulkan; fallback if ROCm regresses
|
|
# vulkan-amdvlk — alternate Vulkan driver
|
|
# rocm-6.4.4 — older ROCm; only if 7.2.2 breaks
|
|
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
|
|
#
|
|
# Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important
|
|
# weights at higher precision than naive Q4_K_M, closer to Q5 quality at
|
|
# Q4 size. Download path on the box (see compose/llama/README.md):
|
|
# hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
|
|
# 'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
|
|
# --local-dir /models/qwen
|
|
# Verify exact filename in the HF repo before downloading — Unsloth's
|
|
# file naming varies (sometimes split into shards).
|
|
#
|
|
# Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama
|
|
# stays the default opencode provider until LL-P0 confirms the eval_tps
|
|
# bump is real on this box.
|
|
services:
|
|
llama:
|
|
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
|
container_name: llama
|
|
restart: unless-stopped
|
|
devices:
|
|
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
|
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
|
- /dev/kfd:/dev/kfd
|
|
- /dev/dri:/dev/dri
|
|
cap_add:
|
|
- SYS_PTRACE
|
|
security_opt:
|
|
- seccomp=unconfined
|
|
# Numeric GIDs of host's video (44) and render (991) groups —
|
|
# required for /dev/kfd + /dev/dri access from inside the container.
|
|
group_add:
|
|
- "44"
|
|
- "991"
|
|
shm_size: 8g
|
|
ipc: host
|
|
environment:
|
|
# Unified-memory recipe (same as compose/kimi-linear.yml +
|
|
# compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB +
|
|
# ttm.pages_limit cmdline → these flags merge the rocminfo pools
|
|
# into one ~110 GB arena via the HIP allocator's demand-paging.
|
|
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE.
|
|
- HSA_XNACK=1
|
|
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
|
volumes:
|
|
- /models:/models:ro
|
|
ports:
|
|
- "8080:8080"
|
|
# Toolbox image drops to shell by default; explicit entrypoint.
|
|
entrypoint: ["llama-server"]
|
|
command:
|
|
- --model
|
|
- /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf
|
|
# OpenAI-compatible served name (matches what opencode/llm/curl
|
|
# request as "model"). Keep simple — provider-side name lives
|
|
# in opencode.json.
|
|
- --alias
|
|
- qwen3-coder
|
|
- --host
|
|
- 0.0.0.0
|
|
- --port
|
|
- "8080"
|
|
# Push all layers to GPU. "999" is shorthand for "all available."
|
|
# gfx1151 with 110 GB merged arena fits 30B-class models easily.
|
|
- --n-gpu-layers
|
|
- "999"
|
|
# Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the
|
|
# same across providers. Bump if a workflow needs more; KV cost
|
|
# at this size is small with q8_0 cache.
|
|
- --ctx-size
|
|
- "65536"
|
|
# No-mmap is the Strix Halo standard — mmap >64 GB is slow on
|
|
# ROCm. Forces full GPU load.
|
|
- --no-mmap
|
|
# Flash attention — biggest single win, ~20-40 % faster on MoE.
|
|
# Modern llama-server takes a value (on/off/auto); bare --flash-attn
|
|
# is deprecated and consumes the next arg as its value.
|
|
- --flash-attn
|
|
- "on"
|
|
# Quantize KV cache to int8 — halves KV memory at minor / no
|
|
# quality loss; sometimes faster due to smaller working set.
|
|
# Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml.
|
|
- --cache-type-k
|
|
- q8_0
|
|
- --cache-type-v
|
|
- q8_0
|
|
# Use the model's embedded jinja chat template (rather than
|
|
# llama.cpp's hardcoded default). Important for Qwen3-Coder which
|
|
# has a specific chat format.
|
|
- --jinja
|
|
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
|
|
# tokens/sec, KV-cache use, queue depth, request latency.
|
|
- --metrics
|