progress 235b
This commit is contained in:
@@ -1,44 +1,101 @@
|
||||
# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
|
||||
# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox.
|
||||
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||
#
|
||||
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
|
||||
# vulkan-radv — most stable, recommended default (this one)
|
||||
# vulkan-amdvlk — alternate Vulkan driver, sometimes faster
|
||||
# rocm-7.2.2 — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
|
||||
# rocm-6.4.4 — ROCm 6.x fallback
|
||||
# rocm-7.2.2 — ROCm 7.x, native gfx1151 + rocWMMA (this one;
|
||||
# best perf for Qwen3-Coder-class models)
|
||||
# vulkan-radv — most-stable Vulkan; fallback if ROCm regresses
|
||||
# vulkan-amdvlk — alternate Vulkan driver
|
||||
# rocm-6.4.4 — older ROCm; only if 7.2.2 breaks
|
||||
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
|
||||
#
|
||||
# Toolbox images use a shell entrypoint, so we override to launch
|
||||
# llama-server directly. Edit the --model path before `docker compose up -d`.
|
||||
# Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important
|
||||
# weights at higher precision than naive Q4_K_M, closer to Q5 quality at
|
||||
# Q4 size. Download path on the box (see compose/llama/README.md):
|
||||
# hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
|
||||
# 'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
|
||||
# --local-dir /models/qwen
|
||||
# Verify exact filename in the HF repo before downloading — Unsloth's
|
||||
# file naming varies (sometimes split into shards).
|
||||
#
|
||||
# Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama
|
||||
# stays the default opencode provider until LL-P0 confirms the eval_tps
|
||||
# bump is real on this box.
|
||||
services:
|
||||
llama:
|
||||
image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
|
||||
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
||||
container_name: llama
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
||||
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups —
|
||||
# required for /dev/kfd + /dev/dri access from inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 8g
|
||||
ipc: host
|
||||
environment:
|
||||
# Unified-memory recipe (same as compose/kimi-linear.yml +
|
||||
# compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB +
|
||||
# ttm.pages_limit cmdline → these flags merge the rocminfo pools
|
||||
# into one ~110 GB arena via the HIP allocator's demand-paging.
|
||||
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE.
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
# Toolbox image drops to shell by default; explicit entrypoint.
|
||||
entrypoint: ["llama-server"]
|
||||
command:
|
||||
- --model
|
||||
- /models/REPLACE/ME/model.gguf
|
||||
- /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf
|
||||
# OpenAI-compatible served name (matches what opencode/llm/curl
|
||||
# request as "model"). Keep simple — provider-side name lives
|
||||
# in opencode.json.
|
||||
- --alias
|
||||
- qwen3-coder
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8080"
|
||||
# Push all layers to GPU. "999" is shorthand for "all available."
|
||||
# gfx1151 with 110 GB merged arena fits 30B-class models easily.
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
# Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the
|
||||
# same across providers. Bump if a workflow needs more; KV cost
|
||||
# at this size is small with q8_0 cache.
|
||||
- --ctx-size
|
||||
- "32768"
|
||||
# Required for GPU backends on Strix Halo per Gygeek's setup
|
||||
# guide. Forces full load into GPU memory rather than mmap.
|
||||
- "65536"
|
||||
# No-mmap is the Strix Halo standard — mmap >64 GB is slow on
|
||||
# ROCm. Forces full GPU load.
|
||||
- --no-mmap
|
||||
# Flash attention — works on Vulkan too; the big win is on the
|
||||
# ROCm tag where kyuz0's build has rocWMMA acceleration.
|
||||
# Flash attention — biggest single win, ~20-40 % faster on MoE.
|
||||
# Modern llama-server takes a value (on/off/auto); bare --flash-attn
|
||||
# is deprecated and consumes the next arg as its value.
|
||||
- --flash-attn
|
||||
- "on"
|
||||
# Quantize KV cache to int8 — halves KV memory at minor / no
|
||||
# quality loss; sometimes faster due to smaller working set.
|
||||
# Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml.
|
||||
- --cache-type-k
|
||||
- q8_0
|
||||
- --cache-type-v
|
||||
- q8_0
|
||||
# Use the model's embedded jinja chat template (rather than
|
||||
# llama.cpp's hardcoded default). Important for Qwen3-Coder which
|
||||
# has a specific chat format.
|
||||
- --jinja
|
||||
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
|
||||
# tokens/sec, KV-cache use, queue depth, and request latency.
|
||||
# tokens/sec, KV-cache use, queue depth, request latency.
|
||||
- --metrics
|
||||
|
||||
Reference in New Issue
Block a user