45 lines
1.6 KiB
YAML
45 lines
1.6 KiB
YAML
|
|
# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
|
||
|
|
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||
|
|
#
|
||
|
|
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
|
||
|
|
# vulkan-radv — most stable, recommended default (this one)
|
||
|
|
# vulkan-amdvlk — alternate Vulkan driver, sometimes faster
|
||
|
|
# rocm-7.2.2 — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
|
||
|
|
# rocm-6.4.4 — ROCm 6.x fallback
|
||
|
|
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
|
||
|
|
#
|
||
|
|
# Toolbox images use a shell entrypoint, so we override to launch
|
||
|
|
# llama-server directly. Edit the --model path before `docker compose up -d`.
|
||
|
|
services:
|
||
|
|
llama:
|
||
|
|
image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
|
||
|
|
container_name: llama
|
||
|
|
restart: unless-stopped
|
||
|
|
devices:
|
||
|
|
- /dev/dri:/dev/dri
|
||
|
|
volumes:
|
||
|
|
- /models:/models:ro
|
||
|
|
ports:
|
||
|
|
- "8080:8080"
|
||
|
|
entrypoint: ["llama-server"]
|
||
|
|
command:
|
||
|
|
- --model
|
||
|
|
- /models/REPLACE/ME/model.gguf
|
||
|
|
- --host
|
||
|
|
- 0.0.0.0
|
||
|
|
- --port
|
||
|
|
- "8080"
|
||
|
|
- --n-gpu-layers
|
||
|
|
- "999"
|
||
|
|
- --ctx-size
|
||
|
|
- "32768"
|
||
|
|
# Required for GPU backends on Strix Halo per Gygeek's setup
|
||
|
|
# guide. Forces full load into GPU memory rather than mmap.
|
||
|
|
- --no-mmap
|
||
|
|
# Flash attention — works on Vulkan too; the big win is on the
|
||
|
|
# ROCm tag where kyuz0's build has rocWMMA acceleration.
|
||
|
|
- --flash-attn
|
||
|
|
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
|
||
|
|
# tokens/sec, KV-cache use, queue depth, and request latency.
|
||
|
|
- --metrics
|