# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes. # https://github.com/kyuz0/amd-strix-halo-toolboxes # # Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes: # vulkan-radv — most stable, recommended default (this one) # vulkan-amdvlk — alternate Vulkan driver, sometimes faster # rocm-7.2.2 — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern) # rocm-6.4.4 — ROCm 6.x fallback # rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026) # # Toolbox images use a shell entrypoint, so we override to launch # llama-server directly. Edit the --model path before `docker compose up -d`. services: llama: image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv container_name: llama restart: unless-stopped devices: - /dev/dri:/dev/dri volumes: - /models:/models:ro ports: - "8080:8080" entrypoint: ["llama-server"] command: - --model - /models/REPLACE/ME/model.gguf - --host - 0.0.0.0 - --port - "8080" - --n-gpu-layers - "999" - --ctx-size - "32768" # Required for GPU backends on Strix Halo per Gygeek's setup # guide. Forces full load into GPU memory rather than mmap. - --no-mmap # Flash attention — works on Vulkan too; the big win is on the # ROCm tag where kyuz0's build has rocWMMA acceleration. - --flash-attn # Expose Prometheus metrics at /metrics — scraped by OpenLIT for # tokens/sec, KV-cache use, queue depth, and request latency. - --metrics