localgenai/pyinfra/framework/compose/llama.yml

# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
#
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
#   vulkan-radv      — most stable, recommended default (this one)
#   vulkan-amdvlk    — alternate Vulkan driver, sometimes faster
#   rocm-7.2.2       — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
#   rocm-6.4.4       — ROCm 6.x fallback
#   rocm7-nightlies  — avoid: caps memory allocation to 64 GB (May 2026)
#
# Toolbox images use a shell entrypoint, so we override to launch
# llama-server directly. Edit the --model path before `docker compose up -d`.
services:
  llama:
    image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
    container_name: llama
    restart: unless-stopped
    devices:
      - /dev/dri:/dev/dri
    volumes:
      - /models:/models:ro
    ports:
      - "8080:8080"
    entrypoint: ["llama-server"]
    command:
      - --model
      - /models/REPLACE/ME/model.gguf
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "32768"
      # Required for GPU backends on Strix Halo per Gygeek's setup
      # guide. Forces full load into GPU memory rather than mmap.
      - --no-mmap
      # Flash attention — works on Vulkan too; the big win is on the
      # ROCm tag where kyuz0's build has rocWMMA acceleration.
      - --flash-attn
      # Expose Prometheus metrics at /metrics — scraped by OpenLIT for
      # tokens/sec, KV-cache use, queue depth, and request latency.
      - --metrics