localgenai/framework/compose/llama.yml

# llama.cpp server, Vulkan backend (RADV on Strix Halo).
# Edit the --model path before `docker compose up -d`.
services:
  llama:
    image: ghcr.io/ggml-org/llama.cpp:server-vulkan
    container_name: llama
    restart: unless-stopped
    devices:
      - /dev/dri:/dev/dri
    volumes:
      - /models:/models:ro
    ports:
      - "8080:8080"
    command:
      - --model
      - /models/REPLACE/ME/model.gguf
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      - --n-gpu-layers
      - "999"
      - --ctx-size
      - "32768"