localgenai/pyinfra/framework/compose/llama.yml

# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
#
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
#   rocm-7.2.2       — ROCm 7.x, native gfx1151 + rocWMMA (this one;
#                      best perf for Qwen3-Coder-class models)
#   vulkan-radv      — most-stable Vulkan; fallback if ROCm regresses
#   vulkan-amdvlk    — alternate Vulkan driver
#   rocm-6.4.4       — older ROCm; only if 7.2.2 breaks
#   rocm7-nightlies  — avoid: caps memory allocation to 64 GB (May 2026)
#
# Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important
# weights at higher precision than naive Q4_K_M, closer to Q5 quality at
# Q4 size. Download path on the box (see compose/llama/README.md):
#   hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
#       'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
#       --local-dir /models/qwen
# Verify exact filename in the HF repo before downloading — Unsloth's
# file naming varies (sometimes split into shards).
#
# Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama
# stays the default opencode provider until LL-P0 confirms the eval_tps
# bump is real on this box.
services:
  llama:
    image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
    container_name: llama
    restart: unless-stopped
    devices:
      # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
      # only needs dri. Don't drop kfd when on the rocm-* tag.
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp=unconfined
    # Numeric GIDs of host's video (44) and render (991) groups —
    # required for /dev/kfd + /dev/dri access from inside the container.
    group_add:
      - "44"
      - "991"
    shm_size: 8g
    ipc: host
    environment:
      # Unified-memory recipe (same as compose/kimi-linear.yml +
      # compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB +
      # ttm.pages_limit cmdline → these flags merge the rocminfo pools
      # into one ~110 GB arena via the HIP allocator's demand-paging.
      # kyuz0's image is native gfx1151 so no HSA_OVERRIDE.
      - HSA_XNACK=1
      - HSA_FORCE_FINE_GRAIN_PCIE=1
    volumes:
      - /models:/models:ro
    ports:
      - "8080:8080"
    # Toolbox image drops to shell by default; explicit entrypoint.
    entrypoint: ["llama-server"]
    command:
      - --model
      - /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf
      # OpenAI-compatible served name (matches what opencode/llm/curl
      # request as "model"). Keep simple — provider-side name lives
      # in opencode.json.
      - --alias
      - qwen3-coder
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      # Push all layers to GPU. "999" is shorthand for "all available."
      # gfx1151 with 110 GB merged arena fits 30B-class models easily.
      - --n-gpu-layers
      - "999"
      # Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the
      # same across providers. Bump if a workflow needs more; KV cost
      # at this size is small with q8_0 cache.
      - --ctx-size
      - "65536"
      # No-mmap is the Strix Halo standard — mmap >64 GB is slow on
      # ROCm. Forces full GPU load.
      - --no-mmap
      # Flash attention — biggest single win, ~20-40 % faster on MoE.
      # Modern llama-server takes a value (on/off/auto); bare --flash-attn
      # is deprecated and consumes the next arg as its value.
      - --flash-attn
      - "on"
      # Quantize KV cache to int8 — halves KV memory at minor / no
      # quality loss; sometimes faster due to smaller working set.
      # Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml.
      - --cache-type-k
      - q8_0
      - --cache-type-v
      - q8_0
      # Use the model's embedded jinja chat template (rather than
      # llama.cpp's hardcoded default). Important for Qwen3-Coder which
      # has a specific chat format.
      - --jinja
      # Expose Prometheus metrics at /metrics — scraped by OpenLIT for
      # tokens/sec, KV-cache use, queue depth, request latency.
      - --metrics