localgenai/pyinfra/framework/compose/ollama.yml

# Ollama, ROCm backend. Serves models on demand — safe to start before
# you've put anything in /models.
#
# Storage: Ollama's content-addressed blob store is bind-mounted under
# /models/ollama so all model data on the host lives under /models.
# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM
# can't load them directly. Keep curated GGUFs at /models/<vendor>/...
# for those engines.
services:
  ollama:
    image: ollama/ollama:rocm
    container_name: ollama
    restart: unless-stopped
    devices:
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    # Numeric GIDs of host's video (44) and render (991) groups — names
    # don't exist inside the container, but the GIDs need to match the
    # host so /dev/kfd + /dev/dri are accessible.
    group_add:
      - "44"
      - "991"
    environment:
      # Strix Halo's iGPU is gfx1151 (RDNA 3.5), which Ollama's bundled
      # ROCm runtime doesn't recognize — without this override it falls
      # back to CPU silently. 11.0.0 = gfx1100 (Navi 31); the RDNA 3.x
      # ISAs are close enough that gfx1100 kernels run on gfx1151.
      - HSA_OVERRIDE_GFX_VERSION=11.0.0
      # Default context. 256K (the upstream default for Qwen3-Coder)
      # blows the KV cache up to ~25-30 GB and forces ollama to split
      # layers between GPU and CPU. 64K keeps the model fully on GPU
      # while still being plenty for coding contexts.
      - OLLAMA_CONTEXT_LENGTH=65536
    volumes:
      - /models/ollama:/root/.ollama
      - /models:/models:ro
    ports:
      - "11434:11434"