# Ollama, ROCm backend. Serves models on demand — safe to start before # you've put anything in /models. # # Storage: Ollama's content-addressed blob store is bind-mounted under # /models/ollama so all model data on the host lives under /models. # Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM # can't load them directly. Keep curated GGUFs at /models//... # for those engines. services: ollama: image: ollama/ollama:rocm container_name: ollama restart: unless-stopped devices: - /dev/kfd:/dev/kfd - /dev/dri:/dev/dri # Numeric GIDs of host's video (44) and render (991) groups — names # don't exist inside the container, but the GIDs need to match the # host so /dev/kfd + /dev/dri are accessible. group_add: - "44" - "991" environment: # Strix Halo's iGPU is gfx1151 (RDNA 3.5), which Ollama's bundled # ROCm runtime doesn't recognize — without this override it falls # back to CPU silently. 11.0.0 = gfx1100 (Navi 31); the RDNA 3.x # ISAs are close enough that gfx1100 kernels run on gfx1151. - HSA_OVERRIDE_GFX_VERSION=11.0.0 # Default context. 256K (the upstream default for Qwen3-Coder) # blows the KV cache up to ~25-30 GB and forces ollama to split # layers between GPU and CPU. 64K keeps the model fully on GPU # while still being plenty for coding contexts. - OLLAMA_CONTEXT_LENGTH=65536 # Perf tuning. Flash attention is the biggest single win on MoE # models at long context (20-40 % faster generation). q8_0 KV # cache halves KV memory at minor / no quality loss; sometimes # faster due to smaller working set. The parallel/loaded-models # caps avoid Ollama slicing memory across speculative concurrent # requests we never have. - OLLAMA_FLASH_ATTENTION=1 - OLLAMA_KV_CACHE_TYPE=q8_0 - OLLAMA_NUM_PARALLEL=1 - OLLAMA_MAX_LOADED_MODELS=1 # Keep the model resident for 24h instead of the default 5 min. # Avoids cold-start latency between sessions; safe because we cap # max_loaded_models above so memory doesn't drift. - OLLAMA_KEEP_ALIVE=24h # Unified-memory recipe. With BIOS UMA=0.5 GB the dedicated VRAM # pool is tiny; the model lives in GTT (system RAM the GPU borrows # via ttm.pages_limit=33554432 on the kernel cmdline). XNACK + # FINE_GRAIN_PCIE put the HIP allocator into demand-paging mode so # it treats the merged VRAM+GTT pool as one arena. Same flags as # compose/kimi-linear.yml and compose/comfyui.yml — Ollama uses # ggml/llama.cpp underneath but its allocator goes through HIP. # PYTORCH_HIP_ALLOC_CONF is intentionally absent (Ollama isn't # PyTorch). - HSA_XNACK=1 - HSA_FORCE_FINE_GRAIN_PCIE=1 volumes: - /models/ollama:/root/.ollama - /models:/models:ro ports: - "11434:11434"