2026-05-08 11:35:10 -04:00
|
|
|
# Ollama, ROCm backend. Serves models on demand — safe to start before
|
|
|
|
|
# you've put anything in /models.
|
|
|
|
|
#
|
|
|
|
|
# Storage: Ollama's content-addressed blob store is bind-mounted under
|
|
|
|
|
# /models/ollama so all model data on the host lives under /models.
|
|
|
|
|
# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM
|
|
|
|
|
# can't load them directly. Keep curated GGUFs at /models/<vendor>/...
|
|
|
|
|
# for those engines.
|
|
|
|
|
services:
|
|
|
|
|
ollama:
|
|
|
|
|
image: ollama/ollama:rocm
|
|
|
|
|
container_name: ollama
|
|
|
|
|
restart: unless-stopped
|
|
|
|
|
devices:
|
|
|
|
|
- /dev/kfd:/dev/kfd
|
|
|
|
|
- /dev/dri:/dev/dri
|
|
|
|
|
# Numeric GIDs of host's video (44) and render (991) groups — names
|
|
|
|
|
# don't exist inside the container, but the GIDs need to match the
|
|
|
|
|
# host so /dev/kfd + /dev/dri are accessible.
|
|
|
|
|
group_add:
|
|
|
|
|
- "44"
|
|
|
|
|
- "991"
|
|
|
|
|
environment:
|
|
|
|
|
# Strix Halo's iGPU is gfx1151 (RDNA 3.5), which Ollama's bundled
|
|
|
|
|
# ROCm runtime doesn't recognize — without this override it falls
|
|
|
|
|
# back to CPU silently. 11.0.0 = gfx1100 (Navi 31); the RDNA 3.x
|
|
|
|
|
# ISAs are close enough that gfx1100 kernels run on gfx1151.
|
|
|
|
|
- HSA_OVERRIDE_GFX_VERSION=11.0.0
|
|
|
|
|
# Default context. 256K (the upstream default for Qwen3-Coder)
|
|
|
|
|
# blows the KV cache up to ~25-30 GB and forces ollama to split
|
|
|
|
|
# layers between GPU and CPU. 64K keeps the model fully on GPU
|
|
|
|
|
# while still being plenty for coding contexts.
|
|
|
|
|
- OLLAMA_CONTEXT_LENGTH=65536
|
2026-06-08 15:31:50 +01:00
|
|
|
# Perf tuning. Flash attention is the biggest single win on MoE
|
|
|
|
|
# models at long context (20-40 % faster generation). q8_0 KV
|
|
|
|
|
# cache halves KV memory at minor / no quality loss; sometimes
|
|
|
|
|
# faster due to smaller working set. The parallel/loaded-models
|
|
|
|
|
# caps avoid Ollama slicing memory across speculative concurrent
|
|
|
|
|
# requests we never have.
|
|
|
|
|
- OLLAMA_FLASH_ATTENTION=1
|
|
|
|
|
- OLLAMA_KV_CACHE_TYPE=q8_0
|
|
|
|
|
- OLLAMA_NUM_PARALLEL=1
|
|
|
|
|
- OLLAMA_MAX_LOADED_MODELS=1
|
|
|
|
|
# Keep the model resident for 24h instead of the default 5 min.
|
|
|
|
|
# Avoids cold-start latency between sessions; safe because we cap
|
|
|
|
|
# max_loaded_models above so memory doesn't drift.
|
|
|
|
|
- OLLAMA_KEEP_ALIVE=24h
|
|
|
|
|
# Unified-memory recipe. With BIOS UMA=0.5 GB the dedicated VRAM
|
|
|
|
|
# pool is tiny; the model lives in GTT (system RAM the GPU borrows
|
|
|
|
|
# via ttm.pages_limit=33554432 on the kernel cmdline). XNACK +
|
|
|
|
|
# FINE_GRAIN_PCIE put the HIP allocator into demand-paging mode so
|
|
|
|
|
# it treats the merged VRAM+GTT pool as one arena. Same flags as
|
|
|
|
|
# compose/kimi-linear.yml and compose/comfyui.yml — Ollama uses
|
|
|
|
|
# ggml/llama.cpp underneath but its allocator goes through HIP.
|
|
|
|
|
# PYTORCH_HIP_ALLOC_CONF is intentionally absent (Ollama isn't
|
|
|
|
|
# PyTorch).
|
|
|
|
|
- HSA_XNACK=1
|
|
|
|
|
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
2026-05-08 11:35:10 -04:00
|
|
|
volumes:
|
|
|
|
|
- /models/ollama:/root/.ollama
|
|
|
|
|
- /models:/models:ro
|
|
|
|
|
ports:
|
|
|
|
|
- "11434:11434"
|