progress 235b

2026-06-08 15:31:50 +01:00
parent a29793032d
commit de1635872f
25 changed files with 1598 additions and 53 deletions
--- a/pyinfra/framework/compose/ollama.yml
+++ b/pyinfra/framework/compose/ollama.yml
@@ -31,6 +31,31 @@ services:
      # layers between GPU and CPU. 64K keeps the model fully on GPU
      # while still being plenty for coding contexts.
      - OLLAMA_CONTEXT_LENGTH=65536
+      # Perf tuning. Flash attention is the biggest single win on MoE
+      # models at long context (20-40 % faster generation). q8_0 KV
+      # cache halves KV memory at minor / no quality loss; sometimes
+      # faster due to smaller working set. The parallel/loaded-models
+      # caps avoid Ollama slicing memory across speculative concurrent
+      # requests we never have.
+      - OLLAMA_FLASH_ATTENTION=1
+      - OLLAMA_KV_CACHE_TYPE=q8_0
+      - OLLAMA_NUM_PARALLEL=1
+      - OLLAMA_MAX_LOADED_MODELS=1
+      # Keep the model resident for 24h instead of the default 5 min.
+      # Avoids cold-start latency between sessions; safe because we cap
+      # max_loaded_models above so memory doesn't drift.
+      - OLLAMA_KEEP_ALIVE=24h
+      # Unified-memory recipe. With BIOS UMA=0.5 GB the dedicated VRAM
+      # pool is tiny; the model lives in GTT (system RAM the GPU borrows
+      # via ttm.pages_limit=33554432 on the kernel cmdline). XNACK +
+      # FINE_GRAIN_PCIE put the HIP allocator into demand-paging mode so
+      # it treats the merged VRAM+GTT pool as one arena. Same flags as
+      # compose/kimi-linear.yml and compose/comfyui.yml — Ollama uses
+      # ggml/llama.cpp underneath but its allocator goes through HIP.
+      # PYTORCH_HIP_ALLOC_CONF is intentionally absent (Ollama isn't
+      # PyTorch).
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
    volumes:
      - /models/ollama:/root/.ollama
      - /models:/models:ro