progress 235b

2026-06-08 15:31:50 +01:00
parent a29793032d
commit de1635872f
25 changed files with 1598 additions and 53 deletions
--- a/pyinfra/framework/compose/llama.yml
+++ b/pyinfra/framework/compose/llama.yml
@@ -1,44 +1,101 @@
-# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
+# llama.cpp server, gfx1151-native via kyuz0's Strix Halo toolbox.
 # https://github.com/kyuz0/amd-strix-halo-toolboxes
 #
 # Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
-#   vulkan-radv      — most stable, recommended default (this one)
-#   vulkan-amdvlk    — alternate Vulkan driver, sometimes faster
-#   rocm-7.2.2       — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
-#   rocm-6.4.4       — ROCm 6.x fallback
+#   rocm-7.2.2       — ROCm 7.x, native gfx1151 + rocWMMA (this one;
+#                      best perf for Qwen3-Coder-class models)
+#   vulkan-radv      — most-stable Vulkan; fallback if ROCm regresses
+#   vulkan-amdvlk    — alternate Vulkan driver
+#   rocm-6.4.4       — older ROCm; only if 7.2.2 breaks
 #   rocm7-nightlies  — avoid: caps memory allocation to 64 GB (May 2026)
 #
-# Toolbox images use a shell entrypoint, so we override to launch
-# llama-server directly. Edit the --model path before `docker compose up -d`.
+# Weights: Unsloth "dynamic" quant — UD-Q4_K_XL preserves more important
+# weights at higher precision than naive Q4_K_M, closer to Q5 quality at
+# Q4 size. Download path on the box (see compose/llama/README.md):
+#   hf download unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF \
+#       'Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf' \
+#       --local-dir /models/qwen
+# Verify exact filename in the HF repo before downloading — Unsloth's
+# file naming varies (sometimes split into shards).
+#
+# Coexists with Ollama (11434) and vLLM (8000). Port 8080 here. Ollama
+# stays the default opencode provider until LL-P0 confirms the eval_tps
+# bump is real on this box.
 services:
  llama:
-    image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
+    image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
    container_name: llama
    restart: unless-stopped
    devices:
+      # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
+      # only needs dri. Don't drop kfd when on the rocm-* tag.
+      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups —
+    # required for /dev/kfd + /dev/dri access from inside the container.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 8g
+    ipc: host
+    environment:
+      # Unified-memory recipe (same as compose/kimi-linear.yml +
+      # compose/comfyui.yml + compose/ollama.yml). BIOS UMA=0.5 GB +
+      # ttm.pages_limit cmdline → these flags merge the rocminfo pools
+      # into one ~110 GB arena via the HIP allocator's demand-paging.
+      # kyuz0's image is native gfx1151 so no HSA_OVERRIDE.
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
    volumes:
      - /models:/models:ro
    ports:
      - "8080:8080"
+    # Toolbox image drops to shell by default; explicit entrypoint.
    entrypoint: ["llama-server"]
    command:
      - --model
-      - /models/REPLACE/ME/model.gguf
+      - /models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf
+      # OpenAI-compatible served name (matches what opencode/llm/curl
+      # request as "model"). Keep simple — provider-side name lives
+      # in opencode.json.
+      - --alias
+      - qwen3-coder
      - --host
      - 0.0.0.0
      - --port
      - "8080"
+      # Push all layers to GPU. "999" is shorthand for "all available."
+      # gfx1151 with 110 GB merged arena fits 30B-class models easily.
      - --n-gpu-layers
      - "999"
+      # Match Ollama's OLLAMA_CONTEXT_LENGTH so opencode behaves the
+      # same across providers. Bump if a workflow needs more; KV cost
+      # at this size is small with q8_0 cache.
      - --ctx-size
-      - "32768"
-      # Required for GPU backends on Strix Halo per Gygeek's setup
-      # guide. Forces full load into GPU memory rather than mmap.
+      - "65536"
+      # No-mmap is the Strix Halo standard — mmap >64 GB is slow on
+      # ROCm. Forces full GPU load.
      - --no-mmap
-      # Flash attention — works on Vulkan too; the big win is on the
-      # ROCm tag where kyuz0's build has rocWMMA acceleration.
+      # Flash attention — biggest single win, ~20-40 % faster on MoE.
+      # Modern llama-server takes a value (on/off/auto); bare --flash-attn
+      # is deprecated and consumes the next arg as its value.
      - --flash-attn
+      - "on"
+      # Quantize KV cache to int8 — halves KV memory at minor / no
+      # quality loss; sometimes faster due to smaller working set.
+      # Matches OLLAMA_KV_CACHE_TYPE=q8_0 in compose/ollama.yml.
+      - --cache-type-k
+      - q8_0
+      - --cache-type-v
+      - q8_0
+      # Use the model's embedded jinja chat template (rather than
+      # llama.cpp's hardcoded default). Important for Qwen3-Coder which
+      # has a specific chat format.
+      - --jinja
      # Expose Prometheus metrics at /metrics — scraped by OpenLIT for
-      # tokens/sec, KV-cache use, queue depth, and request latency.
+      # tokens/sec, KV-cache use, queue depth, request latency.
      - --metrics