Document current coding-workflow stack state

Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
2026-05-10 21:14:43 -04:00
parent 228fe8d1ac
commit a29793032d
35 changed files with 2067 additions and 37 deletions
--- a/pyinfra/framework/compose/kimi-linear.yml
+++ b/pyinfra/framework/compose/kimi-linear.yml
@@ -0,0 +1,112 @@
+# Kimi-Linear-48B-A3B-Instruct on vLLM, gfx1151, via kyuz0's TheRock 7.x
+# toolbox. Pioneer-grade: no public Strix Halo benchmarks exist for this
+# model as of 2026-05.
+#
+# Three risks P0 verifies in one shot:
+#   - KDA Triton kernel on gfx1151 (fla-core)             unverified
+#   - compressed-tensors loader on ROCm                   unverified
+#   - HIP-graph-capture on gfx1151                        broken; mitigated
+#                                                         via --enforce-eager
+#
+# Image strategy. Default `image:` is upstream `kyuz0:stable` (vLLM
+# ~6aa057c from 2026-04-22). If that crashes with the v0.12-class
+# `MLAModules.__init__() missing 'indexer_rotary_emb'`, build a
+# v0.11.2-pinned image locally with ./build.sh and edit `image:` below to
+# `kimi-linear-local:v0.11.2`. Source build is multi-hour.
+#
+# Weights. Despite their HF name, cyankiwi's "AWQ" Kimi-Linear weights
+# are actually `compressed-tensors` int4 group-quantized — see config.json.
+# Download with:
+#   huggingface-cli download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
+#     --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
+# Size: ~35 GB on disk (4-bit). 8-bit variant is ~54 GB if quality drives
+# us up later; both fit 128 GB unified comfortably.
+services:
+  kimi-linear:
+    # Derived image: kyuz0:stable + gfx1151 AITER GEMM config fallbacks
+    # (Kimi-Linear's MLA layers hit FP8 BMM ops kyuz0 didn't validate
+    # with their tested models). See ./Dockerfile. Build is fast — just
+    # file copies inside the image.
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: kimi-linear-local:aiter-fixed
+    container_name: kimi-linear
+    restart: unless-stopped
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups — names
+    # don't exist inside the container, but the GIDs need to match the
+    # host so /dev/kfd + /dev/dri are accessible.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 16g
+    ipc: host
+    environment:
+      # gfx1151 native: kyuz0 image is built with GFX=gfx1151, so unlike
+      # ollama.yml (which uses 11.0.0 to coerce gfx1100 kernels), here we
+      # want the GPU to report its real ISA.
+      - HSA_OVERRIDE_GFX_VERSION=11.5.1
+      # AITER attention path — kyuz0's image patches AITER for RDNA
+      # ds_swizzle fallbacks; the env flag opts vLLM into using it.
+      - VLLM_ROCM_USE_AITER=1
+      # MLA pre-processing via AITER triton_fp8_bmm tries to materialize
+      # a ~30 GB intermediate alongside resident weights. Bypass that op;
+      # other AITER paths stay on.
+      - VLLM_ROCM_USE_AITER_MLA=0
+      # Unified-memory recipe (BIOS UMA=0.5 GB + ttm.pages_limit cmdline
+      # + the env triple below). Lets PyTorch's HIP allocator treat the
+      # two rocminfo pools as one ~110 GB arena. Without the
+      # FINE_GRAIN_PCIE flag, XNACK alone is a trap (vLLM mis-computes
+      # KV budget vs. allocator ceiling).
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
+      - PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8000:8000"
+    # kyuz0 toolboxes drop into a shell by default; without an explicit
+    # entrypoint, `command:` would be exec'd as a program (the
+    # `exec "--model": executable file not found` failure).
+    entrypoint: ["vllm", "serve"]
+    command:
+      # Positional model path (vllm serve's documented form).
+      - /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
+      - --served-model-name
+      - kimi-linear
+      # Auto-detect would also work — config.json carries quant_method.
+      # Explicit flag makes the failure mode loud if the loader is wrong.
+      - --quantization
+      - compressed-tensors
+      # Conservative restart point after BIOS+cmdline+env unblock.
+      # P3 ramps further: 32K → 128K → 256K → 512K → 1M.
+      - --max-model-len
+      - "32768"
+      - --gpu-memory-utilization
+      - "0.92"
+      - --max-num-seqs
+      - "4"
+      # gfx1151 V1-engine HIP-graph-capture is broken (vllm-project/vllm#32180).
+      # Eager costs throughput, not correctness; do not remove without
+      # verifying upstream fix landed.
+      - --enforce-eager
+      # Kimi-Linear ships custom modeling_kimi.py — required.
+      - --trust-remote-code
+      # Tool-calling support — opencode sends tool_choice:"auto" whenever
+      # MCP servers are connected. vLLM is strict and rejects unless both
+      # flags are present. Moonshot's Kimi family uses the kimi_k2 parser
+      # for tool-call formatting; Kimi-Linear inherits the same template.
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - kimi_k2
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8000"