# Kimi-Linear-48B-A3B-Instruct on vLLM, gfx1151, via kyuz0's TheRock 7.x # toolbox. Pioneer-grade: no public Strix Halo benchmarks exist for this # model as of 2026-05. # # Three risks P0 verifies in one shot: # - KDA Triton kernel on gfx1151 (fla-core) unverified # - compressed-tensors loader on ROCm unverified # - HIP-graph-capture on gfx1151 broken; mitigated # via --enforce-eager # # Image strategy. Default `image:` is upstream `kyuz0:stable` (vLLM # ~6aa057c from 2026-04-22). If that crashes with the v0.12-class # `MLAModules.__init__() missing 'indexer_rotary_emb'`, build a # v0.11.2-pinned image locally with ./build.sh and edit `image:` below to # `kimi-linear-local:v0.11.2`. Source build is multi-hour. # # Weights. Despite their HF name, cyankiwi's "AWQ" Kimi-Linear weights # are actually `compressed-tensors` int4 group-quantized — see config.json. # Download with: # huggingface-cli download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \ # --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit # Size: ~35 GB on disk (4-bit). 8-bit variant is ~54 GB if quality drives # us up later; both fit 128 GB unified comfortably. services: kimi-linear: # Derived image: kyuz0:stable + gfx1151 AITER GEMM config fallbacks # (Kimi-Linear's MLA layers hit FP8 BMM ops kyuz0 didn't validate # with their tested models). See ./Dockerfile. Build is fast — just # file copies inside the image. build: context: . dockerfile: Dockerfile image: kimi-linear-local:aiter-fixed container_name: kimi-linear restart: unless-stopped devices: - /dev/kfd:/dev/kfd - /dev/dri:/dev/dri cap_add: - SYS_PTRACE security_opt: - seccomp=unconfined # Numeric GIDs of host's video (44) and render (991) groups — names # don't exist inside the container, but the GIDs need to match the # host so /dev/kfd + /dev/dri are accessible. group_add: - "44" - "991" shm_size: 16g ipc: host environment: # gfx1151 native: kyuz0 image is built with GFX=gfx1151, so unlike # ollama.yml (which uses 11.0.0 to coerce gfx1100 kernels), here we # want the GPU to report its real ISA. - HSA_OVERRIDE_GFX_VERSION=11.5.1 # AITER attention path — kyuz0's image patches AITER for RDNA # ds_swizzle fallbacks; the env flag opts vLLM into using it. - VLLM_ROCM_USE_AITER=1 # MLA pre-processing via AITER triton_fp8_bmm tries to materialize # a ~30 GB intermediate alongside resident weights. Bypass that op; # other AITER paths stay on. - VLLM_ROCM_USE_AITER_MLA=0 # Unified-memory recipe (BIOS UMA=0.5 GB + ttm.pages_limit cmdline # + the env triple below). Lets PyTorch's HIP allocator treat the # two rocminfo pools as one ~110 GB arena. Without the # FINE_GRAIN_PCIE flag, XNACK alone is a trap (vLLM mis-computes # KV budget vs. allocator ceiling). - HSA_XNACK=1 - HSA_FORCE_FINE_GRAIN_PCIE=1 - PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9 volumes: - /models:/models:ro ports: - "8000:8000" # kyuz0 toolboxes drop into a shell by default; without an explicit # entrypoint, `command:` would be exec'd as a program (the # `exec "--model": executable file not found` failure). entrypoint: ["vllm", "serve"] command: # Positional model path (vllm serve's documented form). - /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit - --served-model-name - kimi-linear # Auto-detect would also work — config.json carries quant_method. # Explicit flag makes the failure mode loud if the loader is wrong. - --quantization - compressed-tensors # Conservative restart point after BIOS+cmdline+env unblock. # P3 ramps further: 32K → 128K → 256K → 512K → 1M. - --max-model-len - "32768" - --gpu-memory-utilization - "0.92" - --max-num-seqs - "4" # gfx1151 V1-engine HIP-graph-capture is broken (vllm-project/vllm#32180). # Eager costs throughput, not correctness; do not remove without # verifying upstream fix landed. - --enforce-eager # Kimi-Linear ships custom modeling_kimi.py — required. - --trust-remote-code # Tool-calling support — opencode sends tool_choice:"auto" whenever # MCP servers are connected. vLLM is strict and rejects unless both # flags are present. Moonshot's Kimi family uses the kimi_k2 parser # for tool-call formatting; Kimi-Linear inherits the same template. - --enable-auto-tool-choice - --tool-call-parser - kimi_k2 - --host - 0.0.0.0 - --port - "8000"