Document current coding-workflow stack state
Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
This commit is contained in:
112
pyinfra/framework/compose/kimi-linear.yml
Normal file
112
pyinfra/framework/compose/kimi-linear.yml
Normal file
@@ -0,0 +1,112 @@
|
||||
# Kimi-Linear-48B-A3B-Instruct on vLLM, gfx1151, via kyuz0's TheRock 7.x
|
||||
# toolbox. Pioneer-grade: no public Strix Halo benchmarks exist for this
|
||||
# model as of 2026-05.
|
||||
#
|
||||
# Three risks P0 verifies in one shot:
|
||||
# - KDA Triton kernel on gfx1151 (fla-core) unverified
|
||||
# - compressed-tensors loader on ROCm unverified
|
||||
# - HIP-graph-capture on gfx1151 broken; mitigated
|
||||
# via --enforce-eager
|
||||
#
|
||||
# Image strategy. Default `image:` is upstream `kyuz0:stable` (vLLM
|
||||
# ~6aa057c from 2026-04-22). If that crashes with the v0.12-class
|
||||
# `MLAModules.__init__() missing 'indexer_rotary_emb'`, build a
|
||||
# v0.11.2-pinned image locally with ./build.sh and edit `image:` below to
|
||||
# `kimi-linear-local:v0.11.2`. Source build is multi-hour.
|
||||
#
|
||||
# Weights. Despite their HF name, cyankiwi's "AWQ" Kimi-Linear weights
|
||||
# are actually `compressed-tensors` int4 group-quantized — see config.json.
|
||||
# Download with:
|
||||
# huggingface-cli download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
|
||||
# --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
# Size: ~35 GB on disk (4-bit). 8-bit variant is ~54 GB if quality drives
|
||||
# us up later; both fit 128 GB unified comfortably.
|
||||
services:
|
||||
kimi-linear:
|
||||
# Derived image: kyuz0:stable + gfx1151 AITER GEMM config fallbacks
|
||||
# (Kimi-Linear's MLA layers hit FP8 BMM ops kyuz0 didn't validate
|
||||
# with their tested models). See ./Dockerfile. Build is fast — just
|
||||
# file copies inside the image.
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
image: kimi-linear-local:aiter-fixed
|
||||
container_name: kimi-linear
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the container, but the GIDs need to match the
|
||||
# host so /dev/kfd + /dev/dri are accessible.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 16g
|
||||
ipc: host
|
||||
environment:
|
||||
# gfx1151 native: kyuz0 image is built with GFX=gfx1151, so unlike
|
||||
# ollama.yml (which uses 11.0.0 to coerce gfx1100 kernels), here we
|
||||
# want the GPU to report its real ISA.
|
||||
- HSA_OVERRIDE_GFX_VERSION=11.5.1
|
||||
# AITER attention path — kyuz0's image patches AITER for RDNA
|
||||
# ds_swizzle fallbacks; the env flag opts vLLM into using it.
|
||||
- VLLM_ROCM_USE_AITER=1
|
||||
# MLA pre-processing via AITER triton_fp8_bmm tries to materialize
|
||||
# a ~30 GB intermediate alongside resident weights. Bypass that op;
|
||||
# other AITER paths stay on.
|
||||
- VLLM_ROCM_USE_AITER_MLA=0
|
||||
# Unified-memory recipe (BIOS UMA=0.5 GB + ttm.pages_limit cmdline
|
||||
# + the env triple below). Lets PyTorch's HIP allocator treat the
|
||||
# two rocminfo pools as one ~110 GB arena. Without the
|
||||
# FINE_GRAIN_PCIE flag, XNACK alone is a trap (vLLM mis-computes
|
||||
# KV budget vs. allocator ceiling).
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
- PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8000:8000"
|
||||
# kyuz0 toolboxes drop into a shell by default; without an explicit
|
||||
# entrypoint, `command:` would be exec'd as a program (the
|
||||
# `exec "--model": executable file not found` failure).
|
||||
entrypoint: ["vllm", "serve"]
|
||||
command:
|
||||
# Positional model path (vllm serve's documented form).
|
||||
- /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
- --served-model-name
|
||||
- kimi-linear
|
||||
# Auto-detect would also work — config.json carries quant_method.
|
||||
# Explicit flag makes the failure mode loud if the loader is wrong.
|
||||
- --quantization
|
||||
- compressed-tensors
|
||||
# Conservative restart point after BIOS+cmdline+env unblock.
|
||||
# P3 ramps further: 32K → 128K → 256K → 512K → 1M.
|
||||
- --max-model-len
|
||||
- "32768"
|
||||
- --gpu-memory-utilization
|
||||
- "0.92"
|
||||
- --max-num-seqs
|
||||
- "4"
|
||||
# gfx1151 V1-engine HIP-graph-capture is broken (vllm-project/vllm#32180).
|
||||
# Eager costs throughput, not correctness; do not remove without
|
||||
# verifying upstream fix landed.
|
||||
- --enforce-eager
|
||||
# Kimi-Linear ships custom modeling_kimi.py — required.
|
||||
- --trust-remote-code
|
||||
# Tool-calling support — opencode sends tool_choice:"auto" whenever
|
||||
# MCP servers are connected. vLLM is strict and rejects unless both
|
||||
# flags are present. Moonshot's Kimi family uses the kimi_k2 parser
|
||||
# for tool-call formatting; Kimi-Linear inherits the same template.
|
||||
- --enable-auto-tool-choice
|
||||
- --tool-call-parser
|
||||
- kimi_k2
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
Reference in New Issue
Block a user