Document current coding-workflow stack state
Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
This commit is contained in:
112
pyinfra/framework/compose/kimi-linear.yml
Normal file
112
pyinfra/framework/compose/kimi-linear.yml
Normal file
@@ -0,0 +1,112 @@
|
||||
# Kimi-Linear-48B-A3B-Instruct on vLLM, gfx1151, via kyuz0's TheRock 7.x
|
||||
# toolbox. Pioneer-grade: no public Strix Halo benchmarks exist for this
|
||||
# model as of 2026-05.
|
||||
#
|
||||
# Three risks P0 verifies in one shot:
|
||||
# - KDA Triton kernel on gfx1151 (fla-core) unverified
|
||||
# - compressed-tensors loader on ROCm unverified
|
||||
# - HIP-graph-capture on gfx1151 broken; mitigated
|
||||
# via --enforce-eager
|
||||
#
|
||||
# Image strategy. Default `image:` is upstream `kyuz0:stable` (vLLM
|
||||
# ~6aa057c from 2026-04-22). If that crashes with the v0.12-class
|
||||
# `MLAModules.__init__() missing 'indexer_rotary_emb'`, build a
|
||||
# v0.11.2-pinned image locally with ./build.sh and edit `image:` below to
|
||||
# `kimi-linear-local:v0.11.2`. Source build is multi-hour.
|
||||
#
|
||||
# Weights. Despite their HF name, cyankiwi's "AWQ" Kimi-Linear weights
|
||||
# are actually `compressed-tensors` int4 group-quantized — see config.json.
|
||||
# Download with:
|
||||
# huggingface-cli download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
|
||||
# --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
# Size: ~35 GB on disk (4-bit). 8-bit variant is ~54 GB if quality drives
|
||||
# us up later; both fit 128 GB unified comfortably.
|
||||
services:
|
||||
kimi-linear:
|
||||
# Derived image: kyuz0:stable + gfx1151 AITER GEMM config fallbacks
|
||||
# (Kimi-Linear's MLA layers hit FP8 BMM ops kyuz0 didn't validate
|
||||
# with their tested models). See ./Dockerfile. Build is fast — just
|
||||
# file copies inside the image.
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
image: kimi-linear-local:aiter-fixed
|
||||
container_name: kimi-linear
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the container, but the GIDs need to match the
|
||||
# host so /dev/kfd + /dev/dri are accessible.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 16g
|
||||
ipc: host
|
||||
environment:
|
||||
# gfx1151 native: kyuz0 image is built with GFX=gfx1151, so unlike
|
||||
# ollama.yml (which uses 11.0.0 to coerce gfx1100 kernels), here we
|
||||
# want the GPU to report its real ISA.
|
||||
- HSA_OVERRIDE_GFX_VERSION=11.5.1
|
||||
# AITER attention path — kyuz0's image patches AITER for RDNA
|
||||
# ds_swizzle fallbacks; the env flag opts vLLM into using it.
|
||||
- VLLM_ROCM_USE_AITER=1
|
||||
# MLA pre-processing via AITER triton_fp8_bmm tries to materialize
|
||||
# a ~30 GB intermediate alongside resident weights. Bypass that op;
|
||||
# other AITER paths stay on.
|
||||
- VLLM_ROCM_USE_AITER_MLA=0
|
||||
# Unified-memory recipe (BIOS UMA=0.5 GB + ttm.pages_limit cmdline
|
||||
# + the env triple below). Lets PyTorch's HIP allocator treat the
|
||||
# two rocminfo pools as one ~110 GB arena. Without the
|
||||
# FINE_GRAIN_PCIE flag, XNACK alone is a trap (vLLM mis-computes
|
||||
# KV budget vs. allocator ceiling).
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
- PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8000:8000"
|
||||
# kyuz0 toolboxes drop into a shell by default; without an explicit
|
||||
# entrypoint, `command:` would be exec'd as a program (the
|
||||
# `exec "--model": executable file not found` failure).
|
||||
entrypoint: ["vllm", "serve"]
|
||||
command:
|
||||
# Positional model path (vllm serve's documented form).
|
||||
- /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
- --served-model-name
|
||||
- kimi-linear
|
||||
# Auto-detect would also work — config.json carries quant_method.
|
||||
# Explicit flag makes the failure mode loud if the loader is wrong.
|
||||
- --quantization
|
||||
- compressed-tensors
|
||||
# Conservative restart point after BIOS+cmdline+env unblock.
|
||||
# P3 ramps further: 32K → 128K → 256K → 512K → 1M.
|
||||
- --max-model-len
|
||||
- "32768"
|
||||
- --gpu-memory-utilization
|
||||
- "0.92"
|
||||
- --max-num-seqs
|
||||
- "4"
|
||||
# gfx1151 V1-engine HIP-graph-capture is broken (vllm-project/vllm#32180).
|
||||
# Eager costs throughput, not correctness; do not remove without
|
||||
# verifying upstream fix landed.
|
||||
- --enforce-eager
|
||||
# Kimi-Linear ships custom modeling_kimi.py — required.
|
||||
- --trust-remote-code
|
||||
# Tool-calling support — opencode sends tool_choice:"auto" whenever
|
||||
# MCP servers are connected. vLLM is strict and rejects unless both
|
||||
# flags are present. Moonshot's Kimi family uses the kimi_k2 parser
|
||||
# for tool-call formatting; Kimi-Linear inherits the same template.
|
||||
- --enable-auto-tool-choice
|
||||
- --tool-call-parser
|
||||
- kimi_k2
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
35
pyinfra/framework/compose/kimi-linear/Dockerfile
Normal file
35
pyinfra/framework/compose/kimi-linear/Dockerfile
Normal file
@@ -0,0 +1,35 @@
|
||||
# Derived image: kyuz0:stable plus gfx1151 AITER GEMM config fallbacks.
|
||||
#
|
||||
# kyuz0's image is built for gfx1151 but doesn't ship every per-op AITER
|
||||
# autotuning config. Kimi-Linear's MLA layers hit FP8 BMM ops
|
||||
# (BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT
|
||||
# and friends) that have no gfx1151 config in the bundle. We synthesize
|
||||
# them by copying from the closest-arch config that does exist (RDNA3
|
||||
# gfx1100 is closest to RDNA3.5 gfx1151). Tile sizes won't be optimal
|
||||
# but the kernels will compile and run.
|
||||
#
|
||||
# Idempotent — only fills slots that don't already have a gfx1151 config.
|
||||
#
|
||||
# If we ever need a vLLM-pinned base (e.g. upstream regresses on
|
||||
# Kimi-Linear), build it via ./build.sh first and change FROM here to
|
||||
# kimi-linear-local:v0.11.2.
|
||||
|
||||
FROM kyuz0/vllm-therock-gfx1151:stable
|
||||
|
||||
RUN set -e; \
|
||||
DIR=/opt/venv/lib64/python3.12/site-packages/aiter/ops/triton/configs/gemm; \
|
||||
cd "$DIR"; \
|
||||
filled=0; \
|
||||
for SRC_PREFIX in gfx1100 gfx1101 gfx942 gfx90a; do \
|
||||
for SRC in ${SRC_PREFIX}-*.json; do \
|
||||
[ -f "$SRC" ] || continue; \
|
||||
OP=${SRC#${SRC_PREFIX}-}; \
|
||||
DST=gfx1151-${OP}; \
|
||||
if [ ! -f "$DST" ]; then \
|
||||
cp "$SRC" "$DST"; \
|
||||
echo "[fix-aiter] $SRC -> $DST"; \
|
||||
filled=$((filled+1)); \
|
||||
fi; \
|
||||
done; \
|
||||
done; \
|
||||
echo "[fix-aiter] filled $filled gfx1151 config slots"
|
||||
124
pyinfra/framework/compose/kimi-linear/README.md
Normal file
124
pyinfra/framework/compose/kimi-linear/README.md
Normal file
@@ -0,0 +1,124 @@
|
||||
# kimi-linear
|
||||
|
||||
Kimi-Linear-48B-A3B-Instruct on vLLM, ROCm/TheRock 7.x, gfx1151. Sits
|
||||
beside Ollama (port 11434, Qwen3-Coder) on port 8000. OpenAI-compatible.
|
||||
|
||||
This is the **P0 verification stage** — no public Strix Halo numbers
|
||||
exist for this model as of 2026-05. Three things are unverified until a
|
||||
first generation succeeds: KDA Triton kernel on gfx1151,
|
||||
compressed-tensors loader on ROCm, and AITER + Kimi MoE topology.
|
||||
Smoke-test below confirms all three at once.
|
||||
|
||||
## Prereqs
|
||||
|
||||
- Pyinfra deploy has run (`./run.sh` from `pyinfra/framework/`) — gives
|
||||
you `/srv/docker/kimi-linear/`, GPU group membership, `/models/`
|
||||
layout, and `huggingface-cli` on the box.
|
||||
- Hugging Face CLI authenticated (`huggingface-cli login`) if the
|
||||
weights repo gates downloads. cyankiwi's repo is currently public.
|
||||
|
||||
## Step 1 — Download weights
|
||||
|
||||
```sh
|
||||
huggingface-cli download \
|
||||
cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
|
||||
--local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
|
||||
```
|
||||
|
||||
~35 GB. The repo is named `AWQ-4bit` but the actual format is
|
||||
`compressed-tensors` int4 group-quantized — see `config.json`.
|
||||
|
||||
## Step 2 — Try the upstream image first
|
||||
|
||||
```sh
|
||||
cd /srv/docker/kimi-linear
|
||||
docker compose pull # ~8.5 GB
|
||||
docker compose up -d
|
||||
docker compose logs -f
|
||||
```
|
||||
|
||||
Watch for one of three things:
|
||||
|
||||
- **Loads cleanly, model serves on :8000** → P0 passes. Run `./smoke.sh`.
|
||||
- **`MLAModules.__init__() missing 'indexer_rotary_emb'`** → upstream
|
||||
image is on vLLM 0.12.x; need the v0.11.2 source build. Skip to
|
||||
Step 3.
|
||||
- **KDA / Triton / fla-core compile error** → kernel doesn't work on
|
||||
gfx1151 yet. Fall back path: llama.cpp ROCm + bartowski Q4_K_M GGUF
|
||||
in `compose/llama.yml`. Document the error in
|
||||
`localgenai/kimi-linear/NOTES.md` and stop.
|
||||
|
||||
## Step 3 — Source build (if needed)
|
||||
|
||||
```sh
|
||||
cd /srv/docker/kimi-linear
|
||||
tmux new -s kimi-build
|
||||
./build.sh # multi-hour. Detach with C-b d; reattach with `tmux a -t kimi-build`
|
||||
```
|
||||
|
||||
Builds `kimi-linear-local:v0.11.2` from kyuz0 SHA `e2288d6` with
|
||||
`VLLM_COMMIT=v0.11.2`. Then edit `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
image: kimi-linear-local:v0.11.2
|
||||
```
|
||||
|
||||
…and `docker compose up -d` again.
|
||||
|
||||
## Step 4 — Smoke test
|
||||
|
||||
```sh
|
||||
./smoke.sh
|
||||
```
|
||||
|
||||
Expects: `/v1/models` returns `kimi-linear`; a four-token generation
|
||||
returns "ok". If both pass, **P0 is done**. Update task #6 and proceed
|
||||
to P1.
|
||||
|
||||
## Operations
|
||||
|
||||
```sh
|
||||
docker compose logs -f kimi-linear # tail
|
||||
docker compose restart kimi-linear # reload
|
||||
docker compose down # stop
|
||||
docker compose exec kimi-linear bash # shell in
|
||||
amdgpu_top # on host: GPU power, mem, util
|
||||
```
|
||||
|
||||
## Pin manifest
|
||||
|
||||
| Component | Pin |
|
||||
| --------------------------- | ---------------------------------- |
|
||||
| kyuz0 toolbox | commit `e2288d6` (2026-04-22) |
|
||||
| vLLM | tag `v0.11.2` (Moonshot recipe) |
|
||||
| Image (default) | `kyuz0/vllm-therock-gfx1151:stable`|
|
||||
| Image (pinned, if built) | `kimi-linear-local:v0.11.2` |
|
||||
| Weights | `cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit` (compressed-tensors int4) |
|
||||
| ROCm | TheRock nightlies via kyuz0 base |
|
||||
| Python | 3.12 (hardcoded in kyuz0 Dockerfile) |
|
||||
|
||||
Bump policy: don't move vLLM to 0.12.x; don't move kyuz0 commit without
|
||||
re-running smoke; bump weights only when an 8-bit A/B is in scope (P3).
|
||||
|
||||
## Port collision warning
|
||||
|
||||
`compose/vllm.yml` is a placeholder stub that also binds `:8000`. Only
|
||||
one of `kimi-linear` and `vllm` can run at a time. Don't `docker compose
|
||||
up` both. Long term either delete the stub or move it to a different
|
||||
port; not in scope here.
|
||||
|
||||
## Known issues / mitigations
|
||||
|
||||
- **HIP graph capture broken on gfx1151** (vllm-project/vllm#32180) —
|
||||
`--enforce-eager` mitigates at a throughput cost. Re-test without it
|
||||
once the upstream fix lands.
|
||||
- **vLLM 0.12.0 crash on Kimi-Linear** —
|
||||
`MLAModules.__init__() missing 'indexer_rotary_emb'`. Hard pin to
|
||||
0.11.2.
|
||||
- **No published gfx1151 numbers** — we are first. Findings stay
|
||||
private (no upstream filings) per project policy.
|
||||
|
||||
## Status
|
||||
|
||||
P0 in progress. Update `oc-tree`-style `NEXT_STEPS.md` if you set this
|
||||
aside mid-verification.
|
||||
51
pyinfra/framework/compose/kimi-linear/build.sh
Executable file
51
pyinfra/framework/compose/kimi-linear/build.sh
Executable file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
# Source-build a vLLM 0.11.2-pinned image from kyuz0's gfx1151 toolbox.
|
||||
# Use only when the upstream `kyuz0/vllm-therock-gfx1151:stable` tag
|
||||
# crashes on Kimi-Linear with a v0.12-class error
|
||||
# (`MLAModules.__init__() missing 'indexer_rotary_emb'`).
|
||||
#
|
||||
# Compiles flash-attention, AITER+CK, vLLM, and bitsandbytes from source
|
||||
# with MAX_JOBS=4 (fixed upstream). Expect a multi-hour wall-clock on
|
||||
# Strix Halo. Idempotent — skips if the target tag already exists.
|
||||
#
|
||||
# Pin policy. KYUZ0_COMMIT is the upstream SHA whose CI build produced
|
||||
# the published `:stable` on 2026-04-22; bump only after re-validating
|
||||
# Kimi-Linear works with the new toolbox revision. VLLM_COMMIT is the
|
||||
# Moonshot recipe pin for Kimi-Linear; do not bump to v0.12.x.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
KYUZ0_REPO="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes.git"
|
||||
KYUZ0_COMMIT="e2288d6"
|
||||
VLLM_COMMIT="v0.11.2"
|
||||
IMAGE_TAG="kimi-linear-local:${VLLM_COMMIT}"
|
||||
WORKDIR="/tmp/kimi-linear-build"
|
||||
|
||||
if docker image inspect "$IMAGE_TAG" >/dev/null 2>&1; then
|
||||
echo "[build] $IMAGE_TAG already exists. To rebuild: docker rmi $IMAGE_TAG"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -d "$WORKDIR/.git" ]; then
|
||||
rm -rf "$WORKDIR"
|
||||
git clone "$KYUZ0_REPO" "$WORKDIR"
|
||||
fi
|
||||
|
||||
cd "$WORKDIR"
|
||||
git fetch origin
|
||||
git checkout --quiet "$KYUZ0_COMMIT"
|
||||
|
||||
echo "[build] kyuz0 toolbox @ $(git rev-parse --short HEAD)"
|
||||
echo "[build] vLLM pin: $VLLM_COMMIT"
|
||||
echo "[build] image tag: $IMAGE_TAG"
|
||||
echo "[build] expected wall-clock: hours. Use tmux."
|
||||
echo
|
||||
|
||||
docker build \
|
||||
--build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \
|
||||
-t "$IMAGE_TAG" \
|
||||
-f Dockerfile \
|
||||
.
|
||||
|
||||
echo
|
||||
echo "[build] done. Switch image: in docker-compose.yml to $IMAGE_TAG."
|
||||
66
pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
Executable file
66
pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
# Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`.
|
||||
#
|
||||
# Why: tokenization_kimi.py does
|
||||
# from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||
# which fails on recent transformers (the helper was removed/relocated).
|
||||
# The function itself is ~10 lines of public BPE byte-mapping math; we
|
||||
# inline it. Idempotent — re-running is a no-op once patched.
|
||||
#
|
||||
# Run on the box, after weights are downloaded, before first
|
||||
# `docker compose up`. Recreates the container at the end so
|
||||
# `trust_remote_code` re-copies the patched file into its module cache.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}"
|
||||
F="$MODEL_DIR/tokenization_kimi.py"
|
||||
|
||||
if [ ! -f "$F" ]; then
|
||||
echo "[patch-tokenizer] not found: $F" >&2
|
||||
echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if grep -q '__patched_bytes_to_unicode__' "$F"; then
|
||||
echo "[patch-tokenizer] $F already patched. Nothing to do."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then
|
||||
echo "[patch-tokenizer] expected import line not present in $F." >&2
|
||||
echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2
|
||||
echo " grep -n bytes_to_unicode '$F'" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
python3 - "$F" <<'PYEOF'
|
||||
import pathlib, sys
|
||||
p = pathlib.Path(sys.argv[1])
|
||||
s = p.read_text()
|
||||
old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode"
|
||||
new = (
|
||||
"# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n"
|
||||
"def bytes_to_unicode():\n"
|
||||
" bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n"
|
||||
" + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n"
|
||||
" + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n"
|
||||
" cs = bs[:]\n"
|
||||
" n = 0\n"
|
||||
" for b in range(2**8):\n"
|
||||
" if b not in bs:\n"
|
||||
" bs.append(b)\n"
|
||||
" cs.append(2**8 + n)\n"
|
||||
" n += 1\n"
|
||||
" cs = [chr(n) for n in cs]\n"
|
||||
" return dict(zip(bs, cs))"
|
||||
)
|
||||
p.write_text(s.replace(old, new))
|
||||
print("[patch-tokenizer] patched", p)
|
||||
PYEOF
|
||||
|
||||
echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache"
|
||||
cd "$(dirname "$0")"
|
||||
docker compose down
|
||||
docker compose up -d
|
||||
echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"
|
||||
24
pyinfra/framework/compose/kimi-linear/smoke.sh
Executable file
24
pyinfra/framework/compose/kimi-linear/smoke.sh
Executable file
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env bash
|
||||
# Smoke-test the running kimi-linear vLLM container. Exits non-zero if
|
||||
# anything's wrong, so it doubles as a P1 health check.
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${KIMI_HOST:-127.0.0.1:8000}"
|
||||
MODEL="${KIMI_MODEL:-kimi-linear}"
|
||||
|
||||
echo "[smoke] GET /v1/models on $HOST"
|
||||
curl -fsS "http://$HOST/v1/models" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
|
||||
curl -fsS "http://$HOST/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"model\": \"$MODEL\",
|
||||
\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
|
||||
\"max_tokens\": 16,
|
||||
\"temperature\": 0.0
|
||||
}" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] passed"
|
||||
@@ -17,6 +17,11 @@ services:
|
||||
- "host.docker.internal:host-gateway"
|
||||
environment:
|
||||
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
||||
# vLLM (Kimi-Linear) exposed as an OpenAI-compatible backend. The
|
||||
# model isn't strongly tool-trained — opencode's agentic system
|
||||
# prompt confuses it. OpenWebUI's plain chat UI is the right home.
|
||||
- OPENAI_API_BASE_URLS=http://host.docker.internal:8000/v1
|
||||
- OPENAI_API_KEYS=dummy
|
||||
# Built-in web search via the project's SearXNG instance.
|
||||
- ENABLE_RAG_WEB_SEARCH=true
|
||||
- RAG_WEB_SEARCH_ENGINE=searxng
|
||||
|
||||
@@ -343,18 +343,23 @@ server.user(
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Kernel cmdline tuning per Gygeek/Framework-strix-halo-llm-setup:
|
||||
# - amd_iommu=off — ~6 % memory-read improvement on Strix Halo
|
||||
# - amdgpu.gttsize=117760 — ~115 GB GTT ceiling so the GPU can borrow
|
||||
# most of system RAM dynamically. Acts as a
|
||||
# ceiling, not an allocation. See ../../StrixHaloMemory.md
|
||||
# for the UMA-vs-GTT trade-off discussion.
|
||||
# Kernel cmdline tuning. The Strix Halo unified-memory recipe (kyuz0
|
||||
# vllm-toolboxes "Kernel Parameters and Unified Memory" + Framework's
|
||||
# "Linux + ROCm: January 2026 Stable Configurations" thread):
|
||||
# - amd_iommu=off — ~6 % memory-read improvement
|
||||
# - amdgpu.gttsize=131072 — 128 GiB GTT ceiling (deprecated knob
|
||||
# but still honored on kernel 6.16+)
|
||||
# - ttm.pages_limit=33554432 — 128 GiB in 4 KiB pages; forward-
|
||||
# compatible TTM page cap
|
||||
# Combined with BIOS UMA at 0.5 GB and HSA_FORCE_FINE_GRAIN_PCIE=1 in the
|
||||
# container, PyTorch's HIP allocator merges the two rocminfo pools into a
|
||||
# single ~110 GB arena. See ../../StrixHaloMemory.md for context.
|
||||
# Requires a reboot to take effect; pyinfra leaves that to you.
|
||||
files.line(
|
||||
name="GRUB cmdline (amd_iommu, gttsize)",
|
||||
name="GRUB cmdline (amd_iommu, gttsize, ttm)",
|
||||
path="/etc/default/grub",
|
||||
line=r"^GRUB_CMDLINE_LINUX_DEFAULT=.*",
|
||||
replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=117760"',
|
||||
replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=131072 ttm.pages_limit=33554432"',
|
||||
_sudo=True,
|
||||
)
|
||||
server.shell(
|
||||
@@ -418,6 +423,7 @@ for svc in (
|
||||
"llama",
|
||||
"vllm",
|
||||
"ollama",
|
||||
"kimi-linear",
|
||||
"openwebui",
|
||||
"beszel",
|
||||
"openlit",
|
||||
@@ -559,6 +565,27 @@ for cfg in (
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Kimi-Linear container assets (build script, smoke test, operator doc).
|
||||
# The compose file itself is copied by the for-loop above; the rest of
|
||||
# the build context lives under compose/kimi-linear/ on the source side
|
||||
# and at /srv/docker/kimi-linear/ on the box. Source is the source of
|
||||
# truth — pyinfra overwrites drift.
|
||||
for asset, mode in (
|
||||
("Dockerfile", "0664"),
|
||||
("build.sh", "0775"),
|
||||
("smoke.sh", "0775"),
|
||||
("patch-tokenizer.sh", "0775"),
|
||||
("README.md", "0664"),
|
||||
):
|
||||
files.put(
|
||||
name=f"kimi-linear: {asset}",
|
||||
src=f"compose/kimi-linear/{asset}",
|
||||
dest=f"{COMPOSE_DIR}/kimi-linear/{asset}",
|
||||
group="docker",
|
||||
mode=mode,
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Voice stack — Wyoming-protocol Whisper (STT) and Piper (TTS). Models
|
||||
# are downloaded on first start; bind-mounting these dirs survives
|
||||
# container recreation.
|
||||
|
||||
@@ -4,4 +4,4 @@
|
||||
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")"
|
||||
exec pyinfra -v --ssh-password-prompt inventory.py deploy.py "$@"
|
||||
exec pyinfra -v --ssh-password-prompt inventory.py deploy.py "$@"
|
||||
|
||||
Reference in New Issue
Block a user