Document current coding-workflow stack state

Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
2026-05-10 21:14:43 -04:00
parent 228fe8d1ac
commit a29793032d
35 changed files with 2067 additions and 37 deletions
--- a/pyinfra/framework/deploy.py
+++ b/pyinfra/framework/deploy.py
@@ -343,18 +343,23 @@ server.user(
    _sudo=True,
 )

-# Kernel cmdline tuning per Gygeek/Framework-strix-halo-llm-setup:
-#   - amd_iommu=off       — ~6 % memory-read improvement on Strix Halo
-#   - amdgpu.gttsize=117760 — ~115 GB GTT ceiling so the GPU can borrow
-#                            most of system RAM dynamically. Acts as a
-#                            ceiling, not an allocation. See ../../StrixHaloMemory.md
-#                            for the UMA-vs-GTT trade-off discussion.
+# Kernel cmdline tuning. The Strix Halo unified-memory recipe (kyuz0
+# vllm-toolboxes "Kernel Parameters and Unified Memory" + Framework's
+# "Linux + ROCm: January 2026 Stable Configurations" thread):
+#   - amd_iommu=off                — ~6 % memory-read improvement
+#   - amdgpu.gttsize=131072        — 128 GiB GTT ceiling (deprecated knob
+#                                    but still honored on kernel 6.16+)
+#   - ttm.pages_limit=33554432     — 128 GiB in 4 KiB pages; forward-
+#                                    compatible TTM page cap
+# Combined with BIOS UMA at 0.5 GB and HSA_FORCE_FINE_GRAIN_PCIE=1 in the
+# container, PyTorch's HIP allocator merges the two rocminfo pools into a
+# single ~110 GB arena. See ../../StrixHaloMemory.md for context.
 # Requires a reboot to take effect; pyinfra leaves that to you.
 files.line(
-    name="GRUB cmdline (amd_iommu, gttsize)",
+    name="GRUB cmdline (amd_iommu, gttsize, ttm)",
    path="/etc/default/grub",
    line=r"^GRUB_CMDLINE_LINUX_DEFAULT=.*",
-    replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=117760"',
+    replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=131072 ttm.pages_limit=33554432"',
    _sudo=True,
 )
 server.shell(
@@ -418,6 +423,7 @@ for svc in (
    "llama",
    "vllm",
    "ollama",
+    "kimi-linear",
    "openwebui",
    "beszel",
    "openlit",
@@ -559,6 +565,27 @@ for cfg in (
        _sudo=True,
    )

+# Kimi-Linear container assets (build script, smoke test, operator doc).
+# The compose file itself is copied by the for-loop above; the rest of
+# the build context lives under compose/kimi-linear/ on the source side
+# and at /srv/docker/kimi-linear/ on the box. Source is the source of
+# truth — pyinfra overwrites drift.
+for asset, mode in (
+    ("Dockerfile", "0664"),
+    ("build.sh", "0775"),
+    ("smoke.sh", "0775"),
+    ("patch-tokenizer.sh", "0775"),
+    ("README.md", "0664"),
+):
+    files.put(
+        name=f"kimi-linear: {asset}",
+        src=f"compose/kimi-linear/{asset}",
+        dest=f"{COMPOSE_DIR}/kimi-linear/{asset}",
+        group="docker",
+        mode=mode,
+        _sudo=True,
+    )
+
 # Voice stack — Wyoming-protocol Whisper (STT) and Piper (TTS). Models
 # are downloaded on first start; bind-mounting these dirs survives
 # container recreation.