Document current coding-workflow stack state

Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
2026-05-10 21:14:43 -04:00
parent 228fe8d1ac
commit a29793032d
35 changed files with 2067 additions and 37 deletions
--- a/pyinfra/framework/compose/kimi-linear.yml
+++ b/pyinfra/framework/compose/kimi-linear.yml
@@ -0,0 +1,112 @@
+# Kimi-Linear-48B-A3B-Instruct on vLLM, gfx1151, via kyuz0's TheRock 7.x
+# toolbox. Pioneer-grade: no public Strix Halo benchmarks exist for this
+# model as of 2026-05.
+#
+# Three risks P0 verifies in one shot:
+#   - KDA Triton kernel on gfx1151 (fla-core)             unverified
+#   - compressed-tensors loader on ROCm                   unverified
+#   - HIP-graph-capture on gfx1151                        broken; mitigated
+#                                                         via --enforce-eager
+#
+# Image strategy. Default `image:` is upstream `kyuz0:stable` (vLLM
+# ~6aa057c from 2026-04-22). If that crashes with the v0.12-class
+# `MLAModules.__init__() missing 'indexer_rotary_emb'`, build a
+# v0.11.2-pinned image locally with ./build.sh and edit `image:` below to
+# `kimi-linear-local:v0.11.2`. Source build is multi-hour.
+#
+# Weights. Despite their HF name, cyankiwi's "AWQ" Kimi-Linear weights
+# are actually `compressed-tensors` int4 group-quantized — see config.json.
+# Download with:
+#   huggingface-cli download cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
+#     --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
+# Size: ~35 GB on disk (4-bit). 8-bit variant is ~54 GB if quality drives
+# us up later; both fit 128 GB unified comfortably.
+services:
+  kimi-linear:
+    # Derived image: kyuz0:stable + gfx1151 AITER GEMM config fallbacks
+    # (Kimi-Linear's MLA layers hit FP8 BMM ops kyuz0 didn't validate
+    # with their tested models). See ./Dockerfile. Build is fast — just
+    # file copies inside the image.
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: kimi-linear-local:aiter-fixed
+    container_name: kimi-linear
+    restart: unless-stopped
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups — names
+    # don't exist inside the container, but the GIDs need to match the
+    # host so /dev/kfd + /dev/dri are accessible.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 16g
+    ipc: host
+    environment:
+      # gfx1151 native: kyuz0 image is built with GFX=gfx1151, so unlike
+      # ollama.yml (which uses 11.0.0 to coerce gfx1100 kernels), here we
+      # want the GPU to report its real ISA.
+      - HSA_OVERRIDE_GFX_VERSION=11.5.1
+      # AITER attention path — kyuz0's image patches AITER for RDNA
+      # ds_swizzle fallbacks; the env flag opts vLLM into using it.
+      - VLLM_ROCM_USE_AITER=1
+      # MLA pre-processing via AITER triton_fp8_bmm tries to materialize
+      # a ~30 GB intermediate alongside resident weights. Bypass that op;
+      # other AITER paths stay on.
+      - VLLM_ROCM_USE_AITER_MLA=0
+      # Unified-memory recipe (BIOS UMA=0.5 GB + ttm.pages_limit cmdline
+      # + the env triple below). Lets PyTorch's HIP allocator treat the
+      # two rocminfo pools as one ~110 GB arena. Without the
+      # FINE_GRAIN_PCIE flag, XNACK alone is a trap (vLLM mis-computes
+      # KV budget vs. allocator ceiling).
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
+      - PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8000:8000"
+    # kyuz0 toolboxes drop into a shell by default; without an explicit
+    # entrypoint, `command:` would be exec'd as a program (the
+    # `exec "--model": executable file not found` failure).
+    entrypoint: ["vllm", "serve"]
+    command:
+      # Positional model path (vllm serve's documented form).
+      - /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
+      - --served-model-name
+      - kimi-linear
+      # Auto-detect would also work — config.json carries quant_method.
+      # Explicit flag makes the failure mode loud if the loader is wrong.
+      - --quantization
+      - compressed-tensors
+      # Conservative restart point after BIOS+cmdline+env unblock.
+      # P3 ramps further: 32K → 128K → 256K → 512K → 1M.
+      - --max-model-len
+      - "32768"
+      - --gpu-memory-utilization
+      - "0.92"
+      - --max-num-seqs
+      - "4"
+      # gfx1151 V1-engine HIP-graph-capture is broken (vllm-project/vllm#32180).
+      # Eager costs throughput, not correctness; do not remove without
+      # verifying upstream fix landed.
+      - --enforce-eager
+      # Kimi-Linear ships custom modeling_kimi.py — required.
+      - --trust-remote-code
+      # Tool-calling support — opencode sends tool_choice:"auto" whenever
+      # MCP servers are connected. vLLM is strict and rejects unless both
+      # flags are present. Moonshot's Kimi family uses the kimi_k2 parser
+      # for tool-call formatting; Kimi-Linear inherits the same template.
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - kimi_k2
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8000"
--- a/pyinfra/framework/compose/kimi-linear/Dockerfile
+++ b/pyinfra/framework/compose/kimi-linear/Dockerfile
@@ -0,0 +1,35 @@
+# Derived image: kyuz0:stable plus gfx1151 AITER GEMM config fallbacks.
+#
+# kyuz0's image is built for gfx1151 but doesn't ship every per-op AITER
+# autotuning config. Kimi-Linear's MLA layers hit FP8 BMM ops
+# (BATCHED_GEMM-A8W8-A_PER_TOKEN_GROUP_PREQUANT_W_PER_BATCHED_TENSOR_QUANT
+# and friends) that have no gfx1151 config in the bundle. We synthesize
+# them by copying from the closest-arch config that does exist (RDNA3
+# gfx1100 is closest to RDNA3.5 gfx1151). Tile sizes won't be optimal
+# but the kernels will compile and run.
+#
+# Idempotent — only fills slots that don't already have a gfx1151 config.
+#
+# If we ever need a vLLM-pinned base (e.g. upstream regresses on
+# Kimi-Linear), build it via ./build.sh first and change FROM here to
+# kimi-linear-local:v0.11.2.
+
+FROM kyuz0/vllm-therock-gfx1151:stable
+
+RUN set -e; \
+    DIR=/opt/venv/lib64/python3.12/site-packages/aiter/ops/triton/configs/gemm; \
+    cd "$DIR"; \
+    filled=0; \
+    for SRC_PREFIX in gfx1100 gfx1101 gfx942 gfx90a; do \
+        for SRC in ${SRC_PREFIX}-*.json; do \
+            [ -f "$SRC" ] || continue; \
+            OP=${SRC#${SRC_PREFIX}-}; \
+            DST=gfx1151-${OP}; \
+            if [ ! -f "$DST" ]; then \
+                cp "$SRC" "$DST"; \
+                echo "[fix-aiter] $SRC -> $DST"; \
+                filled=$((filled+1)); \
+            fi; \
+        done; \
+    done; \
+    echo "[fix-aiter] filled $filled gfx1151 config slots"
--- a/pyinfra/framework/compose/kimi-linear/README.md
+++ b/pyinfra/framework/compose/kimi-linear/README.md
@@ -0,0 +1,124 @@
+# kimi-linear
+
+Kimi-Linear-48B-A3B-Instruct on vLLM, ROCm/TheRock 7.x, gfx1151. Sits
+beside Ollama (port 11434, Qwen3-Coder) on port 8000. OpenAI-compatible.
+
+This is the **P0 verification stage** — no public Strix Halo numbers
+exist for this model as of 2026-05. Three things are unverified until a
+first generation succeeds: KDA Triton kernel on gfx1151,
+compressed-tensors loader on ROCm, and AITER + Kimi MoE topology.
+Smoke-test below confirms all three at once.
+
+## Prereqs
+
+- Pyinfra deploy has run (`./run.sh` from `pyinfra/framework/`) — gives
+  you `/srv/docker/kimi-linear/`, GPU group membership, `/models/`
+  layout, and `huggingface-cli` on the box.
+- Hugging Face CLI authenticated (`huggingface-cli login`) if the
+  weights repo gates downloads. cyankiwi's repo is currently public.
+
+## Step 1 — Download weights
+
+```sh
+huggingface-cli download \
+    cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit \
+    --local-dir /models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit
+```
+
+~35 GB. The repo is named `AWQ-4bit` but the actual format is
+`compressed-tensors` int4 group-quantized — see `config.json`.
+
+## Step 2 — Try the upstream image first
+
+```sh
+cd /srv/docker/kimi-linear
+docker compose pull       # ~8.5 GB
+docker compose up -d
+docker compose logs -f
+```
+
+Watch for one of three things:
+
+- **Loads cleanly, model serves on :8000** → P0 passes. Run `./smoke.sh`.
+- **`MLAModules.__init__() missing 'indexer_rotary_emb'`** → upstream
+  image is on vLLM 0.12.x; need the v0.11.2 source build. Skip to
+  Step 3.
+- **KDA / Triton / fla-core compile error** → kernel doesn't work on
+  gfx1151 yet. Fall back path: llama.cpp ROCm + bartowski Q4_K_M GGUF
+  in `compose/llama.yml`. Document the error in
+  `localgenai/kimi-linear/NOTES.md` and stop.
+
+## Step 3 — Source build (if needed)
+
+```sh
+cd /srv/docker/kimi-linear
+tmux new -s kimi-build
+./build.sh        # multi-hour. Detach with C-b d; reattach with `tmux a -t kimi-build`
+```
+
+Builds `kimi-linear-local:v0.11.2` from kyuz0 SHA `e2288d6` with
+`VLLM_COMMIT=v0.11.2`. Then edit `docker-compose.yml`:
+
+```yaml
+    image: kimi-linear-local:v0.11.2
+```
+
+…and `docker compose up -d` again.
+
+## Step 4 — Smoke test
+
+```sh
+./smoke.sh
+```
+
+Expects: `/v1/models` returns `kimi-linear`; a four-token generation
+returns "ok". If both pass, **P0 is done**. Update task #6 and proceed
+to P1.
+
+## Operations
+
+```sh
+docker compose logs -f kimi-linear         # tail
+docker compose restart kimi-linear         # reload
+docker compose down                        # stop
+docker compose exec kimi-linear bash       # shell in
+amdgpu_top                                 # on host: GPU power, mem, util
+```
+
+## Pin manifest
+
+| Component                   | Pin                                |
+| --------------------------- | ---------------------------------- |
+| kyuz0 toolbox               | commit `e2288d6` (2026-04-22)      |
+| vLLM                        | tag `v0.11.2` (Moonshot recipe)    |
+| Image (default)             | `kyuz0/vllm-therock-gfx1151:stable`|
+| Image (pinned, if built)    | `kimi-linear-local:v0.11.2`        |
+| Weights                     | `cyankiwi/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit` (compressed-tensors int4) |
+| ROCm                        | TheRock nightlies via kyuz0 base   |
+| Python                      | 3.12 (hardcoded in kyuz0 Dockerfile) |
+
+Bump policy: don't move vLLM to 0.12.x; don't move kyuz0 commit without
+re-running smoke; bump weights only when an 8-bit A/B is in scope (P3).
+
+## Port collision warning
+
+`compose/vllm.yml` is a placeholder stub that also binds `:8000`. Only
+one of `kimi-linear` and `vllm` can run at a time. Don't `docker compose
+up` both. Long term either delete the stub or move it to a different
+port; not in scope here.
+
+## Known issues / mitigations
+
+- **HIP graph capture broken on gfx1151** (vllm-project/vllm#32180) —
+  `--enforce-eager` mitigates at a throughput cost. Re-test without it
+  once the upstream fix lands.
+- **vLLM 0.12.0 crash on Kimi-Linear** —
+  `MLAModules.__init__() missing 'indexer_rotary_emb'`. Hard pin to
+  0.11.2.
+- **No published gfx1151 numbers** — we are first. Findings stay
+  private (no upstream filings) per project policy.
+
+## Status
+
+P0 in progress. Update `oc-tree`-style `NEXT_STEPS.md` if you set this
+aside mid-verification.
--- a/pyinfra/framework/compose/kimi-linear/build.sh
+++ b/pyinfra/framework/compose/kimi-linear/build.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Source-build a vLLM 0.11.2-pinned image from kyuz0's gfx1151 toolbox.
+# Use only when the upstream `kyuz0/vllm-therock-gfx1151:stable` tag
+# crashes on Kimi-Linear with a v0.12-class error
+# (`MLAModules.__init__() missing 'indexer_rotary_emb'`).
+#
+# Compiles flash-attention, AITER+CK, vLLM, and bitsandbytes from source
+# with MAX_JOBS=4 (fixed upstream). Expect a multi-hour wall-clock on
+# Strix Halo. Idempotent — skips if the target tag already exists.
+#
+# Pin policy. KYUZ0_COMMIT is the upstream SHA whose CI build produced
+# the published `:stable` on 2026-04-22; bump only after re-validating
+# Kimi-Linear works with the new toolbox revision. VLLM_COMMIT is the
+# Moonshot recipe pin for Kimi-Linear; do not bump to v0.12.x.
+
+set -euo pipefail
+
+KYUZ0_REPO="https://github.com/kyuz0/amd-strix-halo-vllm-toolboxes.git"
+KYUZ0_COMMIT="e2288d6"
+VLLM_COMMIT="v0.11.2"
+IMAGE_TAG="kimi-linear-local:${VLLM_COMMIT}"
+WORKDIR="/tmp/kimi-linear-build"
+
+if docker image inspect "$IMAGE_TAG" >/dev/null 2>&1; then
+    echo "[build] $IMAGE_TAG already exists. To rebuild: docker rmi $IMAGE_TAG"
+    exit 0
+fi
+
+if [ ! -d "$WORKDIR/.git" ]; then
+    rm -rf "$WORKDIR"
+    git clone "$KYUZ0_REPO" "$WORKDIR"
+fi
+
+cd "$WORKDIR"
+git fetch origin
+git checkout --quiet "$KYUZ0_COMMIT"
+
+echo "[build] kyuz0 toolbox @ $(git rev-parse --short HEAD)"
+echo "[build] vLLM pin: $VLLM_COMMIT"
+echo "[build] image tag: $IMAGE_TAG"
+echo "[build] expected wall-clock: hours. Use tmux."
+echo
+
+docker build \
+    --build-arg "VLLM_COMMIT=${VLLM_COMMIT}" \
+    -t "$IMAGE_TAG" \
+    -f Dockerfile \
+    .
+
+echo
+echo "[build] done. Switch image: in docker-compose.yml to $IMAGE_TAG."
--- a/pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
+++ b/pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`.
+#
+# Why: tokenization_kimi.py does
+#   from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+# which fails on recent transformers (the helper was removed/relocated).
+# The function itself is ~10 lines of public BPE byte-mapping math; we
+# inline it. Idempotent — re-running is a no-op once patched.
+#
+# Run on the box, after weights are downloaded, before first
+# `docker compose up`. Recreates the container at the end so
+# `trust_remote_code` re-copies the patched file into its module cache.
+
+set -euo pipefail
+
+MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}"
+F="$MODEL_DIR/tokenization_kimi.py"
+
+if [ ! -f "$F" ]; then
+    echo "[patch-tokenizer] not found: $F" >&2
+    echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2
+    exit 1
+fi
+
+if grep -q '__patched_bytes_to_unicode__' "$F"; then
+    echo "[patch-tokenizer] $F already patched. Nothing to do."
+    exit 0
+fi
+
+if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then
+    echo "[patch-tokenizer] expected import line not present in $F." >&2
+    echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2
+    echo "  grep -n bytes_to_unicode '$F'" >&2
+    exit 2
+fi
+
+python3 - "$F" <<'PYEOF'
+import pathlib, sys
+p = pathlib.Path(sys.argv[1])
+s = p.read_text()
+old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode"
+new = (
+    "# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n"
+    "def bytes_to_unicode():\n"
+    "    bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n"
+    "          + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n"
+    "          + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n"
+    "    cs = bs[:]\n"
+    "    n = 0\n"
+    "    for b in range(2**8):\n"
+    "        if b not in bs:\n"
+    "            bs.append(b)\n"
+    "            cs.append(2**8 + n)\n"
+    "            n += 1\n"
+    "    cs = [chr(n) for n in cs]\n"
+    "    return dict(zip(bs, cs))"
+)
+p.write_text(s.replace(old, new))
+print("[patch-tokenizer] patched", p)
+PYEOF
+
+echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache"
+cd "$(dirname "$0")"
+docker compose down
+docker compose up -d
+echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"
--- a/pyinfra/framework/compose/kimi-linear/smoke.sh
+++ b/pyinfra/framework/compose/kimi-linear/smoke.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Smoke-test the running kimi-linear vLLM container. Exits non-zero if
+# anything's wrong, so it doubles as a P1 health check.
+set -euo pipefail
+
+HOST="${KIMI_HOST:-127.0.0.1:8000}"
+MODEL="${KIMI_MODEL:-kimi-linear}"
+
+echo "[smoke] GET /v1/models on $HOST"
+curl -fsS "http://$HOST/v1/models" | python3 -m json.tool
+
+echo
+echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
+curl -fsS "http://$HOST/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"model\": \"$MODEL\",
+        \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
+        \"max_tokens\": 16,
+        \"temperature\": 0.0
+    }" | python3 -m json.tool
+
+echo
+echo "[smoke] passed"
--- a/pyinfra/framework/compose/openwebui.yml
+++ b/pyinfra/framework/compose/openwebui.yml
@@ -17,6 +17,11 @@ services:
      - "host.docker.internal:host-gateway"
    environment:
      - OLLAMA_BASE_URL=http://host.docker.internal:11434
+      # vLLM (Kimi-Linear) exposed as an OpenAI-compatible backend. The
+      # model isn't strongly tool-trained — opencode's agentic system
+      # prompt confuses it. OpenWebUI's plain chat UI is the right home.
+      - OPENAI_API_BASE_URLS=http://host.docker.internal:8000/v1
+      - OPENAI_API_KEYS=dummy
      # Built-in web search via the project's SearXNG instance.
      - ENABLE_RAG_WEB_SEARCH=true
      - RAG_WEB_SEARCH_ENGINE=searxng
--- a/pyinfra/framework/deploy.py
+++ b/pyinfra/framework/deploy.py
@@ -343,18 +343,23 @@ server.user(
    _sudo=True,
 )

-# Kernel cmdline tuning per Gygeek/Framework-strix-halo-llm-setup:
-#   - amd_iommu=off       — ~6 % memory-read improvement on Strix Halo
-#   - amdgpu.gttsize=117760 — ~115 GB GTT ceiling so the GPU can borrow
-#                            most of system RAM dynamically. Acts as a
-#                            ceiling, not an allocation. See ../../StrixHaloMemory.md
-#                            for the UMA-vs-GTT trade-off discussion.
+# Kernel cmdline tuning. The Strix Halo unified-memory recipe (kyuz0
+# vllm-toolboxes "Kernel Parameters and Unified Memory" + Framework's
+# "Linux + ROCm: January 2026 Stable Configurations" thread):
+#   - amd_iommu=off                — ~6 % memory-read improvement
+#   - amdgpu.gttsize=131072        — 128 GiB GTT ceiling (deprecated knob
+#                                    but still honored on kernel 6.16+)
+#   - ttm.pages_limit=33554432     — 128 GiB in 4 KiB pages; forward-
+#                                    compatible TTM page cap
+# Combined with BIOS UMA at 0.5 GB and HSA_FORCE_FINE_GRAIN_PCIE=1 in the
+# container, PyTorch's HIP allocator merges the two rocminfo pools into a
+# single ~110 GB arena. See ../../StrixHaloMemory.md for context.
 # Requires a reboot to take effect; pyinfra leaves that to you.
 files.line(
-    name="GRUB cmdline (amd_iommu, gttsize)",
+    name="GRUB cmdline (amd_iommu, gttsize, ttm)",
    path="/etc/default/grub",
    line=r"^GRUB_CMDLINE_LINUX_DEFAULT=.*",
-    replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=117760"',
+    replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=131072 ttm.pages_limit=33554432"',
    _sudo=True,
 )
 server.shell(
@@ -418,6 +423,7 @@ for svc in (
    "llama",
    "vllm",
    "ollama",
+    "kimi-linear",
    "openwebui",
    "beszel",
    "openlit",
@@ -559,6 +565,27 @@ for cfg in (
        _sudo=True,
    )

+# Kimi-Linear container assets (build script, smoke test, operator doc).
+# The compose file itself is copied by the for-loop above; the rest of
+# the build context lives under compose/kimi-linear/ on the source side
+# and at /srv/docker/kimi-linear/ on the box. Source is the source of
+# truth — pyinfra overwrites drift.
+for asset, mode in (
+    ("Dockerfile", "0664"),
+    ("build.sh", "0775"),
+    ("smoke.sh", "0775"),
+    ("patch-tokenizer.sh", "0775"),
+    ("README.md", "0664"),
+):
+    files.put(
+        name=f"kimi-linear: {asset}",
+        src=f"compose/kimi-linear/{asset}",
+        dest=f"{COMPOSE_DIR}/kimi-linear/{asset}",
+        group="docker",
+        mode=mode,
+        _sudo=True,
+    )
+
 # Voice stack — Wyoming-protocol Whisper (STT) and Piper (TTS). Models
 # are downloaded on first start; bind-mounting these dirs survives
 # container recreation.
--- a/pyinfra/framework/run.sh
+++ b/pyinfra/framework/run.sh
@@ -4,4 +4,4 @@

 set -euo pipefail
 cd "$(dirname "$0")"
-exec pyinfra -v --ssh-password-prompt inventory.py deploy.py "$@"
+exec pyinfra -v --ssh-password-prompt inventory.py deploy.py "$@"