localgenai/pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh

#!/usr/bin/env bash
# Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`.
#
# Why: tokenization_kimi.py does
#   from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
# which fails on recent transformers (the helper was removed/relocated).
# The function itself is ~10 lines of public BPE byte-mapping math; we
# inline it. Idempotent — re-running is a no-op once patched.
#
# Run on the box, after weights are downloaded, before first
# `docker compose up`. Recreates the container at the end so
# `trust_remote_code` re-copies the patched file into its module cache.

set -euo pipefail

MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}"
F="$MODEL_DIR/tokenization_kimi.py"

if [ ! -f "$F" ]; then
    echo "[patch-tokenizer] not found: $F" >&2
    echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2
    exit 1
fi

if grep -q '__patched_bytes_to_unicode__' "$F"; then
    echo "[patch-tokenizer] $F already patched. Nothing to do."
    exit 0
fi

if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then
    echo "[patch-tokenizer] expected import line not present in $F." >&2
    echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2
    echo "  grep -n bytes_to_unicode '$F'" >&2
    exit 2
fi

python3 - "$F" <<'PYEOF'
import pathlib, sys
p = pathlib.Path(sys.argv[1])
s = p.read_text()
old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode"
new = (
    "# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n"
    "def bytes_to_unicode():\n"
    "    bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n"
    "          + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n"
    "          + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n"
    "    cs = bs[:]\n"
    "    n = 0\n"
    "    for b in range(2**8):\n"
    "        if b not in bs:\n"
    "            bs.append(b)\n"
    "            cs.append(2**8 + n)\n"
    "            n += 1\n"
    "    cs = [chr(n) for n in cs]\n"
    "    return dict(zip(bs, cs))"
)
p.write_text(s.replace(old, new))
print("[patch-tokenizer] patched", p)
PYEOF

echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache"
cd "$(dirname "$0")"
docker compose down
docker compose up -d
echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"