#!/usr/bin/env bash # Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`. # # Why: tokenization_kimi.py does # from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode # which fails on recent transformers (the helper was removed/relocated). # The function itself is ~10 lines of public BPE byte-mapping math; we # inline it. Idempotent — re-running is a no-op once patched. # # Run on the box, after weights are downloaded, before first # `docker compose up`. Recreates the container at the end so # `trust_remote_code` re-copies the patched file into its module cache. set -euo pipefail MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}" F="$MODEL_DIR/tokenization_kimi.py" if [ ! -f "$F" ]; then echo "[patch-tokenizer] not found: $F" >&2 echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2 exit 1 fi if grep -q '__patched_bytes_to_unicode__' "$F"; then echo "[patch-tokenizer] $F already patched. Nothing to do." exit 0 fi if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then echo "[patch-tokenizer] expected import line not present in $F." >&2 echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2 echo " grep -n bytes_to_unicode '$F'" >&2 exit 2 fi python3 - "$F" <<'PYEOF' import pathlib, sys p = pathlib.Path(sys.argv[1]) s = p.read_text() old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode" new = ( "# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n" "def bytes_to_unicode():\n" " bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n" " + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n" " + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n" " cs = bs[:]\n" " n = 0\n" " for b in range(2**8):\n" " if b not in bs:\n" " bs.append(b)\n" " cs.append(2**8 + n)\n" " n += 1\n" " cs = [chr(n) for n in cs]\n" " return dict(zip(bs, cs))" ) p.write_text(s.replace(old, new)) print("[patch-tokenizer] patched", p) PYEOF echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache" cd "$(dirname "$0")" docker compose down docker compose up -d echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"