Files
localgenai/pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
noisedestroyers a29793032d Document current coding-workflow stack state
Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice
  + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear
  context ramp) and next (ComfyUI) items with pointers to per-project
  NEXT_STEPS.md guides.
2026-05-10 21:14:43 -04:00

67 lines
2.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`.
#
# Why: tokenization_kimi.py does
# from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
# which fails on recent transformers (the helper was removed/relocated).
# The function itself is ~10 lines of public BPE byte-mapping math; we
# inline it. Idempotent — re-running is a no-op once patched.
#
# Run on the box, after weights are downloaded, before first
# `docker compose up`. Recreates the container at the end so
# `trust_remote_code` re-copies the patched file into its module cache.
set -euo pipefail
MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}"
F="$MODEL_DIR/tokenization_kimi.py"
if [ ! -f "$F" ]; then
echo "[patch-tokenizer] not found: $F" >&2
echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2
exit 1
fi
if grep -q '__patched_bytes_to_unicode__' "$F"; then
echo "[patch-tokenizer] $F already patched. Nothing to do."
exit 0
fi
if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then
echo "[patch-tokenizer] expected import line not present in $F." >&2
echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2
echo " grep -n bytes_to_unicode '$F'" >&2
exit 2
fi
python3 - "$F" <<'PYEOF'
import pathlib, sys
p = pathlib.Path(sys.argv[1])
s = p.read_text()
old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode"
new = (
"# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n"
"def bytes_to_unicode():\n"
" bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n"
" + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n"
" + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n"
" cs = bs[:]\n"
" n = 0\n"
" for b in range(2**8):\n"
" if b not in bs:\n"
" bs.append(b)\n"
" cs.append(2**8 + n)\n"
" n += 1\n"
" cs = [chr(n) for n in cs]\n"
" return dict(zip(bs, cs))"
)
p.write_text(s.replace(old, new))
print("[patch-tokenizer] patched", p)
PYEOF
echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache"
cd "$(dirname "$0")"
docker compose down
docker compose up -d
echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"