Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
67 lines
2.4 KiB
Bash
Executable File
67 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`.
|
|
#
|
|
# Why: tokenization_kimi.py does
|
|
# from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
|
# which fails on recent transformers (the helper was removed/relocated).
|
|
# The function itself is ~10 lines of public BPE byte-mapping math; we
|
|
# inline it. Idempotent — re-running is a no-op once patched.
|
|
#
|
|
# Run on the box, after weights are downloaded, before first
|
|
# `docker compose up`. Recreates the container at the end so
|
|
# `trust_remote_code` re-copies the patched file into its module cache.
|
|
|
|
set -euo pipefail
|
|
|
|
MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}"
|
|
F="$MODEL_DIR/tokenization_kimi.py"
|
|
|
|
if [ ! -f "$F" ]; then
|
|
echo "[patch-tokenizer] not found: $F" >&2
|
|
echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2
|
|
exit 1
|
|
fi
|
|
|
|
if grep -q '__patched_bytes_to_unicode__' "$F"; then
|
|
echo "[patch-tokenizer] $F already patched. Nothing to do."
|
|
exit 0
|
|
fi
|
|
|
|
if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then
|
|
echo "[patch-tokenizer] expected import line not present in $F." >&2
|
|
echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2
|
|
echo " grep -n bytes_to_unicode '$F'" >&2
|
|
exit 2
|
|
fi
|
|
|
|
python3 - "$F" <<'PYEOF'
|
|
import pathlib, sys
|
|
p = pathlib.Path(sys.argv[1])
|
|
s = p.read_text()
|
|
old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode"
|
|
new = (
|
|
"# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n"
|
|
"def bytes_to_unicode():\n"
|
|
" bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n"
|
|
" + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n"
|
|
" + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n"
|
|
" cs = bs[:]\n"
|
|
" n = 0\n"
|
|
" for b in range(2**8):\n"
|
|
" if b not in bs:\n"
|
|
" bs.append(b)\n"
|
|
" cs.append(2**8 + n)\n"
|
|
" n += 1\n"
|
|
" cs = [chr(n) for n in cs]\n"
|
|
" return dict(zip(bs, cs))"
|
|
)
|
|
p.write_text(s.replace(old, new))
|
|
print("[patch-tokenizer] patched", p)
|
|
PYEOF
|
|
|
|
echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache"
|
|
cd "$(dirname "$0")"
|
|
docker compose down
|
|
docker compose up -d
|
|
echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"
|