Document current coding-workflow stack state
Snapshot of where opencode + Qwen3-Coder + MCPs + Kimi-Linear + voice + Phoenix tracing land today, plus in-flight (oc-tree, kimi-linear context ramp) and next (ComfyUI) items with pointers to per-project NEXT_STEPS.md guides.
This commit is contained in:
66
pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
Executable file
66
pyinfra/framework/compose/kimi-linear/patch-tokenizer.sh
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env bash
|
||||
# Patch cyankiwi's tokenization_kimi.py to inline `bytes_to_unicode`.
|
||||
#
|
||||
# Why: tokenization_kimi.py does
|
||||
# from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||
# which fails on recent transformers (the helper was removed/relocated).
|
||||
# The function itself is ~10 lines of public BPE byte-mapping math; we
|
||||
# inline it. Idempotent — re-running is a no-op once patched.
|
||||
#
|
||||
# Run on the box, after weights are downloaded, before first
|
||||
# `docker compose up`. Recreates the container at the end so
|
||||
# `trust_remote_code` re-copies the patched file into its module cache.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MODEL_DIR="${MODEL_DIR:-/models/moonshotai/Kimi-Linear-48B-A3B-Instruct-AWQ-4bit}"
|
||||
F="$MODEL_DIR/tokenization_kimi.py"
|
||||
|
||||
if [ ! -f "$F" ]; then
|
||||
echo "[patch-tokenizer] not found: $F" >&2
|
||||
echo "[patch-tokenizer] download weights first, or set MODEL_DIR=" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if grep -q '__patched_bytes_to_unicode__' "$F"; then
|
||||
echo "[patch-tokenizer] $F already patched. Nothing to do."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ! grep -q 'from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode' "$F"; then
|
||||
echo "[patch-tokenizer] expected import line not present in $F." >&2
|
||||
echo "[patch-tokenizer] upstream may have changed — inspect manually:" >&2
|
||||
echo " grep -n bytes_to_unicode '$F'" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
python3 - "$F" <<'PYEOF'
|
||||
import pathlib, sys
|
||||
p = pathlib.Path(sys.argv[1])
|
||||
s = p.read_text()
|
||||
old = "from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode"
|
||||
new = (
|
||||
"# __patched_bytes_to_unicode__ — inlined; helper removed from recent transformers\n"
|
||||
"def bytes_to_unicode():\n"
|
||||
" bs = (list(range(ord(\"!\"), ord(\"~\") + 1))\n"
|
||||
" + list(range(ord(\"¡\"), ord(\"¬\") + 1))\n"
|
||||
" + list(range(ord(\"®\"), ord(\"ÿ\") + 1)))\n"
|
||||
" cs = bs[:]\n"
|
||||
" n = 0\n"
|
||||
" for b in range(2**8):\n"
|
||||
" if b not in bs:\n"
|
||||
" bs.append(b)\n"
|
||||
" cs.append(2**8 + n)\n"
|
||||
" n += 1\n"
|
||||
" cs = [chr(n) for n in cs]\n"
|
||||
" return dict(zip(bs, cs))"
|
||||
)
|
||||
p.write_text(s.replace(old, new))
|
||||
print("[patch-tokenizer] patched", p)
|
||||
PYEOF
|
||||
|
||||
echo "[patch-tokenizer] recreating container to refresh trust_remote_code module cache"
|
||||
cd "$(dirname "$0")"
|
||||
docker compose down
|
||||
docker compose up -d
|
||||
echo "[patch-tokenizer] done. Tail logs with: docker compose logs -f"
|
||||
Reference in New Issue
Block a user