progress 235b

2026-06-08 15:31:50 +01:00
parent a29793032d
commit de1635872f
25 changed files with 1598 additions and 53 deletions
--- a/pyinfra/framework/compose/litellm/config.yaml
+++ b/pyinfra/framework/compose/litellm/config.yaml
@@ -0,0 +1,73 @@
+# LiteLLM model routing. model_name is what clients request; the
+# litellm_params block is how LiteLLM reaches the backend.
+#
+# `model: openai/<served-name>` tells LiteLLM to use its
+# openai-compatible adapter and forward <served-name> to the backend.
+# api_base is the backend's /v1 root reachable from inside the LiteLLM
+# container (host.docker.internal = host's docker0 IP via the
+# extra_hosts entry in litellm.yml).
+#
+# Backend running-state matters: requests to a stopped backend return
+# 503/connection-refused. By design — no fallback chain, since these
+# backends compete for GPU and silently routing "qwen3-235b" to the 30B
+# would be more confusing than failing fast.
+#
+# Edits here require `./run.sh` on the Mac to push to the box, then
+# `docker compose restart litellm` on the box to reload.
+
+model_list:
+  # Daily-driver coding model. Ollama with gfx1100-coerced ROCm —
+  # currently the default opencode provider. Always-resident
+  # (OLLAMA_KEEP_ALIVE=24h).
+  - model_name: qwen3-coder
+    litellm_params:
+      model: openai/qwen3-coder:30b
+      api_base: http://host.docker.internal:11434/v1
+      api_key: dummy
+
+  # Same weights as qwen3-coder above but served via llama.cpp on the
+  # kyuz0 rocm-7.2.2 image (native gfx1151 + rocWMMA). LL-P0 measures
+  # whether the eval_tps win justifies switching default opencode to
+  # this. Manual start until then.
+  - model_name: qwen3-coder-llama
+    litellm_params:
+      model: openai/qwen3-coder
+      api_base: http://host.docker.internal:8080/v1
+      api_key: dummy
+
+  # Long-context chat (no tool calling) via vLLM. P0 verified at 32K;
+  # context ramp tracked in kimi-linear/NEXT_STEPS.md.
+  - model_name: kimi-linear
+    litellm_params:
+      model: openai/kimi-linear
+      api_base: http://host.docker.internal:8000/v1
+      api_key: dummy
+
+  # Long-task model — Qwen3-235B-A22B-Instruct-2507 UD-Q2_K_XL via
+  # llama.cpp on port 8081. ~5-10 tok/s decode; manual start only
+  # (can't coexist with the other GPU services). Requests will fail
+  # with connection refused when the container is down — that's the
+  # intended UX: a stopped service is a clear signal.
+  - model_name: qwen3-235b
+    litellm_params:
+      model: openai/qwen3-235b
+      api_base: http://host.docker.internal:8081/v1
+      api_key: dummy
+
+litellm_settings:
+  # Forward all client params to the backend, even unrecognized ones.
+  # Default drops unknown OpenAI params — fine for hosted models, but
+  # our backends vary (vLLM, llama.cpp, Ollama) and each accepts a
+  # slightly different superset. Let the backend reject what it can't
+  # use rather than LiteLLM silently filtering.
+  drop_params: false
+  # No /v1/models caching — the list is short and we want stop/start
+  # of backends to reflect immediately.
+  cache: false
+  # Log proxy requests at info; tail with `docker compose logs litellm`.
+  set_verbose: false
+
+general_settings:
+  # Disable LiteLLM's database mode — we're stateless. No user/key/spend
+  # tracking needed for a single-user trusted LAN setup.
+  database_url: null