# LiteLLM model routing. model_name is what clients request; the # litellm_params block is how LiteLLM reaches the backend. # # `model: openai/` tells LiteLLM to use its # openai-compatible adapter and forward to the backend. # api_base is the backend's /v1 root reachable from inside the LiteLLM # container (host.docker.internal = host's docker0 IP via the # extra_hosts entry in litellm.yml). # # Backend running-state matters: requests to a stopped backend return # 503/connection-refused. By design — no fallback chain, since these # backends compete for GPU and silently routing "qwen3-235b" to the 30B # would be more confusing than failing fast. # # Edits here require `./run.sh` on the Mac to push to the box, then # `docker compose restart litellm` on the box to reload. model_list: # Daily-driver coding model. Ollama with gfx1100-coerced ROCm — # currently the default opencode provider. Always-resident # (OLLAMA_KEEP_ALIVE=24h). - model_name: qwen3-coder litellm_params: model: openai/qwen3-coder:30b api_base: http://host.docker.internal:11434/v1 api_key: dummy # Same weights as qwen3-coder above but served via llama.cpp on the # kyuz0 rocm-7.2.2 image (native gfx1151 + rocWMMA). LL-P0 measures # whether the eval_tps win justifies switching default opencode to # this. Manual start until then. - model_name: qwen3-coder-llama litellm_params: model: openai/qwen3-coder api_base: http://host.docker.internal:8080/v1 api_key: dummy # Long-context chat (no tool calling) via vLLM. P0 verified at 32K; # context ramp tracked in kimi-linear/NEXT_STEPS.md. - model_name: kimi-linear litellm_params: model: openai/kimi-linear api_base: http://host.docker.internal:8000/v1 api_key: dummy # Long-task model — Qwen3-235B-A22B-Instruct-2507 UD-Q2_K_XL via # llama.cpp on port 8081. ~5-10 tok/s decode; manual start only # (can't coexist with the other GPU services). Requests will fail # with connection refused when the container is down — that's the # intended UX: a stopped service is a clear signal. - model_name: qwen3-235b litellm_params: model: openai/qwen3-235b api_base: http://host.docker.internal:8081/v1 api_key: dummy litellm_settings: # Forward all client params to the backend, even unrecognized ones. # Default drops unknown OpenAI params — fine for hosted models, but # our backends vary (vLLM, llama.cpp, Ollama) and each accepts a # slightly different superset. Let the backend reject what it can't # use rather than LiteLLM silently filtering. drop_params: false # No /v1/models caching — the list is short and we want stop/start # of backends to reflect immediately. cache: false # Log proxy requests at info; tail with `docker compose logs litellm`. set_verbose: false general_settings: # Disable LiteLLM's database mode — we're stateless. No user/key/spend # tracking needed for a single-user trusted LAN setup. database_url: null