# LiteLLM proxy — single OpenAI-compatible endpoint in front of all the # local model backends on this box (Ollama 11434, llama.cpp 30B 8080, # vLLM Kimi-Linear 8000, llama.cpp Qwen3-235B 8081). # # Why this exists. With ≥3 backends running and ≥2 client harnesses # (opencode on Mac, OpenHands on the box, future orchestrator on another # server), each client otherwise carries its own per-backend config. # LiteLLM centralizes: model_name → backend_url mapping lives here once, # clients just speak "model: qwen3-235b" to a single URL. # # Routing model is documented in compose/litellm/README.md — opencode # stays direct-wired for now (fewer hops, simpler debug); OpenHands + # the future orchestrator will point here. # # Backend reachability. `extra_hosts: host.docker.internal:host-gateway` # resolves to the host's docker0 IP from inside this container, which # is how it reaches the other compose services published on host ports. # Don't use container_name-based DNS — those containers live on separate # bridge networks (each compose stack has its own). services: litellm: image: ghcr.io/berriai/litellm:main-stable container_name: litellm restart: unless-stopped extra_hosts: # On Linux, `host-gateway` is Docker's magic alias for the host's # docker0 IP — equivalent to host.docker.internal on Mac/Windows. # Lets LiteLLM dial localhost-bound backends as # http://host.docker.internal:. - "host.docker.internal:host-gateway" environment: # Master key. LiteLLM requires one for admin endpoints + serves # as the default Bearer for client requests. Sibling .env file # holds the value (created by pyinfra as a placeholder; you fill # it in on first deploy). Same pattern as compose/beszel.yml. - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY} # Optional: salt for hashing virtual keys at rest. Unused in the # single-user setup but LiteLLM logs a warning without it. - LITELLM_SALT_KEY=${LITELLM_SALT_KEY:-sk-localgenai-salt} volumes: # Source-of-truth config lives in the repo; pyinfra syncs it to # /srv/docker/litellm/config.yaml on every `./run.sh`. Don't edit # on the box — drift gets overwritten. - /srv/docker/litellm/config.yaml:/app/config.yaml:ro ports: - "4000:4000" command: - --config - /app/config.yaml - --port - "4000" # --num_workers 1 keeps memory minimal; LiteLLM is I/O-bound here, # not CPU-bound. Bump if you see queueing. - --num_workers - "1" healthcheck: # LiteLLM exposes both /health (verifies all backends are reachable # — heavy) and /health/readiness (just the proxy itself — cheap). # Use readiness for the compose healthcheck so a stopped backend # doesn't mark LiteLLM unhealthy. test: ["CMD", "curl", "-fsS", "http://127.0.0.1:4000/health/readiness"] interval: 30s timeout: 5s retries: 3 start_period: 30s