pyinfra/framework/compose/litellm.yml

# LiteLLM proxy — single OpenAI-compatible endpoint in front of all the
# local model backends on this box (Ollama 11434, llama.cpp 30B 8080,
# vLLM Kimi-Linear 8000, llama.cpp Qwen3-235B 8081).
#
# Why this exists. With ≥3 backends running and ≥2 client harnesses
# (opencode on Mac, OpenHands on the box, future orchestrator on another
# server), each client otherwise carries its own per-backend config.
# LiteLLM centralizes: model_name → backend_url mapping lives here once,
# clients just speak "model: qwen3-235b" to a single URL.
#
# Routing model is documented in compose/litellm/README.md — opencode
# stays direct-wired for now (fewer hops, simpler debug); OpenHands +
# the future orchestrator will point here.
#
# Backend reachability. `extra_hosts: host.docker.internal:host-gateway`
# resolves to the host's docker0 IP from inside this container, which
# is how it reaches the other compose services published on host ports.
# Don't use container_name-based DNS — those containers live on separate
# bridge networks (each compose stack has its own).
services:
  litellm:
    image: ghcr.io/berriai/litellm:main-stable
    container_name: litellm
    restart: unless-stopped
    extra_hosts:
      # On Linux, `host-gateway` is Docker's magic alias for the host's
      # docker0 IP — equivalent to host.docker.internal on Mac/Windows.
      # Lets LiteLLM dial localhost-bound backends as
      # http://host.docker.internal:<port>.
      - "host.docker.internal:host-gateway"
    environment:
      # Master key. LiteLLM requires one for admin endpoints + serves
      # as the default Bearer for client requests. Sibling .env file
      # holds the value (created by pyinfra as a placeholder; you fill
      # it in on first deploy). Same pattern as compose/beszel.yml.
      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY}
      # Optional: salt for hashing virtual keys at rest. Unused in the
      # single-user setup but LiteLLM logs a warning without it.
      - LITELLM_SALT_KEY=${LITELLM_SALT_KEY:-sk-localgenai-salt}
    volumes:
      # Source-of-truth config lives in the repo; pyinfra syncs it to
      # /srv/docker/litellm/config.yaml on every `./run.sh`. Don't edit
      # on the box — drift gets overwritten.
      - /srv/docker/litellm/config.yaml:/app/config.yaml:ro
    ports:
      - "4000:4000"
    command:
      - --config
      - /app/config.yaml
      - --port
      - "4000"
      # --num_workers 1 keeps memory minimal; LiteLLM is I/O-bound here,
      # not CPU-bound. Bump if you see queueing.
      - --num_workers
      - "1"
    healthcheck:
      # LiteLLM exposes both /health (verifies all backends are reachable
      # — heavy) and /health/readiness (just the proxy itself — cheap).
      # Use readiness for the compose healthcheck so a stopped backend
      # doesn't mark LiteLLM unhealthy.
      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:4000/health/readiness"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 30s
progress 235b 2026-06-08 15:31:50 +01:00			`# LiteLLM proxy — single OpenAI-compatible endpoint in front of all the`
			`# local model backends on this box (Ollama 11434, llama.cpp 30B 8080,`
			`# vLLM Kimi-Linear 8000, llama.cpp Qwen3-235B 8081).`
			`#`
			`# Why this exists. With ≥3 backends running and ≥2 client harnesses`
			`# (opencode on Mac, OpenHands on the box, future orchestrator on another`
			`# server), each client otherwise carries its own per-backend config.`
			`# LiteLLM centralizes: model_name → backend_url mapping lives here once,`
			`# clients just speak "model: qwen3-235b" to a single URL.`
			`#`
			`# Routing model is documented in compose/litellm/README.md — opencode`
			`# stays direct-wired for now (fewer hops, simpler debug); OpenHands +`
			`# the future orchestrator will point here.`
			`#`
			# Backend reachability. `extra_hosts: host.docker.internal:host-gateway`
			`# resolves to the host's docker0 IP from inside this container, which`
			`# is how it reaches the other compose services published on host ports.`
			`# Don't use container_name-based DNS — those containers live on separate`
			`# bridge networks (each compose stack has its own).`
			`services:`
			`litellm:`
			`image: ghcr.io/berriai/litellm:main-stable`
			`container_name: litellm`
			`restart: unless-stopped`
			`extra_hosts:`
			# On Linux, `host-gateway` is Docker's magic alias for the host's
			`# docker0 IP — equivalent to host.docker.internal on Mac/Windows.`
			`# Lets LiteLLM dial localhost-bound backends as`
			`# http://host.docker.internal:<port>.`
			`- "host.docker.internal:host-gateway"`
			`environment:`
			`# Master key. LiteLLM requires one for admin endpoints + serves`
			`# as the default Bearer for client requests. Sibling .env file`
			`# holds the value (created by pyinfra as a placeholder; you fill`
			`# it in on first deploy). Same pattern as compose/beszel.yml.`
			`- LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY}`
			`# Optional: salt for hashing virtual keys at rest. Unused in the`
			`# single-user setup but LiteLLM logs a warning without it.`
			`- LITELLM_SALT_KEY=${LITELLM_SALT_KEY:-sk-localgenai-salt}`
			`volumes:`
			`# Source-of-truth config lives in the repo; pyinfra syncs it to`
			# /srv/docker/litellm/config.yaml on every `./run.sh`. Don't edit
			`# on the box — drift gets overwritten.`
			`- /srv/docker/litellm/config.yaml:/app/config.yaml:ro`
			`ports:`
			`- "4000:4000"`
			`command:`
			`- --config`
			`- /app/config.yaml`
			`- --port`
			`- "4000"`
			`# --num_workers 1 keeps memory minimal; LiteLLM is I/O-bound here,`
			`# not CPU-bound. Bump if you see queueing.`
			`- --num_workers`
			`- "1"`
			`healthcheck:`
			`# LiteLLM exposes both /health (verifies all backends are reachable`
			`# — heavy) and /health/readiness (just the proxy itself — cheap).`
			`# Use readiness for the compose healthcheck so a stopped backend`
			`# doesn't mark LiteLLM unhealthy.`
			`test: ["CMD", "curl", "-fsS", "http://127.0.0.1:4000/health/readiness"]`
			`interval: 30s`
			`timeout: 5s`
			`retries: 3`
			`start_period: 30s`