progress 235b

2026-06-08 15:31:50 +01:00
parent a29793032d
commit de1635872f
25 changed files with 1598 additions and 53 deletions
--- a/pyinfra/framework/compose/litellm.yml
+++ b/pyinfra/framework/compose/litellm.yml
@@ -0,0 +1,65 @@
+# LiteLLM proxy — single OpenAI-compatible endpoint in front of all the
+# local model backends on this box (Ollama 11434, llama.cpp 30B 8080,
+# vLLM Kimi-Linear 8000, llama.cpp Qwen3-235B 8081).
+#
+# Why this exists. With ≥3 backends running and ≥2 client harnesses
+# (opencode on Mac, OpenHands on the box, future orchestrator on another
+# server), each client otherwise carries its own per-backend config.
+# LiteLLM centralizes: model_name → backend_url mapping lives here once,
+# clients just speak "model: qwen3-235b" to a single URL.
+#
+# Routing model is documented in compose/litellm/README.md — opencode
+# stays direct-wired for now (fewer hops, simpler debug); OpenHands +
+# the future orchestrator will point here.
+#
+# Backend reachability. `extra_hosts: host.docker.internal:host-gateway`
+# resolves to the host's docker0 IP from inside this container, which
+# is how it reaches the other compose services published on host ports.
+# Don't use container_name-based DNS — those containers live on separate
+# bridge networks (each compose stack has its own).
+services:
+  litellm:
+    image: ghcr.io/berriai/litellm:main-stable
+    container_name: litellm
+    restart: unless-stopped
+    extra_hosts:
+      # On Linux, `host-gateway` is Docker's magic alias for the host's
+      # docker0 IP — equivalent to host.docker.internal on Mac/Windows.
+      # Lets LiteLLM dial localhost-bound backends as
+      # http://host.docker.internal:<port>.
+      - "host.docker.internal:host-gateway"
+    environment:
+      # Master key. LiteLLM requires one for admin endpoints + serves
+      # as the default Bearer for client requests. Sibling .env file
+      # holds the value (created by pyinfra as a placeholder; you fill
+      # it in on first deploy). Same pattern as compose/beszel.yml.
+      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY}
+      # Optional: salt for hashing virtual keys at rest. Unused in the
+      # single-user setup but LiteLLM logs a warning without it.
+      - LITELLM_SALT_KEY=${LITELLM_SALT_KEY:-sk-localgenai-salt}
+    volumes:
+      # Source-of-truth config lives in the repo; pyinfra syncs it to
+      # /srv/docker/litellm/config.yaml on every `./run.sh`. Don't edit
+      # on the box — drift gets overwritten.
+      - /srv/docker/litellm/config.yaml:/app/config.yaml:ro
+    ports:
+      - "4000:4000"
+    command:
+      - --config
+      - /app/config.yaml
+      - --port
+      - "4000"
+      # --num_workers 1 keeps memory minimal; LiteLLM is I/O-bound here,
+      # not CPU-bound. Bump if you see queueing.
+      - --num_workers
+      - "1"
+    healthcheck:
+      # LiteLLM exposes both /health (verifies all backends are reachable
+      # — heavy) and /health/readiness (just the proxy itself — cheap).
+      # Use readiness for the compose healthcheck so a stopped backend
+      # doesn't mark LiteLLM unhealthy.
+      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:4000/health/readiness"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 30s