Initial commit: localgenai stack

Containerized local LLM stack for the Framework Desktop / Strix Halo, plus the OpenCode harness on the Mac side. - pyinfra/framework/: pyinfra deploy targeting the box - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override for gfx1151), OpenWebUI - Beszel (host + container + AMD GPU dashboard via sysfs) - OpenLIT (LLM fleet metrics) - Phoenix (per-trace agent waterfall) - OpenHands (autonomous agent in a Docker sandbox) - opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter) - install.sh deploys to ~/.config/opencode/ - StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md: documentation and planning - testing/qwen3-coder-30b/: small evaluation harness Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 11:35:10 -04:00
commit 2c4bfefa95
36 changed files with 5265 additions and 0 deletions
--- a/pyinfra/framework/compose/beszel.yml
+++ b/pyinfra/framework/compose/beszel.yml
@@ -0,0 +1,66 @@
+# Beszel — host + container + GPU dashboard.
+# https://beszel.dev
+#
+# Picked over Prometheus+Grafana for this box because:
+# - The agent's `amd_sysfs` collector reads /sys/class/drm/card*/device/
+#   directly, which is the only reliable GPU metric source on Strix Halo
+#   (gfx1151). AMD's amd-smi / Device Metrics Exporter return N/A for
+#   util/power/temp on this APU (ROCm#6035), so the official Prometheus
+#   exporter path is dead.
+# - Two containers vs six.
+#
+# First-time setup (WebSocket connection model — current Beszel default):
+#   1. `docker compose up -d beszel`            (start the hub)
+#   2. Open http://framework:8090, create the admin account
+#   3. Click "Add system" — the dialog gives you a TOKEN and an SSH KEY.
+#   4. Edit /srv/docker/beszel/.env (created empty by pyinfra; pyinfra
+#      doesn't overwrite). Add:
+#        BESZEL_TOKEN=<token-from-dialog>
+#        BESZEL_KEY=ssh-ed25519 AAAA…
+#   5. `docker compose up -d --force-recreate beszel-agent`
+#
+# Docker Compose auto-reads the sibling .env file for ${VAR} interpolation
+# in the environment block below — so secrets stay out of the compose
+# file (which pyinfra overwrites) but the env-var names match exactly
+# what the agent expects.
+#
+# Why both TOKEN and KEY: TOKEN identifies which system this agent is,
+# KEY authenticates the agent (the SSH key is reused as the auth secret
+# in the WebSocket handshake). Rotate either by editing the .env and
+# `docker compose up -d --force-recreate`.
+services:
+  beszel:
+    image: henrygd/beszel:latest
+    container_name: beszel
+    restart: unless-stopped
+    ports:
+      - "8090:8090"
+    volumes:
+      - /srv/docker/beszel/data:/beszel_data
+
+  beszel-agent:
+    image: henrygd/beszel-agent:latest
+    container_name: beszel-agent
+    restart: unless-stopped
+    # Host networking so the agent sees real CPU/memory/network counters
+    # without bridge-NAT distortion.
+    network_mode: host
+    volumes:
+      # Read-only Docker socket for per-container CPU/mem/net.
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      # Sysfs paths the AMD GPU collector reads.
+      - /sys/class/drm:/sys/class/drm:ro
+      - /sys/class/hwmon:/sys/class/hwmon:ro
+    environment:
+      # Pulled from /srv/docker/beszel/.env at compose-parse time.
+      TOKEN: "${BESZEL_TOKEN:-}"
+      KEY: "${BESZEL_KEY:-}"
+      # WebSocket dial-out target — the hub on this same host. The agent
+      # is on host networking, so localhost is the host machine, where
+      # the hub container exposes port 8090.
+      HUB_URL: "http://localhost:8090"
+      # Optional fallback: legacy SSH listener for hub-initiated probing.
+      # Harmless to keep — hub only uses it if WebSocket is unreachable.
+      LISTEN: "45876"
+      # Enable the AMD sysfs GPU collector.
+      GPU: "true"
--- a/pyinfra/framework/compose/llama.yml
+++ b/pyinfra/framework/compose/llama.yml
@@ -0,0 +1,44 @@
+# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
+# https://github.com/kyuz0/amd-strix-halo-toolboxes
+#
+# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
+#   vulkan-radv      — most stable, recommended default (this one)
+#   vulkan-amdvlk    — alternate Vulkan driver, sometimes faster
+#   rocm-7.2.2       — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
+#   rocm-6.4.4       — ROCm 6.x fallback
+#   rocm7-nightlies  — avoid: caps memory allocation to 64 GB (May 2026)
+#
+# Toolbox images use a shell entrypoint, so we override to launch
+# llama-server directly. Edit the --model path before `docker compose up -d`.
+services:
+  llama:
+    image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
+    container_name: llama
+    restart: unless-stopped
+    devices:
+      - /dev/dri:/dev/dri
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8080:8080"
+    entrypoint: ["llama-server"]
+    command:
+      - --model
+      - /models/REPLACE/ME/model.gguf
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8080"
+      - --n-gpu-layers
+      - "999"
+      - --ctx-size
+      - "32768"
+      # Required for GPU backends on Strix Halo per Gygeek's setup
+      # guide. Forces full load into GPU memory rather than mmap.
+      - --no-mmap
+      # Flash attention — works on Vulkan too; the big win is on the
+      # ROCm tag where kyuz0's build has rocWMMA acceleration.
+      - --flash-attn
+      # Expose Prometheus metrics at /metrics — scraped by OpenLIT for
+      # tokens/sec, KV-cache use, queue depth, and request latency.
+      - --metrics
--- a/pyinfra/framework/compose/ollama.yml
+++ b/pyinfra/framework/compose/ollama.yml
@@ -0,0 +1,38 @@
+# Ollama, ROCm backend. Serves models on demand — safe to start before
+# you've put anything in /models.
+#
+# Storage: Ollama's content-addressed blob store is bind-mounted under
+# /models/ollama so all model data on the host lives under /models.
+# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM
+# can't load them directly. Keep curated GGUFs at /models/<vendor>/...
+# for those engines.
+services:
+  ollama:
+    image: ollama/ollama:rocm
+    container_name: ollama
+    restart: unless-stopped
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    # Numeric GIDs of host's video (44) and render (991) groups — names
+    # don't exist inside the container, but the GIDs need to match the
+    # host so /dev/kfd + /dev/dri are accessible.
+    group_add:
+      - "44"
+      - "991"
+    environment:
+      # Strix Halo's iGPU is gfx1151 (RDNA 3.5), which Ollama's bundled
+      # ROCm runtime doesn't recognize — without this override it falls
+      # back to CPU silently. 11.0.0 = gfx1100 (Navi 31); the RDNA 3.x
+      # ISAs are close enough that gfx1100 kernels run on gfx1151.
+      - HSA_OVERRIDE_GFX_VERSION=11.0.0
+      # Default context. 256K (the upstream default for Qwen3-Coder)
+      # blows the KV cache up to ~25-30 GB and forces ollama to split
+      # layers between GPU and CPU. 64K keeps the model fully on GPU
+      # while still being plenty for coding contexts.
+      - OLLAMA_CONTEXT_LENGTH=65536
+    volumes:
+      - /models/ollama:/root/.ollama
+      - /models:/models:ro
+    ports:
+      - "11434:11434"
--- a/pyinfra/framework/compose/openhands.yml
+++ b/pyinfra/framework/compose/openhands.yml
@@ -0,0 +1,94 @@
+# OpenHands 1.7 (May 2026) — autonomous agent in a Docker sandbox.
+# https://docs.openhands.dev — repo: github.com/OpenHands/OpenHands
+#
+# Architecture: this container is a thin orchestrator. Per conversation
+# it spawns a separate `agent-server` container on the host Docker daemon
+# (that's what the docker.sock mount is for) and talks to it over REST.
+# AGENT_SERVER_IMAGE_TAG below pins the per-session sandbox image.
+#
+# Complements OpenCode: OpenCode is the interactive terminal driver,
+# OpenHands is for autonomous loops (write code, run tests, browse the
+# web in a sandbox, report back).
+services:
+  openhands:
+    # Org rebranded All-Hands-AI → OpenHands at v1.0 (Dec 2025); the old
+    # docker.all-hands.dev/all-hands-ai/openhands image is gone.
+    image: docker.openhands.dev/openhands/openhands:1.7
+    container_name: openhands
+    restart: unless-stopped
+
+    # 3030 host-side because :3000 is OpenWebUI and :3001 is OpenLIT.
+    # Loopback-only — reach via SSH tunnel or Tailscale, don't expose
+    # this directly.
+    ports:
+      - "127.0.0.1:3030:3000"
+
+    volumes:
+      # Required: orchestrator spawns sandbox containers via the host daemon.
+      - /var/run/docker.sock:/var/run/docker.sock
+      # State, settings, conversation history, MCP config, secrets.
+      # Pre-0.44 used ~/.openhands-state — N/A on a fresh install.
+      - /srv/docker/openhands/state:/.openhands
+      # Workspace the sandbox reads/writes. The host path on the LEFT must
+      # match SANDBOX_VOLUMES below — the sandbox container is spawned by
+      # the host daemon, so its bind mount is resolved on the host, not
+      # via this container's filesystem.
+      - /srv/docker/openhands/workspace:/srv/docker/openhands/workspace
+
+    # Linux Docker doesn't auto-provide host.docker.internal; this fixes it.
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+
+    environment:
+      # ---- Sandbox / agent-server image pin ----
+      # Replaces the V0.x SANDBOX_RUNTIME_CONTAINER_IMAGE. 1.19.1-python is
+      # the agent-server tag the 1.7 main image expects; bumping the main
+      # image will likely want a newer agent-server tag — check the
+      # upstream docker-compose.yml on each upgrade.
+      AGENT_SERVER_IMAGE_REPOSITORY: ghcr.io/openhands/agent-server
+      AGENT_SERVER_IMAGE_TAG: 1.19.1-python
+
+      # ---- Workspace mount into the per-session sandbox ----
+      # SANDBOX_VOLUMES is the V1 replacement for the deprecated
+      # WORKSPACE_BASE / WORKSPACE_MOUNT_PATH variables.
+      SANDBOX_VOLUMES: /srv/docker/openhands/workspace:/workspace:rw
+      # Match the host's `noise` UID so files the agent writes aren't
+      # owned by root.
+      SANDBOX_USER_ID: "1000"
+
+      # ---- LLM: host Ollama via OpenAI-compatible endpoint ----
+      # Per the official local-llms doc, the recommended path is the
+      # /v1 OpenAI-compatible endpoint with the `openai/` LiteLLM prefix
+      # — NOT `ollama/...`, which has worse tool-call behaviour.
+      LLM_MODEL: "openai/qwen3-coder:30b"
+      LLM_BASE_URL: "http://host.docker.internal:11434/v1"
+      LLM_API_KEY: "ollama"   # any non-empty string; Ollama doesn't auth.
+
+      # Default tool-calling renderer mismatches Qwen3-Coder's training
+      # format and produces malformed calls (issue #8140). Forcing false
+      # falls back to OpenHands' prompt-based protocol — costs some token
+      # efficiency, gains reliability with local models.
+      LLM_NATIVE_TOOL_CALLING: "false"
+
+      LOG_ALL_EVENTS: "true"
+
+      # ---- Optional: ship traces to Phoenix on :4318 ----
+      # OpenHands V1 uses LiteLLM + OpenTelemetry; standard OTLP env vars
+      # are honoured. Comment out to disable.
+      OTEL_EXPORTER_OTLP_ENDPOINT: "http://host.docker.internal:4318"
+      OTEL_EXPORTER_OTLP_PROTOCOL: "http/protobuf"
+      OTEL_SERVICE_NAME: "openhands"
+
+    # Per-session agent-server containers spawn headless chromium for
+    # browser tasks; default 64 MB shm causes silent crashes.
+    shm_size: "2gb"
+
+    # Playwright/chromium needs higher fd limits than Docker's default.
+    ulimits:
+      nofile:
+        soft: 65536
+        hard: 65536
+
+    # Bridge networking is correct here. Don't switch to network_mode: host
+    # — the spawned sandbox containers reach this orchestrator via Docker
+    # bridge DNS, which only works on a bridge network.
--- a/pyinfra/framework/compose/openlit.yml
+++ b/pyinfra/framework/compose/openlit.yml
@@ -0,0 +1,59 @@
+# OpenLIT — LLM observability (traces, costs, KV-cache, prompt/decode
+# latencies, tokens/sec). https://openlit.io
+#
+# Two services:
+#   - clickhouse  : columnar store for traces (internal only, no host port)
+#   - openlit     : Next.js UI on :3001 (3000 is OpenWebUI)
+#
+# Why OpenLIT vs Langfuse/Phoenix/Laminar: it's the only OSS dashboard
+# (May 2026) that auto-instruments Ollama AND vLLM via OpenTelemetry
+# without adding code to client apps. For llama.cpp, start the server
+# with --metrics (see ../llama/docker-compose.yml) and OpenLIT can scrape
+# /metrics.
+#
+# To send traces from a Python script calling Ollama/vLLM:
+#   pip install openlit
+#   python -c "import openlit; openlit.init(otlp_endpoint='http://framework:4318')"
+#
+# To wire OpenWebUI → OpenLIT, install OpenLIT's pipeline middleware
+# in OpenWebUI per https://openlit.io/blogs/openlit-openwebui.
+services:
+  clickhouse:
+    image: clickhouse/clickhouse-server:25.3-alpine
+    container_name: openlit-clickhouse
+    restart: unless-stopped
+    environment:
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: OPENLIT
+      CLICKHOUSE_DB: openlit
+    volumes:
+      - /srv/docker/openlit/clickhouse:/var/lib/clickhouse
+    ulimits:
+      nofile:
+        soft: 262144
+        hard: 262144
+
+  openlit:
+    image: ghcr.io/openlit/openlit:latest
+    container_name: openlit
+    restart: unless-stopped
+    depends_on:
+      - clickhouse
+    ports:
+      # Host:container — UI on 3001 (OpenWebUI owns 3000).
+      - "3001:3000"
+      # OTLP receivers exposed on the host so SDKs running off-box can
+      # ship traces here. gRPC + HTTP. Remapped (4327/4328 → 4317/4318)
+      # because Phoenix owns the canonical 4317/4318 ports for OpenCode
+      # traces — OpenLIT here is a secondary/fleet-metrics destination.
+      - "4327:4317"
+      - "4328:4318"
+    environment:
+      INIT_DB_HOST: clickhouse
+      INIT_DB_PORT: "8123"
+      INIT_DB_USERNAME: default
+      INIT_DB_PASSWORD: OPENLIT
+      INIT_DB_DATABASE: openlit
+      SQLITE_DATABASE_URL: file:/app/client/data/data.db
+    volumes:
+      - /srv/docker/openlit/data:/app/client/data
--- a/pyinfra/framework/compose/openwebui.yml
+++ b/pyinfra/framework/compose/openwebui.yml
@@ -0,0 +1,25 @@
+# OpenWebUI — ChatGPT-like web UI in front of Ollama. Pre-configured to
+# use the host's Ollama instance and the project's SearXNG for web
+# search. Default port 3000.
+#
+# Persistent state (users, conversations, uploaded docs, RAG vector
+# index) lives at /srv/docker/openwebui/data so backups touch one path.
+services:
+  openwebui:
+    image: ghcr.io/open-webui/open-webui:main
+    container_name: openwebui
+    restart: unless-stopped
+    ports:
+      - "3000:8080"
+    extra_hosts:
+      # Lets the container reach Ollama on the host's :11434 without
+      # needing to share Docker networks.
+      - "host.docker.internal:host-gateway"
+    environment:
+      - OLLAMA_BASE_URL=http://host.docker.internal:11434
+      # Built-in web search via the project's SearXNG instance.
+      - ENABLE_RAG_WEB_SEARCH=true
+      - RAG_WEB_SEARCH_ENGINE=searxng
+      - SEARXNG_QUERY_URL=https://searxng.n0n.io/search?q=<query>&format=json
+    volumes:
+      - /srv/docker/openwebui/data:/app/backend/data
--- a/pyinfra/framework/compose/phoenix.yml
+++ b/pyinfra/framework/compose/phoenix.yml
@@ -0,0 +1,35 @@
+# Arize Phoenix — per-trace agent waterfall / flamegraph viz.
+# https://github.com/Arize-ai/phoenix
+#
+# Picked over Langfuse for "show me one OpenCode turn as a tree":
+#   - Single container vs Langfuse's six (Postgres+ClickHouse+Redis+MinIO+web+worker).
+#   - First-class ingestion of Vercel AI SDK spans (which is what OpenCode
+#     emits under the hood when experimental.openTelemetry=true).
+#   - Best-in-class waterfall + agent-graph view for nested LLM/tool calls.
+#
+# Complements OpenLIT, doesn't replace it: OpenLIT is the fleet-metrics
+# layer (cost / tokens / latency aggregated across sessions). Phoenix is
+# the per-prompt debugger (see what one turn actually did).
+#
+# Bring-up: `docker compose up -d` — no first-run setup needed; UI prompts
+# for project name on first trace ingest. Storage is SQLite at /data.
+services:
+  phoenix:
+    image: arizephoenix/phoenix:latest
+    container_name: phoenix
+    restart: unless-stopped
+    ports:
+      # UI + OTLP/HTTP both ride on 6006 in Phoenix 15.x — HTTP traces go
+      # to http://framework:6006/v1/traces. (Pre-15 had a separate 4318;
+      # the consolidation happened in Phoenix v15.0.)
+      - "6006:6006"
+      # OTLP/gRPC stays separate.
+      - "4317:4317"
+    environment:
+      PHOENIX_WORKING_DIR: /data
+      # Phoenix listens on all interfaces by default; explicit for clarity.
+      PHOENIX_HOST: 0.0.0.0
+      PHOENIX_PORT: "6006"
+      PHOENIX_GRPC_PORT: "4317"
+    volumes:
+      - /srv/docker/phoenix/data:/data
--- a/pyinfra/framework/compose/vllm.yml
+++ b/pyinfra/framework/compose/vllm.yml
@@ -0,0 +1,36 @@
+# vLLM, ROCm backend.
+#
+# NOTE: vLLM's official ROCm support targets datacenter cards (MI300X /
+# gfx942). Strix Halo is gfx1151 — support varies by image tag and
+# release. If `rocm/vllm:latest` doesn't run on this iGPU, try
+# `rocm/vllm-dev:nightly` or build from source against ROCm 7.x.
+services:
+  vllm:
+    image: rocm/vllm:latest
+    container_name: vllm
+    restart: unless-stopped
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups — names
+    # don't exist inside the container.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 16g
+    ipc: host
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8000:8000"
+    command:
+      - --model
+      - /models/REPLACE/ME
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8000"