Initial commit: localgenai stack

Containerized local LLM stack for the Framework Desktop / Strix Halo,
plus the OpenCode harness on the Mac side.

- pyinfra/framework/: pyinfra deploy targeting the box
  - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override
    for gfx1151), OpenWebUI
  - Beszel (host + container + AMD GPU dashboard via sysfs)
  - OpenLIT (LLM fleet metrics)
  - Phoenix (per-trace agent waterfall)
  - OpenHands (autonomous agent in a Docker sandbox)
- opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter)
  - install.sh deploys to ~/.config/opencode/
- StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md:
  documentation and planning
- testing/qwen3-coder-30b/: small evaluation harness

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-08 11:35:10 -04:00
commit 2c4bfefa95
36 changed files with 5265 additions and 0 deletions

View File

@@ -0,0 +1,66 @@
# Beszel — host + container + GPU dashboard.
# https://beszel.dev
#
# Picked over Prometheus+Grafana for this box because:
# - The agent's `amd_sysfs` collector reads /sys/class/drm/card*/device/
# directly, which is the only reliable GPU metric source on Strix Halo
# (gfx1151). AMD's amd-smi / Device Metrics Exporter return N/A for
# util/power/temp on this APU (ROCm#6035), so the official Prometheus
# exporter path is dead.
# - Two containers vs six.
#
# First-time setup (WebSocket connection model — current Beszel default):
# 1. `docker compose up -d beszel` (start the hub)
# 2. Open http://framework:8090, create the admin account
# 3. Click "Add system" — the dialog gives you a TOKEN and an SSH KEY.
# 4. Edit /srv/docker/beszel/.env (created empty by pyinfra; pyinfra
# doesn't overwrite). Add:
# BESZEL_TOKEN=<token-from-dialog>
# BESZEL_KEY=ssh-ed25519 AAAA…
# 5. `docker compose up -d --force-recreate beszel-agent`
#
# Docker Compose auto-reads the sibling .env file for ${VAR} interpolation
# in the environment block below — so secrets stay out of the compose
# file (which pyinfra overwrites) but the env-var names match exactly
# what the agent expects.
#
# Why both TOKEN and KEY: TOKEN identifies which system this agent is,
# KEY authenticates the agent (the SSH key is reused as the auth secret
# in the WebSocket handshake). Rotate either by editing the .env and
# `docker compose up -d --force-recreate`.
services:
beszel:
image: henrygd/beszel:latest
container_name: beszel
restart: unless-stopped
ports:
- "8090:8090"
volumes:
- /srv/docker/beszel/data:/beszel_data
beszel-agent:
image: henrygd/beszel-agent:latest
container_name: beszel-agent
restart: unless-stopped
# Host networking so the agent sees real CPU/memory/network counters
# without bridge-NAT distortion.
network_mode: host
volumes:
# Read-only Docker socket for per-container CPU/mem/net.
- /var/run/docker.sock:/var/run/docker.sock:ro
# Sysfs paths the AMD GPU collector reads.
- /sys/class/drm:/sys/class/drm:ro
- /sys/class/hwmon:/sys/class/hwmon:ro
environment:
# Pulled from /srv/docker/beszel/.env at compose-parse time.
TOKEN: "${BESZEL_TOKEN:-}"
KEY: "${BESZEL_KEY:-}"
# WebSocket dial-out target — the hub on this same host. The agent
# is on host networking, so localhost is the host machine, where
# the hub container exposes port 8090.
HUB_URL: "http://localhost:8090"
# Optional fallback: legacy SSH listener for hub-initiated probing.
# Harmless to keep — hub only uses it if WebSocket is unreachable.
LISTEN: "45876"
# Enable the AMD sysfs GPU collector.
GPU: "true"

View File

@@ -0,0 +1,44 @@
# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
#
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
# vulkan-radv — most stable, recommended default (this one)
# vulkan-amdvlk — alternate Vulkan driver, sometimes faster
# rocm-7.2.2 — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
# rocm-6.4.4 — ROCm 6.x fallback
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
#
# Toolbox images use a shell entrypoint, so we override to launch
# llama-server directly. Edit the --model path before `docker compose up -d`.
services:
llama:
image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
container_name: llama
restart: unless-stopped
devices:
- /dev/dri:/dev/dri
volumes:
- /models:/models:ro
ports:
- "8080:8080"
entrypoint: ["llama-server"]
command:
- --model
- /models/REPLACE/ME/model.gguf
- --host
- 0.0.0.0
- --port
- "8080"
- --n-gpu-layers
- "999"
- --ctx-size
- "32768"
# Required for GPU backends on Strix Halo per Gygeek's setup
# guide. Forces full load into GPU memory rather than mmap.
- --no-mmap
# Flash attention — works on Vulkan too; the big win is on the
# ROCm tag where kyuz0's build has rocWMMA acceleration.
- --flash-attn
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
# tokens/sec, KV-cache use, queue depth, and request latency.
- --metrics

View File

@@ -0,0 +1,38 @@
# Ollama, ROCm backend. Serves models on demand — safe to start before
# you've put anything in /models.
#
# Storage: Ollama's content-addressed blob store is bind-mounted under
# /models/ollama so all model data on the host lives under /models.
# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM
# can't load them directly. Keep curated GGUFs at /models/<vendor>/...
# for those engines.
services:
ollama:
image: ollama/ollama:rocm
container_name: ollama
restart: unless-stopped
devices:
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
# Numeric GIDs of host's video (44) and render (991) groups — names
# don't exist inside the container, but the GIDs need to match the
# host so /dev/kfd + /dev/dri are accessible.
group_add:
- "44"
- "991"
environment:
# Strix Halo's iGPU is gfx1151 (RDNA 3.5), which Ollama's bundled
# ROCm runtime doesn't recognize — without this override it falls
# back to CPU silently. 11.0.0 = gfx1100 (Navi 31); the RDNA 3.x
# ISAs are close enough that gfx1100 kernels run on gfx1151.
- HSA_OVERRIDE_GFX_VERSION=11.0.0
# Default context. 256K (the upstream default for Qwen3-Coder)
# blows the KV cache up to ~25-30 GB and forces ollama to split
# layers between GPU and CPU. 64K keeps the model fully on GPU
# while still being plenty for coding contexts.
- OLLAMA_CONTEXT_LENGTH=65536
volumes:
- /models/ollama:/root/.ollama
- /models:/models:ro
ports:
- "11434:11434"

View File

@@ -0,0 +1,94 @@
# OpenHands 1.7 (May 2026) — autonomous agent in a Docker sandbox.
# https://docs.openhands.dev — repo: github.com/OpenHands/OpenHands
#
# Architecture: this container is a thin orchestrator. Per conversation
# it spawns a separate `agent-server` container on the host Docker daemon
# (that's what the docker.sock mount is for) and talks to it over REST.
# AGENT_SERVER_IMAGE_TAG below pins the per-session sandbox image.
#
# Complements OpenCode: OpenCode is the interactive terminal driver,
# OpenHands is for autonomous loops (write code, run tests, browse the
# web in a sandbox, report back).
services:
openhands:
# Org rebranded All-Hands-AI → OpenHands at v1.0 (Dec 2025); the old
# docker.all-hands.dev/all-hands-ai/openhands image is gone.
image: docker.openhands.dev/openhands/openhands:1.7
container_name: openhands
restart: unless-stopped
# 3030 host-side because :3000 is OpenWebUI and :3001 is OpenLIT.
# Loopback-only — reach via SSH tunnel or Tailscale, don't expose
# this directly.
ports:
- "127.0.0.1:3030:3000"
volumes:
# Required: orchestrator spawns sandbox containers via the host daemon.
- /var/run/docker.sock:/var/run/docker.sock
# State, settings, conversation history, MCP config, secrets.
# Pre-0.44 used ~/.openhands-state — N/A on a fresh install.
- /srv/docker/openhands/state:/.openhands
# Workspace the sandbox reads/writes. The host path on the LEFT must
# match SANDBOX_VOLUMES below — the sandbox container is spawned by
# the host daemon, so its bind mount is resolved on the host, not
# via this container's filesystem.
- /srv/docker/openhands/workspace:/srv/docker/openhands/workspace
# Linux Docker doesn't auto-provide host.docker.internal; this fixes it.
extra_hosts:
- "host.docker.internal:host-gateway"
environment:
# ---- Sandbox / agent-server image pin ----
# Replaces the V0.x SANDBOX_RUNTIME_CONTAINER_IMAGE. 1.19.1-python is
# the agent-server tag the 1.7 main image expects; bumping the main
# image will likely want a newer agent-server tag — check the
# upstream docker-compose.yml on each upgrade.
AGENT_SERVER_IMAGE_REPOSITORY: ghcr.io/openhands/agent-server
AGENT_SERVER_IMAGE_TAG: 1.19.1-python
# ---- Workspace mount into the per-session sandbox ----
# SANDBOX_VOLUMES is the V1 replacement for the deprecated
# WORKSPACE_BASE / WORKSPACE_MOUNT_PATH variables.
SANDBOX_VOLUMES: /srv/docker/openhands/workspace:/workspace:rw
# Match the host's `noise` UID so files the agent writes aren't
# owned by root.
SANDBOX_USER_ID: "1000"
# ---- LLM: host Ollama via OpenAI-compatible endpoint ----
# Per the official local-llms doc, the recommended path is the
# /v1 OpenAI-compatible endpoint with the `openai/` LiteLLM prefix
# — NOT `ollama/...`, which has worse tool-call behaviour.
LLM_MODEL: "openai/qwen3-coder:30b"
LLM_BASE_URL: "http://host.docker.internal:11434/v1"
LLM_API_KEY: "ollama" # any non-empty string; Ollama doesn't auth.
# Default tool-calling renderer mismatches Qwen3-Coder's training
# format and produces malformed calls (issue #8140). Forcing false
# falls back to OpenHands' prompt-based protocol — costs some token
# efficiency, gains reliability with local models.
LLM_NATIVE_TOOL_CALLING: "false"
LOG_ALL_EVENTS: "true"
# ---- Optional: ship traces to Phoenix on :4318 ----
# OpenHands V1 uses LiteLLM + OpenTelemetry; standard OTLP env vars
# are honoured. Comment out to disable.
OTEL_EXPORTER_OTLP_ENDPOINT: "http://host.docker.internal:4318"
OTEL_EXPORTER_OTLP_PROTOCOL: "http/protobuf"
OTEL_SERVICE_NAME: "openhands"
# Per-session agent-server containers spawn headless chromium for
# browser tasks; default 64 MB shm causes silent crashes.
shm_size: "2gb"
# Playwright/chromium needs higher fd limits than Docker's default.
ulimits:
nofile:
soft: 65536
hard: 65536
# Bridge networking is correct here. Don't switch to network_mode: host
# — the spawned sandbox containers reach this orchestrator via Docker
# bridge DNS, which only works on a bridge network.

View File

@@ -0,0 +1,59 @@
# OpenLIT — LLM observability (traces, costs, KV-cache, prompt/decode
# latencies, tokens/sec). https://openlit.io
#
# Two services:
# - clickhouse : columnar store for traces (internal only, no host port)
# - openlit : Next.js UI on :3001 (3000 is OpenWebUI)
#
# Why OpenLIT vs Langfuse/Phoenix/Laminar: it's the only OSS dashboard
# (May 2026) that auto-instruments Ollama AND vLLM via OpenTelemetry
# without adding code to client apps. For llama.cpp, start the server
# with --metrics (see ../llama/docker-compose.yml) and OpenLIT can scrape
# /metrics.
#
# To send traces from a Python script calling Ollama/vLLM:
# pip install openlit
# python -c "import openlit; openlit.init(otlp_endpoint='http://framework:4318')"
#
# To wire OpenWebUI → OpenLIT, install OpenLIT's pipeline middleware
# in OpenWebUI per https://openlit.io/blogs/openlit-openwebui.
services:
clickhouse:
image: clickhouse/clickhouse-server:25.3-alpine
container_name: openlit-clickhouse
restart: unless-stopped
environment:
CLICKHOUSE_USER: default
CLICKHOUSE_PASSWORD: OPENLIT
CLICKHOUSE_DB: openlit
volumes:
- /srv/docker/openlit/clickhouse:/var/lib/clickhouse
ulimits:
nofile:
soft: 262144
hard: 262144
openlit:
image: ghcr.io/openlit/openlit:latest
container_name: openlit
restart: unless-stopped
depends_on:
- clickhouse
ports:
# Host:container — UI on 3001 (OpenWebUI owns 3000).
- "3001:3000"
# OTLP receivers exposed on the host so SDKs running off-box can
# ship traces here. gRPC + HTTP. Remapped (4327/4328 → 4317/4318)
# because Phoenix owns the canonical 4317/4318 ports for OpenCode
# traces — OpenLIT here is a secondary/fleet-metrics destination.
- "4327:4317"
- "4328:4318"
environment:
INIT_DB_HOST: clickhouse
INIT_DB_PORT: "8123"
INIT_DB_USERNAME: default
INIT_DB_PASSWORD: OPENLIT
INIT_DB_DATABASE: openlit
SQLITE_DATABASE_URL: file:/app/client/data/data.db
volumes:
- /srv/docker/openlit/data:/app/client/data

View File

@@ -0,0 +1,25 @@
# OpenWebUI — ChatGPT-like web UI in front of Ollama. Pre-configured to
# use the host's Ollama instance and the project's SearXNG for web
# search. Default port 3000.
#
# Persistent state (users, conversations, uploaded docs, RAG vector
# index) lives at /srv/docker/openwebui/data so backups touch one path.
services:
openwebui:
image: ghcr.io/open-webui/open-webui:main
container_name: openwebui
restart: unless-stopped
ports:
- "3000:8080"
extra_hosts:
# Lets the container reach Ollama on the host's :11434 without
# needing to share Docker networks.
- "host.docker.internal:host-gateway"
environment:
- OLLAMA_BASE_URL=http://host.docker.internal:11434
# Built-in web search via the project's SearXNG instance.
- ENABLE_RAG_WEB_SEARCH=true
- RAG_WEB_SEARCH_ENGINE=searxng
- SEARXNG_QUERY_URL=https://searxng.n0n.io/search?q=<query>&format=json
volumes:
- /srv/docker/openwebui/data:/app/backend/data

View File

@@ -0,0 +1,35 @@
# Arize Phoenix — per-trace agent waterfall / flamegraph viz.
# https://github.com/Arize-ai/phoenix
#
# Picked over Langfuse for "show me one OpenCode turn as a tree":
# - Single container vs Langfuse's six (Postgres+ClickHouse+Redis+MinIO+web+worker).
# - First-class ingestion of Vercel AI SDK spans (which is what OpenCode
# emits under the hood when experimental.openTelemetry=true).
# - Best-in-class waterfall + agent-graph view for nested LLM/tool calls.
#
# Complements OpenLIT, doesn't replace it: OpenLIT is the fleet-metrics
# layer (cost / tokens / latency aggregated across sessions). Phoenix is
# the per-prompt debugger (see what one turn actually did).
#
# Bring-up: `docker compose up -d` — no first-run setup needed; UI prompts
# for project name on first trace ingest. Storage is SQLite at /data.
services:
phoenix:
image: arizephoenix/phoenix:latest
container_name: phoenix
restart: unless-stopped
ports:
# UI + OTLP/HTTP both ride on 6006 in Phoenix 15.x — HTTP traces go
# to http://framework:6006/v1/traces. (Pre-15 had a separate 4318;
# the consolidation happened in Phoenix v15.0.)
- "6006:6006"
# OTLP/gRPC stays separate.
- "4317:4317"
environment:
PHOENIX_WORKING_DIR: /data
# Phoenix listens on all interfaces by default; explicit for clarity.
PHOENIX_HOST: 0.0.0.0
PHOENIX_PORT: "6006"
PHOENIX_GRPC_PORT: "4317"
volumes:
- /srv/docker/phoenix/data:/data

View File

@@ -0,0 +1,36 @@
# vLLM, ROCm backend.
#
# NOTE: vLLM's official ROCm support targets datacenter cards (MI300X /
# gfx942). Strix Halo is gfx1151 — support varies by image tag and
# release. If `rocm/vllm:latest` doesn't run on this iGPU, try
# `rocm/vllm-dev:nightly` or build from source against ROCm 7.x.
services:
vllm:
image: rocm/vllm:latest
container_name: vllm
restart: unless-stopped
devices:
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
cap_add:
- SYS_PTRACE
security_opt:
- seccomp=unconfined
# Numeric GIDs of host's video (44) and render (991) groups — names
# don't exist inside the container.
group_add:
- "44"
- "991"
shm_size: 16g
ipc: host
volumes:
- /models:/models:ro
ports:
- "8000:8000"
command:
- --model
- /models/REPLACE/ME
- --host
- 0.0.0.0
- --port
- "8000"