Initial commit: localgenai stack
Containerized local LLM stack for the Framework Desktop / Strix Halo,
plus the OpenCode harness on the Mac side.
- pyinfra/framework/: pyinfra deploy targeting the box
- llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override
for gfx1151), OpenWebUI
- Beszel (host + container + AMD GPU dashboard via sysfs)
- OpenLIT (LLM fleet metrics)
- Phoenix (per-trace agent waterfall)
- OpenHands (autonomous agent in a Docker sandbox)
- opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter)
- install.sh deploys to ~/.config/opencode/
- StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md:
documentation and planning
- testing/qwen3-coder-30b/: small evaluation harness
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
66
pyinfra/framework/compose/beszel.yml
Normal file
66
pyinfra/framework/compose/beszel.yml
Normal file
@@ -0,0 +1,66 @@
|
||||
# Beszel — host + container + GPU dashboard.
|
||||
# https://beszel.dev
|
||||
#
|
||||
# Picked over Prometheus+Grafana for this box because:
|
||||
# - The agent's `amd_sysfs` collector reads /sys/class/drm/card*/device/
|
||||
# directly, which is the only reliable GPU metric source on Strix Halo
|
||||
# (gfx1151). AMD's amd-smi / Device Metrics Exporter return N/A for
|
||||
# util/power/temp on this APU (ROCm#6035), so the official Prometheus
|
||||
# exporter path is dead.
|
||||
# - Two containers vs six.
|
||||
#
|
||||
# First-time setup (WebSocket connection model — current Beszel default):
|
||||
# 1. `docker compose up -d beszel` (start the hub)
|
||||
# 2. Open http://framework:8090, create the admin account
|
||||
# 3. Click "Add system" — the dialog gives you a TOKEN and an SSH KEY.
|
||||
# 4. Edit /srv/docker/beszel/.env (created empty by pyinfra; pyinfra
|
||||
# doesn't overwrite). Add:
|
||||
# BESZEL_TOKEN=<token-from-dialog>
|
||||
# BESZEL_KEY=ssh-ed25519 AAAA…
|
||||
# 5. `docker compose up -d --force-recreate beszel-agent`
|
||||
#
|
||||
# Docker Compose auto-reads the sibling .env file for ${VAR} interpolation
|
||||
# in the environment block below — so secrets stay out of the compose
|
||||
# file (which pyinfra overwrites) but the env-var names match exactly
|
||||
# what the agent expects.
|
||||
#
|
||||
# Why both TOKEN and KEY: TOKEN identifies which system this agent is,
|
||||
# KEY authenticates the agent (the SSH key is reused as the auth secret
|
||||
# in the WebSocket handshake). Rotate either by editing the .env and
|
||||
# `docker compose up -d --force-recreate`.
|
||||
services:
|
||||
beszel:
|
||||
image: henrygd/beszel:latest
|
||||
container_name: beszel
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8090:8090"
|
||||
volumes:
|
||||
- /srv/docker/beszel/data:/beszel_data
|
||||
|
||||
beszel-agent:
|
||||
image: henrygd/beszel-agent:latest
|
||||
container_name: beszel-agent
|
||||
restart: unless-stopped
|
||||
# Host networking so the agent sees real CPU/memory/network counters
|
||||
# without bridge-NAT distortion.
|
||||
network_mode: host
|
||||
volumes:
|
||||
# Read-only Docker socket for per-container CPU/mem/net.
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
# Sysfs paths the AMD GPU collector reads.
|
||||
- /sys/class/drm:/sys/class/drm:ro
|
||||
- /sys/class/hwmon:/sys/class/hwmon:ro
|
||||
environment:
|
||||
# Pulled from /srv/docker/beszel/.env at compose-parse time.
|
||||
TOKEN: "${BESZEL_TOKEN:-}"
|
||||
KEY: "${BESZEL_KEY:-}"
|
||||
# WebSocket dial-out target — the hub on this same host. The agent
|
||||
# is on host networking, so localhost is the host machine, where
|
||||
# the hub container exposes port 8090.
|
||||
HUB_URL: "http://localhost:8090"
|
||||
# Optional fallback: legacy SSH listener for hub-initiated probing.
|
||||
# Harmless to keep — hub only uses it if WebSocket is unreachable.
|
||||
LISTEN: "45876"
|
||||
# Enable the AMD sysfs GPU collector.
|
||||
GPU: "true"
|
||||
44
pyinfra/framework/compose/llama.yml
Normal file
44
pyinfra/framework/compose/llama.yml
Normal file
@@ -0,0 +1,44 @@
|
||||
# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
|
||||
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||
#
|
||||
# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
|
||||
# vulkan-radv — most stable, recommended default (this one)
|
||||
# vulkan-amdvlk — alternate Vulkan driver, sometimes faster
|
||||
# rocm-7.2.2 — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
|
||||
# rocm-6.4.4 — ROCm 6.x fallback
|
||||
# rocm7-nightlies — avoid: caps memory allocation to 64 GB (May 2026)
|
||||
#
|
||||
# Toolbox images use a shell entrypoint, so we override to launch
|
||||
# llama-server directly. Edit the --model path before `docker compose up -d`.
|
||||
services:
|
||||
llama:
|
||||
image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
|
||||
container_name: llama
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
entrypoint: ["llama-server"]
|
||||
command:
|
||||
- --model
|
||||
- /models/REPLACE/ME/model.gguf
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8080"
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
- --ctx-size
|
||||
- "32768"
|
||||
# Required for GPU backends on Strix Halo per Gygeek's setup
|
||||
# guide. Forces full load into GPU memory rather than mmap.
|
||||
- --no-mmap
|
||||
# Flash attention — works on Vulkan too; the big win is on the
|
||||
# ROCm tag where kyuz0's build has rocWMMA acceleration.
|
||||
- --flash-attn
|
||||
# Expose Prometheus metrics at /metrics — scraped by OpenLIT for
|
||||
# tokens/sec, KV-cache use, queue depth, and request latency.
|
||||
- --metrics
|
||||
38
pyinfra/framework/compose/ollama.yml
Normal file
38
pyinfra/framework/compose/ollama.yml
Normal file
@@ -0,0 +1,38 @@
|
||||
# Ollama, ROCm backend. Serves models on demand — safe to start before
|
||||
# you've put anything in /models.
|
||||
#
|
||||
# Storage: Ollama's content-addressed blob store is bind-mounted under
|
||||
# /models/ollama so all model data on the host lives under /models.
|
||||
# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM
|
||||
# can't load them directly. Keep curated GGUFs at /models/<vendor>/...
|
||||
# for those engines.
|
||||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:rocm
|
||||
container_name: ollama
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the container, but the GIDs need to match the
|
||||
# host so /dev/kfd + /dev/dri are accessible.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
environment:
|
||||
# Strix Halo's iGPU is gfx1151 (RDNA 3.5), which Ollama's bundled
|
||||
# ROCm runtime doesn't recognize — without this override it falls
|
||||
# back to CPU silently. 11.0.0 = gfx1100 (Navi 31); the RDNA 3.x
|
||||
# ISAs are close enough that gfx1100 kernels run on gfx1151.
|
||||
- HSA_OVERRIDE_GFX_VERSION=11.0.0
|
||||
# Default context. 256K (the upstream default for Qwen3-Coder)
|
||||
# blows the KV cache up to ~25-30 GB and forces ollama to split
|
||||
# layers between GPU and CPU. 64K keeps the model fully on GPU
|
||||
# while still being plenty for coding contexts.
|
||||
- OLLAMA_CONTEXT_LENGTH=65536
|
||||
volumes:
|
||||
- /models/ollama:/root/.ollama
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "11434:11434"
|
||||
94
pyinfra/framework/compose/openhands.yml
Normal file
94
pyinfra/framework/compose/openhands.yml
Normal file
@@ -0,0 +1,94 @@
|
||||
# OpenHands 1.7 (May 2026) — autonomous agent in a Docker sandbox.
|
||||
# https://docs.openhands.dev — repo: github.com/OpenHands/OpenHands
|
||||
#
|
||||
# Architecture: this container is a thin orchestrator. Per conversation
|
||||
# it spawns a separate `agent-server` container on the host Docker daemon
|
||||
# (that's what the docker.sock mount is for) and talks to it over REST.
|
||||
# AGENT_SERVER_IMAGE_TAG below pins the per-session sandbox image.
|
||||
#
|
||||
# Complements OpenCode: OpenCode is the interactive terminal driver,
|
||||
# OpenHands is for autonomous loops (write code, run tests, browse the
|
||||
# web in a sandbox, report back).
|
||||
services:
|
||||
openhands:
|
||||
# Org rebranded All-Hands-AI → OpenHands at v1.0 (Dec 2025); the old
|
||||
# docker.all-hands.dev/all-hands-ai/openhands image is gone.
|
||||
image: docker.openhands.dev/openhands/openhands:1.7
|
||||
container_name: openhands
|
||||
restart: unless-stopped
|
||||
|
||||
# 3030 host-side because :3000 is OpenWebUI and :3001 is OpenLIT.
|
||||
# Loopback-only — reach via SSH tunnel or Tailscale, don't expose
|
||||
# this directly.
|
||||
ports:
|
||||
- "127.0.0.1:3030:3000"
|
||||
|
||||
volumes:
|
||||
# Required: orchestrator spawns sandbox containers via the host daemon.
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
# State, settings, conversation history, MCP config, secrets.
|
||||
# Pre-0.44 used ~/.openhands-state — N/A on a fresh install.
|
||||
- /srv/docker/openhands/state:/.openhands
|
||||
# Workspace the sandbox reads/writes. The host path on the LEFT must
|
||||
# match SANDBOX_VOLUMES below — the sandbox container is spawned by
|
||||
# the host daemon, so its bind mount is resolved on the host, not
|
||||
# via this container's filesystem.
|
||||
- /srv/docker/openhands/workspace:/srv/docker/openhands/workspace
|
||||
|
||||
# Linux Docker doesn't auto-provide host.docker.internal; this fixes it.
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
environment:
|
||||
# ---- Sandbox / agent-server image pin ----
|
||||
# Replaces the V0.x SANDBOX_RUNTIME_CONTAINER_IMAGE. 1.19.1-python is
|
||||
# the agent-server tag the 1.7 main image expects; bumping the main
|
||||
# image will likely want a newer agent-server tag — check the
|
||||
# upstream docker-compose.yml on each upgrade.
|
||||
AGENT_SERVER_IMAGE_REPOSITORY: ghcr.io/openhands/agent-server
|
||||
AGENT_SERVER_IMAGE_TAG: 1.19.1-python
|
||||
|
||||
# ---- Workspace mount into the per-session sandbox ----
|
||||
# SANDBOX_VOLUMES is the V1 replacement for the deprecated
|
||||
# WORKSPACE_BASE / WORKSPACE_MOUNT_PATH variables.
|
||||
SANDBOX_VOLUMES: /srv/docker/openhands/workspace:/workspace:rw
|
||||
# Match the host's `noise` UID so files the agent writes aren't
|
||||
# owned by root.
|
||||
SANDBOX_USER_ID: "1000"
|
||||
|
||||
# ---- LLM: host Ollama via OpenAI-compatible endpoint ----
|
||||
# Per the official local-llms doc, the recommended path is the
|
||||
# /v1 OpenAI-compatible endpoint with the `openai/` LiteLLM prefix
|
||||
# — NOT `ollama/...`, which has worse tool-call behaviour.
|
||||
LLM_MODEL: "openai/qwen3-coder:30b"
|
||||
LLM_BASE_URL: "http://host.docker.internal:11434/v1"
|
||||
LLM_API_KEY: "ollama" # any non-empty string; Ollama doesn't auth.
|
||||
|
||||
# Default tool-calling renderer mismatches Qwen3-Coder's training
|
||||
# format and produces malformed calls (issue #8140). Forcing false
|
||||
# falls back to OpenHands' prompt-based protocol — costs some token
|
||||
# efficiency, gains reliability with local models.
|
||||
LLM_NATIVE_TOOL_CALLING: "false"
|
||||
|
||||
LOG_ALL_EVENTS: "true"
|
||||
|
||||
# ---- Optional: ship traces to Phoenix on :4318 ----
|
||||
# OpenHands V1 uses LiteLLM + OpenTelemetry; standard OTLP env vars
|
||||
# are honoured. Comment out to disable.
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://host.docker.internal:4318"
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL: "http/protobuf"
|
||||
OTEL_SERVICE_NAME: "openhands"
|
||||
|
||||
# Per-session agent-server containers spawn headless chromium for
|
||||
# browser tasks; default 64 MB shm causes silent crashes.
|
||||
shm_size: "2gb"
|
||||
|
||||
# Playwright/chromium needs higher fd limits than Docker's default.
|
||||
ulimits:
|
||||
nofile:
|
||||
soft: 65536
|
||||
hard: 65536
|
||||
|
||||
# Bridge networking is correct here. Don't switch to network_mode: host
|
||||
# — the spawned sandbox containers reach this orchestrator via Docker
|
||||
# bridge DNS, which only works on a bridge network.
|
||||
59
pyinfra/framework/compose/openlit.yml
Normal file
59
pyinfra/framework/compose/openlit.yml
Normal file
@@ -0,0 +1,59 @@
|
||||
# OpenLIT — LLM observability (traces, costs, KV-cache, prompt/decode
|
||||
# latencies, tokens/sec). https://openlit.io
|
||||
#
|
||||
# Two services:
|
||||
# - clickhouse : columnar store for traces (internal only, no host port)
|
||||
# - openlit : Next.js UI on :3001 (3000 is OpenWebUI)
|
||||
#
|
||||
# Why OpenLIT vs Langfuse/Phoenix/Laminar: it's the only OSS dashboard
|
||||
# (May 2026) that auto-instruments Ollama AND vLLM via OpenTelemetry
|
||||
# without adding code to client apps. For llama.cpp, start the server
|
||||
# with --metrics (see ../llama/docker-compose.yml) and OpenLIT can scrape
|
||||
# /metrics.
|
||||
#
|
||||
# To send traces from a Python script calling Ollama/vLLM:
|
||||
# pip install openlit
|
||||
# python -c "import openlit; openlit.init(otlp_endpoint='http://framework:4318')"
|
||||
#
|
||||
# To wire OpenWebUI → OpenLIT, install OpenLIT's pipeline middleware
|
||||
# in OpenWebUI per https://openlit.io/blogs/openlit-openwebui.
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:25.3-alpine
|
||||
container_name: openlit-clickhouse
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
CLICKHOUSE_USER: default
|
||||
CLICKHOUSE_PASSWORD: OPENLIT
|
||||
CLICKHOUSE_DB: openlit
|
||||
volumes:
|
||||
- /srv/docker/openlit/clickhouse:/var/lib/clickhouse
|
||||
ulimits:
|
||||
nofile:
|
||||
soft: 262144
|
||||
hard: 262144
|
||||
|
||||
openlit:
|
||||
image: ghcr.io/openlit/openlit:latest
|
||||
container_name: openlit
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- clickhouse
|
||||
ports:
|
||||
# Host:container — UI on 3001 (OpenWebUI owns 3000).
|
||||
- "3001:3000"
|
||||
# OTLP receivers exposed on the host so SDKs running off-box can
|
||||
# ship traces here. gRPC + HTTP. Remapped (4327/4328 → 4317/4318)
|
||||
# because Phoenix owns the canonical 4317/4318 ports for OpenCode
|
||||
# traces — OpenLIT here is a secondary/fleet-metrics destination.
|
||||
- "4327:4317"
|
||||
- "4328:4318"
|
||||
environment:
|
||||
INIT_DB_HOST: clickhouse
|
||||
INIT_DB_PORT: "8123"
|
||||
INIT_DB_USERNAME: default
|
||||
INIT_DB_PASSWORD: OPENLIT
|
||||
INIT_DB_DATABASE: openlit
|
||||
SQLITE_DATABASE_URL: file:/app/client/data/data.db
|
||||
volumes:
|
||||
- /srv/docker/openlit/data:/app/client/data
|
||||
25
pyinfra/framework/compose/openwebui.yml
Normal file
25
pyinfra/framework/compose/openwebui.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
# OpenWebUI — ChatGPT-like web UI in front of Ollama. Pre-configured to
|
||||
# use the host's Ollama instance and the project's SearXNG for web
|
||||
# search. Default port 3000.
|
||||
#
|
||||
# Persistent state (users, conversations, uploaded docs, RAG vector
|
||||
# index) lives at /srv/docker/openwebui/data so backups touch one path.
|
||||
services:
|
||||
openwebui:
|
||||
image: ghcr.io/open-webui/open-webui:main
|
||||
container_name: openwebui
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3000:8080"
|
||||
extra_hosts:
|
||||
# Lets the container reach Ollama on the host's :11434 without
|
||||
# needing to share Docker networks.
|
||||
- "host.docker.internal:host-gateway"
|
||||
environment:
|
||||
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
||||
# Built-in web search via the project's SearXNG instance.
|
||||
- ENABLE_RAG_WEB_SEARCH=true
|
||||
- RAG_WEB_SEARCH_ENGINE=searxng
|
||||
- SEARXNG_QUERY_URL=https://searxng.n0n.io/search?q=<query>&format=json
|
||||
volumes:
|
||||
- /srv/docker/openwebui/data:/app/backend/data
|
||||
35
pyinfra/framework/compose/phoenix.yml
Normal file
35
pyinfra/framework/compose/phoenix.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
# Arize Phoenix — per-trace agent waterfall / flamegraph viz.
|
||||
# https://github.com/Arize-ai/phoenix
|
||||
#
|
||||
# Picked over Langfuse for "show me one OpenCode turn as a tree":
|
||||
# - Single container vs Langfuse's six (Postgres+ClickHouse+Redis+MinIO+web+worker).
|
||||
# - First-class ingestion of Vercel AI SDK spans (which is what OpenCode
|
||||
# emits under the hood when experimental.openTelemetry=true).
|
||||
# - Best-in-class waterfall + agent-graph view for nested LLM/tool calls.
|
||||
#
|
||||
# Complements OpenLIT, doesn't replace it: OpenLIT is the fleet-metrics
|
||||
# layer (cost / tokens / latency aggregated across sessions). Phoenix is
|
||||
# the per-prompt debugger (see what one turn actually did).
|
||||
#
|
||||
# Bring-up: `docker compose up -d` — no first-run setup needed; UI prompts
|
||||
# for project name on first trace ingest. Storage is SQLite at /data.
|
||||
services:
|
||||
phoenix:
|
||||
image: arizephoenix/phoenix:latest
|
||||
container_name: phoenix
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
# UI + OTLP/HTTP both ride on 6006 in Phoenix 15.x — HTTP traces go
|
||||
# to http://framework:6006/v1/traces. (Pre-15 had a separate 4318;
|
||||
# the consolidation happened in Phoenix v15.0.)
|
||||
- "6006:6006"
|
||||
# OTLP/gRPC stays separate.
|
||||
- "4317:4317"
|
||||
environment:
|
||||
PHOENIX_WORKING_DIR: /data
|
||||
# Phoenix listens on all interfaces by default; explicit for clarity.
|
||||
PHOENIX_HOST: 0.0.0.0
|
||||
PHOENIX_PORT: "6006"
|
||||
PHOENIX_GRPC_PORT: "4317"
|
||||
volumes:
|
||||
- /srv/docker/phoenix/data:/data
|
||||
36
pyinfra/framework/compose/vllm.yml
Normal file
36
pyinfra/framework/compose/vllm.yml
Normal file
@@ -0,0 +1,36 @@
|
||||
# vLLM, ROCm backend.
|
||||
#
|
||||
# NOTE: vLLM's official ROCm support targets datacenter cards (MI300X /
|
||||
# gfx942). Strix Halo is gfx1151 — support varies by image tag and
|
||||
# release. If `rocm/vllm:latest` doesn't run on this iGPU, try
|
||||
# `rocm/vllm-dev:nightly` or build from source against ROCm 7.x.
|
||||
services:
|
||||
vllm:
|
||||
image: rocm/vllm:latest
|
||||
container_name: vllm
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 16g
|
||||
ipc: host
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8000:8000"
|
||||
command:
|
||||
- --model
|
||||
- /models/REPLACE/ME
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
Reference in New Issue
Block a user