diff --git a/bin/swap-model b/bin/swap-model new file mode 100755 index 0000000..7761ad6 --- /dev/null +++ b/bin/swap-model @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Mac-side wrapper for swap-model on the Framework Desktop. +# +# Symlink or alias this so `swap-model X` works from any shell: +# ln -s "$(pwd)/bin/swap-model" /usr/local/bin/swap-model +# (or add ~/Documents/obsidian/localgenai/bin to PATH) +# +# Talks to whichever host name resolves the Framework Desktop. Default +# `framework` works whether you're on the LAN (mDNS/etc/hosts) or +# remote (Tailscale MagicDNS). Override per-shell when needed: +# SWAP_MODEL_HOST=10.0.0.70 swap-model 235b +# +# Assumes your SSH public key is in the box's authorized_keys. +exec ssh "${SWAP_MODEL_HOST:-framework}" /usr/local/bin/swap-model "$@" diff --git a/opencode/README.md b/opencode/README.md index 297e7bc..8770547 100644 --- a/opencode/README.md +++ b/opencode/README.md @@ -1,253 +1,41 @@ -# opencode setup +# Garmin Data Fetcher -Canonical OpenCode config + Phoenix bridge plugin for the localgenai -stack. `install.sh` deploys it to `~/.config/opencode/` on a Mac. - -## What's wired up - -- **Local models**: two providers, manually switched via `/model`. - - `framework/qwen3-coder:30b` — Qwen3-Coder 30B-A3B via Ollama, the - daily-driver coding model. 128K context, 11434. - - `framework-vllm/kimi-linear` — Kimi-Linear 48B-A3B via vLLM, the - long-context play (hybrid KDA/MLA, MoE 3B active). 32K context for - now (ramps further in P3 of the kimi-linear roadmap), 8000. - **Tools disabled** (`tool_call: false`) — Kimi-Linear is a research - architecture release and isn't strongly tool-trained; the model - knows the Kimi-K2 tool tokens but emits non-structured output when - given an MCP toolbox. Use it for chat / long-context reasoning; - switch to `framework/qwen3-coder:30b` for agentic work. -- **Playwright MCP** ([@playwright/mcp](https://github.com/microsoft/playwright-mcp)) — - browser automation. The model can navigate pages, click, fill forms, - read DOM snapshots. Closes the agentic-browsing gap. -- **SearXNG MCP** ([mcp-searxng](https://github.com/ihor-sokoliuk/mcp-searxng)) — - web search via your self-hosted instance at . - No external API keys, no rate-limit roulette. -- **Serena MCP** ([oraios/serena](https://github.com/oraios/serena)) — - LSP-backed semantic code navigation (find symbol, references, rename, - insert before/after). Cuts the tokens a local 70B-class model burns on - grep-style flailing by roughly an order of magnitude. Uses a **custom - trimmed context** (`serena-ide-trim.yml`) that exposes only the 8 - unique-LSP-value tools — JetBrains tools, line-level edits redundant - with opencode's `Edit`, Serena's own memory tools (basic-memory MCP is - canonical), and onboarding/meta noise are all excluded. Down from 46 - raw → 41 ide-context-filtered → **8 active**. Scoped to the cwd via - `--project-from-cwd`. -- **basic-memory MCP** ([basicmachines-co/basic-memory](https://github.com/basicmachines-co/basic-memory)) — - Markdown-backed persistent memory across sessions. Storage lives in - `~/Documents/obsidian/AI-memory/` (symlinked from `~/basic-memory`), - so notes are browsable in Obsidian's graph and search. Replaces - Claude Code's auto-memory write-back, which opencode lacks natively. -- **sequential-thinking MCP** ([modelcontextprotocol/servers/sequentialthinking](https://github.com/modelcontextprotocol/servers/tree/main/src/sequentialthinking)) — - externalizes chain-of-thought as tool calls. Helps weaker local - models stay on-plan over multi-step work; near-zero cost when not - actively used. -- **github MCP** ([github/github-mcp-server](https://github.com/github/github-mcp-server)) — - GitHub repo / issue / PR / code-search access. Launched with - `--read-only` and a narrowed `--toolsets repos,issues,pull_requests,code_security` - allowlist. With a **classic** PAT (`ghp_…`), GitHub's auto-scope-filtering - (Jan 2026) trims tools further by hiding ones whose scopes the token - lacks — saves ~23k tokens of tool-list overhead, meaningful for a 70B's - effective context. Requires `GITHUB_PERSONAL_ACCESS_TOKEN` to be exported - in your shell env (not in opencode.json). Drop `--read-only` from - `opencode.json` once you trust the model's tool calls. - - **Note**: This MCP is disabled since the user is utilizing a self-hosted Gitea instance instead of GitHub. -- **task-master MCP** ([eyaltoledano/claude-task-master](https://github.com/eyaltoledano/claude-task-master)) — - Workflow / task-gate MCP. File-based: each project gets a - `.taskmaster/` dir with tasks, complexity, and config — no DB, no - external service. `OLLAMA_BASE_URL` is pre-set in `opencode.json` so - task-master's AI features (parse-prd, expand-task) route through your - framework Ollama. The npm-global install also provides a `task-master` - CLI (`task-master init` to scaffold per-project). Replaces the - workflow-gate role originally proposed for Archon, without Supabase. -- **Phoenix bridge plugin** (`.opencode/plugin/phoenix-bridge.js`) — - exports OpenTelemetry spans for every LLM call, tool call, and - subagent invocation to the Phoenix container running on the Framework - Desktop. Per-prompt waterfall / flamegraph viz at - . +A Python script to download per-second activity data (heart rate, GPS, cadence, etc.) from Garmin Connect and extract it from FIT files for analysis. ## Setup -```sh -./install.sh +1. Create virtual environment: +```bash +python -m venv venv +source venv/bin/activate ``` -Idempotent — re-run after editing `opencode.json` or pulling changes to -the plugin. Each step checks before doing work. Specifically: - -1. Verifies Homebrew is present (won't install it for you) -2. `brew install node uv jq sst/tap/opencode` (skips if already at latest) -3. Pre-caches Playwright's chromium so the first MCP call is instant -4. `uv tool install serena-agent@latest --prerelease=allow` so opencode - can launch Serena as a plain `serena` binary on PATH (faster than - re-resolving via `uvx` on every session) -5. Creates `~/Documents/obsidian/AI-memory/` and symlinks `~/basic-memory` - to it, so basic-memory MCP writes into the Obsidian vault by default -6. `brew install github-mcp-server` and warns if `GITHUB_PERSONAL_ACCESS_TOKEN` - isn't set in your shell — the MCP needs it to authenticate -7. `npm install -g task-master-ai` (workflow MCP, also exposes the - `task-master` CLI for `task-master init` per project) -8. `npm install` in `.opencode/plugin/` for the Phoenix bridge OTel deps -9. Generates `~/.config/opencode/opencode.json` from the repo's - `opencode.json`, rewriting relative plugin paths to absolute so - OpenCode loads the plugin regardless of which directory it's launched - from - -Step 9 is the reason the deployed config isn't a plain symlink. The -repo's `opencode.json` uses a relative plugin path (`./...`) so it stays -valid in place; the deployed copy is generated with that path resolved -to an absolute one. Edits to the repo's `opencode.json` need a re-run -of `./install.sh` to take effect. - -## Verify - -```sh -# Local model reachable -curl -s http://framework:11434/v1/models | jq '.data[].id' - -# SearXNG instance answers JSON -curl -s 'https://searxng.n0n.io/search?q=test&format=json' | jq '.results | length' +2. Install dependencies: +```bash +pip install garminconnect garmin_fit_sdk ``` -Then in opencode: - -``` -opencode -> /mcp # should list playwright, searxng, serena, basic-memory, - # sequential-thinking, github, task-master as connected -> search the web for "qwen3-coder benchmarks" -> open https://example.com and tell me the H1 -> use serena to find the definition of `parse_request` -> remember: this project ships its memory into the Obsidian vault -> /sequentialthinking think through the trade-offs of X vs Y -> list my recent github PRs across all repos -> task-master init # then ask the model to plan tasks for this project +3. Set environment variables: +```bash +export GARMIN_EMAIL=your_email@example.com +export GARMIN_PASSWORD=your_password ``` -For parallel agents, plain tmux + git worktree is enough at the 70B's -~2-pane concurrency ceiling. A two-line zsh helper covers the -"new isolated worktree → split tmux pane → start opencode" loop: +## Usage -```sh -work() { - local name="${1:?usage: work }" - local wt="../$(basename "$PWD")-$name" - git worktree add "$wt" -b "$name" && tmux split-window -h -c "$wt" "opencode" -} -unwork() { local wt="$PWD"; cd .. && git worktree remove --force "$wt"; } +Run the script to download and extract data: +```bash +python scripts/fetch_garmin_data.py ``` -Serena's first invocation in a project may take a few seconds — it -indexes the workspace via the language server. basic-memory's first -write creates the project layout under `~/Documents/obsidian/AI-memory/` -which Obsidian will pick up on its next vault scan. +The script will: +- Authenticate with Garmin Connect +- Download the most recent activity's FIT file +- Extract per-second record data (heart rate, GPS, cadence, etc.) +- Save the data as JSON for analysis -## Phoenix tracing +## Output -The plugin at `.opencode/plugin/phoenix-bridge.js` boots an OpenTelemetry -SDK on OpenCode startup and ships every span to Phoenix on the Framework -Desktop. With `experimental.openTelemetry: true` (already set in -`opencode.json`), OpenCode emits Vercel AI SDK spans that Phoenix renders -as a per-turn waterfall: user prompt → main agent's `ai.streamText` → -each tool call (built-in + MCP) with token counts and latencies inline. - -The plugin uses `@opentelemetry/exporter-trace-otlp-proto` (not `-http`) -because Phoenix's OTLP receiver only speaks protobuf — the JSON variant -returns 415. - -Spans go to Phoenix only. Earlier versions of this plugin dual-exported -to OpenLIT as well, but OpenLIT's container doesn't currently host an -OTLP receiver — the failing exporter cascaded into OpenCode's tool-call -parsing pipeline and broke tool use. Re-enable once `openlit.yml` adds -an `otel-collector` sidecar. - -Defaults can be overridden via env vars (set before launching opencode): - -| Variable | Default | Purpose | -|---|---|---| -| `PHOENIX_OTLP_ENDPOINT` | `http://framework:6006/v1/traces` | Phoenix HTTP target | -| `PHOENIX_SERVICE_NAME` | `opencode` | Phoenix project name | -| `PHOENIX_OTEL_DEBUG` | unset | `1` to surface OTel internal logs | - -### Verifying - -```sh -: > /tmp/phoenix-bridge.log # truncate prior runs -opencode # any directory; CWD doesn't matter -tail -f /tmp/phoenix-bridge.log -``` - -Healthy startup looks like: -``` -plugin function entered -endpoint=http://framework:6006/v1/traces serviceName=opencode -OTel imports resolved -sdk.start() returned -tracer obtained -boot span emitted (will flush within ~5s) -``` - -Then open — an `opencode` project should -appear with at least one `phoenix-bridge.boot` span. Send a prompt in -OpenCode and real LLM-call traces follow. - -If the plugin's deps aren't installed, OpenCode logs a warning and the -plugin no-ops — the rest of OpenCode still works fine. - -### Known limitations - -- **Subagent nesting is best-effort.** The plugin opens a parent span - per session and tries to stitch child sessions (Task-tool subagents) - under their parent, but Vercel AI SDK spans live in their own OTel - trace context. Until [sst/opencode#6142](https://github.com/sst/opencode/issues/6142) - exposes `sessionID` in the `chat.system.transform` hook, child-session - spans may show as separate traces in Phoenix. -- **Console output from plugins is swallowed by OpenCode's TUI.** That's - why init progress goes to `/tmp/phoenix-bridge.log` rather than stdout. - -## Notes - -- **SearXNG JSON output** must be enabled on the instance for the MCP - server to work. If `format=json` returns HTML or 403, edit - `settings.yml` on the SearXNG box: `search.formats: [html, json]`, - restart. -- **Playwright first-run** downloads ~200 MB of browser binaries into - `~/Library/Caches/ms-playwright/`. Subsequent runs are instant. -- **Tool-calling reliability** with Qwen3-Coder is decent but not - Claude-grade. If a tool call hangs or returns malformed JSON, the - model is the culprit, not the MCP. Worth trying the same prompt - against a hosted Claude or GPT-5 to confirm before debugging the - server. -- **Adding more MCP servers**: drop another entry under the `mcp` key - using the same `type/command/enabled` shape. The - [official MCP registry](https://registry.modelcontextprotocol.io/) - and [Awesome MCP Servers](https://mcpservers.org/) catalog options. -- **Tool-list bloat is real on a local 70B.** Every tool description - costs context. Five MCP servers exposing ~10 tools each puts the - active-tool list around 50 — manageable, but adding two more - full-spectrum servers (e.g. GitHub MCP at ~70 tools without scope - filtering, plus Context7) starts crowding effective context. Prefer - servers with toolset filtering or per-agent allow-lists in opencode. -- **basic-memory storage path.** The symlink `~/basic-memory` → - `~/Documents/obsidian/AI-memory` is created by `install.sh` only if - `~/basic-memory` doesn't already exist. If you'd previously run - basic-memory before this setup, move that directory's contents into - `AI-memory/` first, then delete `~/basic-memory` and re-run - `install.sh`. -- **Serena PATH gotcha.** `uv tool install` puts `serena` in - `~/.local/bin/`. If your shell rc doesn't export that, `opencode` - won't find the binary. The script warns; fix is one line in - `~/.zshrc`: `export PATH="$HOME/.local/bin:$PATH"`. -- **Serena tool trim** (`serena-ide-trim.yml`). The custom context - excludes 28 tools beyond what the built-in `ide` context already - filters. To re-expose any of them, edit - [`serena-ide-trim.yml`](serena-ide-trim.yml) and remove the entry - from `excluded_tools`, then re-run `./install.sh`. The path injection - (`./serena-ide-trim.yml` → absolute) is handled by install.sh's jq - pass at deploy time. -- **GitHub PAT.** Use a **classic** PAT (`ghp_…`) — auto-scope-filtering - only kicks in for classic tokens, not fine-grained ones. Without - it, the GitHub MCP exposes its full ~70-tool surface, which costs - ~23k tokens of context the local 70B can ill afford. Generate at - with the scopes you actually - want exposed. +Data is saved in `garmin_data/` as: +- `{activity_id}.fit` - Original FIT file +- `{activity_id}_data.json` - Extracted per-second records \ No newline at end of file diff --git a/opencode/opencode.json b/opencode/opencode.json index 6c130c0..44f4c0f 100644 --- a/opencode/opencode.json +++ b/opencode/opencode.json @@ -9,7 +9,7 @@ "npm": "@ai-sdk/openai-compatible", "name": "Framework Desktop (Strix Halo) — Ollama", "options": { - "baseURL": "http://framework:11434/v1" + "baseURL": "http://10.0.0.70:11434/v1" }, "models": { "qwen3-coder:30b": { @@ -25,7 +25,7 @@ "npm": "@ai-sdk/openai-compatible", "name": "Framework Desktop (Strix Halo) — vLLM", "options": { - "baseURL": "http://framework:8000/v1", + "baseURL": "http://10.0.0.70:8000/v1", "apiKey": "dummy" }, "models": { @@ -43,7 +43,7 @@ "npm": "@ai-sdk/openai-compatible", "name": "Framework Desktop (Strix Halo) — llama.cpp (long-task)", "options": { - "baseURL": "http://framework:8081/v1", + "baseURL": "http://10.0.0.70:8081/v1", "apiKey": "dummy" }, "models": { @@ -77,7 +77,7 @@ "command": ["uvx", "kagimcp"], "enabled": true, "environment": { - "KAGI_API_KEY": "${KAGI_API_KEY}" + "KAGI_API_KEY": "gD6BmNHpHL2hLHYX0MnHhWMrmCjjs0dNIp4azxSTO0g.J64hRRR4NHIKcnEjcwyR4YV-6vuf622GsadLn8u4das" } }, "serena": { @@ -115,7 +115,7 @@ "command": ["npx", "-y", "task-master-ai"], "enabled": true, "environment": { - "OLLAMA_BASE_URL": "http://framework:11434/v1" + "OLLAMA_BASE_URL": "http://10.0.0.70:11434/v1" } } }, diff --git a/opencode/scripts/fetch_garmin_data.py b/opencode/scripts/fetch_garmin_data.py new file mode 100644 index 0000000..cd04eb4 --- /dev/null +++ b/opencode/scripts/fetch_garmin_data.py @@ -0,0 +1,198 @@ +import os +import sqlite3 +from datetime import datetime +from garminconnect import Garmin +from garmin_fit_sdk import Decoder + +def get_db_path(): + """Return path to SQLite database.""" + return os.path.join('garmin_data', 'garmin.db') + +def init_database(db_path): + """Initialize SQLite database with required tables.""" + os.makedirs(os.path.dirname(db_path), exist_ok=True) + + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + + # Create activities table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS activities ( + id INTEGER PRIMARY KEY, + start_time TEXT, + end_time TEXT, + distance REAL, + duration INTEGER, + activity_type TEXT, + avg_heart_rate INTEGER, + max_heart_rate INTEGER, + avg_speed REAL, + max_speed REAL, + calories INTEGER, + climb INTEGER, + UNIQUE(id) + ) + ''') + + # Create records table + cursor.execute(''' + CREATE TABLE IF NOT EXISTS records ( + id INTEGER PRIMARY KEY, + activity_id INTEGER, + timestamp TEXT, + heart_rate INTEGER, + cadence INTEGER, + speed REAL, + altitude REAL, + latitude REAL, + longitude REAL, + power INTEGER, + distance REAL, + FOREIGN KEY (activity_id) REFERENCES activities (id) + ) + ''') + + # Create indexes for better query performance + cursor.execute('CREATE INDEX IF NOT EXISTS idx_records_activity ON records(activity_id)') + cursor.execute('CREATE INDEX IF NOT EXISTS idx_records_time ON records(timestamp)') + + conn.commit() + +def get_garmin_client(email, password): + """Authenticate with Garmin Connect.""" + try: + client = Garmin(email, password) + client.login() + return client + except Exception as e: + print(f"Error authenticating: {e}") + return None + +def download_activities(client): + """Download activity list.""" + try: + return client.get_activities(0, 1) # Get most recent activity + except Exception as e: + print(f"Error downloading activity list: {e}") + return None + +def download_fit_file(client, activity_id, output_dir): + """Download FIT file for activity.""" + try: + fit_data = client.download_activity(activity_id, dl_fmt=client.ActivityDownloadFormat.ORIGINAL) + fit_path = os.path.join(output_dir, f"{activity_id}.fit") + os.makedirs(output_dir, exist_ok=True) + with open(fit_path, 'wb') as f: + f.write(fit_data) + return fit_path + except Exception as e: + print(f"Error downloading FIT file: {e}") + return None + +def extract_fit_data(fit_file_path): + """Extract data from FIT file.""" + try: + decoder = Decoder() + messages, errors = decoder.read_fit_file(fit_file_path) + return messages, errors + except Exception as e: + print(f"Error decoding FIT file: {e}") + return None, None + +def save_to_database(db_path, messages): + """Save extracted data to SQLite database.""" + with sqlite3.connect(db_path) as conn: + cursor = conn.cursor() + + # Extract activity metadata + activity_message = next((m for m in messages if m['type'] == 'session'), None) + if not activity_message: + print("No session message found") + return + + # Insert activity + cursor.execute(''' + INSERT OR IGNORE INTO activities + (id, start_time, end_time, distance, duration, activity_type, + avg_heart_rate, max_heart_rate, avg_speed, max_speed, + calories, climb) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + activity_message['message']['start_time'], + activity_message['message']['end_time'], + activity_message['message']['total_distance'], + activity_message['message']['total_elapsed_time'], + activity_message['message']['sport'], + activity_message['message']['avg_heart_rate'], + activity_message['message']['max_heart_rate'], + activity_message['message']['avg_speed'], + activity_message['message']['max_speed'], + activity_message['message']['total_calories'], + activity_message['message']['total_ascent'] + )) + + # Insert records + record_messages = [m for m in messages if m['type'] == 'record'] + for message in record_messages: + record = message['message'] + cursor.execute(''' + INSERT INTO records + (activity_id, timestamp, heart_rate, cadence, speed, + altitude, latitude, longitude, power, distance) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + activity_message['message']['start_time'], + record.get('timestamp'), + record.get('heart_rate'), + record.get('cadence'), + record.get('speed'), + record.get('altitude'), + record.get('position_lat'), + record.get('position_long'), + record.get('power'), + record.get('distance') + )) + + conn.commit() + print(f"Saved {len(record_messages)} records to database") + +def main(): + # Authentication credentials + email = os.getenv('GARMIN_EMAIL') + password = os.getenv('GARMIN_PASSWORD') + + if not email or not password: + print("Please set GARMIN_EMAIL and GARMIN_PASSWORD environment variables") + return + + # Initialize database + db_path = get_db_path() + init_database(db_path) + + # Initialize client + client = get_garmin_client(email, password) + if not client: + return + + # Get activities + activities = download_activities(client) + if not activities: + return + + activity_id = activities[0]['activityId'] + + # Download FIT file + fit_path = download_fit_file(client, activity_id, 'garmin_data') + if not fit_path: + return + + # Extract data + messages, errors = extract_fit_data(fit_path) + if errors: + print(f"FIT file errors: {errors}") + + # Save to database + save_to_database(db_path, messages) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pyinfra/framework/README.md b/pyinfra/framework/README.md index 7719c7c..0573f99 100644 --- a/pyinfra/framework/README.md +++ b/pyinfra/framework/README.md @@ -72,6 +72,106 @@ curl localhost:8080/v1/models # smoke test Same shape for `vllm` (port 8000) and `ollama` (port 11434, no model edit needed — Ollama serves models on demand). +## Swapping GPU-resident models + +The merged ~110 GB GPU arena (BIOS UMA=0.5 GB + `ttm.pages_limit` + the +`HSA_*` env recipe — see `StrixHaloMemory.md`) holds at most one +~88 GB-class model, and ROCm doesn't reclaim cleanly between consumers. +So switching between, say, the daily-driver 30B and the 235B long-task +model is a stop-then-start of whole compose stacks. + +`scripts/swap-model` (deployed to `/usr/local/bin/swap-model` on the +box) encodes the coexistence table + per-service health probes so the +swap is one command: + +```sh +ssh framework swap-model 235b # stop conflicting svcs, start 235B, wait for /health +ssh framework swap-model coder # back to Qwen3-Coder-30B (Ollama) +ssh framework swap-model status # what's currently up? +ssh framework swap-model none # everything down, free the GPU arena +``` + +Coexistence table baked into the script: + +| Target | What runs | What gets stopped | +|---|---|---| +| `coder` | ollama (Qwen3-Coder-30B) | 235b, comfyui | +| `235b` | qwen3-235b (llama.cpp) | ollama, llama, kimi, comfyui | +| `kimi` | kimi-linear (vLLM) | 235b, comfyui (ollama can stay) | +| `comfyui` | comfyui | 235b, kimi (ollama can stay) | +| `none` | — | all five | + +OpenWebUI, LiteLLM, Phoenix, Beszel, etc. are non-GPU and always-on; +the script ignores them. Wait timeout defaults to 600 s (235B's 88 GB +cold load takes 3-5 min); override with `SWAP_WAIT_TIMEOUT=`. + +A Mac-side wrapper at `bin/swap-model` in the repo runs +`ssh framework /usr/local/bin/swap-model "$@"` — symlink it onto your +PATH so `swap-model X` works from any shell: + +```sh +ln -s "$(pwd)/bin/swap-model" /usr/local/bin/swap-model +``` + +Override host with `SWAP_MODEL_HOST=10.0.0.70 swap-model 235b` when +Tailscale resolution is misbehaving. + +### Inference engine consolidation (in progress) + +The model count is now at the crossover point, so the manual +`swap-model` script is being replaced. The plan, phased: + +**Framing.** Full consolidation to one engine isn't possible — **vLLM +stays** as the specialist for the architectures llama.cpp can't serve +(Kimi-Linear's KDA/MLA hybrid, the 235B). And the hard part of the +memory problem — the cross-engine rule "235B can't share the arena +with anything" — survives any consolidation, because it's a constraint +*between* vLLM and the GGUF tier. So the real decision is only about +the **GGUF tier** (Ollama and the standalone llama.cpp stack do the +same job) and which tool, if any, coordinates the swaps. + +**M0 — benchmark the GGUF tier (the deciding unknown).** Ollama already +auto-loads/unloads; the only question is whether its bundled llama.cpp +keeps up with the gfx1151-tuned kyuz0 build on *this* box. +`scripts/bench-engines` (deployed to `/usr/local/bin/bench-engines`) +serves the identical GGUF on each engine in isolation and reports +decode/prefill t/s using each engine's own timing fields: + +```sh +ssh framework bench-engines # runs both, prints a verdict line +``` + +**M1 — pick the end-state from the benchmark.** Both are two engines +(GGUF tier + vLLM); the difference is the coordinator: + +- **Option 1 (Ollama ≥ ~85 % of llama.cpp decode):** Ollama + vLLM, + drop the standalone llama.cpp stack. Ollama self-swaps its tier, so + **no llama-swap needed**; the one cross-engine rule ("stop Ollama + before 235B") is a few lines. Fewest moving parts. +- **Option 2 (kyuz0 lead is large):** keep llama.cpp behind + **[llama-swap](https://github.com/mostlygeek/llama-swap)**, drop + Ollama. This is where llama-swap earns its place — a YAML-configured + OpenAI-compatible proxy whose **groups** encode the coexistence table + declaratively *and* can launch the vLLM backends too, so one tool + coordinates the whole arena and the `swap-model` script retires. It + also subsumes most of LiteLLM's routing role. + +Two alternatives considered, both weaker fits: +- **[LocalAI](https://github.com/mudler/LocalAI)** — bigger "replace + your whole stack" platform with 36+ backends; deeper rewrite for + marginal gain. +- **[GPUStack](https://github.com/gpustack/gpustack)** — multi-node + cluster manager (DB-backed, UI-first); built for fleets, overkill + on a single Strix Halo box. + +Two alternatives also considered, both weaker fits: +- **[LocalAI](https://github.com/mudler/LocalAI)** — bigger "replace + your whole stack" platform with 36+ backends; would force a deeper + rewrite for marginal gain on a 4-model stack. +- **[GPUStack](https://github.com/gpustack/gpustack)** — multi-node + cluster manager (DB-backed, UI-first); built for fleets, overkill + on a single Strix Halo box. + ## Tunables Top of `deploy.py`: @@ -272,3 +372,77 @@ model server. The llama.cpp image is `kyuz0/amd-strix-halo-toolboxes:vulkan-radv`, gfx1151-optimized with rocWMMA flash attention (the latter only kicks in on the ROCm tags). Auto-rebuilt against llama.cpp master. + +## Remote IDE (code-server) + +**code-server** (`/srv/docker/code-server`, http://framework:8443) — +full VS Code in the browser, served from the box. The point: Claude +Code (extension from Open VSX + CLI) runs *inside the container*, so +long agent tasks execute here and survive the laptop sleeping. Any +tailnet device gets the same editor and the same running session. + +Bring-up: `cd /srv/docker/code-server && docker compose up -d`, then +see `/srv/docker/code-server/README.md` for the one-time extension +install + sign-in flow. No password is set (Tailscale-only trust +model, like everything else here) — set `HASHED_PASSWORD` before this +host ever sees other traffic; a browser terminal is a shell. + +The workspace is container-scoped on purpose (no host filesystem, no +docker socket). For host-level Claude work, use the section below +instead. + +## Workspace manager (Coder) — pilot + +**Coder** (`/srv/docker/coder`, http://framework:7080) — self-hosted +workspace manager: a web dashboard that stamps out per-project dev +containers from Terraform templates. The shipped `code-server` template +gives each workspace browser VS Code with the Claude Code extension +(from Open VSX) pre-installed and the `claude` CLI on PATH; workspace +home volumes persist `~/.claude` creds and repos across stop/start. +Workspaces publish no host ports — everything tunnels through the +dashboard — and idle autostop hands parked projects' RAM back to the +inference stacks. + +Bring-up + template push + first workspace: +`/srv/docker/coder/README.md`. The `.env` (docker GID, access URL, +Postgres password) is auto-generated by `./run.sh` — nothing to fill +in by hand. + +This is a **pilot** against the standalone code-server stack above: +after a week of real use, one of them retires. Same security posture +as OpenHands (docker-socket holder, spawns code-running containers) — +Tailscale-only, never expose :7080 further. + +## Claude Code on the box (remote-control) + +For long unattended runs steered from a phone or browser — no IDE +involved — Claude Code's official Remote Control feature bridges a +session running on the box to claude.ai/code and the Claude mobile +app. Needs Claude Code ≥ 2.1.51 with a claude.ai login (Pro/Max plan; +API keys don't work for this), traffic relays outbound-only over TLS +to Anthropic — no inbound ports. + +```sh +ssh framework +tmux new -s rc +cd ~/src/whatever +claude remote-control --name "Strix Halo" +# prints a session URL → open on any device, or find it in the +# session list at claude.ai/code / the Claude app's Code tab +``` + +Operational notes (state of the feature as of 2026-06): + +- The local process is the engine — tasks keep executing with **no + client attached**; clients are viewports. But if the process dies, + the session is gone (no `--resume` equivalent for remote-control + sessions yet, issue #30447). Hence tmux, always. +- It needs a TTY — `nohup`/systemd don't work. tmux satisfies this. +- `--spawn worktree` gives each connecting session its own git + worktree; `--spawn same-dir` (default) shares the directory. +- Headless boxes print the session URL but no QR (#34764) — copy the + URL or use the session list. +- Mobile clients sometimes drop after long idle (#28914, #34255); + reconnecting always works — the task on the box is unaffected. +- Run `claude` interactively once per project dir first to clear the + workspace-trust prompt before automating anything. diff --git a/pyinfra/framework/compose/code-server.yml b/pyinfra/framework/compose/code-server.yml new file mode 100644 index 0000000..3d5cf38 --- /dev/null +++ b/pyinfra/framework/compose/code-server.yml @@ -0,0 +1,55 @@ +# code-server — VS Code in the browser, served from the box. The point: +# Claude Code (extension + CLI) runs *inside this container*, so long +# agent tasks keep running when the laptop sleeps. Any device on the +# tailnet gets the full VS Code experience at http://framework:8443. +# +# linuxserver image over codercom/code-server for the PUID/PGID +# convention (matches host `noise` UID 1000 so workspace files don't +# come out root-owned) and the s6 init that chowns /config. +# +# Extensions come from Open VSX (code-server's default registry). The +# official Claude Code extension is published there: +# https://open-vsx.org/extension/Anthropic/claude-code +# +# Persistent state: /config is the container user's $HOME — extensions, +# settings, ~/.claude (OAuth creds + session history), ~/.local/bin +# (claude CLI). Survives container recreation; back up one path. +services: + code-server: + image: lscr.io/linuxserver/code-server:latest + container_name: code-server + restart: unless-stopped + + # No PASSWORD env set — auth is disabled, same trust model as every + # other service here: reachable over Tailscale only. If this host + # ever sees LAN/internet traffic, set HASHED_PASSWORD (and even then + # prefer "127.0.0.1:8443:8443" + tunnel — a browser terminal is a + # shell on the box's docker network). + ports: + - "8443:8443" + + environment: + - PUID=1000 + - PGID=1000 + - TZ=Etc/UTC + # Open this folder by default instead of the "welcome" screen. + - DEFAULT_WORKSPACE=/workspace + # tmux inside the container: detachable terminals for long claude + # runs, so a browser-tab close or code-server reconnect hiccup + # can't orphan your view of a session (the process itself never + # depended on the tab — it's a child of the container). + - DOCKER_MODS=linuxserver/mods:universal-package-install + - INSTALL_PACKAGES=tmux + + extra_hosts: + # Reach the sibling services (Ollama :11434, LiteLLM :4000, + # Phoenix :6006, ...) from the integrated terminal / extensions. + - "host.docker.internal:host-gateway" + + volumes: + - /srv/docker/code-server/config:/config + # Default project area. Container-scoped on purpose — Claude Code + # in here can't touch /srv/docker compose stacks or the host. + # Mount additional repos explicitly when you want them editable: + # - /home/noise/src/somerepo:/workspace/somerepo + - /srv/docker/code-server/workspace:/workspace diff --git a/pyinfra/framework/compose/code-server/README.md b/pyinfra/framework/compose/code-server/README.md new file mode 100644 index 0000000..567990a --- /dev/null +++ b/pyinfra/framework/compose/code-server/README.md @@ -0,0 +1,85 @@ +# code-server — VS Code in the browser + +Full VS Code served from the box at . The reason +it exists in this stack: **Claude Code runs inside the container**, so +long agent tasks execute on the Framework Desktop and survive the +laptop sleeping, the browser tab closing, or you walking away with the +phone. Any device on the tailnet gets the same editor with the same +state. + +## Bring-up + +```sh +cd /srv/docker/code-server && docker compose up -d +``` + +First start pulls the image and runs the universal-package-install mod +(installs tmux), so give it a minute before the UI answers on :8443. + +> **HTTPS is required for extension panels.** VS Code webviews (the +> Claude Code panel included) need a secure context; over plain +> `http://framework:8443` they render blank. Serve it via Tailscale: +> `sudo tailscale serve --bg --https=8443 8443` → +> `https://framework..ts.net:8443`. Localhost also counts as +> secure (`ssh -L 8443:localhost:8443 framework`). + +## One-time setup (in the code-server UI) + +1. **Install the Claude Code extension.** Extensions panel → search + "Claude Code". code-server uses the Open VSX registry, where + Anthropic publishes the official extension + (). +2. **Sign in.** The OAuth flow in a browser context is a copy/paste + dance: the extension shows a URL → open it in another tab → approve + → paste the code back. Credentials land in `/config/.claude/` and + persist across container recreates. +3. **(Optional) CLI in the terminal.** The extension bundles its own + CLI, but for tmux-based long runs install it explicitly: + + ```sh + curl -fsSL https://claude.ai/install.sh | bash + ``` + + Lands in `~/.local/bin` (= `/config/.local/bin`, persisted). + +## Long-running tasks + +The Claude process is a child of the container, not of your browser +tab — closing the tab does nothing to it. Reattach from any device and +the session is where you left it. For multi-hour unattended runs, +prefer a tmux pane in the integrated terminal: + +```sh +tmux new -s longtask +claude # kick off the task, detach with C-b d +``` + +If you want to steer from a phone or claude.ai/code instead of this UI, +use `claude remote-control` (see the framework README's +"Claude Code on the box" section) — that works in the host's tmux too, +no container needed. + +## Scope and security + +- **No password is set** — same trust model as every other service on + this Tailscale-only box. If the host ever sees LAN/internet traffic, + set `HASHED_PASSWORD` in the compose env and bind to localhost; a + browser terminal is a shell. +- The workspace is **container-scoped on purpose**: Claude Code in here + sees `/workspace` and `/config`, not the host filesystem or the + docker socket. Bind-mount specific repos into `/workspace/` + when you want them editable (host UID 1000 == container PUID 1000, + so ownership just works). +- `host.docker.internal` reaches the sibling services — handy for + pointing scripts at Ollama (:11434) or LiteLLM (:4000) from the + integrated terminal. + +## State layout + +| Path (host) | What lives there | +| -------------------------------------- | ------------------------------------------------- | +| `/srv/docker/code-server/config` | `$HOME`: extensions, settings, `~/.claude`, CLIs | +| `/srv/docker/code-server/workspace` | default project area (`DEFAULT_WORKSPACE`) | + +Back up `config` if you care about extension state and Claude session +history; everything else is reproducible from the repo. diff --git a/pyinfra/framework/compose/coder.yml b/pyinfra/framework/compose/coder.yml new file mode 100644 index 0000000..bacd975 --- /dev/null +++ b/pyinfra/framework/compose/coder.yml @@ -0,0 +1,64 @@ +# Coder — self-hosted workspace manager (PILOT). Web dashboard at :7080 +# that stamps out per-project dev containers from Terraform templates; +# each workspace gets browser code-server with the Claude Code extension +# pre-installed. Evaluating against the standalone code-server stack +# (compose/code-server.yml, kept as-is during the pilot) — verdict +# criteria in compose/coder/README.md. +# +# The server container holds the host docker socket and spawns workspace +# containers as siblings (same pattern + same security posture as +# OpenHands: fine Tailscale-only, never expose further). group_add needs +# the socket's host GID — host-specific, so it comes from the sibling +# .env, which pyinfra generates on the box on first deploy along with a +# random one-time Postgres password. +# +# Workspaces don't publish host ports — code-server and terminals are +# reached through the dashboard's tunnel. No port bookkeeping per +# project. +services: + coder: + # Pin and bump deliberately — Coder releases weekly and the workspace + # provisioner/agent protocol moves with it. Verify at + # https://github.com/coder/coder/releases. + image: ghcr.io/coder/coder:v2.33.8 + container_name: coder + restart: unless-stopped + ports: + - "7080:7080" + environment: + CODER_HTTP_ADDRESS: "0.0.0.0:7080" + # The URL clients are told to use — tailnet MagicDNS name, same + # convention as every other service tile. + CODER_ACCESS_URL: "${CODER_ACCESS_URL}" + CODER_PG_CONNECTION_URL: "postgresql://coder:${POSTGRES_PASSWORD}@database/coder?sslmode=disable" + group_add: + # GID of /var/run/docker.sock — generated into .env by deploy.py. + - "${DOCKER_GROUP_ID}" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + # Repo-shipped Terraform templates (source of truth: + # pyinfra/framework/compose/coder/templates/). Push after changes: + # docker compose exec coder coder templates push code-server \ + # --directory /templates/code-server --yes + - /srv/docker/coder/templates:/templates:ro + depends_on: + database: + condition: service_healthy + + database: + image: postgres:17 + container_name: coder-db + restart: unless-stopped + environment: + POSTGRES_USER: coder + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: coder + volumes: + # Entrypoint starts as root and chowns this to the postgres uid; + # deploy.py just creates the mount point. + - /srv/docker/coder/postgres:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U coder -d coder"] + interval: 5s + timeout: 5s + retries: 5 diff --git a/pyinfra/framework/compose/coder/README.md b/pyinfra/framework/compose/coder/README.md new file mode 100644 index 0000000..0b34814 --- /dev/null +++ b/pyinfra/framework/compose/coder/README.md @@ -0,0 +1,110 @@ +# Coder — workspace manager (pilot) + +Self-hosted control plane at that creates +per-project dev containers from Terraform templates. Each workspace: +its own container, browser code-server with the Claude Code extension +pre-installed, no host-port bookkeeping (everything tunnels through the +dashboard), and idle autostop so parked projects give their RAM back to +the inference stacks. + +**Pilot status.** Evaluating against the standalone code-server stack +(`/srv/docker/code-server`, kept as-is meanwhile). Verdict after a week +of real use: does the dashboard + autostop + create-from-template earn +an always-on control plane + Postgres? If yes, the standalone stack +retires; if no, Coder goes and a thin `workspace` spawner script +replaces it. + +## Bring-up + +```sh +cd /srv/docker/coder && docker compose up -d +``` + +The sibling `.env` (DOCKER_GROUP_ID, CODER_ACCESS_URL, random Postgres +password) is generated by deploy.py on first `./run.sh` — no hand-fill +needed. First visit to creates the admin +account (pick anything; it's local to the box). + +> **HTTPS is required for extension panels.** VS Code webviews (the +> Claude Code panel, markdown preview, etc.) run on service workers, +> which browsers only allow in a secure context — over plain +> `http://framework:7080` the editor works but webview panels render +> blank. Fix via Tailscale Serve (real Let's Encrypt cert for the +> tailnet name; enable "HTTPS Certificates" once in the Tailscale +> admin console): +> +> ```sh +> sudo tailscale serve --bg 7080 +> # then in .env: CODER_ACCESS_URL=https://framework..ts.net +> docker compose up -d # recreate server with new access URL +> # restart any existing workspace — app URLs derive from the access URL +> ``` +> +> Localhost is also a secure context, so +> `ssh -L 7080:localhost:7080 framework` + http://localhost:7080 works +> in a pinch. + +## Push the template (one-time + after edits) + +The `coder` CLI ships inside the server image; authenticate it once: + +```sh +docker compose exec coder coder login http://localhost:7080 +# prints a /cli-auth URL — open http://framework:7080/cli-auth in your +# browser, copy the session token, paste it back +``` + +Then push: + +```sh +docker compose exec coder coder templates push code-server \ + --directory /templates/code-server --yes +``` + +Template source of truth is the repo +(`pyinfra/framework/compose/coder/templates/code-server/main.tf`) — +edit there, `./run.sh`, re-push. Edits on the box get overwritten. + +## First workspace + +Dashboard → Workspaces → Create → `code-server` template → name it +after the project. Open the code-server app tile, then one-time +Claude sign-in: the extension shows an OAuth URL → open in another tab +→ approve → paste the code back. Credentials live in `~/.claude` on +the workspace's home volume and survive stop/start and rebuilds. + +Set **idle autostop** under Template → Settings → Schedule (suggest +1–2 h inactivity). Activity = open code-server tab, SSH, web terminal. + +## What persists where + +| Thing | Where | +| -------------------------------- | -------------------------------------------- | +| Workspace home (repos, ~/.claude, extensions) | named volume `coder--home` | +| Control-plane state (users, templates, workspace defs) | Postgres → `/srv/docker/coder/postgres` | +| Template source | this repo, shipped to `/srv/docker/coder/templates` | + +A stopped workspace's container is deleted; only the home volume +remains. `docker volume ls | grep coder-` to audit. + +## Security + +Same posture as OpenHands: the server container holds the docker +socket and spawns code-running containers — root-equivalent on the +box. Tailscale-only exposure is the mitigation; never forward :7080 +anywhere else. Coder does have real auth (the admin account), but +treat that as defense-in-depth, not as permission to expose it. + +## Notes + +- Workspaces reach the sibling services via `host.docker.internal` + (Ollama :11434, LiteLLM :4000, Phoenix :6006, ...). +- Long Claude runs: same rules as anywhere — the process lives in the + workspace container, so it survives laptop/browser disconnects, but + **autostop will kill an idle-looking workspace mid-run**. For + multi-hour unattended tasks either bump the workspace's TTL in the + dashboard or use the host-side tmux + `claude remote-control` + pattern (framework README, "Claude Code on the box"). +- The base image is `codercom/enterprise-base:ubuntu` (sudo-enabled, + common toolchain). Per-project images are a later refinement — + swap the `image` in main.tf or parameterize with `coder_parameter`. diff --git a/pyinfra/framework/compose/coder/templates/code-server/main.tf b/pyinfra/framework/compose/coder/templates/code-server/main.tf new file mode 100644 index 0000000..d4214e5 --- /dev/null +++ b/pyinfra/framework/compose/coder/templates/code-server/main.tf @@ -0,0 +1,121 @@ +# code-server workspace template — one dev container per project, with +# browser VS Code and Claude Code (extension + CLI) ready on first open. +# +# Source of truth is the repo copy at +# pyinfra/framework/compose/coder/templates/code-server/main.tf; pyinfra +# ships it to /srv/docker/coder/templates/, mounted read-only into the +# server container at /templates. Push after edits: +# cd /srv/docker/coder +# docker compose exec coder coder templates push code-server \ +# --directory /templates/code-server --yes + +terraform { + required_providers { + coder = { + source = "coder/coder" + } + docker = { + source = "kreuzwerker/docker" + } + } +} + +provider "coder" {} + +# Talks to the host daemon via the socket mounted into the server +# container (default unix:///var/run/docker.sock) — workspace containers +# are siblings of the compose stacks, not children. +provider "docker" {} + +data "coder_workspace" "me" {} +data "coder_workspace_owner" "me" {} + +resource "coder_agent" "main" { + arch = "amd64" + os = "linux" + + # Claude Code CLI — native installer, lands in ~/.local/bin inside the + # persisted home volume, so this is a no-op after the first start. The + # code-server extension bundles its own CLI, but having `claude` on + # PATH enables tmux-based long runs in the workspace terminal. + startup_script = <<-EOT + set -e + command -v claude >/dev/null 2>&1 || curl -fsSL https://claude.ai/install.sh | bash + EOT + + env = { + GIT_AUTHOR_NAME = data.coder_workspace_owner.me.full_name + GIT_AUTHOR_EMAIL = data.coder_workspace_owner.me.email + GIT_COMMITTER_NAME = data.coder_workspace_owner.me.full_name + GIT_COMMITTER_EMAIL = data.coder_workspace_owner.me.email + } + + metadata { + display_name = "CPU" + key = "cpu" + script = "coder stat cpu" + interval = 10 + timeout = 1 + } + metadata { + display_name = "RAM" + key = "mem" + script = "coder stat mem" + interval = 10 + timeout = 1 + } +} + +# Browser VS Code inside the workspace, surfaced as a dashboard app. +# Extensions install from Open VSX — anthropic.claude-code is the +# official Claude Code extension +# (https://open-vsx.org/extension/Anthropic/claude-code). OAuth creds +# land in ~/.claude inside the home volume and survive rebuilds. +module "code_server" { + count = data.coder_workspace.me.start_count + source = "registry.coder.com/coder/code-server/coder" + version = "~> 1.0" + agent_id = coder_agent.main.id + folder = "/home/coder/project" + + extensions = [ + "anthropic.claude-code", + ] +} + +# Home survives workspace stop/start AND template-driven rebuilds — +# ignore_changes keeps Terraform from recreating the volume (and wiping +# ~/.claude, extensions, repos) when template metadata shifts. +resource "docker_volume" "home" { + name = "coder-${data.coder_workspace.me.id}-home" + lifecycle { + ignore_changes = all + } +} + +resource "docker_container" "workspace" { + # start_count is 0 when the workspace is stopped — the container is + # deleted but the home volume above persists. This is what idle + # autostop reclaims: RAM back to the inference stacks. + count = data.coder_workspace.me.start_count + image = "codercom/enterprise-base:ubuntu" + name = "coder-${data.coder_workspace_owner.me.name}-${lower(data.coder_workspace.me.name)}" + hostname = data.coder_workspace.me.name + + # Agent bootstrap, rewritten to reach the control plane through the + # docker bridge (the workspace is a sibling container; "localhost" + # inside it isn't the Coder server). + entrypoint = ["sh", "-c", replace(coder_agent.main.init_script, "/localhost|127\\.0\\.0\\.1/", "host.docker.internal")] + env = ["CODER_AGENT_TOKEN=${coder_agent.main.token}"] + + host { + host = "host.docker.internal" + ip = "host-gateway" + } + + volumes { + container_path = "/home/coder" + volume_name = docker_volume.home.name + read_only = false + } +} diff --git a/pyinfra/framework/compose/homepage/services.yaml b/pyinfra/framework/compose/homepage/services.yaml index 9ebd0c6..0bc5ca9 100644 --- a/pyinfra/framework/compose/homepage/services.yaml +++ b/pyinfra/framework/compose/homepage/services.yaml @@ -95,6 +95,28 @@ server: localhost-docker container: openhands + - code-server: + icon: code-server.svg + href: http://framework:8443 + description: VS Code in the browser — Claude Code long tasks live here + server: localhost-docker + container: code-server + + - Coder: + icon: coder.svg + href: http://framework:7080 + description: Workspace manager (pilot) — per-project containers from templates + server: localhost-docker + container: coder + # /api/v2/buildinfo is unauthenticated — cheap liveness + version. + widget: + type: customapi + url: http://host.docker.internal:7080/api/v2/buildinfo + refreshInterval: 30000 + mappings: + - field: version + label: Version + - Observability: - Beszel: icon: beszel.svg diff --git a/pyinfra/framework/compose/qwable.yml b/pyinfra/framework/compose/qwable.yml new file mode 100644 index 0000000..2367c4d --- /dev/null +++ b/pyinfra/framework/compose/qwable.yml @@ -0,0 +1,102 @@ +# Qwable-3.6-27B (Qwen3.6-27B fine-tuned on Fable-5-style reasoning +# traces — "Qwen + Fable") via the kyuz0 rocm-7.2.2 Strix Halo toolbox. +# Same image + unified-memory recipe as compose/llama.yml; deltas are +# model path, port, alias. +# https://github.com/kyuz0/amd-strix-halo-toolboxes +# Model: https://huggingface.co/Mia-AiLab/Qwable-3.6-27b (MIT) +# +# What it's for. A "thinks-like-Fable-5" interactive model — structured, +# step-by-step explanatory output. Dense 27B (NOT MoE), so it's slower +# per token than the 30B-A3B MoE workhorses despite being smaller on +# disk: all 27B weights load per token. Bandwidth math (256 GB/s ÷ +# ~16.5 GB) → ~10-15 tok/s decode. Interactive but not snappy. +# +# Coexistence. At ~16.5 GB (Q4_K_M) it's the smallest GPU resident here +# and fits alongside llama 30B (port 8080), Ollama, or Kimi in the +# ~110 GB merged arena. It does NOT fit alongside qwen3-235b (88.8 GB) +# or comfyui — swap-model tears those down for the `qwable` target. +# `restart: "no"`: you bring it up deliberately via swap-model, it won't +# auto-start after a reboot and surprise-collide with a big model. +# +# Weights. Single-file GGUF (not sharded). Download path on the box +# (see compose/qwable/README.md): +# hf download Mia-AiLab/Qwable-3.6-27b \ +# 'Qwable-27b_Q4_K_M.gguf' \ +# --local-dir /models/qwen/Qwable-3.6-27b +# Verify exact filename in the HF repo before downloading. +# +# Port 8082 — distinct from llama 30B (8080) and qwen3-235b (8081). +services: + qwable: + image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2 + container_name: qwable + # Manual start only — see header note about GPU contention with + # the big models. swap-model brings it up/down. + restart: "no" + devices: + # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan + # only needs dri. Don't drop kfd when on the rocm-* tag. + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + cap_add: + - SYS_PTRACE + security_opt: + - seccomp=unconfined + # Numeric GIDs of host's video (44) and render (991) groups — + # required for /dev/kfd + /dev/dri access from inside the container. + group_add: + - "44" + - "991" + shm_size: 8g + ipc: host + environment: + # Unified-memory recipe (same as compose/llama.yml + kimi-linear + + # qwen3-235b). BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these + # flags merge the rocminfo pools into one ~110 GB arena. kyuz0's + # image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION. + - HSA_XNACK=1 + - HSA_FORCE_FINE_GRAIN_PCIE=1 + volumes: + - /models:/models:ro + ports: + - "8082:8082" + entrypoint: ["llama-server"] + command: + - --model + - /models/qwen/Qwable-3.6-27b/Qwable-27b_Q4_K_M.gguf + # OpenAI-compatible served name (matches what opencode/curl request + # as "model"). Provider-side name lives in opencode.json if/when + # this gets wired as a provider. + - --alias + - qwable + - --host + - 0.0.0.0 + - --port + - "8082" + # Push all layers to GPU. "999" = all available. A 27B Q4 (~16.5 GB) + # fits the merged arena with huge headroom. + - --n-gpu-layers + - "999" + # 64K to match llama/qwen3-235b — keeps opencode auto-compaction + # behaviour consistent across providers. Tons of arena headroom + # here (model is small), so this can ramp far higher if a workflow + # needs it; see compose/qwable/README.md. + - --ctx-size + - "65536" + # No-mmap is the Strix Halo standard — forces full GPU load. + - --no-mmap + # Flash attention — required for q8_0 KV cache; modern llama-server + # takes a value (on/off/auto), bare --flash-attn is deprecated. + - --flash-attn + - "on" + # Quantize KV cache to int8 — halves KV memory at minor/no quality + # loss. Matches the other llama.cpp stacks. + - --cache-type-k + - q8_0 + - --cache-type-v + - q8_0 + # Use the model's embedded jinja chat template — Qwable inherits + # Qwen3.6's chat format, which the Fable-trace fine-tune relies on. + - --jinja + # Expose Prometheus metrics at /metrics — scraped by OpenLIT. + - --metrics diff --git a/pyinfra/framework/compose/qwable/README.md b/pyinfra/framework/compose/qwable/README.md new file mode 100644 index 0000000..7338c4a --- /dev/null +++ b/pyinfra/framework/compose/qwable/README.md @@ -0,0 +1,130 @@ +# qwable + +Qwable-3.6-27B on Strix Halo via `kyuz0:rocm-7.2.2`. A full fine-tune +of Qwen3.6-27B trained on **Fable-5-style reasoning traces** — the dev +collected examples formatted like Fable 5's deliberate, step-by-step +answers and trained Qwen to reproduce that structured, explanatory +output. Think of it as a local "thinks-like-Fable" model. + +OpenAI-compatible endpoint at `http://framework:8082` once running. + +## Dense, not MoE (read first) + +Despite being smaller on disk than the 30B-A3B workhorses, Qwable is a +**dense** 27B — every weight loads per token. On this bandwidth-bound +box (256 GB/s ÷ ~16.5 GB) that's **~10-15 tok/s** decode, slower than +the MoE 30B (~100 tok/s, only ~3B active). So Qwable is for when you +specifically want the Fable-style reasoning, not for raw throughput. +The interactive daily driver stays on Ollama / llama 30B. + +## Coexistence notes + +At ~16.5 GB (Q4_K_M) Qwable is the smallest GPU resident here: + +| Concurrent service | Coexists? | +|---|---| +| `llama` (Qwen3-Coder-30B, 8080) | ✅ yes (~35 GB total) | +| `ollama` (11434) | ✅ yes | +| `kimi-linear` (vLLM, 8000) | ✅ yes (~47 GB total) | +| `qwen3-235b` (88.8 GB, 8081) | ❌ no — too tight, swap-model stops it | +| `comfyui` (8188) | ❌ no — swap-model stops it | + +`restart: "no"`: you bring it up deliberately (via `swap-model qwable`), +it won't auto-start after a reboot and surprise-collide with a big model. + +## Prereqs + +- Pyinfra deploy has run (creates `/srv/docker/qwable/` with right perms). +- BIOS UMA at 0.5 GB + `ttm.pages_limit=33554432` kernel cmdline active. + Verify: `cat /proc/cmdline | grep ttm.pages_limit`. + +## Download weights (~16.5 GB, single file) + +```sh +# /models/qwen exists via pyinfra; just create the model subdir. +mkdir -p /models/qwen/Qwable-3.6-27b + +hf download Mia-AiLab/Qwable-3.6-27b \ + 'Qwable-27b_Q4_K_M.gguf' \ + --local-dir /models/qwen/Qwable-3.6-27b + +# File lands at: +# /models/qwen/Qwable-3.6-27b/Qwable-27b_Q4_K_M.gguf (~16.5 GB) +``` + +Single-file GGUF (not sharded) — point `--model` straight at it. Disk: +needs ~17 GB free on `/models`. + +> Abliterated variant (refusals removed) lives at +> `huihui-ai/Huihui-Qwable-3.6-27b-abliterated-GGUF` +> (`Huihui-Qwable-3.6-27b-abliterated-Q4_K_M_Q8.gguf`, ~18.3 GB). +> Not the default — no safety filtering, careful with it. + +## Bring up + +Easy path — `swap-model` handles stop-conflicting-services + waits for +`/health`: + +```sh +ssh framework swap-model qwable # ~1-2 min cold load (16.5 GB) +ssh framework /srv/docker/qwable/smoke.sh # perf measure +``` + +Manual equivalent (first-ever bring-up, before the image is cached): + +```sh +cd /srv/docker/qwable +docker compose pull # already-cached image if you ran llama first +docker compose up -d +docker compose logs -f # wait for "server is listening on http://0.0.0.0:8082" + +./smoke.sh # /health + tiny generation + perf +``` + +First start is ~1-2 min (16.5 GB load off disk; much faster than the +235B). If `./smoke.sh` reports `predicted_per_second` in the 10-15 tok/s +band, it's healthy. <6 tok/s = investigate (likely arena < 100 GB — see +qwen3-235b/README.md "Troubleshooting" for the arena checks). + +## Ramping context + +Defaults to 64K to match the other llama.cpp stacks (keeps opencode +auto-compaction consistent across providers). The model is tiny relative +to the arena, so there's plenty of room to push higher: + +| Stage | `--ctx-size` | Margin in arena | +|---|---|---| +| **Current default** | **65536** | huge (~90 GB free) | +| Stretch | 131072+ | still comfortable | + +Edit `--ctx-size` in `docker-compose.yml`, `docker compose down && up -d`, +re-run `./smoke.sh`. The real ceiling is Qwable's trained context length +(inherits Qwen3.6-27B's), not arena memory — verify the model's max +positions before going past 128K. + +## Operations + +```sh +docker compose logs -f # tail +docker compose down # stop +docker compose exec qwable bash # shell in +./smoke.sh # health + perf +amdgpu_top # GPU view on host +``` + +## Pin manifest + +| Component | Pin | +|---|---| +| Image | `kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2` (shared with `llama`) | +| Weights | `Mia-AiLab/Qwable-3.6-27b` → `Qwable-27b_Q4_K_M.gguf` (~16.5 GB) | +| Default port | 8082 | +| Default context | 65536 | +| KV cache type | q8_0 (k and v) | +| License | MIT (model); Qwen3.6-27B base license also applies | + +## Status + +Compose artifacts written; awaiting box-side weight pull + bring-up. +Wired as a `swap-model qwable` target. Wire as an opencode/LiteLLM +provider only if the Fable-style reasoning proves useful in practice. diff --git a/pyinfra/framework/compose/qwable/smoke.sh b/pyinfra/framework/compose/qwable/smoke.sh new file mode 100755 index 0000000..073003b --- /dev/null +++ b/pyinfra/framework/compose/qwable/smoke.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Smoke-test the running qwable llama-server (port 8082). Hits /health +# for liveness, then a tiny OpenAI-compatible chat completion, then +# measures eval_tps via /completion. Dense 27B → expect ~10-15 tok/s. +set -euo pipefail + +HOST="${QWABLE_HOST:-127.0.0.1:8082}" +MODEL="${QWABLE_MODEL:-qwable}" + +echo "[smoke] GET /health on $HOST" +curl -fsS "http://$HOST/health" | python3 -m json.tool + +echo +echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation" +curl -fsS "http://$HOST/v1/chat/completions" \ + -H 'Content-Type: application/json' \ + -d "{ + \"model\": \"$MODEL\", + \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}], + \"max_tokens\": 16, + \"temperature\": 0.0 + }" | python3 -m json.tool + +echo +echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=128)" +# 128 tokens — at ~10-15 tok/s the per-token warmup noise still matters, +# but a dense 27B settles faster than the 235B so we don't need 64-only. +curl -fsS "http://$HOST/completion" \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.", + "n_predict": 128, + "temperature": 0.0, + "stream": false + }' | python3 -c " +import json, sys +r = json.load(sys.stdin) +t = r.get('timings', {}) +print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s') +print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s') +print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}') +print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}') +" + +echo +echo "[smoke] passed — expected band 10-15 tok/s decode (dense 27B Q4)" diff --git a/pyinfra/framework/compose/qwen3-235b/README.md b/pyinfra/framework/compose/qwen3-235b/README.md index 48da806..572cd37 100644 --- a/pyinfra/framework/compose/qwen3-235b/README.md +++ b/pyinfra/framework/compose/qwen3-235b/README.md @@ -53,6 +53,15 @@ Disk: needs ~90 GB free on `/models`. Pull is bandwidth-bound; expect ## Bring up (M0.2 — first generation) +Easy path — `swap-model` handles the stop-conflicting-services dance + waits for `/health`: + +```sh +ssh framework swap-model 235b # ~3-5 min on first cold load +ssh framework /srv/docker/qwen3-235b/smoke.sh # perf measure +``` + +Manual equivalent (for first-ever bring-up before the image is cached): + ```sh cd /srv/docker/qwen3-235b docker compose pull # already-cached image if you ran llama first diff --git a/pyinfra/framework/deploy.py b/pyinfra/framework/deploy.py index 786c7b6..ada3f12 100644 --- a/pyinfra/framework/deploy.py +++ b/pyinfra/framework/deploy.py @@ -80,6 +80,8 @@ apt.packages( "ca-certificates", "unzip", "software-properties-common", # for add-apt-repository (g++-14 PPA) + "jq", # JSON parsing in operator scripts (bench-engines) + "bc", # float math in bench-engines ], _sudo=True, ) @@ -433,6 +435,7 @@ for svc in ( "ollama", "kimi-linear", "qwen3-235b", + "qwable", "litellm", "comfyui", "openwebui", @@ -440,6 +443,8 @@ for svc in ( "openlit", "phoenix", "openhands", + "code-server", + "coder", "homepage", "whisper", "piper", @@ -561,6 +566,89 @@ files.directory( _sudo=True, ) +# code-server persistent state. The linuxserver image's s6 init drops to +# PUID/PGID 1000 and treats /config as the container user's $HOME — +# extensions, settings, ~/.claude (Claude Code OAuth creds + session +# history), ~/.local/bin. Owned 1000:1000 to match (same pattern as +# kokoro: the container user isn't in the docker group, so 2775 +# root:docker wouldn't help it). +files.directory( + name="code-server config dir", + path=f"{COMPOSE_DIR}/code-server/config", + user="1000", + group="1000", + mode="0755", + _sudo=True, +) +# Default workspace. Host UID 1000 == container PUID 1000, so files +# created either side stay owned by the SSH user. +files.directory( + name="code-server workspace dir", + path=f"{COMPOSE_DIR}/code-server/workspace", + user=SSH_USER, + group=SSH_USER, + mode="2775", + _sudo=True, +) +files.put( + name="code-server: README.md", + src="compose/code-server/README.md", + dest=f"{COMPOSE_DIR}/code-server/README.md", + group="docker", + mode="0664", + _sudo=True, +) + +# Coder workspace manager (pilot — see compose/coder/README.md for the +# evaluation criteria vs the standalone code-server stack). Postgres +# chowns its own data dir at first start (entrypoint runs as root); we +# just create the mount point. Templates are repo-sourced Terraform +# mounted read-only into the server container, pushed with +# `docker compose exec coder coder templates push`. +files.directory( + name="Coder postgres data dir", + path=f"{COMPOSE_DIR}/coder/postgres", + group="docker", + mode="2775", + _sudo=True, +) +files.directory( + name="Coder templates/code-server dir", + path=f"{COMPOSE_DIR}/coder/templates/code-server", + group="docker", + mode="2775", + _sudo=True, +) +for asset, mode in ( + ("templates/code-server/main.tf", "0664"), + ("README.md", "0664"), +): + files.put( + name=f"coder: {asset}", + src=f"compose/coder/{asset}", + dest=f"{COMPOSE_DIR}/coder/{asset}", + group="docker", + mode=mode, + _sudo=True, + ) +# Sibling .env: DOCKER_GROUP_ID (host-specific GID of the docker socket, +# needed for the server container's group_add), the access URL, and a +# random one-time Postgres password. Generated on the box rather than +# placeholder-then-hand-fill because every value is derivable. Never +# overwritten once present. +server.shell( + name="Coder .env (generate once)", + commands=[ + f"test -f {COMPOSE_DIR}/coder/.env || {{ " + f"printf 'DOCKER_GROUP_ID=%s\\nCODER_ACCESS_URL=http://framework:7080\\nPOSTGRES_PASSWORD=%s\\n' " + f'"$(stat -c %g /var/run/docker.sock)" "$(openssl rand -hex 16)" ' + f"> {COMPOSE_DIR}/coder/.env && " + f"chown root:docker {COMPOSE_DIR}/coder/.env && " + f"chmod 640 {COMPOSE_DIR}/coder/.env; }}", + ], + _sudo=True, +) + # Homepage config. The compose loop above only copies homepage.yml; the # YAML config files live in compose/homepage/ on the source side and at # /srv/docker/homepage/config/ on the box. Source-of-truth is the repo — @@ -622,6 +710,22 @@ for asset, mode in ( _sudo=True, ) +# Qwable operator assets. Same image as llama (kyuz0 rocm-7.2.2); dense +# 27B Qwen3.6 fine-tuned on Fable-5 traces. Weights live at /models/qwen/ +# via manual `hf download` per the README. swap-model `qwable` target. +for asset, mode in ( + ("smoke.sh", "0775"), + ("README.md", "0664"), +): + files.put( + name=f"qwable: {asset}", + src=f"compose/qwable/{asset}", + dest=f"{COMPOSE_DIR}/qwable/{asset}", + group="docker", + mode=mode, + _sudo=True, + ) + # LiteLLM router assets. config.yaml is the source-of-truth model # routing table — pyinfra syncs it on every run; edits on the box get # overwritten. The .env file holds LITELLM_MASTER_KEY + LITELLM_SALT_KEY @@ -758,6 +862,37 @@ files.directory( _sudo=True, ) +# --- Operator scripts ------------------------------------------------------- + +# swap-model — one-command swap between which inference container is +# GPU-resident. Encodes the coexistence table (235B doesn't fit alongside +# anything; ollama+kimi do) + per-service health probes. Lives in +# /usr/local/bin so the SSH user (in the docker group) can run it +# directly: `ssh framework swap-model 235b`. See scripts/swap-model for +# the modes and bin/swap-model for the Mac-side wrapper. +files.put( + name="swap-model script (box-side)", + src="scripts/swap-model", + dest="/usr/local/bin/swap-model", + user="root", + group="root", + mode="0755", + _sudo=True, +) + +# bench-engines — one-shot decision tool for the GGUF-tier consolidation +# (Ollama vs kyuz0 llama.cpp decode t/s on gfx1151). See the framework +# README "Inference engine consolidation". +files.put( + name="bench-engines script (box-side)", + src="scripts/bench-engines", + dest="/usr/local/bin/bench-engines", + user="root", + group="root", + mode="0755", + _sudo=True, +) + # --- Cleanup of artifacts from the prior native-build deploy ---------------- # All idempotent — `present=False` is a no-op when the target is absent. diff --git a/pyinfra/framework/scripts/bench-engines b/pyinfra/framework/scripts/bench-engines new file mode 100644 index 0000000..dc27266 --- /dev/null +++ b/pyinfra/framework/scripts/bench-engines @@ -0,0 +1,177 @@ +#!/usr/bin/env bash +# bench-engines — compare decode/prefill throughput of Ollama vs +# llama.cpp (kyuz0 toolbox) on the SAME GGUF on gfx1151. +# +# Why this exists. The GGUF-tier consolidation decision (see the +# framework README "Inference engine consolidation") hinges on one +# hardware-specific unknown: how close is Ollama's bundled llama.cpp to +# the gfx1151-tuned kyuz0 build on *this* box? If decode t/s is within +# ~10-15 %, Ollama's convenience wins (it auto-swaps, so no llama-swap +# needed). If kyuz0's rocWMMA flash-attention lead is large, that argues +# for keeping llama.cpp behind llama-swap. This measures it. +# +# Method. Serves the identical GGUF on each engine in isolation (the +# other GGUF engine + 235b are stopped so nothing competes for the +# arena), warms up, then runs R raw-completion trials at a fixed decode +# length. Reads each engine's own authoritative timing fields — no +# token-counting guesswork: +# - llama.cpp /completion → .timings.{prompt,predicted}_per_second +# - Ollama /api/generate → {prompt_eval,eval}_{count,duration} +# Uses raw prompts (no chat template) on both for an apples-to-apples +# prompt-in / tokens-out measurement. +# +# Run ON THE BOX (hits localhost + docker). Requires jq. +# +# Usage: +# bench-engines # bench the model llama.yml serves +# bench-engines status # show what's currently GPU-resident +# BENCH_RUNS=5 bench-engines # more trials (default 3) +# +# To bench a different model (e.g. Qwen3.6-27B): point compose/llama.yml +# at the new GGUF, set GGUF below to match, redeploy, rerun. + +set -euo pipefail + +COMPOSE_ROOT="/srv/docker" +RUNS="${BENCH_RUNS:-3}" +N_PREDICT="${BENCH_N_PREDICT:-256}" +WAIT_TIMEOUT="${BENCH_WAIT_TIMEOUT:-600}" + +# Must match the GGUF that compose/llama.yml serves — this is the file +# registered into Ollama so both engines run identical weights. The +# path is the in-container path (/models is bind-mounted into both). +GGUF="${BENCH_GGUF:-/models/qwen/Qwen3-Coder-30B-A3B-Instruct-UD-Q4_K_XL.gguf}" +OLLAMA_BENCH_MODEL="bench-engines" + +# A fixed, moderately long prompt so prefill is measurable. Decode is the +# number that actually decides the consolidation (bandwidth-bound). +read -r -d '' PROMPT <<'EOF' || true +You are a careful systems engineer. Explain, in detail and step by step, +how a unified-memory APU shares a single physical RAM pool between the +CPU and an integrated GPU, what a GTT aperture is, why demand paging +matters for large language model weights, and how this differs from a +discrete GPU with dedicated VRAM. Be thorough and precise. +EOF + +LLAMA_URL="http://127.0.0.1:8080" +OLLAMA_URL="http://127.0.0.1:11434" + +need() { command -v "$1" >/dev/null 2>&1 || { echo "bench-engines: missing '$1'" >&2; exit 1; }; } +need jq +need curl + +is_running() { docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true; } +is_healthy() { curl -fsS --max-time 5 "$1" >/dev/null 2>&1; } + +down() { + local dir="$COMPOSE_ROOT/$1" + is_running "$1" || return 0 + echo " stopping $1" + (cd "$dir" && docker compose down >/dev/null 2>&1) +} + +up_wait() { + local svc="$1" health="$2" deadline=$(( SECONDS + WAIT_TIMEOUT )) + echo " starting $svc" + (cd "$COMPOSE_ROOT/$svc" && docker compose up -d >/dev/null 2>&1) + printf " waiting for health" + while ! is_healthy "$health"; do + (( SECONDS > deadline )) && { echo " TIMEOUT"; docker logs --tail 20 "$svc" >&2; exit 1; } + sleep 5; printf "." + done + echo " ok" +} + +# --- isolation: only the engine under test is GPU-resident ------------ +isolate_for() { + case "$1" in + llama) down ollama; down qwen3-235b; down kimi-linear ;; + ollama) down llama; down qwen3-235b; down kimi-linear ;; + esac +} + +# --- register the GGUF into Ollama (idempotent) ----------------------- +register_ollama_model() { + if docker exec ollama ollama list 2>/dev/null | grep -q "^${OLLAMA_BENCH_MODEL}"; then + echo " ollama model '${OLLAMA_BENCH_MODEL}' already registered" + return 0 + fi + echo " registering ${OLLAMA_BENCH_MODEL} from ${GGUF}" + # FROM the in-container GGUF path; num_ctx/kv match llama.yml so the + # comparison stays fair. + printf 'FROM %s\nPARAMETER num_ctx 65536\n' "$GGUF" \ + | docker exec -i ollama ollama create "${OLLAMA_BENCH_MODEL}" -f - +} + +# --- one trial; echoes "prefill_tps decode_tps" ----------------------- +trial_llama() { + local body resp + body=$(jq -n --arg p "$PROMPT" --argjson n "$N_PREDICT" \ + '{prompt:$p, n_predict:$n, temperature:0, cache_prompt:false}') + resp=$(curl -fsS --max-time 300 "$LLAMA_URL/completion" \ + -H 'Content-Type: application/json' -d "$body") + echo "$resp" | jq -r '"\(.timings.prompt_per_second) \(.timings.predicted_per_second)"' +} + +trial_ollama() { + local body resp + body=$(jq -n --arg m "$OLLAMA_BENCH_MODEL" --arg p "$PROMPT" --argjson n "$N_PREDICT" \ + '{model:$m, prompt:$p, raw:true, stream:false, options:{temperature:0, num_predict:$n}}') + resp=$(curl -fsS --max-time 300 "$OLLAMA_URL/api/generate" \ + -H 'Content-Type: application/json' -d "$body") + # durations are ns; t/s = count / (duration/1e9) + echo "$resp" | jq -r ' + "\(.prompt_eval_count / (.prompt_eval_duration/1e9)) \(.eval_count / (.eval_duration/1e9))"' +} + +# --- run R trials, print per-trial + mean decode ---------------------- +bench() { + local engine="$1" trialfn="$2" + echo " warmup..."; "$trialfn" >/dev/null + local sum_pp=0 sum_tg=0 + for i in $(seq 1 "$RUNS"); do + read -r pp tg < <("$trialfn") + printf " trial %d: prefill %6.1f t/s decode %6.2f t/s\n" "$i" "$pp" "$tg" + sum_pp=$(echo "$sum_pp + $pp" | bc -l) + sum_tg=$(echo "$sum_tg + $tg" | bc -l) + done + MEAN_PP=$(echo "scale=1; $sum_pp / $RUNS" | bc -l) + MEAN_TG=$(echo "scale=2; $sum_tg / $RUNS" | bc -l) + printf " %s mean: prefill %s t/s decode %s t/s\n" "$engine" "$MEAN_PP" "$MEAN_TG" +} + +if [[ "${1:-}" == "status" ]]; then + for c in ollama llama kimi-linear qwen3-235b; do + is_running "$c" && echo "$c: up" || echo "$c: down" + done + exit 0 +fi + +need bc + +echo "== llama.cpp (kyuz0 ${GGUF##*/}) ==" +isolate_for llama +up_wait llama "$LLAMA_URL/health" +bench "llama.cpp" trial_llama +LLAMA_TG="$MEAN_TG" +down llama + +echo +echo "== Ollama (same GGUF) ==" +isolate_for ollama +up_wait ollama "$OLLAMA_URL/api/tags" +register_ollama_model +bench "ollama" trial_ollama +OLLAMA_TG="$MEAN_TG" + +echo +echo "== Verdict ==" +# Ollama as % of llama.cpp decode throughput. +PCT=$(echo "scale=1; 100 * $OLLAMA_TG / $LLAMA_TG" | bc -l) +printf " llama.cpp decode: %s t/s\n ollama decode: %s t/s (%s%% of llama.cpp)\n" \ + "$LLAMA_TG" "$OLLAMA_TG" "$PCT" +echo +echo " Guidance: Ollama >=85% of llama.cpp -> option 1 (Ollama + vLLM," +echo " drop standalone llama.cpp; Ollama self-swaps, no llama-swap)." +echo " Larger gap -> option 2 (keep llama.cpp" +echo " behind llama-swap with coexistence groups; drop Ollama)." diff --git a/pyinfra/framework/scripts/swap-model b/pyinfra/framework/scripts/swap-model new file mode 100755 index 0000000..5078ba4 --- /dev/null +++ b/pyinfra/framework/scripts/swap-model @@ -0,0 +1,186 @@ +#!/usr/bin/env bash +# swap-model — coordinate which inference container is GPU-resident on +# the Strix Halo box. +# +# Why this exists. The GPU's merged ~110 GB arena (BIOS UMA=0.5 GB + +# ttm.pages_limit + HSA_XNACK; see StrixHaloMemory.md) holds at most +# one 88 GB-class model at a time, and ROCm doesn't reclaim cleanly +# between consumers. So switching models means stop-then-start of +# whole compose stacks. This script encodes the per-target conflict +# table + per-service health probes so the swap is one command. +# +# Usage: +# swap-model coder # Qwen3-Coder-30B via Ollama (interactive) +# swap-model 235b # Qwen3-235B-A22B via llama.cpp (long-task) +# swap-model kimi # Kimi-Linear-48B-A3B via vLLM (long-context) +# swap-model qwable # Qwable-3.6-27B via llama.cpp (Fable-style) +# swap-model comfyui # ComfyUI (image generation) +# swap-model none # everything down — free the GPU +# swap-model status # show what's currently up +# +# Env knobs: +# SWAP_WAIT_TIMEOUT seconds to wait for /health after up; default 600 +# (235B's 88 GB cold load can take 3-5 min) +# +# Out of scope (deliberately): +# - Always-on services (openwebui, litellm, phoenix, beszel, etc.) — +# no GPU footprint, left alone. +# - llama.cpp 30B (port 8080) — same weights as Ollama's qwen3-coder +# but still LL-P0 perf-evaluating. `coder` target uses Ollama only. +# - Multi-target combos (e.g. kimi+ollama coexist on the arena); +# for now run swap-model twice if you want both. + +set -euo pipefail + +COMPOSE_ROOT="/srv/docker" +WAIT_TIMEOUT="${SWAP_WAIT_TIMEOUT:-600}" + +# --- Service table ----------------------------------------------------------- +# Map short name → compose dir (under $COMPOSE_ROOT) and health URL. +# Container name == compose dir name in every case (intentional convention, +# enforced in compose/*.yml's container_name fields). +declare -A SVC_DIR=( + [ollama]=ollama + [llama]=llama + [kimi]=kimi-linear + [235b]=qwen3-235b + [qwable]=qwable + [comfyui]=comfyui +) +declare -A SVC_HEALTH=( + [ollama]="http://127.0.0.1:11434/api/tags" + [llama]="http://127.0.0.1:8080/health" + [kimi]="http://127.0.0.1:8000/v1/models" + [235b]="http://127.0.0.1:8081/health" + [qwable]="http://127.0.0.1:8082/health" + [comfyui]="http://127.0.0.1:8188/" +) + +# --- Target → plan ----------------------------------------------------------- +# UP = services that should be running after the swap +# DOWN = services that must be stopped to free the GPU arena +# (anything not in either list is left untouched — e.g. switching to coder +# leaves kimi alone, since kimi(30 GB) + ollama(30 GB) fit in the arena.) +plan() { + UP=() ; DOWN=() + case "$1" in + coder) UP=(ollama) ; DOWN=(235b comfyui) ;; + 235b) UP=(235b) ; DOWN=(ollama llama kimi qwable comfyui) ;; + kimi) UP=(kimi) ; DOWN=(235b comfyui) ;; + qwable) UP=(qwable) ; DOWN=(235b comfyui) ;; + comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;; + none) UP=() ; DOWN=(ollama llama kimi 235b qwable comfyui) ;; + *) return 1 ;; + esac +} + +# --- Probes ------------------------------------------------------------------ +is_running() { + docker inspect -f '{{.State.Running}}' "$1" 2>/dev/null | grep -q true +} + +is_healthy() { + curl -fsS --max-time 5 "$1" >/dev/null 2>&1 +} + +wait_healthy() { + local svc="$1" url="${SVC_HEALTH[$1]}" deadline=$(( SECONDS + WAIT_TIMEOUT )) + printf " waiting for %s health (timeout %ss)" "$svc" "$WAIT_TIMEOUT" + while ! is_healthy "$url"; do + if (( SECONDS > deadline )); then + printf " TIMEOUT\n" + echo " last 20 lines of container log:" >&2 + docker logs --tail 20 "${SVC_DIR[$svc]}" 2>&1 | sed 's/^/ /' >&2 + return 1 + fi + sleep 5 + printf "." + done + printf " ok\n" +} + +# --- Actions ----------------------------------------------------------------- +down_svc() { + local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}" + if ! is_running "${SVC_DIR[$svc]}"; then + echo " $svc: already down" + return 0 + fi + echo " stopping $svc" + (cd "$dir" && docker compose down) +} + +up_svc() { + local svc="$1" dir="$COMPOSE_ROOT/${SVC_DIR[$1]}" + if is_running "${SVC_DIR[$svc]}" && is_healthy "${SVC_HEALTH[$svc]}"; then + echo " $svc: already up + healthy" + return 0 + fi + if [[ ! -d "$dir" ]]; then + echo " $svc: compose dir $dir missing — run pyinfra deploy first" >&2 + return 1 + fi + echo " starting $svc" + (cd "$dir" && docker compose up -d) + wait_healthy "$svc" +} + +show_status() { + echo "Inference services:" + for svc in ollama llama kimi 235b qwable comfyui; do + local container="${SVC_DIR[$svc]}" state="down" health="" + if is_running "$container"; then + state="up" + if is_healthy "${SVC_HEALTH[$svc]}"; then + health=" (healthy)" + else + health=" (starting/unhealthy)" + fi + fi + printf " %-8s %s%s\n" "$svc" "$state" "$health" + done +} + +usage() { + cat <&2 + echo "Try: swap-model help" >&2 + exit 2 + ;; +esac + +plan "$TARGET" +echo "Plan: down=[${DOWN[*]:-}] up=[${UP[*]:-}]" +for svc in "${DOWN[@]}"; do down_svc "$svc"; done +for svc in "${UP[@]}"; do up_svc "$svc"; done + +echo +show_status