From 7b594c71b1e0b608643b824d540a1a61c97235b8 Mon Sep 17 00:00:00 2001 From: noisedestroyers Date: Thu, 7 May 2026 07:37:40 -0400 Subject: [PATCH] folder-per-station --- README.md | 74 +------- framework/README.md | 70 ++++++++ framework/compose/llama.yml | 24 +++ framework/compose/ollama.yml | 27 +++ framework/compose/vllm.yml | 36 ++++ framework/deploy.py | 318 +++++++++++++++++++++++++++++++++++ framework/inventory.py | 5 + framework/run.sh | 7 + 8 files changed, 496 insertions(+), 65 deletions(-) create mode 100644 framework/README.md create mode 100644 framework/compose/llama.yml create mode 100644 framework/compose/ollama.yml create mode 100644 framework/compose/vllm.yml create mode 100644 framework/deploy.py create mode 100644 framework/inventory.py create mode 100755 framework/run.sh diff --git a/README.md b/README.md index 1c8c9c8..f331e02 100644 --- a/README.md +++ b/README.md @@ -1,68 +1,12 @@ -# pyinfra: Strix Halo bring-up +# pyinfra -Containerized setup for the Framework Desktop (Ryzen AI Max+ 395, Radeon -8060S, 128 GB). The host stays minimal — kernel + driver + Docker + -diagnostics. Inference engines (llama.cpp, vLLM, Ollama) run as docker -compose services, each shipping its own ROCm/Vulkan stack. +One folder per station. Each subfolder is a self-contained pyinfra +deploy: `inventory.py`, `deploy.py`, `run.sh`, plus any compose files +or assets that ship to the host. -## Manual prerequisites +| Station | Host | Notes | +|---------|------|-------| +| [`framework/`](framework/README.md) | `10.0.0.237` | Framework Desktop (Strix Halo, 128 GB) — local LLM box | -1. **Phase 0** — update Framework BIOS, set GPU UMA carve-out (96 GB). -2. **OS install** — Ubuntu Server (24.04 LTS recommended; 26.04 also works - but ROCm host-side support is patchy — see `../TODO.md`). Enable SSH, - import your laptop key, create user `noise`. -3. The host must be reachable at `10.0.0.237` over SSH (edit `inventory.py` - if it moves). -4. **NOPASSWD sudo for `noise`** — pyinfra's fact layer doesn't reliably - thread sudo passwords. One-time setup: - ```sh - ssh noise@10.0.0.237 'echo "noise ALL=(ALL) NOPASSWD: ALL" | sudo tee /etc/sudoers.d/noise-nopasswd && sudo chmod 440 /etc/sudoers.d/noise-nopasswd' - ``` - -## Run - -```sh -uv tool install pyinfra -./run.sh # equivalent to: pyinfra inventory.py deploy.py -./run.sh --dry # any extra args are forwarded to pyinfra -``` - -Or run it ephemerally without installing: `uvx pyinfra inventory.py deploy.py`. - -## What the deploy does - -- Base CLI: tmux, vim, htop, btop, nvtop, radeontop, uv -- Tailscale (run `sudo tailscale up` on the box once, interactively) -- Docker engine + compose plugin, user added to `docker` group -- ROCm host diagnostics only (`rocminfo`, `rocm-smi`) — no full toolchain -- `/models//` layout -- `~/docker/{llama,vllm,ollama}/docker-compose.yml` dropped in, - not auto-started — you edit the model path then `docker compose up -d` - -If a previous run installed the native llama.cpp build / full ROCm / -native Ollama, those are auto-cleaned the next time `./run.sh` runs. - -## After the deploy: starting an inference service - -```sh -ssh noise@10.0.0.237 -sudo tailscale up # one-time, interactive - -# Drop a GGUF somewhere under /models, then: -cd ~/docker/llama -vim docker-compose.yml # edit the --model path -docker compose up -d -curl localhost:8080/v1/models # smoke test -``` - -Same shape for `vllm` (port 8000) and `ollama` (port 11434, no model edit -needed — Ollama serves models on demand). - -## Tunables - -Top of `deploy.py`: -- `ROCM_VERSION` and `AMDGPU_INSTALL_DEB` — bump when AMD ships a newer - release. The .deb filename has a build suffix that doesn't derive from - the version; find it at https://repo.radeon.com/amdgpu-install/. - -Compose images in `compose/{llama,vllm,ollama}.yml` — pin tags here. +To bring up a station, `cd` into its folder and run `./run.sh`. See the +station's own README for prerequisites. diff --git a/framework/README.md b/framework/README.md new file mode 100644 index 0000000..6c9e676 --- /dev/null +++ b/framework/README.md @@ -0,0 +1,70 @@ +# pyinfra: Strix Halo bring-up + +Containerized setup for the Framework Desktop (Ryzen AI Max+ 395, Radeon +8060S, 128 GB). The host stays minimal — kernel + driver + Docker + +diagnostics. Inference engines (llama.cpp, vLLM, Ollama) run as docker +compose services, each shipping its own ROCm/Vulkan stack. + +## Manual prerequisites + +1. **Phase 0** — update Framework BIOS, set GPU UMA carve-out (96 GB). +2. **OS install** — Ubuntu Server 24.04 LTS. AMD ROCm only ships for + jammy/noble; later Ubuntus install but break the host-side toolchain + (libxml2 ABI). Enable SSH, import your laptop key, create user `noise`. + Recommended partitioning: ≥300 GB on `/`, big disk mounted at `/models`, + plain ext4 (skip LVM). +3. The host must be reachable at `10.0.0.237` over SSH (edit `inventory.py` + if it moves). +4. **NOPASSWD sudo for `noise`** — pyinfra's fact layer doesn't reliably + thread sudo passwords. One-time setup: + ```sh + ssh noise@10.0.0.237 'echo "noise ALL=(ALL) NOPASSWD: ALL" | sudo tee /etc/sudoers.d/noise-nopasswd && sudo chmod 440 /etc/sudoers.d/noise-nopasswd' + ``` + +## Run + +```sh +uv tool install pyinfra +./run.sh # equivalent to: pyinfra inventory.py deploy.py +./run.sh --dry # any extra args are forwarded to pyinfra +``` + +Or run it ephemerally without installing: `uvx pyinfra inventory.py deploy.py`. + +## What the deploy does + +- Base CLI: tmux, vim, htop, btop, nvtop, radeontop, uv +- Tailscale (run `sudo tailscale up` on the box once, interactively) +- Docker engine + compose plugin, user added to `docker` group +- ROCm host diagnostics only (`rocminfo`, `rocm-smi`) — no full toolchain +- `/models//` layout +- `~/docker/{llama,vllm,ollama}/docker-compose.yml` dropped in, + not auto-started — you edit the model path then `docker compose up -d` + +If a previous run installed the native llama.cpp build / full ROCm / +native Ollama, those are auto-cleaned the next time `./run.sh` runs. + +## After the deploy: starting an inference service + +```sh +ssh noise@10.0.0.237 +sudo tailscale up # one-time, interactive + +# Drop a GGUF somewhere under /models, then: +cd ~/docker/llama +vim docker-compose.yml # edit the --model path +docker compose up -d +curl localhost:8080/v1/models # smoke test +``` + +Same shape for `vllm` (port 8000) and `ollama` (port 11434, no model edit +needed — Ollama serves models on demand). + +## Tunables + +Top of `deploy.py`: +- `ROCM_VERSION` and `AMDGPU_INSTALL_DEB` — bump when AMD ships a newer + release. The .deb filename has a build suffix that doesn't derive from + the version; find it at https://repo.radeon.com/amdgpu-install/. + +Compose images in `compose/{llama,vllm,ollama}.yml` — pin tags here. diff --git a/framework/compose/llama.yml b/framework/compose/llama.yml new file mode 100644 index 0000000..5fbccfd --- /dev/null +++ b/framework/compose/llama.yml @@ -0,0 +1,24 @@ +# llama.cpp server, Vulkan backend (RADV on Strix Halo). +# Edit the --model path before `docker compose up -d`. +services: + llama: + image: ghcr.io/ggml-org/llama.cpp:server-vulkan + container_name: llama + restart: unless-stopped + devices: + - /dev/dri:/dev/dri + volumes: + - /models:/models:ro + ports: + - "8080:8080" + command: + - --model + - /models/REPLACE/ME/model.gguf + - --host + - 0.0.0.0 + - --port + - "8080" + - --n-gpu-layers + - "999" + - --ctx-size + - "32768" diff --git a/framework/compose/ollama.yml b/framework/compose/ollama.yml new file mode 100644 index 0000000..d672e23 --- /dev/null +++ b/framework/compose/ollama.yml @@ -0,0 +1,27 @@ +# Ollama, ROCm backend. Serves models on demand — safe to start before +# you've put anything in /models. +# +# Storage: Ollama's content-addressed blob store is bind-mounted under +# /models/ollama so all model data on the host lives under /models. +# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM +# can't load them directly. Keep curated GGUFs at /models//... +# for those engines. +services: + ollama: + image: ollama/ollama:rocm + container_name: ollama + restart: unless-stopped + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + # Numeric GIDs of host's video (44) and render (991) groups — names + # don't exist inside the container, but the GIDs need to match the + # host so /dev/kfd + /dev/dri are accessible. + group_add: + - "44" + - "991" + volumes: + - /models/ollama:/root/.ollama + - /models:/models:ro + ports: + - "11434:11434" diff --git a/framework/compose/vllm.yml b/framework/compose/vllm.yml new file mode 100644 index 0000000..dd5e00b --- /dev/null +++ b/framework/compose/vllm.yml @@ -0,0 +1,36 @@ +# vLLM, ROCm backend. +# +# NOTE: vLLM's official ROCm support targets datacenter cards (MI300X / +# gfx942). Strix Halo is gfx1151 — support varies by image tag and +# release. If `rocm/vllm:latest` doesn't run on this iGPU, try +# `rocm/vllm-dev:nightly` or build from source against ROCm 7.x. +services: + vllm: + image: rocm/vllm:latest + container_name: vllm + restart: unless-stopped + devices: + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + cap_add: + - SYS_PTRACE + security_opt: + - seccomp=unconfined + # Numeric GIDs of host's video (44) and render (991) groups — names + # don't exist inside the container. + group_add: + - "44" + - "991" + shm_size: 16g + ipc: host + volumes: + - /models:/models:ro + ports: + - "8000:8000" + command: + - --model + - /models/REPLACE/ME + - --host + - 0.0.0.0 + - --port + - "8000" diff --git a/framework/deploy.py b/framework/deploy.py new file mode 100644 index 0000000..fe233d5 --- /dev/null +++ b/framework/deploy.py @@ -0,0 +1,318 @@ +""" +pyinfra deploy for Framework Desktop (Ryzen AI Max+ 395 / Strix Halo). + +Containerized layout: inference engines (llama.cpp, vLLM, Ollama) run as +docker compose services, each shipping its own ROCm/Vulkan stack. The +host stays minimal — kernel + driver + diagnostics + Docker. + +Run with: + ./run.sh # equivalent to pyinfra inventory.py deploy.py + +Idempotent — re-running only does work that's actually needed. Includes +one-shot cleanup steps for artifacts left over from the earlier native +build (llama.cpp git checkout, symlinks, systemd unit, full ROCm). +""" + +from io import StringIO + +from pyinfra.operations import apt, files, server, systemd +from pyinfra import host + +# --- Tunables ---------------------------------------------------------------- + +# Latest stable as of 2026-04-30. Verify at https://repo.radeon.com/amdgpu-install/. +ROCM_VERSION = "7.2.3" +# The .deb filename has an opaque build suffix — find the exact name at +# the URL above and paste it here. +AMDGPU_INSTALL_DEB = "amdgpu-install_7.2.3.70203-1_all.deb" + +SSH_USER = host.data.get("ssh_user", "noise") +MODELS_DIR = "/models" +COMPOSE_DIR = f"/home/{SSH_USER}/docker" + +# --- Phase 1 — Base OS basics ------------------------------------------------ + +apt.update(name="apt update", _sudo=True) + +apt.packages( + name="HWE kernel (>=6.11 for ROCm 7)", + packages=["linux-generic-hwe-24.04"], + _sudo=True, +) + +# User basics + monitoring tools. +apt.packages( + name="Base CLI tools", + packages=[ + "tmux", + "vim", + "htop", + "btop", + "nvtop", + "radeontop", + "git", + "curl", + "ca-certificates", + "unzip", + ], + _sudo=True, +) + +# Vulkan diagnostics on the host (containers ship their own runtime). +apt.packages( + name="Vulkan host diagnostics", + packages=["mesa-vulkan-drivers", "vulkan-tools"], + _sudo=True, +) + +# uv (Python package/tool manager). +server.shell( + name="Install / upgrade uv", + commands=[ + "curl -LsSf https://astral.sh/uv/install.sh | " + "env UV_INSTALL_DIR=/usr/local/bin UV_UNMANAGED_INSTALL=1 sh", + ], + _sudo=True, +) + +# gpakosz/.tmux config (Oh My Tmux!). +TMUX_CONF_DIR = f"/home/{SSH_USER}/.tmux" +server.shell( + name="Clone gpakosz/.tmux", + commands=[ + f"test -d {TMUX_CONF_DIR}/.git || " + f"git clone --depth 1 https://github.com/gpakosz/.tmux.git {TMUX_CONF_DIR}", + ], + _sudo=True, + _sudo_user=SSH_USER, +) +files.link( + name="Symlink ~/.tmux.conf -> ~/.tmux/.tmux.conf", + path=f"/home/{SSH_USER}/.tmux.conf", + target=f"{TMUX_CONF_DIR}/.tmux.conf", + user=SSH_USER, + group=SSH_USER, + _sudo=True, +) +# Seed the user override file only if absent — preserves customizations. +server.shell( + name="Seed ~/.tmux.conf.local (if missing)", + commands=[ + f"test -f /home/{SSH_USER}/.tmux.conf.local || " + f"cp {TMUX_CONF_DIR}/.tmux.conf.local /home/{SSH_USER}/", + ], + _sudo=True, + _sudo_user=SSH_USER, +) + +# --- Tailscale --------------------------------------------------------------- + +# Tailscale's noble repo works fine on later Ubuntus — the package is +# self-contained Go binaries. +files.download( + name="Fetch Tailscale apt key", + src="https://pkgs.tailscale.com/stable/ubuntu/noble.noarmor.gpg", + dest="/usr/share/keyrings/tailscale-archive-keyring.gpg", + mode="644", + _sudo=True, +) +files.download( + name="Fetch Tailscale apt list", + src="https://pkgs.tailscale.com/stable/ubuntu/noble.tailscale-keyring.list", + dest="/etc/apt/sources.list.d/tailscale.list", + mode="644", + _sudo=True, +) +apt.update(name="apt update (Tailscale)", _sudo=True) +apt.packages(name="Install Tailscale", packages=["tailscale"], _sudo=True) +systemd.service( + name="Enable tailscaled", + service="tailscaled", + running=True, + enabled=True, + _sudo=True, +) +# `sudo tailscale up` is interactive (browser auth) — run manually once. + +# --- Docker ----------------------------------------------------------------- + +apt.packages( + name="Install Docker + compose plugin", + packages=["docker.io", "docker-compose-v2"], + _sudo=True, +) +systemd.service( + name="Enable docker daemon", + service="docker", + running=True, + enabled=True, + _sudo=True, +) + +# --- GPU access (host kernel/driver bits only) ------------------------------ + +# AMD's amdgpu-install package adds the ROCm apt repo; we use it just to +# get rocminfo / rocm-smi-lib for host-side diagnostics. Containers ship +# their own ROCm. +files.directory( + name="apt keyring dir", + path="/etc/apt/keyrings", + mode="755", + _sudo=True, +) + +amdgpu_deb = f"/tmp/{AMDGPU_INSTALL_DEB}" +amdgpu_url = ( + f"https://repo.radeon.com/amdgpu-install/{ROCM_VERSION}/ubuntu/noble/" + f"{AMDGPU_INSTALL_DEB}" +) +server.shell( + name="Fetch amdgpu-install .deb", + commands=[f"test -f {amdgpu_deb} || curl -fsSL {amdgpu_url} -o {amdgpu_deb}"], + _sudo=True, +) +server.shell( + name="Install amdgpu-install package", + commands=[f"apt install -y {amdgpu_deb}"], + _sudo=True, +) + +# Idempotent cleanup: if a prior run installed the full ROCm userspace +# (~25 GB), tear it down before installing the diagnostic-only subset. +# On a fresh box this is a no-op. +server.shell( + name="Remove full ROCm install if present", + commands=[ + "if dpkg -l rocm-dev 2>/dev/null | grep -q '^ii'; then " + " amdgpu-install -y --uninstall || true; " + "fi", + ], + _sudo=True, +) +apt.packages( + name="ROCm host diagnostics (rocminfo, rocm-smi)", + packages=["rocminfo", "rocm-smi-lib"], + _sudo=True, +) + +# Group membership for /dev/kfd + /dev/dri access (needed for GPU passthrough +# into containers, and for unprivileged host-side rocminfo). +server.group(name="ensure render group", group="render", _sudo=True) +server.group(name="ensure video group", group="video", _sudo=True) +server.user( + name="Add login user to render/video/docker", + user=SSH_USER, + groups=["render", "video", "docker"], + append=True, + _sudo=True, +) + +# --- Storage layout --------------------------------------------------------- + +files.directory( + name="/models root", + path=MODELS_DIR, + user=SSH_USER, + group=SSH_USER, + mode="755", + _sudo=True, +) +for sub in ("moonshotai", "qwen", "deepseek", "zai", "mistralai"): + files.directory( + name=f"{MODELS_DIR}/{sub}", + path=f"{MODELS_DIR}/{sub}", + user=SSH_USER, + group=SSH_USER, + mode="755", + _sudo=True, + ) +# Ollama bind-mounts its content-addressed store here. +files.directory( + name=f"{MODELS_DIR}/ollama", + path=f"{MODELS_DIR}/ollama", + user=SSH_USER, + group=SSH_USER, + mode="755", + _sudo=True, +) + +# --- Compose files for inference services ---------------------------------- + +files.directory( + name="Compose root", + path=COMPOSE_DIR, + user=SSH_USER, + group=SSH_USER, + mode="755", + _sudo=True, +) +# Earlier iterations dropped compose at /srv/compose. Idempotent cleanup. +files.directory( + name="Remove old /srv/compose", + path="/srv/compose", + present=False, + _sudo=True, +) +for svc in ("llama", "vllm", "ollama"): + files.directory( + name=f"compose/{svc} dir", + path=f"{COMPOSE_DIR}/{svc}", + user=SSH_USER, + group=SSH_USER, + mode="755", + _sudo=True, + ) + files.put( + name=f"compose/{svc}/docker-compose.yml", + src=f"compose/{svc}.yml", + dest=f"{COMPOSE_DIR}/{svc}/docker-compose.yml", + user=SSH_USER, + group=SSH_USER, + mode="644", + _sudo=True, + ) + +# --- Cleanup of artifacts from the prior native-build deploy ---------------- +# All idempotent — `present=False` is a no-op when the target is absent. + +server.shell( + name="Stop & disable old native llama-server.service", + commands=[ + "systemctl disable --now llama-server.service 2>/dev/null || true", + ], + _sudo=True, +) +files.file( + name="Remove old llama-server.service", + path="/etc/systemd/system/llama-server.service", + present=False, + _sudo=True, +) +files.link( + name="Remove old llama-server-vulkan symlink", + path="/usr/local/bin/llama-server-vulkan", + present=False, + _sudo=True, +) +files.link( + name="Remove old llama-server-rocm symlink", + path="/usr/local/bin/llama-server-rocm", + present=False, + _sudo=True, +) +files.directory( + name="Remove old llama.cpp checkout", + path="/opt/llama.cpp", + present=False, + _sudo=True, +) +server.shell( + name="Stop & remove native Ollama install", + commands=[ + "systemctl disable --now ollama.service 2>/dev/null || true", + "rm -f /etc/systemd/system/ollama.service /usr/local/bin/ollama", + "userdel ollama 2>/dev/null || true", + ], + _sudo=True, +) +systemd.daemon_reload(name="systemctl daemon-reload", _sudo=True) diff --git a/framework/inventory.py b/framework/inventory.py new file mode 100644 index 0000000..b9a8408 --- /dev/null +++ b/framework/inventory.py @@ -0,0 +1,5 @@ +# pyinfra inventory for the Framework Desktop / Strix Halo box. + +framework_desktop = [ + ("framework", {"ssh_hostname": "10.0.0.237", "ssh_user": "noise"}), +] diff --git a/framework/run.sh b/framework/run.sh new file mode 100755 index 0000000..39efb97 --- /dev/null +++ b/framework/run.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Wrapper: invokes pyinfra against the Framework Desktop, forwarding any +# extra args (e.g. --dry, -v) to pyinfra. Assumes NOPASSWD sudo on the box. + +set -euo pipefail +cd "$(dirname "$0")" +exec pyinfra inventory.py deploy.py "$@"