initial
This commit is contained in:
68
README.md
Normal file
68
README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# pyinfra: Strix Halo bring-up
|
||||
|
||||
Containerized setup for the Framework Desktop (Ryzen AI Max+ 395, Radeon
|
||||
8060S, 128 GB). The host stays minimal — kernel + driver + Docker +
|
||||
diagnostics. Inference engines (llama.cpp, vLLM, Ollama) run as docker
|
||||
compose services, each shipping its own ROCm/Vulkan stack.
|
||||
|
||||
## Manual prerequisites
|
||||
|
||||
1. **Phase 0** — update Framework BIOS, set GPU UMA carve-out (96 GB).
|
||||
2. **OS install** — Ubuntu Server (24.04 LTS recommended; 26.04 also works
|
||||
but ROCm host-side support is patchy — see `../TODO.md`). Enable SSH,
|
||||
import your laptop key, create user `noise`.
|
||||
3. The host must be reachable at `10.0.0.237` over SSH (edit `inventory.py`
|
||||
if it moves).
|
||||
4. **NOPASSWD sudo for `noise`** — pyinfra's fact layer doesn't reliably
|
||||
thread sudo passwords. One-time setup:
|
||||
```sh
|
||||
ssh noise@10.0.0.237 'echo "noise ALL=(ALL) NOPASSWD: ALL" | sudo tee /etc/sudoers.d/noise-nopasswd && sudo chmod 440 /etc/sudoers.d/noise-nopasswd'
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
```sh
|
||||
uv tool install pyinfra
|
||||
./run.sh # equivalent to: pyinfra inventory.py deploy.py
|
||||
./run.sh --dry # any extra args are forwarded to pyinfra
|
||||
```
|
||||
|
||||
Or run it ephemerally without installing: `uvx pyinfra inventory.py deploy.py`.
|
||||
|
||||
## What the deploy does
|
||||
|
||||
- Base CLI: tmux, vim, htop, btop, nvtop, radeontop, uv
|
||||
- Tailscale (run `sudo tailscale up` on the box once, interactively)
|
||||
- Docker engine + compose plugin, user added to `docker` group
|
||||
- ROCm host diagnostics only (`rocminfo`, `rocm-smi`) — no full toolchain
|
||||
- `/models/<vendor>/` layout
|
||||
- `~/docker/{llama,vllm,ollama}/docker-compose.yml` dropped in,
|
||||
not auto-started — you edit the model path then `docker compose up -d`
|
||||
|
||||
If a previous run installed the native llama.cpp build / full ROCm /
|
||||
native Ollama, those are auto-cleaned the next time `./run.sh` runs.
|
||||
|
||||
## After the deploy: starting an inference service
|
||||
|
||||
```sh
|
||||
ssh noise@10.0.0.237
|
||||
sudo tailscale up # one-time, interactive
|
||||
|
||||
# Drop a GGUF somewhere under /models, then:
|
||||
cd ~/docker/llama
|
||||
vim docker-compose.yml # edit the --model path
|
||||
docker compose up -d
|
||||
curl localhost:8080/v1/models # smoke test
|
||||
```
|
||||
|
||||
Same shape for `vllm` (port 8000) and `ollama` (port 11434, no model edit
|
||||
needed — Ollama serves models on demand).
|
||||
|
||||
## Tunables
|
||||
|
||||
Top of `deploy.py`:
|
||||
- `ROCM_VERSION` and `AMDGPU_INSTALL_DEB` — bump when AMD ships a newer
|
||||
release. The .deb filename has a build suffix that doesn't derive from
|
||||
the version; find it at https://repo.radeon.com/amdgpu-install/.
|
||||
|
||||
Compose images in `compose/{llama,vllm,ollama}.yml` — pin tags here.
|
||||
24
compose/llama.yml
Normal file
24
compose/llama.yml
Normal file
@@ -0,0 +1,24 @@
|
||||
# llama.cpp server, Vulkan backend (RADV on Strix Halo).
|
||||
# Edit the --model path before `docker compose up -d`.
|
||||
services:
|
||||
llama:
|
||||
image: ghcr.io/ggml-org/llama.cpp:server-vulkan
|
||||
container_name: llama
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/dri:/dev/dri
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8080:8080"
|
||||
command:
|
||||
- --model
|
||||
- /models/REPLACE/ME/model.gguf
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8080"
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
- --ctx-size
|
||||
- "32768"
|
||||
27
compose/ollama.yml
Normal file
27
compose/ollama.yml
Normal file
@@ -0,0 +1,27 @@
|
||||
# Ollama, ROCm backend. Serves models on demand — safe to start before
|
||||
# you've put anything in /models.
|
||||
#
|
||||
# Storage: Ollama's content-addressed blob store is bind-mounted under
|
||||
# /models/ollama so all model data on the host lives under /models.
|
||||
# Note: Ollama's blobs are SHA256-named, not raw GGUFs — llama.cpp/vLLM
|
||||
# can't load them directly. Keep curated GGUFs at /models/<vendor>/...
|
||||
# for those engines.
|
||||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:rocm
|
||||
container_name: ollama
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the container, but the GIDs need to match the
|
||||
# host so /dev/kfd + /dev/dri are accessible.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
volumes:
|
||||
- /models/ollama:/root/.ollama
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "11434:11434"
|
||||
37
compose/vllm.yml
Normal file
37
compose/vllm.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
# vLLM, ROCm backend.
|
||||
#
|
||||
# NOTE: vLLM's official ROCm support targets datacenter cards (MI300X /
|
||||
# gfx942). Strix Halo is gfx1151 — support varies by image tag and
|
||||
# release. If `rocm/vllm:latest` doesn't run on this iGPU, try
|
||||
# `rocm/vllm-dev:nightly` or build from source against ROCm 7.x.
|
||||
# Track this in ../../TODO.md.
|
||||
services:
|
||||
vllm:
|
||||
image: rocm/vllm:latest
|
||||
container_name: vllm
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups — names
|
||||
# don't exist inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 16g
|
||||
ipc: host
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8000:8000"
|
||||
command:
|
||||
- --model
|
||||
- /models/REPLACE/ME
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
318
deploy.py
Normal file
318
deploy.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
pyinfra deploy for Framework Desktop (Ryzen AI Max+ 395 / Strix Halo).
|
||||
|
||||
Containerized layout: inference engines (llama.cpp, vLLM, Ollama) run as
|
||||
docker compose services, each shipping its own ROCm/Vulkan stack. The
|
||||
host stays minimal — kernel + driver + diagnostics + Docker.
|
||||
|
||||
Run with:
|
||||
./run.sh # equivalent to pyinfra inventory.py deploy.py
|
||||
|
||||
Idempotent — re-running only does work that's actually needed. Includes
|
||||
one-shot cleanup steps for artifacts left over from the earlier native
|
||||
build (llama.cpp git checkout, symlinks, systemd unit, full ROCm).
|
||||
"""
|
||||
|
||||
from io import StringIO
|
||||
|
||||
from pyinfra.operations import apt, files, server, systemd
|
||||
from pyinfra import host
|
||||
|
||||
# --- Tunables ----------------------------------------------------------------
|
||||
|
||||
# Latest stable as of 2026-04-30. Verify at https://repo.radeon.com/amdgpu-install/.
|
||||
ROCM_VERSION = "7.2.3"
|
||||
# The .deb filename has an opaque build suffix — find the exact name at
|
||||
# the URL above and paste it here.
|
||||
AMDGPU_INSTALL_DEB = "amdgpu-install_7.2.3.70203-1_all.deb"
|
||||
|
||||
SSH_USER = host.data.get("ssh_user", "noise")
|
||||
MODELS_DIR = "/models"
|
||||
COMPOSE_DIR = f"/home/{SSH_USER}/docker"
|
||||
|
||||
# --- Phase 1 — Base OS basics ------------------------------------------------
|
||||
|
||||
apt.update(name="apt update", _sudo=True)
|
||||
|
||||
apt.packages(
|
||||
name="HWE kernel (>=6.11 for ROCm 7)",
|
||||
packages=["linux-generic-hwe-24.04"],
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# User basics + monitoring tools.
|
||||
apt.packages(
|
||||
name="Base CLI tools",
|
||||
packages=[
|
||||
"tmux",
|
||||
"vim",
|
||||
"htop",
|
||||
"btop",
|
||||
"nvtop",
|
||||
"radeontop",
|
||||
"git",
|
||||
"curl",
|
||||
"ca-certificates",
|
||||
"unzip",
|
||||
],
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Vulkan diagnostics on the host (containers ship their own runtime).
|
||||
apt.packages(
|
||||
name="Vulkan host diagnostics",
|
||||
packages=["mesa-vulkan-drivers", "vulkan-tools"],
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# uv (Python package/tool manager).
|
||||
server.shell(
|
||||
name="Install / upgrade uv",
|
||||
commands=[
|
||||
"curl -LsSf https://astral.sh/uv/install.sh | "
|
||||
"env UV_INSTALL_DIR=/usr/local/bin UV_UNMANAGED_INSTALL=1 sh",
|
||||
],
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# gpakosz/.tmux config (Oh My Tmux!).
|
||||
TMUX_CONF_DIR = f"/home/{SSH_USER}/.tmux"
|
||||
server.shell(
|
||||
name="Clone gpakosz/.tmux",
|
||||
commands=[
|
||||
f"test -d {TMUX_CONF_DIR}/.git || "
|
||||
f"git clone --depth 1 https://github.com/gpakosz/.tmux.git {TMUX_CONF_DIR}",
|
||||
],
|
||||
_sudo=True,
|
||||
_sudo_user=SSH_USER,
|
||||
)
|
||||
files.link(
|
||||
name="Symlink ~/.tmux.conf -> ~/.tmux/.tmux.conf",
|
||||
path=f"/home/{SSH_USER}/.tmux.conf",
|
||||
target=f"{TMUX_CONF_DIR}/.tmux.conf",
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
_sudo=True,
|
||||
)
|
||||
# Seed the user override file only if absent — preserves customizations.
|
||||
server.shell(
|
||||
name="Seed ~/.tmux.conf.local (if missing)",
|
||||
commands=[
|
||||
f"test -f /home/{SSH_USER}/.tmux.conf.local || "
|
||||
f"cp {TMUX_CONF_DIR}/.tmux.conf.local /home/{SSH_USER}/",
|
||||
],
|
||||
_sudo=True,
|
||||
_sudo_user=SSH_USER,
|
||||
)
|
||||
|
||||
# --- Tailscale ---------------------------------------------------------------
|
||||
|
||||
# Tailscale's noble repo works fine on later Ubuntus — the package is
|
||||
# self-contained Go binaries.
|
||||
files.download(
|
||||
name="Fetch Tailscale apt key",
|
||||
src="https://pkgs.tailscale.com/stable/ubuntu/noble.noarmor.gpg",
|
||||
dest="/usr/share/keyrings/tailscale-archive-keyring.gpg",
|
||||
mode="644",
|
||||
_sudo=True,
|
||||
)
|
||||
files.download(
|
||||
name="Fetch Tailscale apt list",
|
||||
src="https://pkgs.tailscale.com/stable/ubuntu/noble.tailscale-keyring.list",
|
||||
dest="/etc/apt/sources.list.d/tailscale.list",
|
||||
mode="644",
|
||||
_sudo=True,
|
||||
)
|
||||
apt.update(name="apt update (Tailscale)", _sudo=True)
|
||||
apt.packages(name="Install Tailscale", packages=["tailscale"], _sudo=True)
|
||||
systemd.service(
|
||||
name="Enable tailscaled",
|
||||
service="tailscaled",
|
||||
running=True,
|
||||
enabled=True,
|
||||
_sudo=True,
|
||||
)
|
||||
# `sudo tailscale up` is interactive (browser auth) — run manually once.
|
||||
|
||||
# --- Docker -----------------------------------------------------------------
|
||||
|
||||
apt.packages(
|
||||
name="Install Docker + compose plugin",
|
||||
packages=["docker.io", "docker-compose-v2"],
|
||||
_sudo=True,
|
||||
)
|
||||
systemd.service(
|
||||
name="Enable docker daemon",
|
||||
service="docker",
|
||||
running=True,
|
||||
enabled=True,
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# --- GPU access (host kernel/driver bits only) ------------------------------
|
||||
|
||||
# AMD's amdgpu-install package adds the ROCm apt repo; we use it just to
|
||||
# get rocminfo / rocm-smi-lib for host-side diagnostics. Containers ship
|
||||
# their own ROCm.
|
||||
files.directory(
|
||||
name="apt keyring dir",
|
||||
path="/etc/apt/keyrings",
|
||||
mode="755",
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
amdgpu_deb = f"/tmp/{AMDGPU_INSTALL_DEB}"
|
||||
amdgpu_url = (
|
||||
f"https://repo.radeon.com/amdgpu-install/{ROCM_VERSION}/ubuntu/noble/"
|
||||
f"{AMDGPU_INSTALL_DEB}"
|
||||
)
|
||||
server.shell(
|
||||
name="Fetch amdgpu-install .deb",
|
||||
commands=[f"test -f {amdgpu_deb} || curl -fsSL {amdgpu_url} -o {amdgpu_deb}"],
|
||||
_sudo=True,
|
||||
)
|
||||
server.shell(
|
||||
name="Install amdgpu-install package",
|
||||
commands=[f"apt install -y {amdgpu_deb}"],
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Idempotent cleanup: if a prior run installed the full ROCm userspace
|
||||
# (~25 GB), tear it down before installing the diagnostic-only subset.
|
||||
# On a fresh box this is a no-op.
|
||||
server.shell(
|
||||
name="Remove full ROCm install if present",
|
||||
commands=[
|
||||
"if dpkg -l rocm-dev 2>/dev/null | grep -q '^ii'; then "
|
||||
" amdgpu-install -y --uninstall || true; "
|
||||
"fi",
|
||||
],
|
||||
_sudo=True,
|
||||
)
|
||||
apt.packages(
|
||||
name="ROCm host diagnostics (rocminfo, rocm-smi)",
|
||||
packages=["rocminfo", "rocm-smi-lib"],
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Group membership for /dev/kfd + /dev/dri access (needed for GPU passthrough
|
||||
# into containers, and for unprivileged host-side rocminfo).
|
||||
server.group(name="ensure render group", group="render", _sudo=True)
|
||||
server.group(name="ensure video group", group="video", _sudo=True)
|
||||
server.user(
|
||||
name="Add login user to render/video/docker",
|
||||
user=SSH_USER,
|
||||
groups=["render", "video", "docker"],
|
||||
append=True,
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# --- Storage layout ---------------------------------------------------------
|
||||
|
||||
files.directory(
|
||||
name="/models root",
|
||||
path=MODELS_DIR,
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
mode="755",
|
||||
_sudo=True,
|
||||
)
|
||||
for sub in ("moonshotai", "qwen", "deepseek", "zai", "mistralai"):
|
||||
files.directory(
|
||||
name=f"{MODELS_DIR}/{sub}",
|
||||
path=f"{MODELS_DIR}/{sub}",
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
mode="755",
|
||||
_sudo=True,
|
||||
)
|
||||
# Ollama bind-mounts its content-addressed store here.
|
||||
files.directory(
|
||||
name=f"{MODELS_DIR}/ollama",
|
||||
path=f"{MODELS_DIR}/ollama",
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
mode="755",
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# --- Compose files for inference services ----------------------------------
|
||||
|
||||
files.directory(
|
||||
name="Compose root",
|
||||
path=COMPOSE_DIR,
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
mode="755",
|
||||
_sudo=True,
|
||||
)
|
||||
# Earlier iterations dropped compose at /srv/compose. Idempotent cleanup.
|
||||
files.directory(
|
||||
name="Remove old /srv/compose",
|
||||
path="/srv/compose",
|
||||
present=False,
|
||||
_sudo=True,
|
||||
)
|
||||
for svc in ("llama", "vllm", "ollama"):
|
||||
files.directory(
|
||||
name=f"compose/{svc} dir",
|
||||
path=f"{COMPOSE_DIR}/{svc}",
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
mode="755",
|
||||
_sudo=True,
|
||||
)
|
||||
files.put(
|
||||
name=f"compose/{svc}/docker-compose.yml",
|
||||
src=f"compose/{svc}.yml",
|
||||
dest=f"{COMPOSE_DIR}/{svc}/docker-compose.yml",
|
||||
user=SSH_USER,
|
||||
group=SSH_USER,
|
||||
mode="644",
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# --- Cleanup of artifacts from the prior native-build deploy ----------------
|
||||
# All idempotent — `present=False` is a no-op when the target is absent.
|
||||
|
||||
server.shell(
|
||||
name="Stop & disable old native llama-server.service",
|
||||
commands=[
|
||||
"systemctl disable --now llama-server.service 2>/dev/null || true",
|
||||
],
|
||||
_sudo=True,
|
||||
)
|
||||
files.file(
|
||||
name="Remove old llama-server.service",
|
||||
path="/etc/systemd/system/llama-server.service",
|
||||
present=False,
|
||||
_sudo=True,
|
||||
)
|
||||
files.link(
|
||||
name="Remove old llama-server-vulkan symlink",
|
||||
path="/usr/local/bin/llama-server-vulkan",
|
||||
present=False,
|
||||
_sudo=True,
|
||||
)
|
||||
files.link(
|
||||
name="Remove old llama-server-rocm symlink",
|
||||
path="/usr/local/bin/llama-server-rocm",
|
||||
present=False,
|
||||
_sudo=True,
|
||||
)
|
||||
files.directory(
|
||||
name="Remove old llama.cpp checkout",
|
||||
path="/opt/llama.cpp",
|
||||
present=False,
|
||||
_sudo=True,
|
||||
)
|
||||
server.shell(
|
||||
name="Stop & remove native Ollama install",
|
||||
commands=[
|
||||
"systemctl disable --now ollama.service 2>/dev/null || true",
|
||||
"rm -f /etc/systemd/system/ollama.service /usr/local/bin/ollama",
|
||||
"userdel ollama 2>/dev/null || true",
|
||||
],
|
||||
_sudo=True,
|
||||
)
|
||||
systemd.daemon_reload(name="systemctl daemon-reload", _sudo=True)
|
||||
5
inventory.py
Normal file
5
inventory.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# pyinfra inventory for the Framework Desktop / Strix Halo box.
|
||||
|
||||
framework_desktop = [
|
||||
("framework", {"ssh_hostname": "10.0.0.237", "ssh_user": "noise"}),
|
||||
]
|
||||
9
pyinfra-debug.log
Normal file
9
pyinfra-debug.log
Normal file
@@ -0,0 +1,9 @@
|
||||
File "/Users/noise/.local/share/uv/tools/pyinfra/lib/python3.12/site-packages/pyinfra_cli/cli.py", line 263, in cli
|
||||
_main(*args, **kwargs)
|
||||
File "/Users/noise/.local/share/uv/tools/pyinfra/lib/python3.12/site-packages/pyinfra_cli/cli.py", line 442, in _main
|
||||
and not _do_confirm("Detected changes displayed above, skip this step with -y")
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/noise/.local/share/uv/tools/pyinfra/lib/python3.12/site-packages/pyinfra_cli/cli.py", line 468, in _do_confirm
|
||||
v = input()
|
||||
^^^^^^^
|
||||
EOFError: EOF when reading a line
|
||||
Reference in New Issue
Block a user