Files
localgenai/pyinfra/framework/deploy.py

646 lines
20 KiB
Python
Raw Normal View History

"""
pyinfra deploy for Framework Desktop (Ryzen AI Max+ 395 / Strix Halo).
Containerized layout: inference engines (llama.cpp, vLLM, Ollama) run as
docker compose services, each shipping its own ROCm/Vulkan stack. The
host stays minimal kernel + driver + diagnostics + Docker.
Run with:
./run.sh # equivalent to pyinfra inventory.py deploy.py
Idempotent re-running only does work that's actually needed. Includes
one-shot cleanup steps for artifacts left over from the earlier native
build (llama.cpp git checkout, symlinks, systemd unit, full ROCm).
"""
from io import StringIO
from pyinfra.operations import apt, files, server, systemd
from pyinfra import host
# --- Tunables ----------------------------------------------------------------
# Latest stable as of 2026-04-30. Verify at https://repo.radeon.com/amdgpu-install/.
ROCM_VERSION = "7.2.3"
# The .deb filename has an opaque build suffix — find the exact name at
# the URL above and paste it here.
AMDGPU_INSTALL_DEB = "amdgpu-install_7.2.3.70203-1_all.deb"
# amdgpu_top — modern GPU monitor that handles new AMD cards / APUs
# (Strix Halo / gfx1151). Replaces radeontop, which doesn't know about
# RDNA 3.5. Verify at https://github.com/Umio-Yasuno/amdgpu_top/releases.
AMDGPU_TOP_VERSION = "0.11.4-1"
AMDGPU_TOP_DEB = f"amdgpu-top_without_gui_{AMDGPU_TOP_VERSION}_amd64.deb"
# nvtop — htop-like GPU monitor with multi-vendor support. Ubuntu's apt
# package predates the gfx1151 sysfs detection improvements; we build
# 3.2.x from source instead. Verify at
# https://github.com/Syllo/nvtop/releases.
NVTOP_VERSION = "3.2.0"
# btop — system monitor with optional GPU panel. AMD GPU support
# requires building with GPU_SUPPORT=true against librocm-smi-dev.
# Ubuntu 24.04's apt package is 1.3.x (no GPU support). 1.4+ requires
# C++23, hence g++-14 from the ubuntu-toolchain-r/test PPA. Verify at
# https://github.com/aristocratos/btop/releases.
BTOP_VERSION = "1.4.7"
SSH_USER = host.data.get("ssh_user", "noise")
MODELS_DIR = "/models"
# /srv is the FHS-blessed location for "data and configuration for
# services this system provides." Owned root:docker with setgid +
# group-write so any docker-group member can manage compose stacks.
COMPOSE_DIR = "/srv/docker"
# --- Phase 1 — Base OS basics ------------------------------------------------
apt.update(name="apt update", _sudo=True)
apt.packages(
name="HWE kernel (>=6.11 for ROCm 7)",
packages=["linux-generic-hwe-24.04"],
_sudo=True,
)
# User basics + monitoring tools.
apt.packages(
name="Base CLI tools",
# radeontop intentionally omitted — predates RDNA 3.5 / Strix Halo,
# errors with "no VRAM support". amdgpu_top installed below.
# nvtop from apt intentionally omitted — apt's 3.0.2 doesn't pick up
# gfx1151. Built from source below instead.
# btop from apt intentionally omitted — apt's 1.3.x has no AMD GPU
# support. Built from source below.
packages=[
"tmux",
"vim",
"htop",
"git",
"curl",
"ca-certificates",
"unzip",
"software-properties-common", # for add-apt-repository (g++-14 PPA)
],
_sudo=True,
)
# Vulkan diagnostics on the host (containers ship their own runtime).
apt.packages(
name="Vulkan host diagnostics",
packages=["mesa-vulkan-drivers", "vulkan-tools"],
_sudo=True,
)
# uv (Python package/tool manager).
server.shell(
name="Install / upgrade uv",
commands=[
"curl -LsSf https://astral.sh/uv/install.sh | "
"env UV_INSTALL_DIR=/usr/local/bin UV_UNMANAGED_INSTALL=1 sh",
],
_sudo=True,
)
# huggingface_hub CLI for `huggingface-cli download <repo> --local-dir ...`
# Lands at ~/.local/bin/huggingface-cli for the SSH user. Other users can
# repeat the command themselves.
server.shell(
name="Install / upgrade huggingface_hub CLI",
commands=["uv tool install --upgrade 'huggingface_hub[cli]'"],
_sudo=True,
_sudo_user=SSH_USER,
)
# gpakosz/.tmux config (Oh My Tmux!).
TMUX_CONF_DIR = f"/home/{SSH_USER}/.tmux"
server.shell(
name="Clone gpakosz/.tmux",
commands=[
f"test -d {TMUX_CONF_DIR}/.git || "
f"git clone --depth 1 https://github.com/gpakosz/.tmux.git {TMUX_CONF_DIR}",
],
_sudo=True,
_sudo_user=SSH_USER,
)
files.link(
name="Symlink ~/.tmux.conf -> ~/.tmux/.tmux.conf",
path=f"/home/{SSH_USER}/.tmux.conf",
target=f"{TMUX_CONF_DIR}/.tmux.conf",
user=SSH_USER,
group=SSH_USER,
_sudo=True,
)
# Seed the user override file only if absent — preserves customizations.
server.shell(
name="Seed ~/.tmux.conf.local (if missing)",
commands=[
f"test -f /home/{SSH_USER}/.tmux.conf.local || "
f"cp {TMUX_CONF_DIR}/.tmux.conf.local /home/{SSH_USER}/",
],
_sudo=True,
_sudo_user=SSH_USER,
)
# --- Tailscale ---------------------------------------------------------------
# Tailscale's noble repo works fine on later Ubuntus — the package is
# self-contained Go binaries.
files.download(
name="Fetch Tailscale apt key",
src="https://pkgs.tailscale.com/stable/ubuntu/noble.noarmor.gpg",
dest="/usr/share/keyrings/tailscale-archive-keyring.gpg",
mode="644",
_sudo=True,
)
files.download(
name="Fetch Tailscale apt list",
src="https://pkgs.tailscale.com/stable/ubuntu/noble.tailscale-keyring.list",
dest="/etc/apt/sources.list.d/tailscale.list",
mode="644",
_sudo=True,
)
apt.update(name="apt update (Tailscale)", _sudo=True)
apt.packages(name="Install Tailscale", packages=["tailscale"], _sudo=True)
systemd.service(
name="Enable tailscaled",
service="tailscaled",
running=True,
enabled=True,
_sudo=True,
)
# `sudo tailscale up` is interactive (browser auth) — run manually once.
# --- Docker -----------------------------------------------------------------
apt.packages(
name="Install Docker + compose plugin",
packages=["docker.io", "docker-compose-v2"],
_sudo=True,
)
systemd.service(
name="Enable docker daemon",
service="docker",
running=True,
enabled=True,
_sudo=True,
)
# lazydocker (TUI for docker). System-wide install via official script.
server.shell(
name="Install / upgrade lazydocker",
commands=[
"curl -fsSL "
"https://raw.githubusercontent.com/jesseduffield/lazydocker/master/scripts/install_update_linux.sh "
"| DIR=/usr/local/bin bash",
],
_sudo=True,
)
# --- GPU access (host kernel/driver bits only) ------------------------------
# AMD's amdgpu-install package adds the ROCm apt repo; we use it just to
# get rocminfo for host-side diagnostics. Containers ship
# their own ROCm.
files.directory(
name="apt keyring dir",
path="/etc/apt/keyrings",
mode="755",
_sudo=True,
)
amdgpu_deb = f"/tmp/{AMDGPU_INSTALL_DEB}"
amdgpu_url = (
f"https://repo.radeon.com/amdgpu-install/{ROCM_VERSION}/ubuntu/noble/"
f"{AMDGPU_INSTALL_DEB}"
)
server.shell(
name="Fetch amdgpu-install .deb",
commands=[f"test -f {amdgpu_deb} || curl -fsSL {amdgpu_url} -o {amdgpu_deb}"],
_sudo=True,
)
server.shell(
name="Install amdgpu-install package",
commands=[f"apt install -y {amdgpu_deb}"],
_sudo=True,
)
# Idempotent cleanup: if a prior run installed the full ROCm userspace
# (~25 GB), tear it down before installing the diagnostic-only subset.
# On a fresh box this is a no-op.
server.shell(
name="Remove full ROCm install if present",
commands=[
"if dpkg -l rocm-dev 2>/dev/null | grep -q '^ii'; then "
" amdgpu-install -y --uninstall || true; "
"fi",
],
_sudo=True,
)
apt.packages(
name="ROCm host diagnostics (rocminfo, librocm-smi-dev)",
# rocminfo is the stable diagnostic. The SMI tool's package name has
# churned across ROCm releases (rocm-smi-lib → amd-smi-lib in 7.x);
# install on demand if you need it.
# librocm-smi-dev provides librocm_smi64.so + headers; btop dlopens
# it at runtime for AMD GPU monitoring (compiled against headers,
# loaded dynamically). Cheap to install (~50 MB), no full ROCm tail.
packages=["rocminfo", "librocm-smi-dev"],
_sudo=True,
)
# amdgpu_top — fetch + install the .deb if not already at the pinned
# version. Replaces radeontop for newer AMD cards (Strix Halo / gfx1151).
amdgpu_top_deb = f"/tmp/{AMDGPU_TOP_DEB}"
amdgpu_top_url = (
f"https://github.com/Umio-Yasuno/amdgpu_top/releases/download/"
f"v{AMDGPU_TOP_VERSION.split('-')[0]}/{AMDGPU_TOP_DEB}"
)
server.shell(
name="Install / upgrade amdgpu_top",
commands=[
f"dpkg -s amdgpu-top 2>/dev/null | grep -q '^Version: {AMDGPU_TOP_VERSION}' || "
f"(test -f {amdgpu_top_deb} || curl -fsSL {amdgpu_top_url} -o {amdgpu_top_deb}; "
f"apt install -y {amdgpu_top_deb})",
],
_sudo=True,
)
# nvtop from source. Build deps + clone + cmake + install to /usr/local
# (which wins over /usr/bin in $PATH). Idempotent — only rebuilds if
# /usr/local/bin/nvtop's version doesn't match NVTOP_VERSION. Run
# `sudo nvtop` to see container processes (otherwise non-root user
# only sees its own /proc/<pid>/fdinfo entries).
apt.packages(
name="nvtop build deps",
packages=[
"cmake",
"build-essential",
"libncurses-dev",
"libdrm-dev",
"libudev-dev",
"libsystemd-dev",
],
_sudo=True,
)
server.shell(
name=f"Build & install nvtop {NVTOP_VERSION} from source",
commands=[
f"/usr/local/bin/nvtop --version 2>/dev/null | grep -q 'version {NVTOP_VERSION}' && exit 0; "
f"rm -rf /tmp/nvtop-build && "
f"git clone --depth 1 --branch {NVTOP_VERSION} "
f"https://github.com/Syllo/nvtop.git /tmp/nvtop-build && "
f"cmake -S /tmp/nvtop-build -B /tmp/nvtop-build/build "
f"-DAMDGPU_SUPPORT=ON -DCMAKE_INSTALL_PREFIX=/usr/local && "
f"make -C /tmp/nvtop-build/build -j && "
f"make -C /tmp/nvtop-build/build install && "
f"rm -rf /tmp/nvtop-build",
],
_sudo=True,
)
# btop from source. apt's 1.3.x has no AMD GPU support; 1.4+ requires
# C++23 (g++-14), which 24.04 doesn't ship by default — add the
# ubuntu-toolchain-r/test PPA. Build with GPU_SUPPORT=true so btop
# dlopens librocm_smi64 (provided by librocm-smi-dev installed above).
# Idempotent — only rebuilds if installed version doesn't match.
server.shell(
name="Add ubuntu-toolchain-r/test PPA (g++-14 on 24.04)",
commands=[
"grep -rq ubuntu-toolchain-r /etc/apt/sources.list.d/ 2>/dev/null || "
"add-apt-repository -y ppa:ubuntu-toolchain-r/test",
],
_sudo=True,
)
apt.update(name="apt update (post-toolchain PPA)", _sudo=True)
apt.packages(
name="btop build deps (g++-14 + ncurses)",
packages=["g++-14", "libncurses-dev"],
_sudo=True,
)
server.shell(
name=f"Build & install btop {BTOP_VERSION} from source",
commands=[
f"/usr/local/bin/btop --version 2>/dev/null | grep -q '{BTOP_VERSION}' && exit 0; "
f"rm -rf /tmp/btop-build && "
f"git clone --depth 1 --branch v{BTOP_VERSION} "
f"https://github.com/aristocratos/btop.git /tmp/btop-build && "
f"make -C /tmp/btop-build GPU_SUPPORT=true CXX=g++-14 -j && "
f"make -C /tmp/btop-build install PREFIX=/usr/local && "
f"rm -rf /tmp/btop-build",
],
_sudo=True,
)
# Group membership for /dev/kfd + /dev/dri access (needed for GPU passthrough
# into containers, and for unprivileged host-side rocminfo).
server.group(name="ensure render group", group="render", _sudo=True)
server.group(name="ensure video group", group="video", _sudo=True)
server.user(
name="Add login user to render/video/docker",
user=SSH_USER,
groups=["render", "video", "docker"],
append=True,
_sudo=True,
)
# Kernel cmdline tuning per Gygeek/Framework-strix-halo-llm-setup:
# - amd_iommu=off — ~6 % memory-read improvement on Strix Halo
# - amdgpu.gttsize=117760 — ~115 GB GTT ceiling so the GPU can borrow
# most of system RAM dynamically. Acts as a
# ceiling, not an allocation. See ../../StrixHaloMemory.md
# for the UMA-vs-GTT trade-off discussion.
# Requires a reboot to take effect; pyinfra leaves that to you.
files.line(
name="GRUB cmdline (amd_iommu, gttsize)",
path="/etc/default/grub",
line=r"^GRUB_CMDLINE_LINUX_DEFAULT=.*",
replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=117760"',
_sudo=True,
)
server.shell(
name="Regenerate GRUB config",
commands=["update-grub"],
_sudo=True,
)
# --- Storage layout ---------------------------------------------------------
files.directory(
name="/models root",
path=MODELS_DIR,
user=SSH_USER,
group=SSH_USER,
mode="755",
_sudo=True,
)
for sub in ("moonshotai", "qwen", "deepseek", "zai", "mistralai"):
files.directory(
name=f"{MODELS_DIR}/{sub}",
path=f"{MODELS_DIR}/{sub}",
user=SSH_USER,
group=SSH_USER,
mode="755",
_sudo=True,
)
# Ollama bind-mounts its content-addressed store here.
files.directory(
name=f"{MODELS_DIR}/ollama",
path=f"{MODELS_DIR}/ollama",
user=SSH_USER,
group=SSH_USER,
mode="755",
_sudo=True,
)
# --- Compose files for inference services ----------------------------------
files.directory(
name="Compose root",
path=COMPOSE_DIR,
group="docker",
mode="2775", # setgid: new files inherit docker group
_sudo=True,
)
# Idempotent cleanup of earlier locations.
files.directory(
name="Remove old /srv/compose",
path="/srv/compose",
present=False,
_sudo=True,
)
files.directory(
name="Remove old ~/docker compose dir",
path=f"/home/{SSH_USER}/docker",
present=False,
_sudo=True,
)
for svc in (
"llama",
"vllm",
"ollama",
"openwebui",
"beszel",
"openlit",
"phoenix",
"openhands",
"homepage",
"whisper",
"piper",
"faster-whisper",
"kokoro",
):
files.directory(
name=f"compose/{svc} dir",
path=f"{COMPOSE_DIR}/{svc}",
group="docker",
mode="2775",
_sudo=True,
)
files.put(
name=f"compose/{svc}/docker-compose.yml",
src=f"compose/{svc}.yml",
dest=f"{COMPOSE_DIR}/{svc}/docker-compose.yml",
group="docker",
mode="664",
_sudo=True,
)
# OpenWebUI persistent state (users, conversations, uploaded docs,
# RAG vector index) — bind-mounted into the container.
files.directory(
name="OpenWebUI data dir",
path=f"{COMPOSE_DIR}/openwebui/data",
group="docker",
mode="2775",
_sudo=True,
)
# Beszel persistent state (admin account, system list, metric history).
files.directory(
name="Beszel data dir",
path=f"{COMPOSE_DIR}/beszel/data",
group="docker",
mode="2775",
_sudo=True,
)
# Sibling .env file Docker Compose auto-reads for variable interpolation.
# Pyinfra creates it empty if missing and never overwrites — the user
# fills in BESZEL_TOKEN= and BESZEL_KEY= from the hub UI on first setup.
# Mode 640 root:docker so docker group members can read it at compose
# parse time but it isn't world-readable.
files.file(
name="Beszel .env (placeholder)",
path=f"{COMPOSE_DIR}/beszel/.env",
present=True,
mode="640",
user="root",
group="docker",
_sudo=True,
)
# Tear down the previous /etc/default/beszel-agent location if a prior
# deploy created it (idempotent — no-op on a fresh box).
files.file(
name="Remove legacy /etc/default/beszel-agent",
path="/etc/default/beszel-agent",
present=False,
_sudo=True,
)
# OpenLIT persistent state — Next.js app config + ClickHouse trace store.
files.directory(
name="OpenLIT data dir",
path=f"{COMPOSE_DIR}/openlit/data",
group="docker",
mode="2775",
_sudo=True,
)
files.directory(
name="OpenLIT ClickHouse data dir",
path=f"{COMPOSE_DIR}/openlit/clickhouse",
group="docker",
mode="2775",
_sudo=True,
)
# Phoenix persistent state (SQLite-backed trace store).
files.directory(
name="Phoenix data dir",
path=f"{COMPOSE_DIR}/phoenix/data",
group="docker",
mode="2775",
_sudo=True,
)
# OpenHands state + workspace. Owned by the SSH user (UID 1000) so the
# sandbox containers — which run as SANDBOX_USER_ID=1000 — can write to
# the shared workspace without root-owned files leaking out.
files.directory(
name="OpenHands state dir",
path=f"{COMPOSE_DIR}/openhands/state",
user=SSH_USER,
group=SSH_USER,
mode="2775",
_sudo=True,
)
files.directory(
name="OpenHands workspace dir",
path=f"{COMPOSE_DIR}/openhands/workspace",
user=SSH_USER,
group=SSH_USER,
mode="2775",
_sudo=True,
)
# Homepage config. The compose loop above only copies homepage.yml; the
# YAML config files live in compose/homepage/ on the source side and at
# /srv/docker/homepage/config/ on the box. Source-of-truth is the repo —
# `./run.sh` syncs the config files. Edits should happen here in the
# repo, not on the box, since pyinfra will overwrite drift.
files.directory(
name="Homepage config dir",
path=f"{COMPOSE_DIR}/homepage/config",
group="docker",
mode="2775",
_sudo=True,
)
for cfg in (
"services.yaml",
"settings.yaml",
"widgets.yaml",
"docker.yaml",
"bookmarks.yaml",
):
files.put(
name=f"homepage config: {cfg}",
src=f"compose/homepage/{cfg}",
dest=f"{COMPOSE_DIR}/homepage/config/{cfg}",
group="docker",
mode="664",
_sudo=True,
)
# Voice stack — Wyoming-protocol Whisper (STT) and Piper (TTS). Models
# are downloaded on first start; bind-mounting these dirs survives
# container recreation.
files.directory(
name="Whisper data dir",
path=f"{COMPOSE_DIR}/whisper/data",
group="docker",
mode="2775",
_sudo=True,
)
files.directory(
name="Piper data dir",
path=f"{COMPOSE_DIR}/piper/data",
group="docker",
mode="2775",
_sudo=True,
)
# OpenAI-compatible voice servers — alternative path to Wyoming, used by
# OpenWebUI's Audio settings (and through it, Conduit on Android).
files.directory(
name="faster-whisper cache dir",
path=f"{COMPOSE_DIR}/faster-whisper/cache",
group="docker",
mode="2775",
_sudo=True,
)
files.directory(
name="Kokoro models dir",
# The kokoro-fastapi image runs as UID 1000 (non-root) and downloads
# models into this dir on first start. Has to be writable by 1000;
# 2775 root:docker isn't enough since the container isn't in the
# docker group. Owning it as 1000:1000 matches the image's user.
path=f"{COMPOSE_DIR}/kokoro/models",
user="1000",
group="1000",
mode="0755",
_sudo=True,
)
# --- Cleanup of artifacts from the prior native-build deploy ----------------
# All idempotent — `present=False` is a no-op when the target is absent.
server.shell(
name="Stop & disable old native llama-server.service",
commands=[
"systemctl disable --now llama-server.service 2>/dev/null || true",
],
_sudo=True,
)
files.file(
name="Remove old llama-server.service",
path="/etc/systemd/system/llama-server.service",
present=False,
_sudo=True,
)
files.link(
name="Remove old llama-server-vulkan symlink",
path="/usr/local/bin/llama-server-vulkan",
present=False,
_sudo=True,
)
files.link(
name="Remove old llama-server-rocm symlink",
path="/usr/local/bin/llama-server-rocm",
present=False,
_sudo=True,
)
files.directory(
name="Remove old llama.cpp checkout",
path="/opt/llama.cpp",
present=False,
_sudo=True,
)
server.shell(
name="Stop & remove native Ollama install",
commands=[
"systemctl disable --now ollama.service 2>/dev/null || true",
"rm -f /etc/systemd/system/ollama.service /usr/local/bin/ollama",
"userdel ollama 2>/dev/null || true",
],
_sudo=True,
)
systemd.daemon_reload(name="systemctl daemon-reload", _sudo=True)