pyinfra/framework/deploy.py

"""
pyinfra deploy for Framework Desktop (Ryzen AI Max+ 395 / Strix Halo).

Containerized layout: inference engines (llama.cpp, vLLM, Ollama) run as
docker compose services, each shipping its own ROCm/Vulkan stack. The
host stays minimal — kernel + driver + diagnostics + Docker.

Run with:
    ./run.sh                # equivalent to pyinfra inventory.py deploy.py

Idempotent — re-running only does work that's actually needed. Includes
one-shot cleanup steps for artifacts left over from the earlier native
build (llama.cpp git checkout, symlinks, systemd unit, full ROCm).
"""

from io import StringIO

from pyinfra.operations import apt, files, server, systemd
from pyinfra import host

# --- Tunables ----------------------------------------------------------------

# Latest stable as of 2026-04-30. Verify at https://repo.radeon.com/amdgpu-install/.
ROCM_VERSION = "7.2.3"
# The .deb filename has an opaque build suffix — find the exact name at
# the URL above and paste it here.
AMDGPU_INSTALL_DEB = "amdgpu-install_7.2.3.70203-1_all.deb"

# amdgpu_top — modern GPU monitor that handles new AMD cards / APUs
# (Strix Halo / gfx1151). Replaces radeontop, which doesn't know about
# RDNA 3.5. Verify at https://github.com/Umio-Yasuno/amdgpu_top/releases.
AMDGPU_TOP_VERSION = "0.11.4-1"
AMDGPU_TOP_DEB = f"amdgpu-top_without_gui_{AMDGPU_TOP_VERSION}_amd64.deb"

SSH_USER = host.data.get("ssh_user", "noise")
MODELS_DIR = "/models"
# /srv is the FHS-blessed location for "data and configuration for
# services this system provides." Owned root:docker with setgid +
# group-write so any docker-group member can manage compose stacks.
COMPOSE_DIR = "/srv/docker"

# --- Phase 1 — Base OS basics ------------------------------------------------

apt.update(name="apt update", _sudo=True)

apt.packages(
    name="HWE kernel (>=6.11 for ROCm 7)",
    packages=["linux-generic-hwe-24.04"],
    _sudo=True,
)

# User basics + monitoring tools.
apt.packages(
    name="Base CLI tools",
    # radeontop intentionally omitted — it predates RDNA 3.5 / Strix Halo
    # and just errors with "no VRAM support". amdgpu_top installed below.
    packages=[
        "tmux",
        "vim",
        "htop",
        "btop",
        "nvtop",
        "git",
        "curl",
        "ca-certificates",
        "unzip",
    ],
    _sudo=True,
)

# Vulkan diagnostics on the host (containers ship their own runtime).
apt.packages(
    name="Vulkan host diagnostics",
    packages=["mesa-vulkan-drivers", "vulkan-tools"],
    _sudo=True,
)

# uv (Python package/tool manager).
server.shell(
    name="Install / upgrade uv",
    commands=[
        "curl -LsSf https://astral.sh/uv/install.sh | "
        "env UV_INSTALL_DIR=/usr/local/bin UV_UNMANAGED_INSTALL=1 sh",
    ],
    _sudo=True,
)

# huggingface_hub CLI for `huggingface-cli download <repo> --local-dir ...`
# Lands at ~/.local/bin/huggingface-cli for the SSH user. Other users can
# repeat the command themselves.
server.shell(
    name="Install / upgrade huggingface_hub CLI",
    commands=["uv tool install --upgrade 'huggingface_hub[cli]'"],
    _sudo=True,
    _sudo_user=SSH_USER,
)

# gpakosz/.tmux config (Oh My Tmux!).
TMUX_CONF_DIR = f"/home/{SSH_USER}/.tmux"
server.shell(
    name="Clone gpakosz/.tmux",
    commands=[
        f"test -d {TMUX_CONF_DIR}/.git || "
        f"git clone --depth 1 https://github.com/gpakosz/.tmux.git {TMUX_CONF_DIR}",
    ],
    _sudo=True,
    _sudo_user=SSH_USER,
)
files.link(
    name="Symlink ~/.tmux.conf -> ~/.tmux/.tmux.conf",
    path=f"/home/{SSH_USER}/.tmux.conf",
    target=f"{TMUX_CONF_DIR}/.tmux.conf",
    user=SSH_USER,
    group=SSH_USER,
    _sudo=True,
)
# Seed the user override file only if absent — preserves customizations.
server.shell(
    name="Seed ~/.tmux.conf.local (if missing)",
    commands=[
        f"test -f /home/{SSH_USER}/.tmux.conf.local || "
        f"cp {TMUX_CONF_DIR}/.tmux.conf.local /home/{SSH_USER}/",
    ],
    _sudo=True,
    _sudo_user=SSH_USER,
)

# --- Tailscale ---------------------------------------------------------------

# Tailscale's noble repo works fine on later Ubuntus — the package is
# self-contained Go binaries.
files.download(
    name="Fetch Tailscale apt key",
    src="https://pkgs.tailscale.com/stable/ubuntu/noble.noarmor.gpg",
    dest="/usr/share/keyrings/tailscale-archive-keyring.gpg",
    mode="644",
    _sudo=True,
)
files.download(
    name="Fetch Tailscale apt list",
    src="https://pkgs.tailscale.com/stable/ubuntu/noble.tailscale-keyring.list",
    dest="/etc/apt/sources.list.d/tailscale.list",
    mode="644",
    _sudo=True,
)
apt.update(name="apt update (Tailscale)", _sudo=True)
apt.packages(name="Install Tailscale", packages=["tailscale"], _sudo=True)
systemd.service(
    name="Enable tailscaled",
    service="tailscaled",
    running=True,
    enabled=True,
    _sudo=True,
)
# `sudo tailscale up` is interactive (browser auth) — run manually once.

# --- Docker -----------------------------------------------------------------

apt.packages(
    name="Install Docker + compose plugin",
    packages=["docker.io", "docker-compose-v2"],
    _sudo=True,
)
systemd.service(
    name="Enable docker daemon",
    service="docker",
    running=True,
    enabled=True,
    _sudo=True,
)

# lazydocker (TUI for docker). System-wide install via official script.
server.shell(
    name="Install / upgrade lazydocker",
    commands=[
        "curl -fsSL "
        "https://raw.githubusercontent.com/jesseduffield/lazydocker/master/scripts/install_update_linux.sh "
        "| DIR=/usr/local/bin bash",
    ],
    _sudo=True,
)

# --- GPU access (host kernel/driver bits only) ------------------------------

# AMD's amdgpu-install package adds the ROCm apt repo; we use it just to
# get rocminfo for host-side diagnostics. Containers ship
# their own ROCm.
files.directory(
    name="apt keyring dir",
    path="/etc/apt/keyrings",
    mode="755",
    _sudo=True,
)

amdgpu_deb = f"/tmp/{AMDGPU_INSTALL_DEB}"
amdgpu_url = (
    f"https://repo.radeon.com/amdgpu-install/{ROCM_VERSION}/ubuntu/noble/"
    f"{AMDGPU_INSTALL_DEB}"
)
server.shell(
    name="Fetch amdgpu-install .deb",
    commands=[f"test -f {amdgpu_deb} || curl -fsSL {amdgpu_url} -o {amdgpu_deb}"],
    _sudo=True,
)
server.shell(
    name="Install amdgpu-install package",
    commands=[f"apt install -y {amdgpu_deb}"],
    _sudo=True,
)

# Idempotent cleanup: if a prior run installed the full ROCm userspace
# (~25 GB), tear it down before installing the diagnostic-only subset.
# On a fresh box this is a no-op.
server.shell(
    name="Remove full ROCm install if present",
    commands=[
        "if dpkg -l rocm-dev 2>/dev/null | grep -q '^ii'; then "
        "  amdgpu-install -y --uninstall || true; "
        "fi",
    ],
    _sudo=True,
)
apt.packages(
    name="ROCm host diagnostics (rocminfo)",
    # rocminfo is the stable diagnostic. The SMI tool's package name has
    # churned across ROCm releases (rocm-smi-lib → amd-smi-lib in 7.x);
    # install on demand if you need it.
    packages=["rocminfo"],
    _sudo=True,
)

# amdgpu_top — fetch + install the .deb if not already at the pinned
# version. Replaces radeontop for newer AMD cards (Strix Halo / gfx1151).
amdgpu_top_deb = f"/tmp/{AMDGPU_TOP_DEB}"
amdgpu_top_url = (
    f"https://github.com/Umio-Yasuno/amdgpu_top/releases/download/"
    f"v{AMDGPU_TOP_VERSION.split('-')[0]}/{AMDGPU_TOP_DEB}"
)
server.shell(
    name="Install / upgrade amdgpu_top",
    commands=[
        f"dpkg -s amdgpu-top 2>/dev/null | grep -q '^Version: {AMDGPU_TOP_VERSION}' || "
        f"(test -f {amdgpu_top_deb} || curl -fsSL {amdgpu_top_url} -o {amdgpu_top_deb}; "
        f"apt install -y {amdgpu_top_deb})",
    ],
    _sudo=True,
)

# Group membership for /dev/kfd + /dev/dri access (needed for GPU passthrough
# into containers, and for unprivileged host-side rocminfo).
server.group(name="ensure render group", group="render", _sudo=True)
server.group(name="ensure video group", group="video", _sudo=True)
server.user(
    name="Add login user to render/video/docker",
    user=SSH_USER,
    groups=["render", "video", "docker"],
    append=True,
    _sudo=True,
)

# Kernel cmdline tuning per Gygeek/Framework-strix-halo-llm-setup:
#   - amd_iommu=off       — ~6 % memory-read improvement on Strix Halo
#   - amdgpu.gttsize=117760 — ~115 GB GTT ceiling so the GPU can borrow
#                            most of system RAM dynamically. Acts as a
#                            ceiling, not an allocation. See ../../StrixHaloMemory.md
#                            for the UMA-vs-GTT trade-off discussion.
# Requires a reboot to take effect; pyinfra leaves that to you.
files.line(
    name="GRUB cmdline (amd_iommu, gttsize)",
    path="/etc/default/grub",
    line=r"^GRUB_CMDLINE_LINUX_DEFAULT=.*",
    replace='GRUB_CMDLINE_LINUX_DEFAULT="quiet splash amd_iommu=off amdgpu.gttsize=117760"',
    _sudo=True,
)
server.shell(
    name="Regenerate GRUB config",
    commands=["update-grub"],
    _sudo=True,
)

# --- Storage layout ---------------------------------------------------------

files.directory(
    name="/models root",
    path=MODELS_DIR,
    user=SSH_USER,
    group=SSH_USER,
    mode="755",
    _sudo=True,
)
for sub in ("moonshotai", "qwen", "deepseek", "zai", "mistralai"):
    files.directory(
        name=f"{MODELS_DIR}/{sub}",
        path=f"{MODELS_DIR}/{sub}",
        user=SSH_USER,
        group=SSH_USER,
        mode="755",
        _sudo=True,
    )
# Ollama bind-mounts its content-addressed store here.
files.directory(
    name=f"{MODELS_DIR}/ollama",
    path=f"{MODELS_DIR}/ollama",
    user=SSH_USER,
    group=SSH_USER,
    mode="755",
    _sudo=True,
)

# --- Compose files for inference services ----------------------------------

files.directory(
    name="Compose root",
    path=COMPOSE_DIR,
    group="docker",
    mode="2775",          # setgid: new files inherit docker group
    _sudo=True,
)
# Idempotent cleanup of earlier locations.
files.directory(
    name="Remove old /srv/compose",
    path="/srv/compose",
    present=False,
    _sudo=True,
)
files.directory(
    name="Remove old ~/docker compose dir",
    path=f"/home/{SSH_USER}/docker",
    present=False,
    _sudo=True,
)
for svc in (
    "llama",
    "vllm",
    "ollama",
    "openwebui",
    "beszel",
    "openlit",
    "phoenix",
    "openhands",
    "homepage",
    "whisper",
    "piper",
    "faster-whisper",
    "kokoro",
):
    files.directory(
        name=f"compose/{svc} dir",
        path=f"{COMPOSE_DIR}/{svc}",
        group="docker",
        mode="2775",
        _sudo=True,
    )
    files.put(
        name=f"compose/{svc}/docker-compose.yml",
        src=f"compose/{svc}.yml",
        dest=f"{COMPOSE_DIR}/{svc}/docker-compose.yml",
        group="docker",
        mode="664",
        _sudo=True,
    )

# OpenWebUI persistent state (users, conversations, uploaded docs,
# RAG vector index) — bind-mounted into the container.
files.directory(
    name="OpenWebUI data dir",
    path=f"{COMPOSE_DIR}/openwebui/data",
    group="docker",
    mode="2775",
    _sudo=True,
)

# Beszel persistent state (admin account, system list, metric history).
files.directory(
    name="Beszel data dir",
    path=f"{COMPOSE_DIR}/beszel/data",
    group="docker",
    mode="2775",
    _sudo=True,
)
# Sibling .env file Docker Compose auto-reads for variable interpolation.
# Pyinfra creates it empty if missing and never overwrites — the user
# fills in BESZEL_TOKEN= and BESZEL_KEY= from the hub UI on first setup.
# Mode 640 root:docker so docker group members can read it at compose
# parse time but it isn't world-readable.
files.file(
    name="Beszel .env (placeholder)",
    path=f"{COMPOSE_DIR}/beszel/.env",
    present=True,
    mode="640",
    user="root",
    group="docker",
    _sudo=True,
)
# Tear down the previous /etc/default/beszel-agent location if a prior
# deploy created it (idempotent — no-op on a fresh box).
files.file(
    name="Remove legacy /etc/default/beszel-agent",
    path="/etc/default/beszel-agent",
    present=False,
    _sudo=True,
)

# OpenLIT persistent state — Next.js app config + ClickHouse trace store.
files.directory(
    name="OpenLIT data dir",
    path=f"{COMPOSE_DIR}/openlit/data",
    group="docker",
    mode="2775",
    _sudo=True,
)
files.directory(
    name="OpenLIT ClickHouse data dir",
    path=f"{COMPOSE_DIR}/openlit/clickhouse",
    group="docker",
    mode="2775",
    _sudo=True,
)

# Phoenix persistent state (SQLite-backed trace store).
files.directory(
    name="Phoenix data dir",
    path=f"{COMPOSE_DIR}/phoenix/data",
    group="docker",
    mode="2775",
    _sudo=True,
)

# OpenHands state + workspace. Owned by the SSH user (UID 1000) so the
# sandbox containers — which run as SANDBOX_USER_ID=1000 — can write to
# the shared workspace without root-owned files leaking out.
files.directory(
    name="OpenHands state dir",
    path=f"{COMPOSE_DIR}/openhands/state",
    user=SSH_USER,
    group=SSH_USER,
    mode="2775",
    _sudo=True,
)
files.directory(
    name="OpenHands workspace dir",
    path=f"{COMPOSE_DIR}/openhands/workspace",
    user=SSH_USER,
    group=SSH_USER,
    mode="2775",
    _sudo=True,
)

# Homepage config. The compose loop above only copies homepage.yml; the
# YAML config files live in compose/homepage/ on the source side and at
# /srv/docker/homepage/config/ on the box. Source-of-truth is the repo —
# `./run.sh` syncs the config files. Edits should happen here in the
# repo, not on the box, since pyinfra will overwrite drift.
files.directory(
    name="Homepage config dir",
    path=f"{COMPOSE_DIR}/homepage/config",
    group="docker",
    mode="2775",
    _sudo=True,
)
for cfg in (
    "services.yaml",
    "settings.yaml",
    "widgets.yaml",
    "docker.yaml",
    "bookmarks.yaml",
):
    files.put(
        name=f"homepage config: {cfg}",
        src=f"compose/homepage/{cfg}",
        dest=f"{COMPOSE_DIR}/homepage/config/{cfg}",
        group="docker",
        mode="664",
        _sudo=True,
    )

# Voice stack — Wyoming-protocol Whisper (STT) and Piper (TTS). Models
# are downloaded on first start; bind-mounting these dirs survives
# container recreation.
files.directory(
    name="Whisper data dir",
    path=f"{COMPOSE_DIR}/whisper/data",
    group="docker",
    mode="2775",
    _sudo=True,
)
files.directory(
    name="Piper data dir",
    path=f"{COMPOSE_DIR}/piper/data",
    group="docker",
    mode="2775",
    _sudo=True,
)

# OpenAI-compatible voice servers — alternative path to Wyoming, used by
# OpenWebUI's Audio settings (and through it, Conduit on Android).
files.directory(
    name="faster-whisper cache dir",
    path=f"{COMPOSE_DIR}/faster-whisper/cache",
    group="docker",
    mode="2775",
    _sudo=True,
)
files.directory(
    name="Kokoro models dir",
    path=f"{COMPOSE_DIR}/kokoro/models",
    group="docker",
    mode="2775",
    _sudo=True,
)

# --- Cleanup of artifacts from the prior native-build deploy ----------------
# All idempotent — `present=False` is a no-op when the target is absent.

server.shell(
    name="Stop & disable old native llama-server.service",
    commands=[
        "systemctl disable --now llama-server.service 2>/dev/null || true",
    ],
    _sudo=True,
)
files.file(
    name="Remove old llama-server.service",
    path="/etc/systemd/system/llama-server.service",
    present=False,
    _sudo=True,
)
files.link(
    name="Remove old llama-server-vulkan symlink",
    path="/usr/local/bin/llama-server-vulkan",
    present=False,
    _sudo=True,
)
files.link(
    name="Remove old llama-server-rocm symlink",
    path="/usr/local/bin/llama-server-rocm",
    present=False,
    _sudo=True,
)
files.directory(
    name="Remove old llama.cpp checkout",
    path="/opt/llama.cpp",
    present=False,
    _sudo=True,
)
server.shell(
    name="Stop & remove native Ollama install",
    commands=[
        "systemctl disable --now ollama.service 2>/dev/null || true",
        "rm -f /etc/systemd/system/ollama.service /usr/local/bin/ollama",
        "userdel ollama 2>/dev/null || true",
    ],
    _sudo=True,
)
systemd.daemon_reload(name="systemctl daemon-reload", _sudo=True)