diff --git a/TODO.md b/TODO.md index 203b6fe..28161b7 100644 --- a/TODO.md +++ b/TODO.md @@ -2,13 +2,11 @@ ## ROCm / vLLM on Strix Halo (gfx1151) -The Framework Desktop runs **Ubuntu 26.04 LTS**; AMD only ships ROCm -7.2.3 packages for jammy (22.04) and noble (24.04). We installed the -noble repo but pulled only `rocminfo` + `rocm-smi-lib` for host-side -diagnostics — all heavy ROCm work runs in containers, which ship their -own ROCm stack. This sidesteps the host-side libxml2 ABI mismatch (noble -ships `libxml2.so.2`, 26.04 ships `libxml2.so.16`) that broke the native -HIP toolchain. +The Framework Desktop runs **Ubuntu 24.04 LTS (noble)**, which aligns +with AMD's ROCm 7.x packaging. The deploy installs `rocminfo` and +`librocm-smi-dev` host-side; heavier ROCm bits (full HIP toolchain, +device-mapped libraries) still run inside containers that ship their +own ROCm stack. The host stays slim by design. ### Open questions @@ -17,21 +15,15 @@ HIP toolchain. gfx1151 (RDNA 3.5 consumer) is a different ISA. If the stock image doesn't initialize the device, try `rocm/vllm-dev:nightly` or build from source against ROCm 7.x with `-DAMDGPU_TARGETS=gfx1151`. -- **AMD support for 26.04** — watch https://repo.radeon.com/amdgpu-install//ubuntu/ - for a directory matching the box's codename. AMD historically lags - Ubuntu LTS by 6–12 months for ROCm packaging. -### When 26.04 ROCm packages land +### If you ever want full host-side ROCm -If you ever want to do native ROCm work on the host (rather than via -containers): -1. Bump `ROCM_VERSION` and `AMDGPU_INSTALL_DEB` in `pyinfra/deploy.py` - to the new release. -2. Update the apt source URL path in `deploy.py` if AMD adds a new - release codename (currently hardcoded to `noble`). -3. Add a step that runs `amdgpu-install -y --usecase=rocm --no-dkms` - (the current deploy explicitly avoids this to stay slim). -4. `./run.sh`. +For native ROCm work on the host (compiling HIP kernels, full toolchain): +1. Bump `ROCM_VERSION` and `AMDGPU_INSTALL_DEB` in + `pyinfra/framework/deploy.py` to the latest release. +2. Add a step that runs `amdgpu-install -y --usecase=rocm --no-dkms` + (currently avoided to stay slim — ~25 GB toolchain). +3. `./run.sh`. For container-only workflows (current default), no action is needed — container images update independently of the host. diff --git a/pyinfra/framework/README.md b/pyinfra/framework/README.md index e69023f..9460606 100644 --- a/pyinfra/framework/README.md +++ b/pyinfra/framework/README.md @@ -80,10 +80,15 @@ Top of `deploy.py`: the version; find it at https://repo.radeon.com/amdgpu-install/. - `AMDGPU_TOP_VERSION` — bump when a newer release lands at https://github.com/Umio-Yasuno/amdgpu_top/releases. -- `NVTOP_VERSION` — built from source because Ubuntu 26.04's apt - package (3.0.2) predates gfx1151 detection. Bump when a newer release - lands at https://github.com/Syllo/nvtop/releases. Run `sudo nvtop` to - see all GPU processes (non-root only sees the calling user's own). +- `NVTOP_VERSION` — built from source because apt's nvtop predates + gfx1151 detection. Bump when a newer release lands at + https://github.com/Syllo/nvtop/releases. Run `sudo nvtop` to see all + GPU processes (non-root only sees the calling user's own). +- `BTOP_VERSION` — built from source because apt's btop has no AMD GPU + support. 1.4+ requires C++23, hence the ubuntu-toolchain-r/test PPA + for g++-14. The build links `librocm-smi-dev` for AMD GPU monitoring. + Bump at https://github.com/aristocratos/btop/releases. In btop, Esc + → Options → "show_gpu_info" → On to enable the GPU panel. Compose images in `compose/{llama,vllm,ollama,openwebui,beszel,openlit,phoenix,openhands,homepage}.yml` diff --git a/pyinfra/framework/deploy.py b/pyinfra/framework/deploy.py index 7068198..7db71bd 100644 --- a/pyinfra/framework/deploy.py +++ b/pyinfra/framework/deploy.py @@ -32,12 +32,19 @@ AMDGPU_INSTALL_DEB = "amdgpu-install_7.2.3.70203-1_all.deb" AMDGPU_TOP_VERSION = "0.11.4-1" AMDGPU_TOP_DEB = f"amdgpu-top_without_gui_{AMDGPU_TOP_VERSION}_amd64.deb" -# nvtop — htop-like GPU monitor with multi-vendor support. Ubuntu 26.04 -# ships 3.0.2 in apt, which predates the gfx1151 sysfs detection -# improvements; we build 3.2.x from source instead. Verify at +# nvtop — htop-like GPU monitor with multi-vendor support. Ubuntu's apt +# package predates the gfx1151 sysfs detection improvements; we build +# 3.2.x from source instead. Verify at # https://github.com/Syllo/nvtop/releases. NVTOP_VERSION = "3.2.0" +# btop — system monitor with optional GPU panel. AMD GPU support +# requires building with GPU_SUPPORT=true against librocm-smi-dev. +# Ubuntu 24.04's apt package is 1.3.x (no GPU support). 1.4+ requires +# C++23, hence g++-14 from the ubuntu-toolchain-r/test PPA. Verify at +# https://github.com/aristocratos/btop/releases. +BTOP_VERSION = "1.4.7" + SSH_USER = host.data.get("ssh_user", "noise") MODELS_DIR = "/models" # /srv is the FHS-blessed location for "data and configuration for @@ -60,17 +67,19 @@ apt.packages( name="Base CLI tools", # radeontop intentionally omitted — predates RDNA 3.5 / Strix Halo, # errors with "no VRAM support". amdgpu_top installed below. - # nvtop from apt intentionally omitted — Ubuntu 26.04 ships 3.0.2, - # which doesn't pick up gfx1151. Built from source below instead. + # nvtop from apt intentionally omitted — apt's 3.0.2 doesn't pick up + # gfx1151. Built from source below instead. + # btop from apt intentionally omitted — apt's 1.3.x has no AMD GPU + # support. Built from source below. packages=[ "tmux", "vim", "htop", - "btop", "git", "curl", "ca-certificates", "unzip", + "software-properties-common", # for add-apt-repository (g++-14 PPA) ], _sudo=True, ) @@ -228,11 +237,14 @@ server.shell( _sudo=True, ) apt.packages( - name="ROCm host diagnostics (rocminfo)", + name="ROCm host diagnostics (rocminfo, librocm-smi-dev)", # rocminfo is the stable diagnostic. The SMI tool's package name has # churned across ROCm releases (rocm-smi-lib → amd-smi-lib in 7.x); # install on demand if you need it. - packages=["rocminfo"], + # librocm-smi-dev provides librocm_smi64.so + headers; btop dlopens + # it at runtime for AMD GPU monitoring (compiled against headers, + # loaded dynamically). Cheap to install (~50 MB), no full ROCm tail. + packages=["rocminfo", "librocm-smi-dev"], _sudo=True, ) @@ -286,6 +298,39 @@ server.shell( _sudo=True, ) +# btop from source. apt's 1.3.x has no AMD GPU support; 1.4+ requires +# C++23 (g++-14), which 24.04 doesn't ship by default — add the +# ubuntu-toolchain-r/test PPA. Build with GPU_SUPPORT=true so btop +# dlopens librocm_smi64 (provided by librocm-smi-dev installed above). +# Idempotent — only rebuilds if installed version doesn't match. +server.shell( + name="Add ubuntu-toolchain-r/test PPA (g++-14 on 24.04)", + commands=[ + "grep -rq ubuntu-toolchain-r /etc/apt/sources.list.d/ 2>/dev/null || " + "add-apt-repository -y ppa:ubuntu-toolchain-r/test", + ], + _sudo=True, +) +apt.update(name="apt update (post-toolchain PPA)", _sudo=True) +apt.packages( + name="btop build deps (g++-14 + ncurses)", + packages=["g++-14", "libncurses-dev"], + _sudo=True, +) +server.shell( + name=f"Build & install btop {BTOP_VERSION} from source", + commands=[ + f"/usr/local/bin/btop --version 2>/dev/null | grep -q '{BTOP_VERSION}' && exit 0; " + f"rm -rf /tmp/btop-build && " + f"git clone --depth 1 --branch v{BTOP_VERSION} " + f"https://github.com/aristocratos/btop.git /tmp/btop-build && " + f"make -C /tmp/btop-build GPU_SUPPORT=true CXX=g++-14 -j && " + f"make -C /tmp/btop-build install PREFIX=/usr/local && " + f"rm -rf /tmp/btop-build", + ], + _sudo=True, +) + # Group membership for /dev/kfd + /dev/dri access (needed for GPU passthrough # into containers, and for unprivileged host-side rocminfo). server.group(name="ensure render group", group="render", _sudo=True)