progress 235b

2026-06-08 15:31:50 +01:00
parent a29793032d
commit de1635872f
25 changed files with 1598 additions and 53 deletions
--- a/pyinfra/framework/compose/comfyui.yml
+++ b/pyinfra/framework/compose/comfyui.yml
@@ -0,0 +1,77 @@
+# ComfyUI on Strix Halo gfx1151 via kyuz0/amd-strix-halo-comfyui.
+#
+# Toolbox-style image (Fedora rawhide + ROCm) with /bin/bash as CMD.
+# We override entrypoint to launch ComfyUI's main.py with the flag set
+# gfx1151 needs (--disable-mmap because mmap >64 GB is slow on ROCm;
+# --bf16-vae avoids VAE OOM; --cache-none keeps unified-memory pressure
+# manageable).
+#
+# Coexistence with other services. ComfyUI competes for GPU with
+# kimi-linear (always-resident) and ollama (loads-on-demand). To avoid
+# silent contention this stack is NOT set to restart automatically —
+# bring it up manually (`docker compose up -d`) when you need image gen,
+# and `docker compose down` after. Mid-term we'll add a
+# load-shed/coordination layer; this comment is the binding for now.
+#
+# Pin: kyuz0/amd-strix-halo-comfyui:20260213-143435 (sha-7242b4d). Bump
+# deliberately after re-validating Flux/HiDream/LTX2 still work.
+services:
+  comfyui:
+    image: kyuz0/amd-strix-halo-comfyui:20260213-143435
+    container_name: comfyui
+    # Explicit no auto-restart — see header note about GPU contention.
+    restart: "no"
+    devices:
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups — names
+    # don't exist inside the Fedora-rawhide base, but GIDs need to match
+    # the host for /dev/kfd + /dev/dri access.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 16g
+    ipc: host
+    environment:
+      # Same unified-memory recipe as kimi-linear.yml: BIOS UMA=0.5 GB +
+      # ttm.pages_limit=33554432 cmdline + this triple. Without these,
+      # PyTorch's HIP allocator only sees the tiny 0.5 GB UMA pool and
+      # can't reach GTT. The kyuz0 image is built against native gfx1151
+      # so HSA_OVERRIDE_GFX_VERSION isn't needed.
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
+      - PYTORCH_HIP_ALLOC_CONF=backend:native,expandable_segments:True,garbage_collection_threshold:0.9
+    volumes:
+      # All ComfyUI state lives under /srv/docker/comfyui/ on the host.
+      # Image's $HOME is /root (Fedora rawhide). Models go in subdirs
+      # under comfy-models/ (text_encoders/, vae/, checkpoints/,
+      # diffusion_models/, unet/, loras/, clip_vision/) — kyuz0's image
+      # populates extra_model_paths.yaml pointing at $HOME/comfy-models.
+      - /srv/docker/comfyui/models:/root/comfy-models
+      - /srv/docker/comfyui/output:/root/comfy-outputs
+      - /srv/docker/comfyui/custom_nodes:/opt/ComfyUI/custom_nodes
+      - /srv/docker/comfyui/workflows:/opt/ComfyUI/user/default/workflows
+    ports:
+      # 8188 = standard ComfyUI port. kyuz0's banner alias uses 8000 but
+      # that would collide with vLLM (compose/kimi-linear.yml).
+      - "8188:8188"
+    # bash -lc loads /etc/profile.d/01-rocm-envs.sh (TORCH_ROCM_AOTRITON,
+    # TORCH_BLAS_PREFER_HIPBLASLT) — without a login shell those don't
+    # apply and ROCm perf regresses.
+    entrypoint: ["/bin/bash", "-lc"]
+    # set_extra_paths.sh writes /opt/ComfyUI/extra_model_paths.yaml so
+    # ComfyUI finds models under $HOME/comfy-models. Idempotent — safe
+    # to run every start. Without it, model dropdowns in the UI are
+    # empty and templates report "missing model".
+    command:
+      - >
+        /opt/set_extra_paths.sh &&
+        cd /opt/ComfyUI && python main.py
+        --listen 0.0.0.0 --port 8188
+        --output-directory /root/comfy-outputs
+        --disable-mmap --gpu-only --disable-smart-memory
+        --cache-none --bf16-vae