Initial commit: localgenai stack

Containerized local LLM stack for the Framework Desktop / Strix Halo, plus the OpenCode harness on the Mac side. - pyinfra/framework/: pyinfra deploy targeting the box - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override for gfx1151), OpenWebUI - Beszel (host + container + AMD GPU dashboard via sysfs) - OpenLIT (LLM fleet metrics) - Phoenix (per-trace agent waterfall) - OpenHands (autonomous agent in a Docker sandbox) - opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter) - install.sh deploys to ~/.config/opencode/ - StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md: documentation and planning - testing/qwen3-coder-30b/: small evaluation harness Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 11:35:10 -04:00
commit 2c4bfefa95
36 changed files with 5265 additions and 0 deletions
--- a/pyinfra/framework/compose/llama.yml
+++ b/pyinfra/framework/compose/llama.yml
@@ -0,0 +1,44 @@
+# llama.cpp server, gfx1151-optimized via kyuz0's Strix Halo toolboxes.
+# https://github.com/kyuz0/amd-strix-halo-toolboxes
+#
+# Tag options on docker.io/kyuz0/amd-strix-halo-toolboxes:
+#   vulkan-radv      — most stable, recommended default (this one)
+#   vulkan-amdvlk    — alternate Vulkan driver, sometimes faster
+#   rocm-7.2.2       — ROCm 7.x; needs /dev/kfd + render group_add (see vllm.yml pattern)
+#   rocm-6.4.4       — ROCm 6.x fallback
+#   rocm7-nightlies  — avoid: caps memory allocation to 64 GB (May 2026)
+#
+# Toolbox images use a shell entrypoint, so we override to launch
+# llama-server directly. Edit the --model path before `docker compose up -d`.
+services:
+  llama:
+    image: kyuz0/amd-strix-halo-toolboxes:vulkan-radv
+    container_name: llama
+    restart: unless-stopped
+    devices:
+      - /dev/dri:/dev/dri
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8080:8080"
+    entrypoint: ["llama-server"]
+    command:
+      - --model
+      - /models/REPLACE/ME/model.gguf
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8080"
+      - --n-gpu-layers
+      - "999"
+      - --ctx-size
+      - "32768"
+      # Required for GPU backends on Strix Halo per Gygeek's setup
+      # guide. Forces full load into GPU memory rather than mmap.
+      - --no-mmap
+      # Flash attention — works on Vulkan too; the big win is on the
+      # ROCm tag where kyuz0's build has rocWMMA acceleration.
+      - --flash-attn
+      # Expose Prometheus metrics at /metrics — scraped by OpenLIT for
+      # tokens/sec, KV-cache use, queue depth, and request latency.
+      - --metrics