added qwable and orinth

2026-06-26 11:33:35 -04:00
parent 224afbb3a6
commit 705421470a
6 changed files with 384 additions and 7 deletions
--- a/pyinfra/framework/compose/ornith.yml
+++ b/pyinfra/framework/compose/ornith.yml
@@ -0,0 +1,119 @@
+# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
+# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
+# toolbox. Same image + unified-memory recipe as compose/llama.yml and
+# compose/qwable.yml; deltas are model path, port, alias.
+# https://github.com/kyuz0/amd-strix-halo-toolboxes
+# Model:   https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
+# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
+#
+# What it's for. A purpose-built *agentic coding* model — strong on
+# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
+# opens with a <think> reasoning block. Candidate daily-driver coder to
+# A/B against Ollama's qwen3-coder:30b.
+#
+# Why it's a great Strix Halo fit. MoE with only ~3B active params per
+# token (256 routed experts, 8 active + shared, 40 layers) — so on this
+# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
+# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
+# coding quality at interactive speed. Quant DOES move decode speed here
+# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
+# only if quality disappoints (~2x slower).
+#
+# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
+# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
+# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
+# `ornith` target.
+# `restart: "no"`: you bring it up deliberately via swap-model.
+#
+# Weights. Single-file GGUF (not sharded). Download path on the box
+# (see compose/ornith/README.md):
+#   hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
+#       'ornith-1.0-35b-Q4_K_M.gguf' \
+#       --local-dir /models/qwen/Ornith-1.0-35B
+# Verify exact filename in the HF repo before downloading.
+#
+# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
+# qwable (8082).
+services:
+  ornith:
+    image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
+    container_name: ornith
+    # Manual start only — see header note about GPU contention with
+    # the big models. swap-model brings it up/down.
+    restart: "no"
+    devices:
+      # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
+      # only needs dri. Don't drop kfd when on the rocm-* tag.
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups —
+    # required for /dev/kfd + /dev/dri access from inside the container.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 8g
+    ipc: host
+    environment:
+      # Unified-memory recipe (same as compose/llama.yml + kimi-linear +
+      # qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
+      # these flags merge the rocminfo pools into one ~110 GB arena.
+      # kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8083:8083"
+    entrypoint: ["llama-server"]
+    command:
+      - --model
+      - /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
+      # OpenAI-compatible served name (matches what opencode/curl request
+      # as "model"). Provider-side name lives in opencode.json.
+      - --alias
+      - ornith
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8083"
+      # Push all layers to GPU. "999" = all available. A 35B-A3B Q4
+      # (~21.2 GB) fits the merged arena with huge headroom.
+      - --n-gpu-layers
+      - "999"
+      # 64K to match the other llama.cpp stacks — keeps opencode
+      # auto-compaction behaviour consistent across providers. Ornith's
+      # native context is 262144; ramp --ctx-size toward that if a
+      # long-repo workflow needs it (see compose/ornith/README.md).
+      - --ctx-size
+      - "65536"
+      # No-mmap is the Strix Halo standard — forces full GPU load.
+      - --no-mmap
+      # Flash attention — required for q8_0 KV cache; modern llama-server
+      # takes a value (on/off/auto), bare --flash-attn is deprecated.
+      - --flash-attn
+      - "on"
+      # Quantize KV cache to int8 — halves KV memory at minor/no quality
+      # loss. Matches the other llama.cpp stacks.
+      - --cache-type-k
+      - q8_0
+      - --cache-type-v
+      - q8_0
+      # Use the model's embedded jinja chat template — Ornith inherits
+      # Qwen3.5's chat format (think-block + tool-call grammar) that the
+      # RL fine-tune relies on. Required for tool_calls to parse.
+      - --jinja
+      # Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
+      # top_k 20). Server-side defaults; opencode can still override
+      # per-request.
+      - --temp
+      - "0.6"
+      - --top-p
+      - "0.95"
+      - --top-k
+      - "20"
+      # Expose Prometheus metrics at /metrics — scraped by OpenLIT.
+      - --metrics