added qwable and orinth

2026-06-26 11:33:35 -04:00
parent 224afbb3a6
commit 705421470a
6 changed files with 384 additions and 7 deletions
--- a/opencode/opencode.json
+++ b/opencode/opencode.json
@@ -56,6 +56,42 @@
          "tool_call": true
        }
      }
+    },
+    "framework-qwable": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "Framework Desktop (Strix Halo) — Qwable (Fable-style, llama.cpp)",
+      "options": {
+        "baseURL": "http://10.0.0.70:8082/v1",
+        "apiKey": "dummy"
+      },
+      "models": {
+        "qwable": {
+          "name": "Qwable-3.6-27B (Fable-style reasoning, ~10-15 tok/s, swap-model qwable)",
+          "limit": {
+            "context": 65536,
+            "output": 16384
+          },
+          "tool_call": true
+        }
+      }
+    },
+    "framework-ornith": {
+      "npm": "@ai-sdk/openai-compatible",
+      "name": "Framework Desktop (Strix Halo) — Ornith (agentic coding, llama.cpp)",
+      "options": {
+        "baseURL": "http://10.0.0.70:8083/v1",
+        "apiKey": "dummy"
+      },
+      "models": {
+        "ornith": {
+          "name": "Ornith-1.0-35B-A3B (agentic coding MoE, ~80-100 tok/s, swap-model ornith)",
+          "limit": {
+            "context": 65536,
+            "output": 16384
+          },
+          "tool_call": true
+        }
+      }
    }
  },
  "mcp": {
--- a/pyinfra/framework/compose/ornith.yml
+++ b/pyinfra/framework/compose/ornith.yml
@@ -0,0 +1,119 @@
+# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
+# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
+# toolbox. Same image + unified-memory recipe as compose/llama.yml and
+# compose/qwable.yml; deltas are model path, port, alias.
+# https://github.com/kyuz0/amd-strix-halo-toolboxes
+# Model:   https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
+# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
+#
+# What it's for. A purpose-built *agentic coding* model — strong on
+# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
+# opens with a <think> reasoning block. Candidate daily-driver coder to
+# A/B against Ollama's qwen3-coder:30b.
+#
+# Why it's a great Strix Halo fit. MoE with only ~3B active params per
+# token (256 routed experts, 8 active + shared, 40 layers) — so on this
+# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
+# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
+# coding quality at interactive speed. Quant DOES move decode speed here
+# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
+# only if quality disappoints (~2x slower).
+#
+# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
+# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
+# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
+# `ornith` target.
+# `restart: "no"`: you bring it up deliberately via swap-model.
+#
+# Weights. Single-file GGUF (not sharded). Download path on the box
+# (see compose/ornith/README.md):
+#   hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
+#       'ornith-1.0-35b-Q4_K_M.gguf' \
+#       --local-dir /models/qwen/Ornith-1.0-35B
+# Verify exact filename in the HF repo before downloading.
+#
+# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
+# qwable (8082).
+services:
+  ornith:
+    image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
+    container_name: ornith
+    # Manual start only — see header note about GPU contention with
+    # the big models. swap-model brings it up/down.
+    restart: "no"
+    devices:
+      # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
+      # only needs dri. Don't drop kfd when on the rocm-* tag.
+      - /dev/kfd:/dev/kfd
+      - /dev/dri:/dev/dri
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp=unconfined
+    # Numeric GIDs of host's video (44) and render (991) groups —
+    # required for /dev/kfd + /dev/dri access from inside the container.
+    group_add:
+      - "44"
+      - "991"
+    shm_size: 8g
+    ipc: host
+    environment:
+      # Unified-memory recipe (same as compose/llama.yml + kimi-linear +
+      # qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
+      # these flags merge the rocminfo pools into one ~110 GB arena.
+      # kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
+      - HSA_XNACK=1
+      - HSA_FORCE_FINE_GRAIN_PCIE=1
+    volumes:
+      - /models:/models:ro
+    ports:
+      - "8083:8083"
+    entrypoint: ["llama-server"]
+    command:
+      - --model
+      - /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
+      # OpenAI-compatible served name (matches what opencode/curl request
+      # as "model"). Provider-side name lives in opencode.json.
+      - --alias
+      - ornith
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8083"
+      # Push all layers to GPU. "999" = all available. A 35B-A3B Q4
+      # (~21.2 GB) fits the merged arena with huge headroom.
+      - --n-gpu-layers
+      - "999"
+      # 64K to match the other llama.cpp stacks — keeps opencode
+      # auto-compaction behaviour consistent across providers. Ornith's
+      # native context is 262144; ramp --ctx-size toward that if a
+      # long-repo workflow needs it (see compose/ornith/README.md).
+      - --ctx-size
+      - "65536"
+      # No-mmap is the Strix Halo standard — forces full GPU load.
+      - --no-mmap
+      # Flash attention — required for q8_0 KV cache; modern llama-server
+      # takes a value (on/off/auto), bare --flash-attn is deprecated.
+      - --flash-attn
+      - "on"
+      # Quantize KV cache to int8 — halves KV memory at minor/no quality
+      # loss. Matches the other llama.cpp stacks.
+      - --cache-type-k
+      - q8_0
+      - --cache-type-v
+      - q8_0
+      # Use the model's embedded jinja chat template — Ornith inherits
+      # Qwen3.5's chat format (think-block + tool-call grammar) that the
+      # RL fine-tune relies on. Required for tool_calls to parse.
+      - --jinja
+      # Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
+      # top_k 20). Server-side defaults; opencode can still override
+      # per-request.
+      - --temp
+      - "0.6"
+      - --top-p
+      - "0.95"
+      - --top-k
+      - "20"
+      # Expose Prometheus metrics at /metrics — scraped by OpenLIT.
+      - --metrics
--- a/pyinfra/framework/compose/ornith/README.md
+++ b/pyinfra/framework/compose/ornith/README.md
@@ -0,0 +1,152 @@
+# ornith
+
+Ornith-1.0-35B on Strix Halo via `kyuz0:rocm-7.2.2`. DeepReinforce's
+MIT-licensed **agentic-coding** model — a self-improving RL fine-tune of
+**Qwen3.5-35B-A3B** that co-trains its own task scaffolds with the policy.
+Strong on Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style
+`tool_calls`, opens each answer with a `<think>` reasoning block.
+
+OpenAI-compatible endpoint at `http://framework:8083` once running.
+
+## MoE, not dense (read first — this is why it's worth a slot)
+
+Despite "35B" in the name, Ornith-1.0-35B is **MoE with only ~3B active
+params per token** (256 routed experts, 8 active + a shared expert, 40
+layers). On this bandwidth-bound box (256 GB/s) decode speed tracks
+*active* params, so it runs like the 30B-A3B workhorse (**~80-100 tok/s**),
+not like a dense 27/31B (~10-15 tok/s). That's the whole point: near
+frontier-class agentic-coding quality at interactive speed. Candidate to
+replace `qwen3-coder:30b` (Ollama) as the opencode daily driver — A/B
+before promoting.
+
+## Quant choice moves speed here
+
+For MoE, decode bandwidth ∝ *active bytes per token*, so quant tier
+changes t/s (~2x across the range), unlike a model where everything is
+read every token:
+
+| Quant | Size | When |
+|---|---|---|
+| **Q4_K_M** | **21.2 GB** | **default** — fastest, huge arena headroom |
+| Q6_K | 28.5 GB | bump here only if Q4 quality disappoints (~slower) |
+| Q8_0 | 36.9 GB | max quality, ~half the decode speed — rarely worth it for A3B |
+
+## Coexistence notes
+
+At ~21.2 GB (Q4_K_M) Ornith fits the merged arena easily:
+
+| Concurrent service | Coexists? |
+|---|---|
+| `llama` (Qwen3-Coder-30B, 8080) | ✅ yes |
+| `ollama` (11434) | ✅ yes |
+| `kimi-linear` (vLLM, 8000) | ✅ yes |
+| `qwable` (8082) | ✅ yes (~38 GB total) |
+| `qwen3-235b` (88.8 GB, 8081) | ❌ no — swap-model stops it |
+| `comfyui` (8188) | ❌ no — swap-model stops it |
+
+`restart: "no"`: you bring it up deliberately (via `swap-model ornith`),
+it won't auto-start after a reboot and surprise-collide with a big model.
+
+## Prereqs
+
+- Pyinfra deploy has run (creates `/srv/docker/ornith/` with right perms).
+- BIOS UMA at 0.5 GB + `ttm.pages_limit=33554432` kernel cmdline active.
+  Verify: `cat /proc/cmdline | grep ttm.pages_limit`.
+
+## Download weights (~21.2 GB, single file)
+
+```sh
+# /models/qwen exists via pyinfra; just create the model subdir.
+mkdir -p /models/qwen/Ornith-1.0-35B
+
+hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
+    'ornith-1.0-35b-Q4_K_M.gguf' \
+    --local-dir /models/qwen/Ornith-1.0-35B
+
+# File lands at:
+#   /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf  (~21.2 GB)
+```
+
+Single-file GGUF (not sharded) — point `--model` straight at it. Disk:
+needs ~22 GB free on `/models`. Verify the exact filename in the HF repo
+before downloading (casing matters).
+
+## Bring up
+
+Easy path — `swap-model` handles stop-conflicting-services + waits for
+`/health`:
+
+```sh
+ssh framework swap-model ornith     # ~1-2 min cold load (21.2 GB)
+ssh framework /srv/docker/ornith/smoke.sh    # /health + perf
+```
+
+Manual equivalent (first-ever bring-up, before the image is cached):
+
+```sh
+cd /srv/docker/ornith
+docker compose pull       # already-cached image if you ran llama first
+docker compose up -d
+docker compose logs -f    # wait for "server is listening on http://0.0.0.0:8083"
+
+./smoke.sh                # /health + tiny generation + perf
+```
+
+If `./smoke.sh` reports `predicted_per_second` in the ~80-100 tok/s band,
+it's healthy. <30 tok/s = investigate (likely arena < 100 GB — see
+qwen3-235b/README.md "Troubleshooting" for the arena checks).
+
+## Reasoning + tool calls
+
+Ornith emits a `<think>...</think>` block before the final answer and
+OpenAI-style `tool_calls`. `--jinja` (set in the compose file) uses the
+model's embedded Qwen3.5 chat template, which both rely on. If opencode
+shows raw `<think>` content in responses, the box's llama.cpp build is
+too old to split reasoning — bump the `kyuz0` image tag or add the
+build's reasoning-format flag. Recommended sampling (set server-side):
+temp 0.6 / top_p 0.95 / top_k 20.
+
+## Ramping context
+
+Defaults to 64K to match the other llama.cpp stacks (keeps opencode
+auto-compaction consistent across providers). Ornith's native context is
+262144, and the model is small relative to the arena, so there's room to
+push far higher:
+
+| Stage | `--ctx-size` | Margin in arena |
+|---|---|---|
+| **Current default** | **65536** | huge |
+| Stretch | 131072 | comfortable |
+| Native max | 262144 | watch KV cache size (q8_0 KV helps) |
+
+Edit `--ctx-size` in `docker-compose.yml`, `docker compose down && up -d`,
+re-run `./smoke.sh`.
+
+## Operations
+
+```sh
+docker compose logs -f                  # tail
+docker compose down                     # stop
+docker compose exec ornith bash         # shell in
+./smoke.sh                              # health + perf
+amdgpu_top                              # GPU view on host
+```
+
+## Pin manifest
+
+| Component | Pin |
+|---|---|
+| Image | `kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2` (shared with `llama`/`qwable`) |
+| Weights | `deepreinforce-ai/Ornith-1.0-35B-GGUF` → `ornith-1.0-35b-Q4_K_M.gguf` (~21.2 GB) |
+| Base | Qwen3.5-35B-A3B (MoE: 256 experts, 8 active + shared, 40 layers) |
+| Default port | 8083 |
+| Default context | 65536 (native 262144) |
+| KV cache type | q8_0 (k and v) |
+| License | MIT (model); Qwen3.5 base license also applies |
+
+## Status
+
+Compose artifacts written; awaiting box-side weight pull + bring-up.
+Wired as a `swap-model ornith` target and as the `framework-ornith`
+opencode provider. A/B against `qwen3-coder:30b`; promote to opencode
+default if the agentic-coding quality proves out.
--- a/pyinfra/framework/compose/ornith/smoke.sh
+++ b/pyinfra/framework/compose/ornith/smoke.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Smoke-test the running ornith llama-server (port 8083). Hits /health
+# for liveness, then a tiny OpenAI-compatible chat completion, then
+# measures eval_tps via /completion. MoE 35B-A3B (~3B active) → expect
+# ~80-100 tok/s, like the 30B-A3B workhorse (NOT a dense 27/31B).
+set -euo pipefail
+
+HOST="${ORNITH_HOST:-127.0.0.1:8083}"
+MODEL="${ORNITH_MODEL:-ornith}"
+
+echo "[smoke] GET /health on $HOST"
+curl -fsS "http://$HOST/health" | python3 -m json.tool
+
+echo
+echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
+# Ornith opens with a <think> block; ask for a terse final answer.
+curl -fsS "http://$HOST/v1/chat/completions" \
+    -H 'Content-Type: application/json' \
+    -d "{
+        \"model\": \"$MODEL\",
+        \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
+        \"max_tokens\": 256,
+        \"temperature\": 0.0
+    }" | python3 -m json.tool
+
+echo
+echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=128)"
+curl -fsS "http://$HOST/completion" \
+    -H 'Content-Type: application/json' \
+    -d '{
+        "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
+        "n_predict": 128,
+        "temperature": 0.0,
+        "stream": false
+    }' | python3 -c "
+import json, sys
+r = json.load(sys.stdin)
+t = r.get('timings', {})
+print(f'predicted_per_second:  {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
+print(f'prompt_per_second:     {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
+print(f'predicted_n:           {t.get(\"predicted_n\", \"?\")}')
+print(f'prompt_n:              {t.get(\"prompt_n\", \"?\")}')
+"
+
+echo
+echo "[smoke] passed — expected band ~80-100 tok/s decode (35B-A3B MoE Q4)."
+echo "          <30 tok/s = investigate arena (see qwen3-235b/README.md)."
--- a/pyinfra/framework/deploy.py
+++ b/pyinfra/framework/deploy.py
@@ -436,6 +436,7 @@ for svc in (
    "kimi-linear",
    "qwen3-235b",
    "qwable",
+    "ornith",
    "litellm",
    "comfyui",
    "openwebui",
@@ -726,6 +727,23 @@ for asset, mode in (
        _sudo=True,
    )

+# Ornith operator assets. Same image as llama (kyuz0 rocm-7.2.2); MoE
+# 35B-A3B agentic-coding fine-tune of Qwen3.5. Weights live at
+# /models/qwen/Ornith-1.0-35B via manual `hf download` per the README.
+# swap-model `ornith` target.
+for asset, mode in (
+    ("smoke.sh", "0775"),
+    ("README.md", "0664"),
+):
+    files.put(
+        name=f"ornith: {asset}",
+        src=f"compose/ornith/{asset}",
+        dest=f"{COMPOSE_DIR}/ornith/{asset}",
+        group="docker",
+        mode=mode,
+        _sudo=True,
+    )
+
 # LiteLLM router assets. config.yaml is the source-of-truth model
 # routing table — pyinfra syncs it on every run; edits on the box get
 # overwritten. The .env file holds LITELLM_MASTER_KEY + LITELLM_SALT_KEY
--- a/pyinfra/framework/scripts/swap-model
+++ b/pyinfra/framework/scripts/swap-model
@@ -14,6 +14,7 @@
 #   swap-model 235b         # Qwen3-235B-A22B via llama.cpp (long-task)
 #   swap-model kimi         # Kimi-Linear-48B-A3B via vLLM (long-context)
 #   swap-model qwable       # Qwable-3.6-27B via llama.cpp (Fable-style)
+#   swap-model ornith       # Ornith-1.0-35B via llama.cpp (agentic coding)
 #   swap-model comfyui      # ComfyUI (image generation)
 #   swap-model none         # everything down — free the GPU
 #   swap-model status       # show what's currently up
@@ -45,6 +46,7 @@ declare -A SVC_DIR=(
    [kimi]=kimi-linear
    [235b]=qwen3-235b
    [qwable]=qwable
+    [ornith]=ornith
    [comfyui]=comfyui
 )
 declare -A SVC_HEALTH=(
@@ -53,6 +55,7 @@ declare -A SVC_HEALTH=(
    [kimi]="http://127.0.0.1:8000/v1/models"
    [235b]="http://127.0.0.1:8081/health"
    [qwable]="http://127.0.0.1:8082/health"
+    [ornith]="http://127.0.0.1:8083/health"
    [comfyui]="http://127.0.0.1:8188/"
 )

@@ -68,8 +71,9 @@ plan() {
        235b)    UP=(235b)    ; DOWN=(ollama llama kimi qwable comfyui) ;;
        kimi)    UP=(kimi)    ; DOWN=(235b comfyui) ;;
        qwable)  UP=(qwable)  ; DOWN=(235b comfyui) ;;
-        comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;;
-        none)    UP=()        ; DOWN=(ollama llama kimi 235b qwable comfyui) ;;
+        ornith)  UP=(ornith)  ; DOWN=(235b comfyui) ;;
+        comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable ornith) ;;
+        none)    UP=()        ; DOWN=(ollama llama kimi 235b qwable ornith comfyui) ;;
        *)       return 1 ;;
    esac
 }
@@ -127,7 +131,7 @@ up_svc() {

 show_status() {
    echo "Inference services:"
-    for svc in ollama llama kimi 235b qwable comfyui; do
+    for svc in ollama llama kimi 235b qwable ornith comfyui; do
        local container="${SVC_DIR[$svc]}" state="down" health=""
        if is_running "$container"; then
            state="up"
@@ -150,6 +154,7 @@ Usage:
  swap-model 235b         # Qwen3-235B-A22B  (llama.cpp, long-task, ~5-10 tok/s)
  swap-model kimi         # Kimi-Linear-48B  (vLLM, long-context chat)
  swap-model qwable       # Qwable-3.6-27B   (llama.cpp, Fable-style, ~10-15 tok/s)
+  swap-model ornith       # Ornith-1.0-35B   (llama.cpp, agentic coding MoE, ~80-100 tok/s)
  swap-model comfyui      # ComfyUI          (image generation)
  swap-model none         # everything down  (free the GPU arena)
  swap-model status       # show current state
@@ -158,16 +163,16 @@ Behaviour: stops conflicting services (frees the 110 GB GPU arena),
 starts the target, polls its /health until it returns 200. Wait timeout
 defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT.

-Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with
-each other. 235B and comfyui coexist with nothing. See
-compose/qwen3-235b/README.md for arena math.
+Coexistence: ollama(30B), kimi, qwable(27B, 16.5 GB), and ornith(35B-A3B,
+21 GB) coexist with each other. 235B and comfyui coexist with nothing.
+See compose/qwen3-235b/README.md for arena math.
 EOF
 }

 # --- Main --------------------------------------------------------------------
 TARGET="${1:-}"
 case "$TARGET" in
-    coder|235b|kimi|qwable|comfyui|none) ;;
+    coder|235b|kimi|qwable|ornith|comfyui|none) ;;
    status)         show_status ; exit 0 ;;
    -h|--help|help|"") usage ; exit 0 ;;
    *)