From 705421470ac9725d8a17f861880aa1b75f72acd2 Mon Sep 17 00:00:00 2001 From: noisedestroyers Date: Fri, 26 Jun 2026 11:33:35 -0400 Subject: [PATCH] added qwable and orinth --- opencode/opencode.json | 36 +++++ pyinfra/framework/compose/ornith.yml | 119 ++++++++++++++++ pyinfra/framework/compose/ornith/README.md | 152 +++++++++++++++++++++ pyinfra/framework/compose/ornith/smoke.sh | 47 +++++++ pyinfra/framework/deploy.py | 18 +++ pyinfra/framework/scripts/swap-model | 19 ++- 6 files changed, 384 insertions(+), 7 deletions(-) create mode 100644 pyinfra/framework/compose/ornith.yml create mode 100644 pyinfra/framework/compose/ornith/README.md create mode 100755 pyinfra/framework/compose/ornith/smoke.sh diff --git a/opencode/opencode.json b/opencode/opencode.json index 44f4c0f..8b035f8 100644 --- a/opencode/opencode.json +++ b/opencode/opencode.json @@ -56,6 +56,42 @@ "tool_call": true } } + }, + "framework-qwable": { + "npm": "@ai-sdk/openai-compatible", + "name": "Framework Desktop (Strix Halo) — Qwable (Fable-style, llama.cpp)", + "options": { + "baseURL": "http://10.0.0.70:8082/v1", + "apiKey": "dummy" + }, + "models": { + "qwable": { + "name": "Qwable-3.6-27B (Fable-style reasoning, ~10-15 tok/s, swap-model qwable)", + "limit": { + "context": 65536, + "output": 16384 + }, + "tool_call": true + } + } + }, + "framework-ornith": { + "npm": "@ai-sdk/openai-compatible", + "name": "Framework Desktop (Strix Halo) — Ornith (agentic coding, llama.cpp)", + "options": { + "baseURL": "http://10.0.0.70:8083/v1", + "apiKey": "dummy" + }, + "models": { + "ornith": { + "name": "Ornith-1.0-35B-A3B (agentic coding MoE, ~80-100 tok/s, swap-model ornith)", + "limit": { + "context": 65536, + "output": 16384 + }, + "tool_call": true + } + } } }, "mcp": { diff --git a/pyinfra/framework/compose/ornith.yml b/pyinfra/framework/compose/ornith.yml new file mode 100644 index 0000000..b29fc37 --- /dev/null +++ b/pyinfra/framework/compose/ornith.yml @@ -0,0 +1,119 @@ +# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving +# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo +# toolbox. Same image + unified-memory recipe as compose/llama.yml and +# compose/qwable.yml; deltas are model path, port, alias. +# https://github.com/kyuz0/amd-strix-halo-toolboxes +# Model: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B +# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT) +# +# What it's for. A purpose-built *agentic coding* model — strong on +# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls, +# opens with a reasoning block. Candidate daily-driver coder to +# A/B against Ollama's qwen3-coder:30b. +# +# Why it's a great Strix Halo fit. MoE with only ~3B active params per +# token (256 routed experts, 8 active + shared, 40 layers) — so on this +# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse +# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish +# coding quality at interactive speed. Quant DOES move decode speed here +# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K +# only if quality disappoints (~2x slower). +# +# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena +# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside +# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the +# `ornith` target. +# `restart: "no"`: you bring it up deliberately via swap-model. +# +# Weights. Single-file GGUF (not sharded). Download path on the box +# (see compose/ornith/README.md): +# hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \ +# 'ornith-1.0-35b-Q4_K_M.gguf' \ +# --local-dir /models/qwen/Ornith-1.0-35B +# Verify exact filename in the HF repo before downloading. +# +# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081), +# qwable (8082). +services: + ornith: + image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2 + container_name: ornith + # Manual start only — see header note about GPU contention with + # the big models. swap-model brings it up/down. + restart: "no" + devices: + # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan + # only needs dri. Don't drop kfd when on the rocm-* tag. + - /dev/kfd:/dev/kfd + - /dev/dri:/dev/dri + cap_add: + - SYS_PTRACE + security_opt: + - seccomp=unconfined + # Numeric GIDs of host's video (44) and render (991) groups — + # required for /dev/kfd + /dev/dri access from inside the container. + group_add: + - "44" + - "991" + shm_size: 8g + ipc: host + environment: + # Unified-memory recipe (same as compose/llama.yml + kimi-linear + + # qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline → + # these flags merge the rocminfo pools into one ~110 GB arena. + # kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION. + - HSA_XNACK=1 + - HSA_FORCE_FINE_GRAIN_PCIE=1 + volumes: + - /models:/models:ro + ports: + - "8083:8083" + entrypoint: ["llama-server"] + command: + - --model + - /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf + # OpenAI-compatible served name (matches what opencode/curl request + # as "model"). Provider-side name lives in opencode.json. + - --alias + - ornith + - --host + - 0.0.0.0 + - --port + - "8083" + # Push all layers to GPU. "999" = all available. A 35B-A3B Q4 + # (~21.2 GB) fits the merged arena with huge headroom. + - --n-gpu-layers + - "999" + # 64K to match the other llama.cpp stacks — keeps opencode + # auto-compaction behaviour consistent across providers. Ornith's + # native context is 262144; ramp --ctx-size toward that if a + # long-repo workflow needs it (see compose/ornith/README.md). + - --ctx-size + - "65536" + # No-mmap is the Strix Halo standard — forces full GPU load. + - --no-mmap + # Flash attention — required for q8_0 KV cache; modern llama-server + # takes a value (on/off/auto), bare --flash-attn is deprecated. + - --flash-attn + - "on" + # Quantize KV cache to int8 — halves KV memory at minor/no quality + # loss. Matches the other llama.cpp stacks. + - --cache-type-k + - q8_0 + - --cache-type-v + - q8_0 + # Use the model's embedded jinja chat template — Ornith inherits + # Qwen3.5's chat format (think-block + tool-call grammar) that the + # RL fine-tune relies on. Required for tool_calls to parse. + - --jinja + # Recommended sampling for Ornith (temp 0.6 / top_p 0.95 / + # top_k 20). Server-side defaults; opencode can still override + # per-request. + - --temp + - "0.6" + - --top-p + - "0.95" + - --top-k + - "20" + # Expose Prometheus metrics at /metrics — scraped by OpenLIT. + - --metrics diff --git a/pyinfra/framework/compose/ornith/README.md b/pyinfra/framework/compose/ornith/README.md new file mode 100644 index 0000000..5c816ff --- /dev/null +++ b/pyinfra/framework/compose/ornith/README.md @@ -0,0 +1,152 @@ +# ornith + +Ornith-1.0-35B on Strix Halo via `kyuz0:rocm-7.2.2`. DeepReinforce's +MIT-licensed **agentic-coding** model — a self-improving RL fine-tune of +**Qwen3.5-35B-A3B** that co-trains its own task scaffolds with the policy. +Strong on Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style +`tool_calls`, opens each answer with a `` reasoning block. + +OpenAI-compatible endpoint at `http://framework:8083` once running. + +## MoE, not dense (read first — this is why it's worth a slot) + +Despite "35B" in the name, Ornith-1.0-35B is **MoE with only ~3B active +params per token** (256 routed experts, 8 active + a shared expert, 40 +layers). On this bandwidth-bound box (256 GB/s) decode speed tracks +*active* params, so it runs like the 30B-A3B workhorse (**~80-100 tok/s**), +not like a dense 27/31B (~10-15 tok/s). That's the whole point: near +frontier-class agentic-coding quality at interactive speed. Candidate to +replace `qwen3-coder:30b` (Ollama) as the opencode daily driver — A/B +before promoting. + +## Quant choice moves speed here + +For MoE, decode bandwidth ∝ *active bytes per token*, so quant tier +changes t/s (~2x across the range), unlike a model where everything is +read every token: + +| Quant | Size | When | +|---|---|---| +| **Q4_K_M** | **21.2 GB** | **default** — fastest, huge arena headroom | +| Q6_K | 28.5 GB | bump here only if Q4 quality disappoints (~slower) | +| Q8_0 | 36.9 GB | max quality, ~half the decode speed — rarely worth it for A3B | + +## Coexistence notes + +At ~21.2 GB (Q4_K_M) Ornith fits the merged arena easily: + +| Concurrent service | Coexists? | +|---|---| +| `llama` (Qwen3-Coder-30B, 8080) | ✅ yes | +| `ollama` (11434) | ✅ yes | +| `kimi-linear` (vLLM, 8000) | ✅ yes | +| `qwable` (8082) | ✅ yes (~38 GB total) | +| `qwen3-235b` (88.8 GB, 8081) | ❌ no — swap-model stops it | +| `comfyui` (8188) | ❌ no — swap-model stops it | + +`restart: "no"`: you bring it up deliberately (via `swap-model ornith`), +it won't auto-start after a reboot and surprise-collide with a big model. + +## Prereqs + +- Pyinfra deploy has run (creates `/srv/docker/ornith/` with right perms). +- BIOS UMA at 0.5 GB + `ttm.pages_limit=33554432` kernel cmdline active. + Verify: `cat /proc/cmdline | grep ttm.pages_limit`. + +## Download weights (~21.2 GB, single file) + +```sh +# /models/qwen exists via pyinfra; just create the model subdir. +mkdir -p /models/qwen/Ornith-1.0-35B + +hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \ + 'ornith-1.0-35b-Q4_K_M.gguf' \ + --local-dir /models/qwen/Ornith-1.0-35B + +# File lands at: +# /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf (~21.2 GB) +``` + +Single-file GGUF (not sharded) — point `--model` straight at it. Disk: +needs ~22 GB free on `/models`. Verify the exact filename in the HF repo +before downloading (casing matters). + +## Bring up + +Easy path — `swap-model` handles stop-conflicting-services + waits for +`/health`: + +```sh +ssh framework swap-model ornith # ~1-2 min cold load (21.2 GB) +ssh framework /srv/docker/ornith/smoke.sh # /health + perf +``` + +Manual equivalent (first-ever bring-up, before the image is cached): + +```sh +cd /srv/docker/ornith +docker compose pull # already-cached image if you ran llama first +docker compose up -d +docker compose logs -f # wait for "server is listening on http://0.0.0.0:8083" + +./smoke.sh # /health + tiny generation + perf +``` + +If `./smoke.sh` reports `predicted_per_second` in the ~80-100 tok/s band, +it's healthy. <30 tok/s = investigate (likely arena < 100 GB — see +qwen3-235b/README.md "Troubleshooting" for the arena checks). + +## Reasoning + tool calls + +Ornith emits a `...` block before the final answer and +OpenAI-style `tool_calls`. `--jinja` (set in the compose file) uses the +model's embedded Qwen3.5 chat template, which both rely on. If opencode +shows raw `` content in responses, the box's llama.cpp build is +too old to split reasoning — bump the `kyuz0` image tag or add the +build's reasoning-format flag. Recommended sampling (set server-side): +temp 0.6 / top_p 0.95 / top_k 20. + +## Ramping context + +Defaults to 64K to match the other llama.cpp stacks (keeps opencode +auto-compaction consistent across providers). Ornith's native context is +262144, and the model is small relative to the arena, so there's room to +push far higher: + +| Stage | `--ctx-size` | Margin in arena | +|---|---|---| +| **Current default** | **65536** | huge | +| Stretch | 131072 | comfortable | +| Native max | 262144 | watch KV cache size (q8_0 KV helps) | + +Edit `--ctx-size` in `docker-compose.yml`, `docker compose down && up -d`, +re-run `./smoke.sh`. + +## Operations + +```sh +docker compose logs -f # tail +docker compose down # stop +docker compose exec ornith bash # shell in +./smoke.sh # health + perf +amdgpu_top # GPU view on host +``` + +## Pin manifest + +| Component | Pin | +|---|---| +| Image | `kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2` (shared with `llama`/`qwable`) | +| Weights | `deepreinforce-ai/Ornith-1.0-35B-GGUF` → `ornith-1.0-35b-Q4_K_M.gguf` (~21.2 GB) | +| Base | Qwen3.5-35B-A3B (MoE: 256 experts, 8 active + shared, 40 layers) | +| Default port | 8083 | +| Default context | 65536 (native 262144) | +| KV cache type | q8_0 (k and v) | +| License | MIT (model); Qwen3.5 base license also applies | + +## Status + +Compose artifacts written; awaiting box-side weight pull + bring-up. +Wired as a `swap-model ornith` target and as the `framework-ornith` +opencode provider. A/B against `qwen3-coder:30b`; promote to opencode +default if the agentic-coding quality proves out. diff --git a/pyinfra/framework/compose/ornith/smoke.sh b/pyinfra/framework/compose/ornith/smoke.sh new file mode 100755 index 0000000..eed9e40 --- /dev/null +++ b/pyinfra/framework/compose/ornith/smoke.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Smoke-test the running ornith llama-server (port 8083). Hits /health +# for liveness, then a tiny OpenAI-compatible chat completion, then +# measures eval_tps via /completion. MoE 35B-A3B (~3B active) → expect +# ~80-100 tok/s, like the 30B-A3B workhorse (NOT a dense 27/31B). +set -euo pipefail + +HOST="${ORNITH_HOST:-127.0.0.1:8083}" +MODEL="${ORNITH_MODEL:-ornith}" + +echo "[smoke] GET /health on $HOST" +curl -fsS "http://$HOST/health" | python3 -m json.tool + +echo +echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation" +# Ornith opens with a block; ask for a terse final answer. +curl -fsS "http://$HOST/v1/chat/completions" \ + -H 'Content-Type: application/json' \ + -d "{ + \"model\": \"$MODEL\", + \"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}], + \"max_tokens\": 256, + \"temperature\": 0.0 + }" | python3 -m json.tool + +echo +echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=128)" +curl -fsS "http://$HOST/completion" \ + -H 'Content-Type: application/json' \ + -d '{ + "prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.", + "n_predict": 128, + "temperature": 0.0, + "stream": false + }' | python3 -c " +import json, sys +r = json.load(sys.stdin) +t = r.get('timings', {}) +print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s') +print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s') +print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}') +print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}') +" + +echo +echo "[smoke] passed — expected band ~80-100 tok/s decode (35B-A3B MoE Q4)." +echo " <30 tok/s = investigate arena (see qwen3-235b/README.md)." diff --git a/pyinfra/framework/deploy.py b/pyinfra/framework/deploy.py index ada3f12..b57045e 100644 --- a/pyinfra/framework/deploy.py +++ b/pyinfra/framework/deploy.py @@ -436,6 +436,7 @@ for svc in ( "kimi-linear", "qwen3-235b", "qwable", + "ornith", "litellm", "comfyui", "openwebui", @@ -726,6 +727,23 @@ for asset, mode in ( _sudo=True, ) +# Ornith operator assets. Same image as llama (kyuz0 rocm-7.2.2); MoE +# 35B-A3B agentic-coding fine-tune of Qwen3.5. Weights live at +# /models/qwen/Ornith-1.0-35B via manual `hf download` per the README. +# swap-model `ornith` target. +for asset, mode in ( + ("smoke.sh", "0775"), + ("README.md", "0664"), +): + files.put( + name=f"ornith: {asset}", + src=f"compose/ornith/{asset}", + dest=f"{COMPOSE_DIR}/ornith/{asset}", + group="docker", + mode=mode, + _sudo=True, + ) + # LiteLLM router assets. config.yaml is the source-of-truth model # routing table — pyinfra syncs it on every run; edits on the box get # overwritten. The .env file holds LITELLM_MASTER_KEY + LITELLM_SALT_KEY diff --git a/pyinfra/framework/scripts/swap-model b/pyinfra/framework/scripts/swap-model index 5078ba4..7e33c47 100755 --- a/pyinfra/framework/scripts/swap-model +++ b/pyinfra/framework/scripts/swap-model @@ -14,6 +14,7 @@ # swap-model 235b # Qwen3-235B-A22B via llama.cpp (long-task) # swap-model kimi # Kimi-Linear-48B-A3B via vLLM (long-context) # swap-model qwable # Qwable-3.6-27B via llama.cpp (Fable-style) +# swap-model ornith # Ornith-1.0-35B via llama.cpp (agentic coding) # swap-model comfyui # ComfyUI (image generation) # swap-model none # everything down — free the GPU # swap-model status # show what's currently up @@ -45,6 +46,7 @@ declare -A SVC_DIR=( [kimi]=kimi-linear [235b]=qwen3-235b [qwable]=qwable + [ornith]=ornith [comfyui]=comfyui ) declare -A SVC_HEALTH=( @@ -53,6 +55,7 @@ declare -A SVC_HEALTH=( [kimi]="http://127.0.0.1:8000/v1/models" [235b]="http://127.0.0.1:8081/health" [qwable]="http://127.0.0.1:8082/health" + [ornith]="http://127.0.0.1:8083/health" [comfyui]="http://127.0.0.1:8188/" ) @@ -68,8 +71,9 @@ plan() { 235b) UP=(235b) ; DOWN=(ollama llama kimi qwable comfyui) ;; kimi) UP=(kimi) ; DOWN=(235b comfyui) ;; qwable) UP=(qwable) ; DOWN=(235b comfyui) ;; - comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;; - none) UP=() ; DOWN=(ollama llama kimi 235b qwable comfyui) ;; + ornith) UP=(ornith) ; DOWN=(235b comfyui) ;; + comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable ornith) ;; + none) UP=() ; DOWN=(ollama llama kimi 235b qwable ornith comfyui) ;; *) return 1 ;; esac } @@ -127,7 +131,7 @@ up_svc() { show_status() { echo "Inference services:" - for svc in ollama llama kimi 235b qwable comfyui; do + for svc in ollama llama kimi 235b qwable ornith comfyui; do local container="${SVC_DIR[$svc]}" state="down" health="" if is_running "$container"; then state="up" @@ -150,6 +154,7 @@ Usage: swap-model 235b # Qwen3-235B-A22B (llama.cpp, long-task, ~5-10 tok/s) swap-model kimi # Kimi-Linear-48B (vLLM, long-context chat) swap-model qwable # Qwable-3.6-27B (llama.cpp, Fable-style, ~10-15 tok/s) + swap-model ornith # Ornith-1.0-35B (llama.cpp, agentic coding MoE, ~80-100 tok/s) swap-model comfyui # ComfyUI (image generation) swap-model none # everything down (free the GPU arena) swap-model status # show current state @@ -158,16 +163,16 @@ Behaviour: stops conflicting services (frees the 110 GB GPU arena), starts the target, polls its /health until it returns 200. Wait timeout defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT. -Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with -each other. 235B and comfyui coexist with nothing. See -compose/qwen3-235b/README.md for arena math. +Coexistence: ollama(30B), kimi, qwable(27B, 16.5 GB), and ornith(35B-A3B, +21 GB) coexist with each other. 235B and comfyui coexist with nothing. +See compose/qwen3-235b/README.md for arena math. EOF } # --- Main -------------------------------------------------------------------- TARGET="${1:-}" case "$TARGET" in - coder|235b|kimi|qwable|comfyui|none) ;; + coder|235b|kimi|qwable|ornith|comfyui|none) ;; status) show_status ; exit 0 ;; -h|--help|help|"") usage ; exit 0 ;; *)