added qwable and orinth
This commit is contained in:
@@ -56,6 +56,42 @@
|
||||
"tool_call": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"framework-qwable": {
|
||||
"npm": "@ai-sdk/openai-compatible",
|
||||
"name": "Framework Desktop (Strix Halo) — Qwable (Fable-style, llama.cpp)",
|
||||
"options": {
|
||||
"baseURL": "http://10.0.0.70:8082/v1",
|
||||
"apiKey": "dummy"
|
||||
},
|
||||
"models": {
|
||||
"qwable": {
|
||||
"name": "Qwable-3.6-27B (Fable-style reasoning, ~10-15 tok/s, swap-model qwable)",
|
||||
"limit": {
|
||||
"context": 65536,
|
||||
"output": 16384
|
||||
},
|
||||
"tool_call": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"framework-ornith": {
|
||||
"npm": "@ai-sdk/openai-compatible",
|
||||
"name": "Framework Desktop (Strix Halo) — Ornith (agentic coding, llama.cpp)",
|
||||
"options": {
|
||||
"baseURL": "http://10.0.0.70:8083/v1",
|
||||
"apiKey": "dummy"
|
||||
},
|
||||
"models": {
|
||||
"ornith": {
|
||||
"name": "Ornith-1.0-35B-A3B (agentic coding MoE, ~80-100 tok/s, swap-model ornith)",
|
||||
"limit": {
|
||||
"context": 65536,
|
||||
"output": 16384
|
||||
},
|
||||
"tool_call": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"mcp": {
|
||||
|
||||
119
pyinfra/framework/compose/ornith.yml
Normal file
119
pyinfra/framework/compose/ornith.yml
Normal file
@@ -0,0 +1,119 @@
|
||||
# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
|
||||
# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
|
||||
# toolbox. Same image + unified-memory recipe as compose/llama.yml and
|
||||
# compose/qwable.yml; deltas are model path, port, alias.
|
||||
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||
# Model: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
|
||||
# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
|
||||
#
|
||||
# What it's for. A purpose-built *agentic coding* model — strong on
|
||||
# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
|
||||
# opens with a <think> reasoning block. Candidate daily-driver coder to
|
||||
# A/B against Ollama's qwen3-coder:30b.
|
||||
#
|
||||
# Why it's a great Strix Halo fit. MoE with only ~3B active params per
|
||||
# token (256 routed experts, 8 active + shared, 40 layers) — so on this
|
||||
# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
|
||||
# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
|
||||
# coding quality at interactive speed. Quant DOES move decode speed here
|
||||
# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
|
||||
# only if quality disappoints (~2x slower).
|
||||
#
|
||||
# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
|
||||
# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
|
||||
# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
|
||||
# `ornith` target.
|
||||
# `restart: "no"`: you bring it up deliberately via swap-model.
|
||||
#
|
||||
# Weights. Single-file GGUF (not sharded). Download path on the box
|
||||
# (see compose/ornith/README.md):
|
||||
# hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
|
||||
# 'ornith-1.0-35b-Q4_K_M.gguf' \
|
||||
# --local-dir /models/qwen/Ornith-1.0-35B
|
||||
# Verify exact filename in the HF repo before downloading.
|
||||
#
|
||||
# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
|
||||
# qwable (8082).
|
||||
services:
|
||||
ornith:
|
||||
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
||||
container_name: ornith
|
||||
# Manual start only — see header note about GPU contention with
|
||||
# the big models. swap-model brings it up/down.
|
||||
restart: "no"
|
||||
devices:
|
||||
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
||||
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups —
|
||||
# required for /dev/kfd + /dev/dri access from inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 8g
|
||||
ipc: host
|
||||
environment:
|
||||
# Unified-memory recipe (same as compose/llama.yml + kimi-linear +
|
||||
# qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
|
||||
# these flags merge the rocminfo pools into one ~110 GB arena.
|
||||
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8083:8083"
|
||||
entrypoint: ["llama-server"]
|
||||
command:
|
||||
- --model
|
||||
- /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
|
||||
# OpenAI-compatible served name (matches what opencode/curl request
|
||||
# as "model"). Provider-side name lives in opencode.json.
|
||||
- --alias
|
||||
- ornith
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8083"
|
||||
# Push all layers to GPU. "999" = all available. A 35B-A3B Q4
|
||||
# (~21.2 GB) fits the merged arena with huge headroom.
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
# 64K to match the other llama.cpp stacks — keeps opencode
|
||||
# auto-compaction behaviour consistent across providers. Ornith's
|
||||
# native context is 262144; ramp --ctx-size toward that if a
|
||||
# long-repo workflow needs it (see compose/ornith/README.md).
|
||||
- --ctx-size
|
||||
- "65536"
|
||||
# No-mmap is the Strix Halo standard — forces full GPU load.
|
||||
- --no-mmap
|
||||
# Flash attention — required for q8_0 KV cache; modern llama-server
|
||||
# takes a value (on/off/auto), bare --flash-attn is deprecated.
|
||||
- --flash-attn
|
||||
- "on"
|
||||
# Quantize KV cache to int8 — halves KV memory at minor/no quality
|
||||
# loss. Matches the other llama.cpp stacks.
|
||||
- --cache-type-k
|
||||
- q8_0
|
||||
- --cache-type-v
|
||||
- q8_0
|
||||
# Use the model's embedded jinja chat template — Ornith inherits
|
||||
# Qwen3.5's chat format (think-block + tool-call grammar) that the
|
||||
# RL fine-tune relies on. Required for tool_calls to parse.
|
||||
- --jinja
|
||||
# Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
|
||||
# top_k 20). Server-side defaults; opencode can still override
|
||||
# per-request.
|
||||
- --temp
|
||||
- "0.6"
|
||||
- --top-p
|
||||
- "0.95"
|
||||
- --top-k
|
||||
- "20"
|
||||
# Expose Prometheus metrics at /metrics — scraped by OpenLIT.
|
||||
- --metrics
|
||||
152
pyinfra/framework/compose/ornith/README.md
Normal file
152
pyinfra/framework/compose/ornith/README.md
Normal file
@@ -0,0 +1,152 @@
|
||||
# ornith
|
||||
|
||||
Ornith-1.0-35B on Strix Halo via `kyuz0:rocm-7.2.2`. DeepReinforce's
|
||||
MIT-licensed **agentic-coding** model — a self-improving RL fine-tune of
|
||||
**Qwen3.5-35B-A3B** that co-trains its own task scaffolds with the policy.
|
||||
Strong on Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style
|
||||
`tool_calls`, opens each answer with a `<think>` reasoning block.
|
||||
|
||||
OpenAI-compatible endpoint at `http://framework:8083` once running.
|
||||
|
||||
## MoE, not dense (read first — this is why it's worth a slot)
|
||||
|
||||
Despite "35B" in the name, Ornith-1.0-35B is **MoE with only ~3B active
|
||||
params per token** (256 routed experts, 8 active + a shared expert, 40
|
||||
layers). On this bandwidth-bound box (256 GB/s) decode speed tracks
|
||||
*active* params, so it runs like the 30B-A3B workhorse (**~80-100 tok/s**),
|
||||
not like a dense 27/31B (~10-15 tok/s). That's the whole point: near
|
||||
frontier-class agentic-coding quality at interactive speed. Candidate to
|
||||
replace `qwen3-coder:30b` (Ollama) as the opencode daily driver — A/B
|
||||
before promoting.
|
||||
|
||||
## Quant choice moves speed here
|
||||
|
||||
For MoE, decode bandwidth ∝ *active bytes per token*, so quant tier
|
||||
changes t/s (~2x across the range), unlike a model where everything is
|
||||
read every token:
|
||||
|
||||
| Quant | Size | When |
|
||||
|---|---|---|
|
||||
| **Q4_K_M** | **21.2 GB** | **default** — fastest, huge arena headroom |
|
||||
| Q6_K | 28.5 GB | bump here only if Q4 quality disappoints (~slower) |
|
||||
| Q8_0 | 36.9 GB | max quality, ~half the decode speed — rarely worth it for A3B |
|
||||
|
||||
## Coexistence notes
|
||||
|
||||
At ~21.2 GB (Q4_K_M) Ornith fits the merged arena easily:
|
||||
|
||||
| Concurrent service | Coexists? |
|
||||
|---|---|
|
||||
| `llama` (Qwen3-Coder-30B, 8080) | ✅ yes |
|
||||
| `ollama` (11434) | ✅ yes |
|
||||
| `kimi-linear` (vLLM, 8000) | ✅ yes |
|
||||
| `qwable` (8082) | ✅ yes (~38 GB total) |
|
||||
| `qwen3-235b` (88.8 GB, 8081) | ❌ no — swap-model stops it |
|
||||
| `comfyui` (8188) | ❌ no — swap-model stops it |
|
||||
|
||||
`restart: "no"`: you bring it up deliberately (via `swap-model ornith`),
|
||||
it won't auto-start after a reboot and surprise-collide with a big model.
|
||||
|
||||
## Prereqs
|
||||
|
||||
- Pyinfra deploy has run (creates `/srv/docker/ornith/` with right perms).
|
||||
- BIOS UMA at 0.5 GB + `ttm.pages_limit=33554432` kernel cmdline active.
|
||||
Verify: `cat /proc/cmdline | grep ttm.pages_limit`.
|
||||
|
||||
## Download weights (~21.2 GB, single file)
|
||||
|
||||
```sh
|
||||
# /models/qwen exists via pyinfra; just create the model subdir.
|
||||
mkdir -p /models/qwen/Ornith-1.0-35B
|
||||
|
||||
hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
|
||||
'ornith-1.0-35b-Q4_K_M.gguf' \
|
||||
--local-dir /models/qwen/Ornith-1.0-35B
|
||||
|
||||
# File lands at:
|
||||
# /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf (~21.2 GB)
|
||||
```
|
||||
|
||||
Single-file GGUF (not sharded) — point `--model` straight at it. Disk:
|
||||
needs ~22 GB free on `/models`. Verify the exact filename in the HF repo
|
||||
before downloading (casing matters).
|
||||
|
||||
## Bring up
|
||||
|
||||
Easy path — `swap-model` handles stop-conflicting-services + waits for
|
||||
`/health`:
|
||||
|
||||
```sh
|
||||
ssh framework swap-model ornith # ~1-2 min cold load (21.2 GB)
|
||||
ssh framework /srv/docker/ornith/smoke.sh # /health + perf
|
||||
```
|
||||
|
||||
Manual equivalent (first-ever bring-up, before the image is cached):
|
||||
|
||||
```sh
|
||||
cd /srv/docker/ornith
|
||||
docker compose pull # already-cached image if you ran llama first
|
||||
docker compose up -d
|
||||
docker compose logs -f # wait for "server is listening on http://0.0.0.0:8083"
|
||||
|
||||
./smoke.sh # /health + tiny generation + perf
|
||||
```
|
||||
|
||||
If `./smoke.sh` reports `predicted_per_second` in the ~80-100 tok/s band,
|
||||
it's healthy. <30 tok/s = investigate (likely arena < 100 GB — see
|
||||
qwen3-235b/README.md "Troubleshooting" for the arena checks).
|
||||
|
||||
## Reasoning + tool calls
|
||||
|
||||
Ornith emits a `<think>...</think>` block before the final answer and
|
||||
OpenAI-style `tool_calls`. `--jinja` (set in the compose file) uses the
|
||||
model's embedded Qwen3.5 chat template, which both rely on. If opencode
|
||||
shows raw `<think>` content in responses, the box's llama.cpp build is
|
||||
too old to split reasoning — bump the `kyuz0` image tag or add the
|
||||
build's reasoning-format flag. Recommended sampling (set server-side):
|
||||
temp 0.6 / top_p 0.95 / top_k 20.
|
||||
|
||||
## Ramping context
|
||||
|
||||
Defaults to 64K to match the other llama.cpp stacks (keeps opencode
|
||||
auto-compaction consistent across providers). Ornith's native context is
|
||||
262144, and the model is small relative to the arena, so there's room to
|
||||
push far higher:
|
||||
|
||||
| Stage | `--ctx-size` | Margin in arena |
|
||||
|---|---|---|
|
||||
| **Current default** | **65536** | huge |
|
||||
| Stretch | 131072 | comfortable |
|
||||
| Native max | 262144 | watch KV cache size (q8_0 KV helps) |
|
||||
|
||||
Edit `--ctx-size` in `docker-compose.yml`, `docker compose down && up -d`,
|
||||
re-run `./smoke.sh`.
|
||||
|
||||
## Operations
|
||||
|
||||
```sh
|
||||
docker compose logs -f # tail
|
||||
docker compose down # stop
|
||||
docker compose exec ornith bash # shell in
|
||||
./smoke.sh # health + perf
|
||||
amdgpu_top # GPU view on host
|
||||
```
|
||||
|
||||
## Pin manifest
|
||||
|
||||
| Component | Pin |
|
||||
|---|---|
|
||||
| Image | `kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2` (shared with `llama`/`qwable`) |
|
||||
| Weights | `deepreinforce-ai/Ornith-1.0-35B-GGUF` → `ornith-1.0-35b-Q4_K_M.gguf` (~21.2 GB) |
|
||||
| Base | Qwen3.5-35B-A3B (MoE: 256 experts, 8 active + shared, 40 layers) |
|
||||
| Default port | 8083 |
|
||||
| Default context | 65536 (native 262144) |
|
||||
| KV cache type | q8_0 (k and v) |
|
||||
| License | MIT (model); Qwen3.5 base license also applies |
|
||||
|
||||
## Status
|
||||
|
||||
Compose artifacts written; awaiting box-side weight pull + bring-up.
|
||||
Wired as a `swap-model ornith` target and as the `framework-ornith`
|
||||
opencode provider. A/B against `qwen3-coder:30b`; promote to opencode
|
||||
default if the agentic-coding quality proves out.
|
||||
47
pyinfra/framework/compose/ornith/smoke.sh
Executable file
47
pyinfra/framework/compose/ornith/smoke.sh
Executable file
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
# Smoke-test the running ornith llama-server (port 8083). Hits /health
|
||||
# for liveness, then a tiny OpenAI-compatible chat completion, then
|
||||
# measures eval_tps via /completion. MoE 35B-A3B (~3B active) → expect
|
||||
# ~80-100 tok/s, like the 30B-A3B workhorse (NOT a dense 27/31B).
|
||||
set -euo pipefail
|
||||
|
||||
HOST="${ORNITH_HOST:-127.0.0.1:8083}"
|
||||
MODEL="${ORNITH_MODEL:-ornith}"
|
||||
|
||||
echo "[smoke] GET /health on $HOST"
|
||||
curl -fsS "http://$HOST/health" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] POST /v1/chat/completions ($MODEL) — tiny generation"
|
||||
# Ornith opens with a <think> block; ask for a terse final answer.
|
||||
curl -fsS "http://$HOST/v1/chat/completions" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{
|
||||
\"model\": \"$MODEL\",
|
||||
\"messages\": [{\"role\": \"user\", \"content\": \"Reply with exactly: ok\"}],
|
||||
\"max_tokens\": 256,
|
||||
\"temperature\": 0.0
|
||||
}" | python3 -m json.tool
|
||||
|
||||
echo
|
||||
echo "[smoke] perf measure — eval_tps and prompt_tps (n_predict=128)"
|
||||
curl -fsS "http://$HOST/completion" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"prompt": "Write a Python function that computes the Fibonacci sequence iteratively. Include type hints and a brief docstring.",
|
||||
"n_predict": 128,
|
||||
"temperature": 0.0,
|
||||
"stream": false
|
||||
}' | python3 -c "
|
||||
import json, sys
|
||||
r = json.load(sys.stdin)
|
||||
t = r.get('timings', {})
|
||||
print(f'predicted_per_second: {t.get(\"predicted_per_second\", \"?\"):.2f} tok/s')
|
||||
print(f'prompt_per_second: {t.get(\"prompt_per_second\", \"?\"):.2f} tok/s')
|
||||
print(f'predicted_n: {t.get(\"predicted_n\", \"?\")}')
|
||||
print(f'prompt_n: {t.get(\"prompt_n\", \"?\")}')
|
||||
"
|
||||
|
||||
echo
|
||||
echo "[smoke] passed — expected band ~80-100 tok/s decode (35B-A3B MoE Q4)."
|
||||
echo " <30 tok/s = investigate arena (see qwen3-235b/README.md)."
|
||||
@@ -436,6 +436,7 @@ for svc in (
|
||||
"kimi-linear",
|
||||
"qwen3-235b",
|
||||
"qwable",
|
||||
"ornith",
|
||||
"litellm",
|
||||
"comfyui",
|
||||
"openwebui",
|
||||
@@ -726,6 +727,23 @@ for asset, mode in (
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# Ornith operator assets. Same image as llama (kyuz0 rocm-7.2.2); MoE
|
||||
# 35B-A3B agentic-coding fine-tune of Qwen3.5. Weights live at
|
||||
# /models/qwen/Ornith-1.0-35B via manual `hf download` per the README.
|
||||
# swap-model `ornith` target.
|
||||
for asset, mode in (
|
||||
("smoke.sh", "0775"),
|
||||
("README.md", "0664"),
|
||||
):
|
||||
files.put(
|
||||
name=f"ornith: {asset}",
|
||||
src=f"compose/ornith/{asset}",
|
||||
dest=f"{COMPOSE_DIR}/ornith/{asset}",
|
||||
group="docker",
|
||||
mode=mode,
|
||||
_sudo=True,
|
||||
)
|
||||
|
||||
# LiteLLM router assets. config.yaml is the source-of-truth model
|
||||
# routing table — pyinfra syncs it on every run; edits on the box get
|
||||
# overwritten. The .env file holds LITELLM_MASTER_KEY + LITELLM_SALT_KEY
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
# swap-model 235b # Qwen3-235B-A22B via llama.cpp (long-task)
|
||||
# swap-model kimi # Kimi-Linear-48B-A3B via vLLM (long-context)
|
||||
# swap-model qwable # Qwable-3.6-27B via llama.cpp (Fable-style)
|
||||
# swap-model ornith # Ornith-1.0-35B via llama.cpp (agentic coding)
|
||||
# swap-model comfyui # ComfyUI (image generation)
|
||||
# swap-model none # everything down — free the GPU
|
||||
# swap-model status # show what's currently up
|
||||
@@ -45,6 +46,7 @@ declare -A SVC_DIR=(
|
||||
[kimi]=kimi-linear
|
||||
[235b]=qwen3-235b
|
||||
[qwable]=qwable
|
||||
[ornith]=ornith
|
||||
[comfyui]=comfyui
|
||||
)
|
||||
declare -A SVC_HEALTH=(
|
||||
@@ -53,6 +55,7 @@ declare -A SVC_HEALTH=(
|
||||
[kimi]="http://127.0.0.1:8000/v1/models"
|
||||
[235b]="http://127.0.0.1:8081/health"
|
||||
[qwable]="http://127.0.0.1:8082/health"
|
||||
[ornith]="http://127.0.0.1:8083/health"
|
||||
[comfyui]="http://127.0.0.1:8188/"
|
||||
)
|
||||
|
||||
@@ -68,8 +71,9 @@ plan() {
|
||||
235b) UP=(235b) ; DOWN=(ollama llama kimi qwable comfyui) ;;
|
||||
kimi) UP=(kimi) ; DOWN=(235b comfyui) ;;
|
||||
qwable) UP=(qwable) ; DOWN=(235b comfyui) ;;
|
||||
comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable) ;;
|
||||
none) UP=() ; DOWN=(ollama llama kimi 235b qwable comfyui) ;;
|
||||
ornith) UP=(ornith) ; DOWN=(235b comfyui) ;;
|
||||
comfyui) UP=(comfyui) ; DOWN=(235b kimi qwable ornith) ;;
|
||||
none) UP=() ; DOWN=(ollama llama kimi 235b qwable ornith comfyui) ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
@@ -127,7 +131,7 @@ up_svc() {
|
||||
|
||||
show_status() {
|
||||
echo "Inference services:"
|
||||
for svc in ollama llama kimi 235b qwable comfyui; do
|
||||
for svc in ollama llama kimi 235b qwable ornith comfyui; do
|
||||
local container="${SVC_DIR[$svc]}" state="down" health=""
|
||||
if is_running "$container"; then
|
||||
state="up"
|
||||
@@ -150,6 +154,7 @@ Usage:
|
||||
swap-model 235b # Qwen3-235B-A22B (llama.cpp, long-task, ~5-10 tok/s)
|
||||
swap-model kimi # Kimi-Linear-48B (vLLM, long-context chat)
|
||||
swap-model qwable # Qwable-3.6-27B (llama.cpp, Fable-style, ~10-15 tok/s)
|
||||
swap-model ornith # Ornith-1.0-35B (llama.cpp, agentic coding MoE, ~80-100 tok/s)
|
||||
swap-model comfyui # ComfyUI (image generation)
|
||||
swap-model none # everything down (free the GPU arena)
|
||||
swap-model status # show current state
|
||||
@@ -158,16 +163,16 @@ Behaviour: stops conflicting services (frees the 110 GB GPU arena),
|
||||
starts the target, polls its /health until it returns 200. Wait timeout
|
||||
defaults to ${WAIT_TIMEOUT}s; override with SWAP_WAIT_TIMEOUT.
|
||||
|
||||
Coexistence: ollama(30B), kimi, and qwable(27B, 16.5 GB) coexist with
|
||||
each other. 235B and comfyui coexist with nothing. See
|
||||
compose/qwen3-235b/README.md for arena math.
|
||||
Coexistence: ollama(30B), kimi, qwable(27B, 16.5 GB), and ornith(35B-A3B,
|
||||
21 GB) coexist with each other. 235B and comfyui coexist with nothing.
|
||||
See compose/qwen3-235b/README.md for arena math.
|
||||
EOF
|
||||
}
|
||||
|
||||
# --- Main --------------------------------------------------------------------
|
||||
TARGET="${1:-}"
|
||||
case "$TARGET" in
|
||||
coder|235b|kimi|qwable|comfyui|none) ;;
|
||||
coder|235b|kimi|qwable|ornith|comfyui|none) ;;
|
||||
status) show_status ; exit 0 ;;
|
||||
-h|--help|help|"") usage ; exit 0 ;;
|
||||
*)
|
||||
|
||||
Reference in New Issue
Block a user