added qwable and orinth
This commit is contained in:
119
pyinfra/framework/compose/ornith.yml
Normal file
119
pyinfra/framework/compose/ornith.yml
Normal file
@@ -0,0 +1,119 @@
|
||||
# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
|
||||
# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
|
||||
# toolbox. Same image + unified-memory recipe as compose/llama.yml and
|
||||
# compose/qwable.yml; deltas are model path, port, alias.
|
||||
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
||||
# Model: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
|
||||
# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
|
||||
#
|
||||
# What it's for. A purpose-built *agentic coding* model — strong on
|
||||
# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
|
||||
# opens with a <think> reasoning block. Candidate daily-driver coder to
|
||||
# A/B against Ollama's qwen3-coder:30b.
|
||||
#
|
||||
# Why it's a great Strix Halo fit. MoE with only ~3B active params per
|
||||
# token (256 routed experts, 8 active + shared, 40 layers) — so on this
|
||||
# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
|
||||
# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
|
||||
# coding quality at interactive speed. Quant DOES move decode speed here
|
||||
# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
|
||||
# only if quality disappoints (~2x slower).
|
||||
#
|
||||
# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
|
||||
# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
|
||||
# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
|
||||
# `ornith` target.
|
||||
# `restart: "no"`: you bring it up deliberately via swap-model.
|
||||
#
|
||||
# Weights. Single-file GGUF (not sharded). Download path on the box
|
||||
# (see compose/ornith/README.md):
|
||||
# hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
|
||||
# 'ornith-1.0-35b-Q4_K_M.gguf' \
|
||||
# --local-dir /models/qwen/Ornith-1.0-35B
|
||||
# Verify exact filename in the HF repo before downloading.
|
||||
#
|
||||
# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
|
||||
# qwable (8082).
|
||||
services:
|
||||
ornith:
|
||||
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
||||
container_name: ornith
|
||||
# Manual start only — see header note about GPU contention with
|
||||
# the big models. swap-model brings it up/down.
|
||||
restart: "no"
|
||||
devices:
|
||||
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
||||
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
# Numeric GIDs of host's video (44) and render (991) groups —
|
||||
# required for /dev/kfd + /dev/dri access from inside the container.
|
||||
group_add:
|
||||
- "44"
|
||||
- "991"
|
||||
shm_size: 8g
|
||||
ipc: host
|
||||
environment:
|
||||
# Unified-memory recipe (same as compose/llama.yml + kimi-linear +
|
||||
# qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
|
||||
# these flags merge the rocminfo pools into one ~110 GB arena.
|
||||
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
|
||||
- HSA_XNACK=1
|
||||
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
volumes:
|
||||
- /models:/models:ro
|
||||
ports:
|
||||
- "8083:8083"
|
||||
entrypoint: ["llama-server"]
|
||||
command:
|
||||
- --model
|
||||
- /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
|
||||
# OpenAI-compatible served name (matches what opencode/curl request
|
||||
# as "model"). Provider-side name lives in opencode.json.
|
||||
- --alias
|
||||
- ornith
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8083"
|
||||
# Push all layers to GPU. "999" = all available. A 35B-A3B Q4
|
||||
# (~21.2 GB) fits the merged arena with huge headroom.
|
||||
- --n-gpu-layers
|
||||
- "999"
|
||||
# 64K to match the other llama.cpp stacks — keeps opencode
|
||||
# auto-compaction behaviour consistent across providers. Ornith's
|
||||
# native context is 262144; ramp --ctx-size toward that if a
|
||||
# long-repo workflow needs it (see compose/ornith/README.md).
|
||||
- --ctx-size
|
||||
- "65536"
|
||||
# No-mmap is the Strix Halo standard — forces full GPU load.
|
||||
- --no-mmap
|
||||
# Flash attention — required for q8_0 KV cache; modern llama-server
|
||||
# takes a value (on/off/auto), bare --flash-attn is deprecated.
|
||||
- --flash-attn
|
||||
- "on"
|
||||
# Quantize KV cache to int8 — halves KV memory at minor/no quality
|
||||
# loss. Matches the other llama.cpp stacks.
|
||||
- --cache-type-k
|
||||
- q8_0
|
||||
- --cache-type-v
|
||||
- q8_0
|
||||
# Use the model's embedded jinja chat template — Ornith inherits
|
||||
# Qwen3.5's chat format (think-block + tool-call grammar) that the
|
||||
# RL fine-tune relies on. Required for tool_calls to parse.
|
||||
- --jinja
|
||||
# Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
|
||||
# top_k 20). Server-side defaults; opencode can still override
|
||||
# per-request.
|
||||
- --temp
|
||||
- "0.6"
|
||||
- --top-p
|
||||
- "0.95"
|
||||
- --top-k
|
||||
- "20"
|
||||
# Expose Prometheus metrics at /metrics — scraped by OpenLIT.
|
||||
- --metrics
|
||||
Reference in New Issue
Block a user