Files
localgenai/pyinfra/framework/compose/ornith.yml

120 lines
4.8 KiB
YAML

# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
# toolbox. Same image + unified-memory recipe as compose/llama.yml and
# compose/qwable.yml; deltas are model path, port, alias.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
# Model: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
#
# What it's for. A purpose-built *agentic coding* model — strong on
# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
# opens with a <think> reasoning block. Candidate daily-driver coder to
# A/B against Ollama's qwen3-coder:30b.
#
# Why it's a great Strix Halo fit. MoE with only ~3B active params per
# token (256 routed experts, 8 active + shared, 40 layers) — so on this
# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
# coding quality at interactive speed. Quant DOES move decode speed here
# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
# only if quality disappoints (~2x slower).
#
# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
# `ornith` target.
# `restart: "no"`: you bring it up deliberately via swap-model.
#
# Weights. Single-file GGUF (not sharded). Download path on the box
# (see compose/ornith/README.md):
# hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
# 'ornith-1.0-35b-Q4_K_M.gguf' \
# --local-dir /models/qwen/Ornith-1.0-35B
# Verify exact filename in the HF repo before downloading.
#
# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
# qwable (8082).
services:
ornith:
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
container_name: ornith
# Manual start only — see header note about GPU contention with
# the big models. swap-model brings it up/down.
restart: "no"
devices:
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
# only needs dri. Don't drop kfd when on the rocm-* tag.
- /dev/kfd:/dev/kfd
- /dev/dri:/dev/dri
cap_add:
- SYS_PTRACE
security_opt:
- seccomp=unconfined
# Numeric GIDs of host's video (44) and render (991) groups —
# required for /dev/kfd + /dev/dri access from inside the container.
group_add:
- "44"
- "991"
shm_size: 8g
ipc: host
environment:
# Unified-memory recipe (same as compose/llama.yml + kimi-linear +
# qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
# these flags merge the rocminfo pools into one ~110 GB arena.
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
- HSA_XNACK=1
- HSA_FORCE_FINE_GRAIN_PCIE=1
volumes:
- /models:/models:ro
ports:
- "8083:8083"
entrypoint: ["llama-server"]
command:
- --model
- /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
# OpenAI-compatible served name (matches what opencode/curl request
# as "model"). Provider-side name lives in opencode.json.
- --alias
- ornith
- --host
- 0.0.0.0
- --port
- "8083"
# Push all layers to GPU. "999" = all available. A 35B-A3B Q4
# (~21.2 GB) fits the merged arena with huge headroom.
- --n-gpu-layers
- "999"
# 64K to match the other llama.cpp stacks — keeps opencode
# auto-compaction behaviour consistent across providers. Ornith's
# native context is 262144; ramp --ctx-size toward that if a
# long-repo workflow needs it (see compose/ornith/README.md).
- --ctx-size
- "65536"
# No-mmap is the Strix Halo standard — forces full GPU load.
- --no-mmap
# Flash attention — required for q8_0 KV cache; modern llama-server
# takes a value (on/off/auto), bare --flash-attn is deprecated.
- --flash-attn
- "on"
# Quantize KV cache to int8 — halves KV memory at minor/no quality
# loss. Matches the other llama.cpp stacks.
- --cache-type-k
- q8_0
- --cache-type-v
- q8_0
# Use the model's embedded jinja chat template — Ornith inherits
# Qwen3.5's chat format (think-block + tool-call grammar) that the
# RL fine-tune relies on. Required for tool_calls to parse.
- --jinja
# Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
# top_k 20). Server-side defaults; opencode can still override
# per-request.
- --temp
- "0.6"
- --top-p
- "0.95"
- --top-k
- "20"
# Expose Prometheus metrics at /metrics — scraped by OpenLIT.
- --metrics