120 lines
4.8 KiB
YAML
120 lines
4.8 KiB
YAML
# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
|
|
# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
|
|
# toolbox. Same image + unified-memory recipe as compose/llama.yml and
|
|
# compose/qwable.yml; deltas are model path, port, alias.
|
|
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
|
# Model: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
|
|
# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
|
|
#
|
|
# What it's for. A purpose-built *agentic coding* model — strong on
|
|
# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
|
|
# opens with a <think> reasoning block. Candidate daily-driver coder to
|
|
# A/B against Ollama's qwen3-coder:30b.
|
|
#
|
|
# Why it's a great Strix Halo fit. MoE with only ~3B active params per
|
|
# token (256 routed experts, 8 active + shared, 40 layers) — so on this
|
|
# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
|
|
# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
|
|
# coding quality at interactive speed. Quant DOES move decode speed here
|
|
# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
|
|
# only if quality disappoints (~2x slower).
|
|
#
|
|
# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
|
|
# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
|
|
# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
|
|
# `ornith` target.
|
|
# `restart: "no"`: you bring it up deliberately via swap-model.
|
|
#
|
|
# Weights. Single-file GGUF (not sharded). Download path on the box
|
|
# (see compose/ornith/README.md):
|
|
# hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
|
|
# 'ornith-1.0-35b-Q4_K_M.gguf' \
|
|
# --local-dir /models/qwen/Ornith-1.0-35B
|
|
# Verify exact filename in the HF repo before downloading.
|
|
#
|
|
# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
|
|
# qwable (8082).
|
|
services:
|
|
ornith:
|
|
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
|
container_name: ornith
|
|
# Manual start only — see header note about GPU contention with
|
|
# the big models. swap-model brings it up/down.
|
|
restart: "no"
|
|
devices:
|
|
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
|
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
|
- /dev/kfd:/dev/kfd
|
|
- /dev/dri:/dev/dri
|
|
cap_add:
|
|
- SYS_PTRACE
|
|
security_opt:
|
|
- seccomp=unconfined
|
|
# Numeric GIDs of host's video (44) and render (991) groups —
|
|
# required for /dev/kfd + /dev/dri access from inside the container.
|
|
group_add:
|
|
- "44"
|
|
- "991"
|
|
shm_size: 8g
|
|
ipc: host
|
|
environment:
|
|
# Unified-memory recipe (same as compose/llama.yml + kimi-linear +
|
|
# qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
|
|
# these flags merge the rocminfo pools into one ~110 GB arena.
|
|
# kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
|
|
- HSA_XNACK=1
|
|
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
|
volumes:
|
|
- /models:/models:ro
|
|
ports:
|
|
- "8083:8083"
|
|
entrypoint: ["llama-server"]
|
|
command:
|
|
- --model
|
|
- /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
|
|
# OpenAI-compatible served name (matches what opencode/curl request
|
|
# as "model"). Provider-side name lives in opencode.json.
|
|
- --alias
|
|
- ornith
|
|
- --host
|
|
- 0.0.0.0
|
|
- --port
|
|
- "8083"
|
|
# Push all layers to GPU. "999" = all available. A 35B-A3B Q4
|
|
# (~21.2 GB) fits the merged arena with huge headroom.
|
|
- --n-gpu-layers
|
|
- "999"
|
|
# 64K to match the other llama.cpp stacks — keeps opencode
|
|
# auto-compaction behaviour consistent across providers. Ornith's
|
|
# native context is 262144; ramp --ctx-size toward that if a
|
|
# long-repo workflow needs it (see compose/ornith/README.md).
|
|
- --ctx-size
|
|
- "65536"
|
|
# No-mmap is the Strix Halo standard — forces full GPU load.
|
|
- --no-mmap
|
|
# Flash attention — required for q8_0 KV cache; modern llama-server
|
|
# takes a value (on/off/auto), bare --flash-attn is deprecated.
|
|
- --flash-attn
|
|
- "on"
|
|
# Quantize KV cache to int8 — halves KV memory at minor/no quality
|
|
# loss. Matches the other llama.cpp stacks.
|
|
- --cache-type-k
|
|
- q8_0
|
|
- --cache-type-v
|
|
- q8_0
|
|
# Use the model's embedded jinja chat template — Ornith inherits
|
|
# Qwen3.5's chat format (think-block + tool-call grammar) that the
|
|
# RL fine-tune relies on. Required for tool_calls to parse.
|
|
- --jinja
|
|
# Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
|
|
# top_k 20). Server-side defaults; opencode can still override
|
|
# per-request.
|
|
- --temp
|
|
- "0.6"
|
|
- --top-p
|
|
- "0.95"
|
|
- --top-k
|
|
- "20"
|
|
# Expose Prometheus metrics at /metrics — scraped by OpenLIT.
|
|
- --metrics
|