localgenai/pyinfra/framework/compose/ornith.yml

# Ornith-1.0-35B (DeepReinforce's agentic-coding MoE — a self-improving
# RL fine-tune of Qwen3.5-35B-A3B) via the kyuz0 rocm-7.2.2 Strix Halo
# toolbox. Same image + unified-memory recipe as compose/llama.yml and
# compose/qwable.yml; deltas are model path, port, alias.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
# Model:   https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B
# Weights: https://huggingface.co/deepreinforce-ai/Ornith-1.0-35B-GGUF (MIT)
#
# What it's for. A purpose-built *agentic coding* model — strong on
# Terminal-Bench 2.1 / SWE-Bench Verified, emits OpenAI-style tool_calls,
# opens with a <think> reasoning block. Candidate daily-driver coder to
# A/B against Ollama's qwen3-coder:30b.
#
# Why it's a great Strix Halo fit. MoE with only ~3B active params per
# token (256 routed experts, 8 active + shared, 40 layers) — so on this
# bandwidth-bound box (256 GB/s) it decodes like the 30B-A3B workhorse
# (~80-100 tok/s), NOT like a dense 27/31B (~10-15 tok/s). Frontier-ish
# coding quality at interactive speed. Quant DOES move decode speed here
# (speed ∝ active bytes/token): Q4_K_M is the fast default; bump to Q6_K
# only if quality disappoints (~2x slower).
#
# Coexistence. At ~21.2 GB (Q4_K_M) it fits the ~110 GB merged arena
# alongside llama 30B (8080), Ollama, or Kimi. It does NOT fit alongside
# qwen3-235b (88.8 GB) or comfyui — swap-model tears those down for the
# `ornith` target.
# `restart: "no"`: you bring it up deliberately via swap-model.
#
# Weights. Single-file GGUF (not sharded). Download path on the box
# (see compose/ornith/README.md):
#   hf download deepreinforce-ai/Ornith-1.0-35B-GGUF \
#       'ornith-1.0-35b-Q4_K_M.gguf' \
#       --local-dir /models/qwen/Ornith-1.0-35B
# Verify exact filename in the HF repo before downloading.
#
# Port 8083 — distinct from llama 30B (8080), qwen3-235b (8081),
# qwable (8082).
services:
  ornith:
    image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
    container_name: ornith
    # Manual start only — see header note about GPU contention with
    # the big models. swap-model brings it up/down.
    restart: "no"
    devices:
      # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
      # only needs dri. Don't drop kfd when on the rocm-* tag.
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp=unconfined
    # Numeric GIDs of host's video (44) and render (991) groups —
    # required for /dev/kfd + /dev/dri access from inside the container.
    group_add:
      - "44"
      - "991"
    shm_size: 8g
    ipc: host
    environment:
      # Unified-memory recipe (same as compose/llama.yml + kimi-linear +
      # qwen3-235b + qwable). BIOS UMA=0.5 GB + ttm.pages_limit cmdline →
      # these flags merge the rocminfo pools into one ~110 GB arena.
      # kyuz0's image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
      - HSA_XNACK=1
      - HSA_FORCE_FINE_GRAIN_PCIE=1
    volumes:
      - /models:/models:ro
    ports:
      - "8083:8083"
    entrypoint: ["llama-server"]
    command:
      - --model
      - /models/qwen/Ornith-1.0-35B/ornith-1.0-35b-Q4_K_M.gguf
      # OpenAI-compatible served name (matches what opencode/curl request
      # as "model"). Provider-side name lives in opencode.json.
      - --alias
      - ornith
      - --host
      - 0.0.0.0
      - --port
      - "8083"
      # Push all layers to GPU. "999" = all available. A 35B-A3B Q4
      # (~21.2 GB) fits the merged arena with huge headroom.
      - --n-gpu-layers
      - "999"
      # 64K to match the other llama.cpp stacks — keeps opencode
      # auto-compaction behaviour consistent across providers. Ornith's
      # native context is 262144; ramp --ctx-size toward that if a
      # long-repo workflow needs it (see compose/ornith/README.md).
      - --ctx-size
      - "65536"
      # No-mmap is the Strix Halo standard — forces full GPU load.
      - --no-mmap
      # Flash attention — required for q8_0 KV cache; modern llama-server
      # takes a value (on/off/auto), bare --flash-attn is deprecated.
      - --flash-attn
      - "on"
      # Quantize KV cache to int8 — halves KV memory at minor/no quality
      # loss. Matches the other llama.cpp stacks.
      - --cache-type-k
      - q8_0
      - --cache-type-v
      - q8_0
      # Use the model's embedded jinja chat template — Ornith inherits
      # Qwen3.5's chat format (think-block + tool-call grammar) that the
      # RL fine-tune relies on. Required for tool_calls to parse.
      - --jinja
      # Recommended sampling for Ornith (temp 0.6 / top_p 0.95 /
      # top_k 20). Server-side defaults; opencode can still override
      # per-request.
      - --temp
      - "0.6"
      - --top-p
      - "0.95"
      - --top-k
      - "20"
      # Expose Prometheus metrics at /metrics — scraped by OpenLIT.
      - --metrics