localgenai/pyinfra/framework/compose/qwable.yml

# Qwable-3.6-27B (Qwen3.6-27B fine-tuned on Fable-5-style reasoning
# traces — "Qwen + Fable") via the kyuz0 rocm-7.2.2 Strix Halo toolbox.
# Same image + unified-memory recipe as compose/llama.yml; deltas are
# model path, port, alias.
# https://github.com/kyuz0/amd-strix-halo-toolboxes
# Model: https://huggingface.co/Mia-AiLab/Qwable-3.6-27b (MIT)
#
# What it's for. A "thinks-like-Fable-5" interactive model — structured,
# step-by-step explanatory output. Dense 27B (NOT MoE), so it's slower
# per token than the 30B-A3B MoE workhorses despite being smaller on
# disk: all 27B weights load per token. Bandwidth math (256 GB/s ÷
# ~16.5 GB) → ~10-15 tok/s decode. Interactive but not snappy.
#
# Coexistence. At ~16.5 GB (Q4_K_M) it's the smallest GPU resident here
# and fits alongside llama 30B (port 8080), Ollama, or Kimi in the
# ~110 GB merged arena. It does NOT fit alongside qwen3-235b (88.8 GB)
# or comfyui — swap-model tears those down for the `qwable` target.
# `restart: "no"`: you bring it up deliberately via swap-model, it won't
# auto-start after a reboot and surprise-collide with a big model.
#
# Weights. Single-file GGUF (not sharded). Download path on the box
# (see compose/qwable/README.md):
#   hf download Mia-AiLab/Qwable-3.6-27b \
#       'Qwable-27b_Q4_K_M.gguf' \
#       --local-dir /models/qwen/Qwable-3.6-27b
# Verify exact filename in the HF repo before downloading.
#
# Port 8082 — distinct from llama 30B (8080) and qwen3-235b (8081).
services:
  qwable:
    image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
    container_name: qwable
    # Manual start only — see header note about GPU contention with
    # the big models. swap-model brings it up/down.
    restart: "no"
    devices:
      # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
      # only needs dri. Don't drop kfd when on the rocm-* tag.
      - /dev/kfd:/dev/kfd
      - /dev/dri:/dev/dri
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp=unconfined
    # Numeric GIDs of host's video (44) and render (991) groups —
    # required for /dev/kfd + /dev/dri access from inside the container.
    group_add:
      - "44"
      - "991"
    shm_size: 8g
    ipc: host
    environment:
      # Unified-memory recipe (same as compose/llama.yml + kimi-linear +
      # qwen3-235b). BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these
      # flags merge the rocminfo pools into one ~110 GB arena. kyuz0's
      # image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
      - HSA_XNACK=1
      - HSA_FORCE_FINE_GRAIN_PCIE=1
    volumes:
      - /models:/models:ro
    ports:
      - "8082:8082"
    entrypoint: ["llama-server"]
    command:
      - --model
      - /models/qwen/Qwable-3.6-27b/Qwable-27b_Q4_K_M.gguf
      # OpenAI-compatible served name (matches what opencode/curl request
      # as "model"). Provider-side name lives in opencode.json if/when
      # this gets wired as a provider.
      - --alias
      - qwable
      - --host
      - 0.0.0.0
      - --port
      - "8082"
      # Push all layers to GPU. "999" = all available. A 27B Q4 (~16.5 GB)
      # fits the merged arena with huge headroom.
      - --n-gpu-layers
      - "999"
      # 64K to match llama/qwen3-235b — keeps opencode auto-compaction
      # behaviour consistent across providers. Tons of arena headroom
      # here (model is small), so this can ramp far higher if a workflow
      # needs it; see compose/qwable/README.md.
      - --ctx-size
      - "65536"
      # No-mmap is the Strix Halo standard — forces full GPU load.
      - --no-mmap
      # Flash attention — required for q8_0 KV cache; modern llama-server
      # takes a value (on/off/auto), bare --flash-attn is deprecated.
      - --flash-attn
      - "on"
      # Quantize KV cache to int8 — halves KV memory at minor/no quality
      # loss. Matches the other llama.cpp stacks.
      - --cache-type-k
      - q8_0
      - --cache-type-v
      - q8_0
      # Use the model's embedded jinja chat template — Qwable inherits
      # Qwen3.6's chat format, which the Fable-trace fine-tune relies on.
      - --jinja
      # Expose Prometheus metrics at /metrics — scraped by OpenLIT.
      - --metrics