103 lines
4.1 KiB
YAML
103 lines
4.1 KiB
YAML
# Qwable-3.6-27B (Qwen3.6-27B fine-tuned on Fable-5-style reasoning
|
|
# traces — "Qwen + Fable") via the kyuz0 rocm-7.2.2 Strix Halo toolbox.
|
|
# Same image + unified-memory recipe as compose/llama.yml; deltas are
|
|
# model path, port, alias.
|
|
# https://github.com/kyuz0/amd-strix-halo-toolboxes
|
|
# Model: https://huggingface.co/Mia-AiLab/Qwable-3.6-27b (MIT)
|
|
#
|
|
# What it's for. A "thinks-like-Fable-5" interactive model — structured,
|
|
# step-by-step explanatory output. Dense 27B (NOT MoE), so it's slower
|
|
# per token than the 30B-A3B MoE workhorses despite being smaller on
|
|
# disk: all 27B weights load per token. Bandwidth math (256 GB/s ÷
|
|
# ~16.5 GB) → ~10-15 tok/s decode. Interactive but not snappy.
|
|
#
|
|
# Coexistence. At ~16.5 GB (Q4_K_M) it's the smallest GPU resident here
|
|
# and fits alongside llama 30B (port 8080), Ollama, or Kimi in the
|
|
# ~110 GB merged arena. It does NOT fit alongside qwen3-235b (88.8 GB)
|
|
# or comfyui — swap-model tears those down for the `qwable` target.
|
|
# `restart: "no"`: you bring it up deliberately via swap-model, it won't
|
|
# auto-start after a reboot and surprise-collide with a big model.
|
|
#
|
|
# Weights. Single-file GGUF (not sharded). Download path on the box
|
|
# (see compose/qwable/README.md):
|
|
# hf download Mia-AiLab/Qwable-3.6-27b \
|
|
# 'Qwable-27b_Q4_K_M.gguf' \
|
|
# --local-dir /models/qwen/Qwable-3.6-27b
|
|
# Verify exact filename in the HF repo before downloading.
|
|
#
|
|
# Port 8082 — distinct from llama 30B (8080) and qwen3-235b (8081).
|
|
services:
|
|
qwable:
|
|
image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2
|
|
container_name: qwable
|
|
# Manual start only — see header note about GPU contention with
|
|
# the big models. swap-model brings it up/down.
|
|
restart: "no"
|
|
devices:
|
|
# ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan
|
|
# only needs dri. Don't drop kfd when on the rocm-* tag.
|
|
- /dev/kfd:/dev/kfd
|
|
- /dev/dri:/dev/dri
|
|
cap_add:
|
|
- SYS_PTRACE
|
|
security_opt:
|
|
- seccomp=unconfined
|
|
# Numeric GIDs of host's video (44) and render (991) groups —
|
|
# required for /dev/kfd + /dev/dri access from inside the container.
|
|
group_add:
|
|
- "44"
|
|
- "991"
|
|
shm_size: 8g
|
|
ipc: host
|
|
environment:
|
|
# Unified-memory recipe (same as compose/llama.yml + kimi-linear +
|
|
# qwen3-235b). BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these
|
|
# flags merge the rocminfo pools into one ~110 GB arena. kyuz0's
|
|
# image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION.
|
|
- HSA_XNACK=1
|
|
- HSA_FORCE_FINE_GRAIN_PCIE=1
|
|
volumes:
|
|
- /models:/models:ro
|
|
ports:
|
|
- "8082:8082"
|
|
entrypoint: ["llama-server"]
|
|
command:
|
|
- --model
|
|
- /models/qwen/Qwable-3.6-27b/Qwable-27b_Q4_K_M.gguf
|
|
# OpenAI-compatible served name (matches what opencode/curl request
|
|
# as "model"). Provider-side name lives in opencode.json if/when
|
|
# this gets wired as a provider.
|
|
- --alias
|
|
- qwable
|
|
- --host
|
|
- 0.0.0.0
|
|
- --port
|
|
- "8082"
|
|
# Push all layers to GPU. "999" = all available. A 27B Q4 (~16.5 GB)
|
|
# fits the merged arena with huge headroom.
|
|
- --n-gpu-layers
|
|
- "999"
|
|
# 64K to match llama/qwen3-235b — keeps opencode auto-compaction
|
|
# behaviour consistent across providers. Tons of arena headroom
|
|
# here (model is small), so this can ramp far higher if a workflow
|
|
# needs it; see compose/qwable/README.md.
|
|
- --ctx-size
|
|
- "65536"
|
|
# No-mmap is the Strix Halo standard — forces full GPU load.
|
|
- --no-mmap
|
|
# Flash attention — required for q8_0 KV cache; modern llama-server
|
|
# takes a value (on/off/auto), bare --flash-attn is deprecated.
|
|
- --flash-attn
|
|
- "on"
|
|
# Quantize KV cache to int8 — halves KV memory at minor/no quality
|
|
# loss. Matches the other llama.cpp stacks.
|
|
- --cache-type-k
|
|
- q8_0
|
|
- --cache-type-v
|
|
- q8_0
|
|
# Use the model's embedded jinja chat template — Qwable inherits
|
|
# Qwen3.6's chat format, which the Fable-trace fine-tune relies on.
|
|
- --jinja
|
|
# Expose Prometheus metrics at /metrics — scraped by OpenLIT.
|
|
- --metrics
|