# Qwable-3.6-27B (Qwen3.6-27B fine-tuned on Fable-5-style reasoning # traces — "Qwen + Fable") via the kyuz0 rocm-7.2.2 Strix Halo toolbox. # Same image + unified-memory recipe as compose/llama.yml; deltas are # model path, port, alias. # https://github.com/kyuz0/amd-strix-halo-toolboxes # Model: https://huggingface.co/Mia-AiLab/Qwable-3.6-27b (MIT) # # What it's for. A "thinks-like-Fable-5" interactive model — structured, # step-by-step explanatory output. Dense 27B (NOT MoE), so it's slower # per token than the 30B-A3B MoE workhorses despite being smaller on # disk: all 27B weights load per token. Bandwidth math (256 GB/s ÷ # ~16.5 GB) → ~10-15 tok/s decode. Interactive but not snappy. # # Coexistence. At ~16.5 GB (Q4_K_M) it's the smallest GPU resident here # and fits alongside llama 30B (port 8080), Ollama, or Kimi in the # ~110 GB merged arena. It does NOT fit alongside qwen3-235b (88.8 GB) # or comfyui — swap-model tears those down for the `qwable` target. # `restart: "no"`: you bring it up deliberately via swap-model, it won't # auto-start after a reboot and surprise-collide with a big model. # # Weights. Single-file GGUF (not sharded). Download path on the box # (see compose/qwable/README.md): # hf download Mia-AiLab/Qwable-3.6-27b \ # 'Qwable-27b_Q4_K_M.gguf' \ # --local-dir /models/qwen/Qwable-3.6-27b # Verify exact filename in the HF repo before downloading. # # Port 8082 — distinct from llama 30B (8080) and qwen3-235b (8081). services: qwable: image: kyuz0/amd-strix-halo-toolboxes:rocm-7.2.2 container_name: qwable # Manual start only — see header note about GPU contention with # the big models. swap-model brings it up/down. restart: "no" devices: # ROCm needs both kfd (kernel fusion driver) and dri (DRM); Vulkan # only needs dri. Don't drop kfd when on the rocm-* tag. - /dev/kfd:/dev/kfd - /dev/dri:/dev/dri cap_add: - SYS_PTRACE security_opt: - seccomp=unconfined # Numeric GIDs of host's video (44) and render (991) groups — # required for /dev/kfd + /dev/dri access from inside the container. group_add: - "44" - "991" shm_size: 8g ipc: host environment: # Unified-memory recipe (same as compose/llama.yml + kimi-linear + # qwen3-235b). BIOS UMA=0.5 GB + ttm.pages_limit cmdline → these # flags merge the rocminfo pools into one ~110 GB arena. kyuz0's # image is native gfx1151 so no HSA_OVERRIDE_GFX_VERSION. - HSA_XNACK=1 - HSA_FORCE_FINE_GRAIN_PCIE=1 volumes: - /models:/models:ro ports: - "8082:8082" entrypoint: ["llama-server"] command: - --model - /models/qwen/Qwable-3.6-27b/Qwable-27b_Q4_K_M.gguf # OpenAI-compatible served name (matches what opencode/curl request # as "model"). Provider-side name lives in opencode.json if/when # this gets wired as a provider. - --alias - qwable - --host - 0.0.0.0 - --port - "8082" # Push all layers to GPU. "999" = all available. A 27B Q4 (~16.5 GB) # fits the merged arena with huge headroom. - --n-gpu-layers - "999" # 64K to match llama/qwen3-235b — keeps opencode auto-compaction # behaviour consistent across providers. Tons of arena headroom # here (model is small), so this can ramp far higher if a workflow # needs it; see compose/qwable/README.md. - --ctx-size - "65536" # No-mmap is the Strix Halo standard — forces full GPU load. - --no-mmap # Flash attention — required for q8_0 KV cache; modern llama-server # takes a value (on/off/auto), bare --flash-attn is deprecated. - --flash-attn - "on" # Quantize KV cache to int8 — halves KV memory at minor/no quality # loss. Matches the other llama.cpp stacks. - --cache-type-k - q8_0 - --cache-type-v - q8_0 # Use the model's embedded jinja chat template — Qwable inherits # Qwen3.6's chat format, which the Fable-trace fine-tune relies on. - --jinja # Expose Prometheus metrics at /metrics — scraped by OpenLIT. - --metrics