localgenai/vscode-continue-config.yml

name: framework
version: 0.0.1
schema: v1

# Continue.dev config for the framework-backed local LLM stack. Drop at
# ~/.continue/config.yaml (per-machine) or <repo>/.continue/config.yaml
# (repo-scoped). One-time prep on the box: `ollama pull nomic-embed-text`.
#
# Two chat models, manually switched via Continue's model dropdown.
# Qwen3-Coder is the daily driver; Kimi-Linear is the long-context play
# and only works once P0 verification passes on the box.

models:
  - name: Qwen3-Coder 30B (Ollama)
    provider: ollama
    model: qwen3-coder:30b      # Ollama tag (the "A3B" is the MoE active count, descriptive only)
    apiBase: http://framework:11434   # NO /v1 — Ollama provider speaks /api/* directly
    roles: [chat, edit, apply]
    defaultCompletionOptions:
      contextLength: 65536    # matches OLLAMA_CONTEXT_LENGTH in ollama.yml

  - name: Kimi-Linear 48B-A3B (vLLM)
    provider: openai          # vLLM exposes OpenAI-compatible /v1
    model: kimi-linear        # served-model-name in compose/kimi-linear.yml
    apiBase: http://framework:8000/v1
    apiKey: dummy             # vLLM doesn't enforce; Continue requires non-empty
    roles: [chat, edit, apply]
    defaultCompletionOptions:
      contextLength: 32768    # P0 narrow start; bump as --max-model-len ramps

  - name: nomic-embed-text
    provider: ollama
    model: nomic-embed-text
    apiBase: http://framework:11434
    roles: [embed]

context:
  - provider: code
  - provider: diff
  - provider: terminal
  - provider: problems
  - provider: folder
  - provider: codebase        # uses the embed-role model above
  - provider: file
  - provider: open

# Tab autocomplete intentionally omitted — Qwen3-Coder-30B is too slow
# for inline FIM. Add a small FIM-trained model later (qwen2.5-coder:1.5b
# or starcoder2:3b) and wire as roles: [autocomplete].