fix(#49 ): allow-anonymous mode must ignore unrecognized keys, not 401

Regression from #49: the auth middleware rejected ANY present-but- unresolvable bearer token with 401 invalid_api_key, even when require_auth=false. But OpenAI-compatible clients (opencode, Open WebUI, Agent Zero, litellm) send a placeholder bearer by default — so enabling the build broke every existing client even though the operator never opted into auth. Pre-#49 the bearer was never inspected at all. Fix: in allow-anonymous mode (require_auth=false, the default) an unrecognized key is now ignored and the request is served anonymously, restoring pre-#49 behaviour. A bad key only 401s when require_auth=true. A valid key is still resolved + metered in both modes. Test renamed/split: unrecognized_key_is_ignored_when_auth_not_required (now 200, served anonymously) + invalid_key_is_401_when_auth_required. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Merge #56 (phase 3): fail-fast prompt pre-validation + advisory hints
2026-06-17 21:40:34 +03:00 · 2026-06-17 20:57:55 +03:00 · 2026-06-17 20:51:26 +03:00 · 2026-06-17 20:50:38 +03:00 · 2026-06-17 20:45:50 +03:00 · 2026-06-17 20:40:25 +03:00
214 changed files with 62627 additions and 852 deletions
--- a/.gitea/workflows/build-prerelease.yml
+++ b/.gitea/workflows/build-prerelease.yml
@@ -0,0 +1,618 @@
+name: build-prerelease
+
+# Builds CUDA-flavoured neuron binaries (and a single cortex binary),
+# packages each as a Fedora RPM, signs them, and publishes to the
+# `unstable` channel at rpm.lair.cafe.
+#
+# Change-aware: the `prepare` job diffs HEAD against the git sha
+# embedded in the most recently *published* unstable RPM (per package)
+# and skips builds whose inputs didn't change. Docs-only commits build
+# nothing; gateway-only commits skip the 3 CUDA builds (and, via
+# deploy.yml's own check-update gate, the neuron restarts + model
+# cold-loads). Diffing against the published sha — not the previous
+# push — means a failed run can never cause a change to be missed.
+#
+# Lint (fmt+clippy) and test run here as parallel jobs and gate
+# `publish`; ci.yml no longer runs on pushes to main (see its trigger
+# comment), so the two workflows stop competing for the same runners.
+#
+# The published packages are versioned as e.g.
+#   helexa-neuron-blackwell-0.1.16-0.1.20260518T140530.gitabcdef0.fc43.x86_64
+#                                       ^^^^^^^^^^^^^^^^^^ ^^^^^^^^
+#                                       commit time (s)    commit sha
+# so they sort BELOW the eventual 0.1.16-1 stable release, and so two
+# commits on the same day are still strictly ordered by their commit
+# timestamps (rather than by RPM-vercmp's alpha-vs-digit precedence
+# on the SHA fragment).
+
+on:
+  # Auto-build on every push to main so the unstable channel tracks
+  # head without a manual dispatch step.
+  push:
+    branches: [main]
+  # Manual dispatch still available to build from a non-main ref.
+  # Dispatched runs skip change detection and build everything.
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: "Git ref to build (branch / tag / commit). Defaults to the workflow's branch."
+        required: false
+        default: ""
+
+# Coalesce same-ref pushes: a newer push cancels the older in-flight
+# run — the newest commit is the one we want on the fleet. The publish
+# job keeps its own `rpm-publish` group (cancel=false) so an in-flight
+# repo update is never interrupted. Runners are ephemeral (one VM per
+# job) so concurrent runs no longer race on a shared workspace; the
+# old shared `cortex-runner-pool` group with ci.yml is gone.
+concurrency:
+  group: build-prerelease-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CARGO_INCREMENTAL: "0"
+  CARGO_TERM_COLOR: "always"
+
+jobs:
+  prepare:
+    name: Resolve version stamps + change detection
+    timeout-minutes: 10
+    runs-on: rust
+    outputs:
+      version: ${{ steps.info.outputs.version }}
+      release: ${{ steps.info.outputs.release }}
+      short_sha: ${{ steps.info.outputs.short_sha }}
+      commit_timestamp: ${{ steps.info.outputs.commit_timestamp }}
+      build_cortex: ${{ steps.changes.outputs.build_cortex }}
+      build_neuron: ${{ steps.changes.outputs.build_neuron }}
+      build_bench: ${{ steps.changes.outputs.build_bench }}
+      check_rust: ${{ steps.changes.outputs.check_rust }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+          fetch-depth: 0
+
+      - id: info
+        run: |
+          set -eux
+          VERSION=$(awk -F\" '/^version[[:space:]]*=/ { print $2; exit }' Cargo.toml)
+          SHORT_SHA=$(git rev-parse --short=7 HEAD)
+          # Second-precise commit timestamp gives the release stamp a
+          # strictly monotonic numeric prefix. The earlier %Y%m%d-only
+          # form let same-day builds be ordered by RPM's rpmvercmp
+          # rules over the SHA, which is non-chronological — e.g.
+          # "git602e8e1" sorts newer than "gitf9f5fa4" purely because
+          # rpmvercmp ranks digit-prefixed segments above alpha ones.
+          # The SHA stays only as a debug identifier; sort order is
+          # decided entirely by the timestamp.
+          COMMIT_TIMESTAMP=$(git log -1 --format=%cd --date=format:%Y%m%d%H%M%S HEAD)
+          RELEASE="0.1.${COMMIT_TIMESTAMP}.git${SHORT_SHA}"
+          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
+          echo "release=${RELEASE}" >> "$GITHUB_OUTPUT"
+          echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
+          echo "commit_timestamp=${COMMIT_TIMESTAMP}" >> "$GITHUB_OUTPUT"
+
+      - id: changes
+        run: |
+          set -ux
+          # Default: build everything. Detection only ever narrows
+          # this, and any failure along the way (manifest unreachable,
+          # unparsable, sha not in history after a force-push) leaves
+          # the full build in place. Manual dispatches always build
+          # everything — predictable when building odd refs.
+          BUILD_CORTEX=true
+          BUILD_NEURON=true
+          BUILD_BENCH=true
+          CHECK_RUST=true
+
+          if [ "${GITHUB_EVENT_NAME}" = "push" ]; then
+            MANIFEST_URL="https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json"
+            if curl -fsS --max-time 20 -o /tmp/packages.json "$MANIFEST_URL"; then
+              # Latest published sha per package, by buildTime.
+              base_for() {
+                python3 - "$1" <<'PY'
+          import json, re, sys
+          name = sys.argv[1]
+          try:
+              with open("/tmp/packages.json") as f:
+                  pkgs = json.load(f)["packages"]
+              cands = [p for p in pkgs if p.get("name") == name]
+              if cands:
+                  latest = max(cands, key=lambda p: p.get("buildTime", 0))
+                  m = re.search(r"git\.?([0-9a-f]{7,40})", latest.get("release", ""))
+                  if m:
+                      print(m.group(1))
+          except Exception:
+              pass
+          PY
+              }
+
+              # true if no usable base, else true iff the diff since
+              # the published sha touches the given path pattern.
+              decide() {
+                local base="$1" pattern="$2"
+                if [ -z "$base" ] \
+                   || ! git cat-file -e "${base}^{commit}" 2>/dev/null \
+                   || ! git merge-base --is-ancestor "$base" HEAD 2>/dev/null; then
+                  echo true; return
+                fi
+                if git diff --name-only "${base}..HEAD" | grep -qE "$pattern"; then
+                  echo true
+                else
+                  echo false
+                fi
+              }
+
+              # cortex-core is shared by both binaries; Cargo.{toml,lock}
+              # affect both; this workflow file affects both.
+              NEURON_RE='^crates/neuron/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/helexa-neuron-prerelease\.spec$|^data/neuron|^neuron\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
+              CORTEX_RE='^crates/cortex-gateway/|^crates/cortex-cli/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/cortex-prerelease\.spec$|^data/cortex|^cortex\.example\.toml$|^models\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
+              BENCH_RE='^crates/helexa-bench/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/helexa-bench-prerelease\.spec$|^data/helexa-bench|^helexa-bench\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
+              # Any Rust change (incl. crates not packaged here, e.g.
+              # helexa-acp) still needs lint+test on main.
+              RUST_RE='\.rs$|^crates/|Cargo\.toml$|^Cargo\.lock$'
+
+              CORTEX_BASE=$(base_for cortex)
+              NEURON_BASE=$(base_for helexa-neuron-blackwell)
+              BENCH_BASE=$(base_for helexa-bench)
+              BUILD_CORTEX=$(decide "$CORTEX_BASE" "$CORTEX_RE")
+              BUILD_NEURON=$(decide "$NEURON_BASE" "$NEURON_RE")
+              BUILD_BENCH=$(decide "$BENCH_BASE" "$BENCH_RE")
+              if [ "$BUILD_CORTEX" = "true" ] || [ "$BUILD_NEURON" = "true" ] || [ "$BUILD_BENCH" = "true" ]; then
+                CHECK_RUST=true
+              else
+                CHECK_RUST=$(decide "$CORTEX_BASE" "$RUST_RE")
+              fi
+            fi
+          fi
+
+          echo "build_cortex=${BUILD_CORTEX}" >> "$GITHUB_OUTPUT"
+          echo "build_neuron=${BUILD_NEURON}" >> "$GITHUB_OUTPUT"
+          echo "build_bench=${BUILD_BENCH}" >> "$GITHUB_OUTPUT"
+          echo "check_rust=${CHECK_RUST}" >> "$GITHUB_OUTPUT"
+          echo "### change detection: build_cortex=${BUILD_CORTEX} build_neuron=${BUILD_NEURON} build_bench=${BUILD_BENCH} check_rust=${CHECK_RUST}"
+
+  # fmt + clippy + test moved here from ci.yml for main pushes so the
+  # two workflows stop queueing against each other (ci.yml's checks
+  # used to delay build-cortex by ~12 minutes on the shared runner
+  # pool). They run in parallel with the builds and gate `publish`,
+  # not the builds themselves — a clippy warning still can't reach the
+  # fleet, but it also doesn't serialize the pipeline.
+  lint:
+    name: Lint (fmt + clippy)
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.check_rust == 'true'
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+      - run: cargo fmt --check --all
+      # Failure-aware sccache escalation lives in the shared script: a
+      # signal death (rustc SIGSEGV / OOM-kill) keeps the cache and fails
+      # fast instead of triggering a slower uncached rebuild; only a real
+      # sccache fault drops the cache. See script/ci-cargo-escalate.sh.
+      - name: Clippy (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo clippy --workspace -- -D warnings
+
+  test:
+    name: Test
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.check_rust == 'true'
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+      # See script/ci-cargo-escalate.sh for the escalation rationale.
+      - name: Test (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo test --workspace
+
+  build-cortex:
+    name: Build cortex binary
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.build_cortex == 'true'
+    # runner-rust image already provides rust/cargo/clippy/rustfmt via
+    # dnf — no rustup install step needed.
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      # See script/ci-cargo-escalate.sh for the escalation rationale.
+      - name: Build cortex (release, sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo build --release -p cortex-cli
+
+      - name: Stage binary
+        run: |
+          mkdir --parents artifacts
+          cp target/release/cortex artifacts/cortex
+          ./artifacts/cortex --version || true
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: cortex-fc43
+          path: artifacts/cortex
+          retention-days: 1
+
+  build-bench:
+    name: Build helexa-bench binary
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.build_bench == 'true'
+    # Pure-Rust, non-CUDA binary — same runner as cortex.
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - name: Build helexa-bench (release, sccache escalation)
+        run: |
+          # Stamp the SHA helexa-bench records as bench_sha against every
+          # run (option_env! in sweep.rs reads it at compile time).
+          export HELEXA_BUILD_SHA="$(git rev-parse HEAD)"
+          script/ci-cargo-escalate.sh cargo build --release -p helexa-bench
+
+      - name: Stage binary
+        run: |
+          mkdir --parents artifacts
+          cp target/release/helexa-bench artifacts/helexa-bench
+          ./artifacts/helexa-bench --version || true
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: bench-fc43
+          path: artifacts/helexa-bench
+          retention-days: 1
+
+  build-neuron:
+    name: Build neuron-${{ matrix.flavour }}
+    timeout-minutes: 35
+    needs: prepare
+    if: needs.prepare.outputs.build_neuron == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - flavour: ampere
+            compute_cap: "86"
+            runner: cuda-13.0
+            cuda_home: /usr/local/cuda-13.0
+            build_jobs: 8
+            nvcc_threads: 4
+            cargo_features: "cuda cudnn"
+          - flavour: ada
+            compute_cap: "89"
+            runner: cuda-13.0
+            cuda_home: /usr/local/cuda-13.0
+            build_jobs: 8
+            nvcc_threads: 4
+            cargo_features: "cuda cudnn"
+          - flavour: blackwell
+            compute_cap: "120"
+            runner: cuda-13.0
+            cuda_home: /usr/local/cuda-13.0
+            build_jobs: 8
+            nvcc_threads: 4
+            cargo_features: "cuda cudnn"
+    runs-on: ${{ matrix.runner }}
+    env:
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      # sccache handling + failure classification lives in
+      # script/ci-cargo-escalate.sh: it probes for sccache (the CUDA
+      # image may not ship it — a missing binary degrades to an uncached
+      # build rather than failing at `sccache rustc -vV`), and a rustc
+      # SIGSEGV / OOM-kill keeps the cache and fails fast instead of
+      # escalating to a slower uncached rebuild. The cache covers the
+      # ~600-crate host-side dep tree (the bulk of the 10-14 min build),
+      # shared across all three flavours, so even one run seeds the next.
+      - name: Build neuron with CUDA (${{ matrix.flavour }})
+        run: |
+          export PATH="${{ matrix.cuda_home }}/bin:${PATH}"
+          export LD_LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LD_LIBRARY_PATH:-}"
+          export LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LIBRARY_PATH:-}"
+          # Pin the build SHA neuron reports from GET /version. The git
+          # fallback in build.rs would also work on a full checkout, but
+          # injecting the exact checked-out commit is unambiguous under
+          # shallow/detached states and makes the artifact self-describing.
+          export HELEXA_BUILD_SHA="$(git rev-parse HEAD)"
+          script/ci-cargo-escalate.sh cargo build --release -p neuron --features "${{ matrix.cargo_features }}"
+        env:
+          CUDA_COMPUTE_CAP: ${{ matrix.compute_cap }}
+          CARGO_BUILD_JOBS: ${{ matrix.build_jobs }}
+          NVCC_THREADS: ${{ matrix.nvcc_threads }}
+
+      - name: Stage binary
+        run: |
+          mkdir --parents artifacts
+          cp target/release/neuron artifacts/neuron-${{ matrix.flavour }}
+          file "artifacts/neuron-${{ matrix.flavour }}"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: neuron-${{ matrix.flavour }}-fc43
+          path: artifacts/neuron-${{ matrix.flavour }}
+          retention-days: 1
+
+  package-cortex:
+    name: Package cortex RPM
+    timeout-minutes: 20
+    needs: [prepare, build-cortex]
+    runs-on: rpm
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - uses: actions/download-artifact@v3
+        with:
+          name: cortex-fc43
+          path: artifacts/
+
+      - name: Build RPM
+        run: |
+          set -eux
+          rm -f ~/.rpmmacros
+          rpmdev-setuptree
+          cp artifacts/cortex ~/rpmbuild/SOURCES/
+          cp data/cortex.service ~/rpmbuild/SOURCES/
+          cp data/cortex-sysusers.conf ~/rpmbuild/SOURCES/
+          cp data/cortex-firewalld.xml ~/rpmbuild/SOURCES/
+          cp cortex.example.toml ~/rpmbuild/SOURCES/
+          cp models.example.toml ~/rpmbuild/SOURCES/
+          cp LICENSE ~/rpmbuild/SOURCES/
+          rpmbuild -bb rpm/cortex-prerelease.spec \
+            --define "cortex_version ${{ needs.prepare.outputs.version }}" \
+            --define "cortex_prerelease ${{ needs.prepare.outputs.release }}" \
+            --undefine dist \
+            --define "dist .fc43"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: rpm-cortex-fc43
+          path: ~/rpmbuild/RPMS/x86_64/*.rpm
+          retention-days: 7
+
+  package-bench:
+    name: Package helexa-bench RPM
+    timeout-minutes: 20
+    needs: [prepare, build-bench]
+    runs-on: rpm
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - uses: actions/download-artifact@v3
+        with:
+          name: bench-fc43
+          path: artifacts/
+
+      - name: Build RPM
+        run: |
+          set -eux
+          rm -f ~/.rpmmacros
+          rpmdev-setuptree
+          cp artifacts/helexa-bench ~/rpmbuild/SOURCES/
+          cp data/helexa-bench.service ~/rpmbuild/SOURCES/
+          cp data/helexa-bench-sysusers.conf ~/rpmbuild/SOURCES/
+          cp data/helexa-bench-firewalld.xml ~/rpmbuild/SOURCES/
+          cp helexa-bench.example.toml ~/rpmbuild/SOURCES/
+          cp LICENSE ~/rpmbuild/SOURCES/
+          rpmbuild -bb rpm/helexa-bench-prerelease.spec \
+            --define "bench_version ${{ needs.prepare.outputs.version }}" \
+            --define "bench_prerelease ${{ needs.prepare.outputs.release }}" \
+            --undefine dist \
+            --define "dist .fc43"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: rpm-bench-fc43
+          path: ~/rpmbuild/RPMS/x86_64/*.rpm
+          retention-days: 7
+
+  package-neuron:
+    name: Package helexa-neuron-${{ matrix.flavour }} RPM
+    timeout-minutes: 20
+    needs: [prepare, build-neuron]
+    runs-on: rpm
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - flavour: ampere
+          - flavour: ada
+          - flavour: blackwell
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - uses: actions/download-artifact@v3
+        with:
+          name: neuron-${{ matrix.flavour }}-fc43
+          path: artifacts/
+
+      - name: Build RPM
+        run: |
+          set -eux
+          rm -f ~/.rpmmacros
+          rpmdev-setuptree
+          cp artifacts/neuron-${{ matrix.flavour }} ~/rpmbuild/SOURCES/
+          cp data/neuron.service ~/rpmbuild/SOURCES/
+          cp data/neuron-sysusers.conf ~/rpmbuild/SOURCES/
+          cp data/neuron-firewalld.xml ~/rpmbuild/SOURCES/
+          cp neuron.example.toml ~/rpmbuild/SOURCES/
+          cp LICENSE ~/rpmbuild/SOURCES/
+          rpmbuild -bb rpm/helexa-neuron-prerelease.spec \
+            --define "neuron_version ${{ needs.prepare.outputs.version }}" \
+            --define "neuron_flavour ${{ matrix.flavour }}" \
+            --define "neuron_prerelease ${{ needs.prepare.outputs.release }}" \
+            --undefine dist \
+            --define "dist .fc43"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: rpm-neuron-${{ matrix.flavour }}-fc43
+          path: ~/rpmbuild/RPMS/x86_64/*.rpm
+          retention-days: 7
+
+  publish:
+    name: Publish to rpm.lair.cafe (unstable)
+    timeout-minutes: 25
+    needs: [lint, test, package-cortex, package-neuron, package-bench]
+    # Runs when at least one package was built and nothing failed.
+    # lint/test may be skipped (docs-only refs never get here because
+    # no packages build), but a real failure in any blocks the
+    # fleet from receiving the RPMs.
+    if: >-
+      ${{
+        !cancelled()
+        && (needs.lint.result == 'success' || needs.lint.result == 'skipped')
+        && (needs.test.result == 'success' || needs.test.result == 'skipped')
+        && (needs.package-cortex.result == 'success' || needs.package-neuron.result == 'success' || needs.package-bench.result == 'success')
+        && needs.package-cortex.result != 'failure'
+        && needs.package-neuron.result != 'failure'
+        && needs.package-bench.result != 'failure'
+      }}
+    runs-on: rpm
+    concurrency:
+      group: rpm-publish
+      cancel-in-progress: false
+    env:
+      RPM_REPO_HOST: oolon.kosherinata.internal
+      FEDORA_VERSION: "43"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - name: Download all built RPMs
+        uses: actions/download-artifact@v3
+        with:
+          path: rpms/
+          pattern: rpm-*-fc43
+
+      - name: Flatten RPM artifacts
+        run: |
+          set -eux
+          find rpms/ -name '*.rpm' -exec mv --target-directory=rpms/ {} +
+          find rpms/ -mindepth 1 -type d -empty -delete
+          ls -la rpms/
+
+      - name: Check for sequoia-sq
+        run: |
+          if ! command -v sq &> /dev/null; then
+            echo "ERROR: sequoia-sq is not installed. Install with: sudo dnf install sequoia-sq"
+            exit 1
+          fi
+
+      - name: Import signing key
+        env:
+          # Pass secrets via env so values stay out of the rendered shell
+          # script (which Gitea includes in step logs). Template
+          # expansion of ${{ secrets.X }} inside `run:` writes the literal
+          # value into the script and depends on Gitea's log masker to
+          # scrub it — fragile for multi-line keys.
+          RPM_SIGNING_KEY: ${{ secrets.RPM_SIGNING_KEY }}
+          RPM_SIGNING_KEY_ID: ${{ secrets.RPM_SIGNING_KEY_ID }}
+        run: |
+          echo "$RPM_SIGNING_KEY" | gpg --batch --import
+          fpr=$(gpg --batch --with-colons --list-keys "$RPM_SIGNING_KEY_ID" | awk -F: '/^fpr:/ { print $10; exit }')
+          echo "${fpr}:6:" | gpg --batch --import-ownertrust
+          sed "s/@GPG_NAME@/$RPM_SIGNING_KEY_ID/" rpm/rpmmacros > ~/.rpmmacros
+
+      - name: Sign RPMs
+        run: |
+          set -eux
+          for rpm in rpms/*.rpm; do
+            echo "signing ${rpm}..."
+            rpm --addsign "${rpm}"
+          done
+
+      - name: Set up SSH for rsync
+        run: |
+          install --directory --mode 700 ~/.ssh
+          echo "${RSYNC_SSH_KEY}" | install --mode 600 /dev/stdin ~/.ssh/id_ed25519
+        env:
+          RSYNC_SSH_KEY: ${{ secrets.RSYNC_SSH_KEY }}
+
+      - name: Test SSH connectivity
+        run: |
+          ssh -o StrictHostKeyChecking=accept-new "gitea_ci@${RPM_REPO_HOST}" exit
+
+      - name: Ensure unstable repo directory exists
+        run: |
+          ssh "gitea_ci@${RPM_REPO_HOST}" \
+            "mkdir --parents /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable"
+
+      - name: Sync RPMs to unstable repo
+        run: |
+          rsync \
+            --archive \
+            --verbose \
+            --chmod D755,F644 \
+            rpms/*.rpm \
+            "gitea_ci@${RPM_REPO_HOST}:/var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/"
+
+      - name: Update unstable repo metadata
+        run: |
+          ssh "gitea_ci@${RPM_REPO_HOST}" \
+            "cd /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable && createrepo_c --update ."
+
+      - name: Generate packages.json manifest
+        run: |
+          scp script/generate-packages-json.py "gitea_ci@${RPM_REPO_HOST}:/tmp/"
+          ssh "gitea_ci@${RPM_REPO_HOST}" \
+            "python3 /tmp/generate-packages-json.py \
+              --repodata-dir /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/repodata \
+              --output /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/packages.json \
+              --base-url https://rpm.lair.cafe/fedora/${FEDORA_VERSION}/x86_64/unstable"
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -1,12 +1,26 @@
 name: CI

+# Pushes to main are deliberately excluded: build-prerelease.yml runs
+# its own lint/test jobs there (gating publish), and running both
+# workflows on the same push made them queue against each other on the
+# same runner labels — ~12 minutes of added latency per deploy. Feature
+# branches, PRs to main, and release tags keep the full gate here.
 on:
  push:
-    branches: ["**"]
+    branches-ignore: [main]
    tags: ["v*"]
  pull_request:
    branches: [main]

+# Coalesce same-ref pushes; a newer push supersedes the in-flight run.
+# (The old shared `cortex-runner-pool` group with build-prerelease.yml
+# is gone — the workflows no longer trigger on the same refs, and
+# ephemeral one-VM-per-job runners removed the shared-workspace race
+# that group existed to serialize.)
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
  CARGO_INCREMENTAL: "0"
  RUSTC_WRAPPER: sccache
@@ -16,56 +30,103 @@ env:
  SCCACHE_S3_USE_SSL: "false"
  AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+  # fmt, clippy, and test all run in parallel on the same `rust` runner
+  # and would otherwise share /root/.cache/act/<hash>/hostexecutor/target/,
+  # racing each other's cargo temp files (.tmpXXXXXX) and failing builds
+  # mid-compile. Give each job its own target directory so the invocations
+  # don't collide. sccache still backs the actual rustc cache, so the
+  # rebuild penalty is small.
+  CARGO_TARGET_DIR: target-${{ github.job }}

 jobs:
-  check:
-    name: Format, lint, build, test
-    runs-on: fedora
+  fmt:
+    name: Format
+    timeout-minutes: 15
+    runs-on: rust
    steps:
      - uses: actions/checkout@v4
+      - run: cargo fmt --check --all

-      - name: Cache cargo registry and target
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/bin
-            ~/.cargo/registry/index
-            ~/.cargo/registry/cache
-            ~/.cargo/git/db
-            target
-          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-cargo-
+  clippy:
+    name: Clippy
+    timeout-minutes: 25
+    runs-on: rust
+    steps:
+      - uses: actions/checkout@v4
+      # Failure-aware sccache escalation lives in the shared script (kept
+      # in sync with build-prerelease.yml): a signal death (rustc SIGSEGV
+      # / OOM-kill) keeps the cache and fails fast instead of an uncached
+      # rebuild; only a real sccache fault drops the cache.
+      - name: Clippy (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo clippy --workspace -- -D warnings

-      - name: Ensure sccache with S3 support
-        env:
-          RUSTC_WRAPPER: ""
+  test:
+    name: Test
+    timeout-minutes: 25
+    runs-on: rust
+    steps:
+      - uses: actions/checkout@v4
+      # See script/ci-cargo-escalate.sh for the escalation rationale.
+      - name: Test (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo test --workspace
+
+  # Type-check the CUDA-only code path. Borrow-check-only — we
+  # never run the tests here (the runner has no GPU). This catches
+  # the category of bug where a refactor compiles fine under the
+  # default feature set (which is what the `clippy` and `test` jobs
+  # exercise) but fails inside a `#[cfg(feature = "cuda")]` block.
+  # `runs-on: cuda-13.0` selects the runner that ships nvcc /
+  # cudarc's build prerequisites. The generic `rust` and `rpm`
+  # runners don't have them (the previous label `rpm` was tried
+  # first and tripped cudarc's `nvcc --version` build script —
+  # see commit history).
+  cuda-check:
+    name: CUDA type-check
+    timeout-minutes: 35
+    runs-on: cuda-13.0
+    # The workflow-level env sets `RUSTC_WRAPPER: sccache`
+    # unconditionally, which hard-fails cargo if the CUDA image
+    # doesn't ship sccache. Clear it at job level; the "Enable
+    # sccache when available" step opts back in only after probing
+    # for the binary. SCCACHE_*/AWS creds stay set — harmless when
+    # the wrapper is off, required when it's on.
+    env:
+      RUSTC_WRAPPER: ""
+      # candle-kernels' build script falls back to `nvidia-smi` for
+      # compute-cap detection when this is unset — and the GPU-less
+      # builder image doesn't ship nvidia-smi. Any valid cap works for
+      # a borrow-check; the real per-flavour caps live in
+      # build-prerelease.yml's matrix.
+      CUDA_COMPUTE_CAP: "86"
+    steps:
+      - uses: actions/checkout@v4
+      # sccache probing + failure classification lives in the shared
+      # script (see build-prerelease.yml's neuron build for the same
+      # pattern). It probes for sccache and, on a rustc SIGSEGV / OOM,
+      # keeps the cache and fails fast rather than rebuilding uncached.
+      - name: cargo check --features cuda (sccache escalation)
        run: |
-          if sccache --version 2>/dev/null && sccache --show-stats 2>/dev/null; then
-            echo "sccache with S3 support already installed"
-          else
-            cargo install sccache --features s3 --locked
-          fi
-
-      - name: Check formatting
-        run: cargo fmt --check --all
-
-      - name: Clippy
-        run: cargo clippy --workspace -- -D warnings
-
-      - name: Test
-        run: cargo test --workspace
-
-      - name: Show sccache stats
-        run: sccache --show-stats
+          # act launches the step shell without /etc/profile, so the
+          # gitea_runner user's inherited PATH lacks /usr/local/cuda-13.0/bin.
+          # cudarc's build.rs shells out to `nvcc --version` (the neuron
+          # crate enables cuda-version-from-build-system) and panics with
+          # ENOENT if nvcc isn't resolvable — keep this export in sync
+          # with build-prerelease.yml.
+          export PATH="/usr/local/cuda-13.0/bin:${PATH}"
+          export LD_LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH:-}"
+          export LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LIBRARY_PATH:-}"
+          script/ci-cargo-escalate.sh cargo check -p neuron --features cuda --all-targets

  srpm-cortex:
    name: Build cortex SRPM
-    runs-on: fedora
-    needs: check
+    timeout-minutes: 25
+    runs-on: rpm
+    needs: [fmt, clippy, test, cuda-check]
    if: startsWith(github.ref, 'refs/tags/v')
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Determine version
        id: version
@@ -79,6 +140,12 @@ jobs:
          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
          sed -i "s/^Version:.*/Version:        ${VERSION}/" cortex.spec

+      - name: Generate changelog entry
+        uses: https://git.lair.cafe/actions/rpm-changelog@v1
+        with:
+          spec: cortex.spec
+          version: ${{ steps.version.outputs.VERSION }}
+
      - name: Generate source tarball
        run: |
          set -ex
@@ -113,11 +180,14 @@ jobs:

  srpm-neuron:
    name: Build neuron SRPM
-    runs-on: fedora
-    needs: check
+    timeout-minutes: 25
+    runs-on: rpm
+    needs: [fmt, clippy, test, cuda-check]
    if: startsWith(github.ref, 'refs/tags/v')
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Determine version
        id: version
@@ -129,31 +199,37 @@ jobs:
        run: |
          VERSION="${{ steps.version.outputs.VERSION }}"
          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
-          sed -i "s/^Version:.*/Version:        ${VERSION}/" neuron.spec
+          sed -i "s/^Version:.*/Version:        ${VERSION}/" helexa-neuron.spec
+
+      - name: Generate changelog entry
+        uses: https://git.lair.cafe/actions/rpm-changelog@v1
+        with:
+          spec: helexa-neuron.spec
+          version: ${{ steps.version.outputs.VERSION }}

      - name: Generate source tarball
        run: |
          set -ex
          VERSION="${{ steps.version.outputs.VERSION }}"
-          tar czf /tmp/neuron-${VERSION}.tar.gz \
-            --transform "s,^\.,neuron-${VERSION}," \
+          tar czf /tmp/helexa-neuron-${VERSION}.tar.gz \
+            --transform "s,^\.,helexa-neuron-${VERSION}," \
            --exclude='./target' \
            --exclude='./.git' \
            --exclude='*.tar.gz' \
            --exclude='*.src.rpm' \
            .
-          mv /tmp/neuron-${VERSION}.tar.gz .
+          mv /tmp/helexa-neuron-${VERSION}.tar.gz .

      - name: Vendor Rust dependencies
        run: |
          VERSION="${{ steps.version.outputs.VERSION }}"
          cargo vendor vendor/
-          tar czf neuron-${VERSION}-vendor.tar.gz vendor/
+          tar czf helexa-neuron-${VERSION}-vendor.tar.gz vendor/
          rm -rf vendor/

      - name: Build SRPM
        run: |
-          rpmbuild -bs neuron.spec \
+          rpmbuild -bs helexa-neuron.spec \
            --define "_sourcedir $(pwd)" \
            --define "_srcrpmdir $(pwd)"

@@ -165,7 +241,8 @@ jobs:

  copr-cortex:
    name: Publish cortex to COPR
-    runs-on: fedora
+    timeout-minutes: 60
+    runs-on: fedora-43
    needs: srpm-cortex
    steps:
      - name: Download SRPM
@@ -173,17 +250,17 @@ jobs:
        with:
          name: srpm-cortex

-      - name: Configure copr-cli
-        run: |
-          mkdir -p ~/.config
-          echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
-
-      - name: Submit build to COPR
-        run: copr-cli build helexa/cortex *.src.rpm
+      - name: Publish to COPR
+        uses: https://git.lair.cafe/actions/copr-publish@v1
+        with:
+          project: helexa/helexa
+          srpm: "*.src.rpm"
+          copr-config: ${{ secrets.COPR_CONFIG }}

  copr-neuron:
    name: Publish neuron to COPR
-    runs-on: fedora
+    timeout-minutes: 60
+    runs-on: fedora-43
    needs: srpm-neuron
    steps:
      - name: Download SRPM
@@ -191,37 +268,59 @@ jobs:
        with:
          name: srpm-neuron

-      - name: Configure copr-cli
-        run: |
-          mkdir -p ~/.config
-          echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
-
-      - name: Submit build to COPR
-        run: copr-cli build helexa/neuron *.src.rpm
+      - name: Publish to COPR
+        uses: https://git.lair.cafe/actions/copr-publish@v1
+        with:
+          project: helexa/helexa
+          srpm: "*.src.rpm"
+          copr-config: ${{ secrets.COPR_CONFIG }}

  bump-version:
    name: Bump version in source
-    runs-on: fedora
+    timeout-minutes: 15
+    runs-on: rust
    needs: [copr-cortex, copr-neuron]
    steps:
      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

-      - name: Stamp version and push
+      - name: Determine version
+        id: version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> "$GITHUB_OUTPUT"
+
+      - name: Stamp version
+        run: |
+          VERSION="${{ steps.version.outputs.VERSION }}"
+          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
+          sed -i "s/^Version:.*/Version:        ${VERSION}/" cortex.spec
+          sed -i "s/^Version:.*/Version:        ${VERSION}/" helexa-neuron.spec
+          cargo check --workspace 2>/dev/null || true
+
+      - name: Generate cortex changelog entry
+        uses: https://git.lair.cafe/actions/rpm-changelog@v1
+        with:
+          spec: cortex.spec
+          version: ${{ steps.version.outputs.VERSION }}
+
+      - name: Generate helexa-neuron changelog entry
+        uses: https://git.lair.cafe/actions/rpm-changelog@v1
+        with:
+          spec: helexa-neuron.spec
+          version: ${{ steps.version.outputs.VERSION }}
+
+      - name: Commit and push
        env:
          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
        run: |
-          VERSION="${GITHUB_REF#refs/tags/v}"
-          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
-          sed -i "s/^Version:.*/Version:        ${VERSION}/" cortex.spec
-          sed -i "s/^Version:.*/Version:        ${VERSION}/" neuron.spec
-          cargo check --workspace 2>/dev/null || true
+          VERSION="${{ steps.version.outputs.VERSION }}"
          git config user.name "Gitea Actions"
          git config user.email "actions@git.lair.cafe"
-          git add Cargo.toml Cargo.lock cortex.spec neuron.spec
+          git add Cargo.toml Cargo.lock cortex.spec helexa-neuron.spec
          if git diff --cached --quiet; then
-            echo "Version already at ${VERSION}"
+            echo "Nothing to commit for ${VERSION}"
          else
            git commit -m "chore: bump version to ${VERSION}"
-            git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/helexa/cortex.git"
+            git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/${{ github.repository }}.git"
            git push origin HEAD:main
          fi
--- a/.gitea/workflows/deploy-dev.yml
+++ b/.gitea/workflows/deploy-dev.yml
@@ -0,0 +1,136 @@
+name: deploy-dev
+
+# Fast-path iteration deploy for a SINGLE neuron host: build one CUDA
+# flavour, copy the raw binary to the host, restart neuron.service.
+# Skips the other two flavours, all RPM packaging, signing, repo
+# publish, and dnf — push-to-testable drops from ~20 min to roughly
+# one CUDA build plus a service restart.
+#
+# This is a DEV convenience, not a release path:
+#   - the binary lands at /usr/bin/neuron *outside* RPM ownership;
+#     the next regular deploy.yml run reconciles the host back to the
+#     packaged binary (dnf sees the newer RPM and reinstalls). `rpm -V
+#     helexa-neuron-<flavour>` flagging a modified /usr/bin/neuron in
+#     the interim is expected.
+#   - nothing is published; other hosts are untouched.
+#   - requires the `install` sudoers rule from
+#     asset/sudoers.d/neuron-host.conf (re-run script/infra-setup.sh
+#     after updating it).
+#
+# Trigger from the Gitea UI: Actions → deploy-dev → Run workflow,
+# pick the target host. Defaults to the ref you dispatch from, so it
+# works from feature branches without touching main.
+
+on:
+  workflow_dispatch:
+    inputs:
+      target:
+        description: "neuron host to deploy to"
+        required: true
+        type: choice
+        options: [beast, benjy, quadbrat]
+        default: beast
+
+# One dev deploy at a time; a newer dispatch for the same host wins.
+concurrency:
+  group: deploy-dev-${{ inputs.target }}
+  cancel-in-progress: true
+
+env:
+  CARGO_INCREMENTAL: "0"
+  CARGO_TERM_COLOR: "always"
+
+jobs:
+  build:
+    name: Build neuron (${{ inputs.target }})
+    runs-on: cuda-13.0
+    outputs:
+      flavour: ${{ steps.map.outputs.flavour }}
+    steps:
+      - uses: actions/checkout@v4
+
+      # host → flavour → compute cap. Keep in sync with the
+      # build-neuron matrix in build-prerelease.yml and the
+      # deploy-neurons matrix in deploy.yml.
+      - id: map
+        run: |
+          case "${{ inputs.target }}" in
+            beast)    flavour=blackwell cap=120 ;;
+            benjy)    flavour=ada       cap=89  ;;
+            quadbrat) flavour=ampere    cap=86  ;;
+            *) echo "unknown target ${{ inputs.target }}"; exit 1 ;;
+          esac
+          echo "flavour=${flavour}" >> "$GITHUB_OUTPUT"
+          echo "cap=${cap}" >> "$GITHUB_OUTPUT"
+
+      - name: Build neuron with CUDA
+        run: |
+          set -eux
+          export PATH="/usr/local/cuda-13.0/bin:${PATH}"
+          export LD_LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH:-}"
+          export LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LIBRARY_PATH:-}"
+          cargo build --release -p neuron --features "cuda cudnn"
+        env:
+          CUDA_COMPUTE_CAP: ${{ steps.map.outputs.cap }}
+          CARGO_BUILD_JOBS: "8"
+          NVCC_THREADS: "4"
+
+      - name: Stage binary
+        run: |
+          mkdir --parents artifacts
+          cp target/release/neuron artifacts/neuron-dev
+          file artifacts/neuron-dev
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: neuron-dev-${{ inputs.target }}
+          path: artifacts/neuron-dev
+          retention-days: 1
+
+  deploy:
+    name: Deploy to ${{ inputs.target }}
+    needs: build
+    runs-on: fedora-43
+    env:
+      DEPLOY_KEY: |
+        ${{ secrets.RSYNC_SSH_KEY }}
+      TARGET_HOST: ${{ inputs.target }}.hanzalova.internal
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              "gitea_ci@${TARGET_HOST}" 'hostname -f'
+
+      - uses: actions/download-artifact@v3
+        with:
+          name: neuron-dev-${{ inputs.target }}
+          path: artifacts/
+
+      - name: Copy binary to host
+        run: |
+          scp artifacts/neuron-dev "gitea_ci@${TARGET_HOST}:/var/lib/gitea_ci/neuron-dev"
+
+      - name: Install binary and restart neuron.service
+        run: |
+          ssh "gitea_ci@${TARGET_HOST}" '
+            set -eu
+            if systemctl is-active --quiet neuron.service; then
+              sudo /usr/bin/systemctl stop neuron.service
+            fi
+            # Exact command form required by the sudoers rule in
+            # asset/sudoers.d/neuron-host.conf — change both together.
+            sudo /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
+            # enable --now so a dev deploy also leaves the unit enabled
+            # for boot, consistent with deploy.yml.
+            sudo /usr/bin/systemctl enable --now neuron.service
+            rm -f /var/lib/gitea_ci/neuron-dev'
+
+      - name: Capture neuron.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh "gitea_ci@${TARGET_HOST}" \
+              'journalctl --unit neuron.service -I --no-pager'
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -0,0 +1,448 @@
+name: deploy
+
+# Roll the freshly-published unstable RPMs onto the helexa fleet:
+# cortex on the gateway, helexa-neuron-<flavour> on each neuron host,
+# and helexa-bench on bob (the bench host).
+#
+# Triggered automatically after `build-prerelease` succeeds (by which
+# point the new RPMs are live on rpm.lair.cafe/unstable), and also
+# re-runnable manually from the Gitea UI.
+#
+# Each host self-gates: if dnf sees no newer package than what is
+# installed, the service is left alone — no stop, no restart, no model
+# cold-load. Combined with build-prerelease's change detection this
+# means a docs- or gateway-only push never restarts the neurons (a
+# neuron restart costs ~5 min of 27B cold-load, see issue #1).
+#
+# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
+# sudoers drop-in) lives in script/infra-setup.sh — run that once per
+# host before this workflow can succeed.
+
+on:
+  workflow_run:
+    workflows: [build-prerelease]
+    types: [completed]
+  workflow_dispatch:
+
+# Serialize deploys. Overlapping runs would race on dnf metadata
+# refresh and service-restart timing; queueing keeps the fleet
+# predictable. Don't cancel an in-flight deploy — a half-applied dnf
+# transaction is worse than a slightly stale deploy.
+concurrency:
+  group: deploy
+  cancel-in-progress: false
+
+env:
+  DEPLOY_KEY: |
+    ${{ secrets.RSYNC_SSH_KEY }}
+
+jobs:
+  deploy-cortex:
+    runs-on: fedora-43
+    # Two trigger paths: manual dispatch always runs; workflow_run
+    # only runs if the upstream `build-prerelease` actually succeeded.
+    if: >-
+      ${{
+        github.event_name == 'workflow_dispatch'
+        || github.event.workflow_run.conclusion == 'success'
+      }}
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@hanzalova.internal 'hostname -f'
+
+      # Gating compares `rpm -q` against the packages.json manifest the
+      # publish job maintains — NOT unprivileged `dnf check-update`,
+      # which proved unreliable as the gitea_ci user (hung on metadata
+      # locks on one host, silently reported "no updates" on others).
+      # An unreadable/unparsable manifest fails open: deploy proceeds.
+      - name: Deploy cortex (skips when already current)
+        run: |
+          ssh gitea_ci@hanzalova.internal 'bash -s' <<'DEPLOY'
+          set -eu
+          pkg=cortex
+          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
+          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
+            | python3 -c '
+          import json, sys
+          name = sys.argv[1]
+          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
+          if cands:
+              p = max(cands, key=lambda p: p.get("buildTime", 0))
+              print(p["version"] + "-" + p["release"])
+          ' "${pkg}" 2>/dev/null || true)
+          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
+            echo "${pkg}-${installed} already current — leaving service untouched"
+            exit 0
+          fi
+          echo "installed=${installed} published=${latest:-unknown} — deploying"
+          if systemctl is-active --quiet cortex.service; then
+            sudo /usr/bin/systemctl stop cortex.service
+          fi
+          if rpm -q "${pkg}" >/dev/null 2>&1; then
+            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
+          else
+            sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
+          fi
+          sudo /usr/bin/systemctl daemon-reload
+          # enable --now: start the service AND enable it for boot so the
+          # fleet self-heals after a host reboot.
+          sudo /usr/bin/systemctl enable --now cortex.service
+          DEPLOY
+
+      # Wait for the service to either come up or wedge, then capture
+      # the latest-invocation journal. Runs even on prior failure so a
+      # failed start step still leaves a usable record in the deploy log.
+      - name: Capture cortex.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh gitea_ci@hanzalova.internal \
+              'journalctl --unit cortex.service -I --no-pager'
+
+  deploy-neurons:
+    needs: [deploy-cortex]
+    runs-on: fedora-43
+    strategy:
+      # One neuron failing must not cancel the others. Cortex is up
+      # already; a partial neuron deploy is strictly better than
+      # rolling back to zero.
+      fail-fast: false
+      matrix:
+        include:
+          # load_timeout: how long to wait for default_models to finish
+          # loading after a restart. beast cold-loads Qwen3.6-27B Q6K
+          # TP=2 (~5-6 min typical, see #1); benjy/quadbrat load small
+          # single-GPU models in well under a minute.
+          #
+          # max_prompt_tokens: per-model context cap, written to the
+          # neuron.service.d/model.conf drop-in (NEURON_MAX_PROMPT_TOKENS).
+          # A change here restarts the neuron even with no new RPM. Values
+          # are VRAM-safe ceilings derived per model — see
+          # doc/context-limits.md. beast (Qwen3.6-27B, hybrid linear, 2x
+          # 32GB) has ample KV headroom; benjy (Qwen3-8B dense, ~6GB free)
+          # is VRAM-bound and stays at the default; quadbrat (Qwen3-1.7B)
+          # likewise conservative.
+          - host: beast.hanzalova.internal
+            flavour: blackwell
+            load_timeout: 900
+            max_prompt_tokens: 131072
+          - host: benjy.hanzalova.internal
+            flavour: ada
+            load_timeout: 300
+            max_prompt_tokens: 16384
+          - host: quadbrat.hanzalova.internal
+            flavour: ampere
+            load_timeout: 300
+            max_prompt_tokens: 16384
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@${{ matrix.host }} 'hostname -f'
+
+      # See deploy-cortex for why gating uses the publish manifest and
+      # not unprivileged `dnf check-update`.
+      - name: Deploy helexa-neuron-${{ matrix.flavour }} (skips when already current)
+        run: |
+          ssh gitea_ci@${{ matrix.host }} 'bash -s' <<'DEPLOY'
+          set -eu
+          pkg=helexa-neuron-${{ matrix.flavour }}
+          max_prompt_tokens="${{ matrix.max_prompt_tokens }}"
+
+          # ── Desired per-model systemd drop-in ─────────────────────────
+          # model.conf carries NEURON_MAX_PROMPT_TOKENS so the context cap
+          # is deterministic per host and rolled out (with a restart) by
+          # this workflow, not hand-edited. It sorts after local.conf, so a
+          # deploy-managed value wins over any manual local override of the
+          # same variable. See doc/context-limits.md.
+          conf=/etc/systemd/system/neuron.service.d/model.conf
+          config_changed=0
+          if [ -n "${max_prompt_tokens}" ]; then
+            desired=$(printf '%s\n%s\n%s\n%s' \
+              "# Managed by .gitea/workflows/deploy.yml - do not edit by hand." \
+              "# Per-model context cap; see doc/context-limits.md." \
+              "[Service]" \
+              "Environment=NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}")
+            [ "${desired}" = "$(cat "${conf}" 2>/dev/null || true)" ] || config_changed=1
+          fi
+
+          # ── Package version gate (manifest rationale: see deploy-cortex) ──
+          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
+          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
+            | python3 -c '
+          import json, sys
+          name = sys.argv[1]
+          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
+          if cands:
+              p = max(cands, key=lambda p: p.get("buildTime", 0))
+              print(p["version"] + "-" + p["release"])
+          ' "${pkg}" 2>/dev/null || true)
+          pkg_changed=1
+          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
+            pkg_changed=0
+          fi
+
+          # Skip only when BOTH the package and the drop-in are unchanged —
+          # a context-cap change must restart the neuron even with no new RPM.
+          if [ "${pkg_changed}" -eq 0 ] && [ "${config_changed}" -eq 0 ]; then
+            echo "${pkg}-${installed} current; NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens:-<unset>} unchanged — leaving service untouched"
+            exit 0
+          fi
+          echo "installed=${installed} published=${latest:-unknown} pkg_changed=${pkg_changed} config_changed=${config_changed} — deploying"
+
+          # Write the drop-in (staged in gitea_ci's dir, installed root-owned).
+          if [ "${config_changed}" -eq 1 ]; then
+            printf '%s\n' "${desired}" > /var/lib/gitea_ci/model.conf
+            sudo /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf "${conf}"
+            rm -f /var/lib/gitea_ci/model.conf
+            echo "applied ${conf}: NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}"
+          fi
+
+          if systemctl is-active --quiet neuron.service; then
+            sudo /usr/bin/systemctl stop neuron.service
+          fi
+          if [ "${pkg_changed}" -eq 1 ]; then
+            if rpm -q "${pkg}" >/dev/null 2>&1; then
+              sudo /usr/bin/dnf upgrade --refresh --allowerasing -y "${pkg}"
+            else
+              sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
+            fi
+          fi
+          # daemon-reload picks up both a new unit (dnf) and the drop-in.
+          sudo /usr/bin/systemctl daemon-reload
+          # enable --now: start the service AND enable it for boot so the
+          # fleet self-heals after a host reboot.
+          sudo /usr/bin/systemctl enable --now neuron.service
+
+          # ── Post-deploy validation ────────────────────────────────
+          # A deploy only goes green if the neuron (a) finishes loading
+          # its default models and (b) answers a trivial prompt like an
+          # LLM should. Catches the class of bug where the binary
+          # starts fine but model load or inference is broken — which
+          # previously surfaced only when a human noticed. The wait
+          # polls /health activation (the structured source of the
+          # "loaded default model" journal line, plus per-model failure
+          # detail); the journal-capture step below still runs for
+          # forensics either way.
+          load_timeout=${{ matrix.load_timeout }}
+          echo "waiting for default models (timeout ${load_timeout}s)"
+          deadline=$(( $(date +%s) + load_timeout ))
+          health=""
+          while :; do
+            health=$(curl -fsS --max-time 5 http://localhost:13131/health 2>/dev/null || true)
+            state=$(printf %s "${health}" | python3 -c '
+          import json, sys
+          try:
+              print(json.load(sys.stdin).get("activation", {}).get("state", ""))
+          except Exception:
+              print("")
+          ')
+            if [ "${state}" = "ready" ]; then
+              break
+            fi
+            if [ "$(date +%s)" -ge "${deadline}" ]; then
+              echo "FAIL: activation not ready within ${load_timeout}s (last state: ${state:-unreachable})"
+              exit 1
+            fi
+            sleep 10
+          done
+
+          model=$(printf %s "${health}" | python3 -c '
+          import json, sys
+          a = json.load(sys.stdin).get("activation", {})
+          failed = a.get("failed", [])
+          if failed:
+              for f in failed:
+                  msg = "FAILED " + str(f.get("model_id")) + ": " + str(f.get("error", ""))[:400]
+                  sys.stderr.write(msg + chr(10))
+              sys.exit(1)
+          completed = a.get("completed", [])
+          print(completed[0] if completed else "")
+          ')
+          if [ -z "${model}" ]; then
+            echo "no default models configured — skipping LLM probe"
+            exit 0
+          fi
+
+          echo "LLM probe against ${model}"
+          probe_body=$(printf '{"model":"%s","messages":[{"role":"user","content":"Reply with exactly one word: pineapple"}],"max_tokens":512,"temperature":0}' "${model}")
+          resp=$(curl -fsS --max-time 180 -H "content-type: application/json" \
+            -d "${probe_body}" http://localhost:13131/v1/chat/completions) || {
+            echo "FAIL: probe request errored"
+            exit 1
+          }
+          if printf %s "${resp}" | grep -qi pineapple; then
+            echo "LLM probe passed"
+          else
+            echo "FAIL: probe response missing expected token"
+            printf %s "${resp}" | head -c 2000
+            echo
+            exit 1
+          fi
+          DEPLOY
+
+      - name: Ensure firewalld allows helexa-neuron
+        run: |
+          ssh gitea_ci@${{ matrix.host }} '
+            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
+              sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
+              sudo /usr/bin/firewall-cmd --reload
+            fi'
+
+      # Wait for the service to either come up or wedge, then capture
+      # the latest-invocation journal. Runs even on prior failure so a
+      # failed start step still leaves a usable record in the deploy log.
+      - name: Capture neuron.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh gitea_ci@${{ matrix.host }} \
+              'journalctl --unit neuron.service -I --no-pager'
+
+  # helexa-bench is a separate package on a separate host (bob), and it
+  # only consumes the fleet's HTTP APIs — it has no deploy-ordering
+  # dependency on cortex or the neurons (the sweep loop is version-aware
+  # and picks up whatever each neuron reports whenever). So it runs
+  # alongside the cortex→neurons chain rather than after it.
+  deploy-bench:
+    runs-on: fedora-43
+    if: >-
+      ${{
+        github.event_name == 'workflow_dispatch'
+        || github.event.workflow_run.conclusion == 'success'
+      }}
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@bob.hanzalova.internal 'hostname -f'
+
+      # See deploy-cortex for why gating uses the publish manifest and
+      # not unprivileged `dnf check-update`.
+      - name: Deploy helexa-bench (skips when already current)
+        run: |
+          ssh gitea_ci@bob.hanzalova.internal 'bash -s' <<'DEPLOY'
+          set -eu
+          pkg=helexa-bench
+          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
+          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
+            | python3 -c '
+          import json, sys
+          name = sys.argv[1]
+          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
+          if cands:
+              p = max(cands, key=lambda p: p.get("buildTime", 0))
+              print(p["version"] + "-" + p["release"])
+          ' "${pkg}" 2>/dev/null || true)
+          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
+            echo "${pkg}-${installed} already current — leaving service untouched"
+            exit 0
+          fi
+          echo "installed=${installed} published=${latest:-unknown} — deploying"
+          if systemctl is-active --quiet helexa-bench.service; then
+            sudo /usr/bin/systemctl stop helexa-bench.service
+          fi
+          if rpm -q "${pkg}" >/dev/null 2>&1; then
+            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
+          else
+            sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
+          fi
+          sudo /usr/bin/systemctl daemon-reload
+          # enable --now: start the service AND enable it for boot so the
+          # bench resumes collecting after a host reboot.
+          sudo /usr/bin/systemctl enable --now helexa-bench.service
+
+          # ── Post-deploy validation ────────────────────────────────
+          # The bench serves a read-only API on :13132 alongside the
+          # outbound sweep loop. Probe the API over localhost (bypasses
+          # firewalld) — catches a crash-on-start or a bad bind. Bail
+          # early if the unit drops out of active (Restart backoff).
+          echo "waiting for bench API on :13132"
+          deadline=$(( $(date +%s) + 30 ))
+          while :; do
+            if curl -fsS --max-time 5 http://localhost:13132/api/health >/dev/null 2>&1; then
+              echo "bench API healthy"
+              break
+            fi
+            if ! systemctl is-active --quiet helexa-bench.service; then
+              echo "FAIL: helexa-bench.service is not active"
+              systemctl --no-pager status helexa-bench.service | head -20 || true
+              exit 1
+            fi
+            if [ "$(date +%s)" -ge "${deadline}" ]; then
+              echo "FAIL: bench API not healthy within 30s"
+              exit 1
+            fi
+            sleep 3
+          done
+          DEPLOY
+
+      - name: Ensure firewalld allows helexa-bench
+        run: |
+          ssh gitea_ci@bob.hanzalova.internal '
+            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-bench --quiet 2>/dev/null; then
+              sudo /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
+              sudo /usr/bin/firewall-cmd --reload
+            fi'
+
+      # Wait for the service to either come up or wedge, then capture
+      # the latest-invocation journal. Runs even on prior failure so a
+      # failed start step still leaves a usable record in the deploy log.
+      - name: Capture helexa-bench.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh gitea_ci@bob.hanzalova.internal \
+              'journalctl --unit helexa-bench.service -I --no-pager'
+
+  # Build the bench UI and publish it to the public nginx vhost on the
+  # gateway (https://bench.helexa.ai). The vhost + Let's Encrypt cert are
+  # one-time host setup (script/infra-setup.sh); this job just refreshes
+  # the static assets. nginx reverse-proxies /api to the bob API, so the
+  # SPA is built same-origin (no VITE_API_BASE). Independent of the other
+  # deploy jobs.
+  deploy-bench-ui:
+    runs-on: fedora-43
+    if: >-
+      ${{
+        github.event_name == 'workflow_dispatch'
+        || github.event.workflow_run.conclusion == 'success'
+      }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Build UI
+        run: |
+          cd bench
+          npm ci
+          npm run build
+
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@hanzalova.internal 'hostname -f'
+
+      - name: Rsync built UI to gateway webroot
+        run: |
+          rsync --archive --compress --delete \
+            --rsync-path 'sudo rsync' \
+            bench/dist/ \
+            gitea_ci@hanzalova.internal:/var/www/bench.helexa.ai/
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,12 @@
 /target
+/bench/node_modules
+/bench/dist
 *.swp
 *.swo
 .idea/
 .vscode/
 cortex.toml
+models.toml
 doc/plan/*
+/target-cuda/
+.claude/
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,268 @@
+# AGENTS.md — helexa/cortex
+
+## Project Overview
+
+helexa is a self-hosted LLM serving stack for multi-node GPU inference clusters. It has two components:
+
+- **cortex** — the per-operator control plane and LLM proxy. A Rust reverse-proxy that sits in front of the fleet and presents a unified OpenAI + Anthropic compatible API surface. It handles model routing, lifecycle management (load/unload/evict), request translation, and metrics collection.
+- **neuron** — the per-host LLM harness. One instance runs on every GPU host, serving candle-based in-process inference and managing local hardware discovery and model lifecycle.
+
+## Repository Layout
+
+```
+cortex/
+├── Cargo.toml              # workspace root (Rust 2024 edition, GPL-3.0)
+├── cortex.example.toml     # example gateway config
+├── models.example.toml     # example model catalogue
+├── neuron.example.toml     # example neuron config
+├── README.md               # public-facing documentation
+├── CLAUDE.md               # detailed design rationale and implementation history
+├── AGENTS.md               # ← you are here
+├── cortex.spec             # RPM spec for cortex
+├── helexa-neuron.spec      # RPM spec for neuron (renamed to avoid Fedora collision)
+├── rpm/                    # prerelease RPM specs
+│   ├── cortex-prerelease.spec
+│   ├── helexa-neuron-prerelease.spec
+│   └── helexa-bench-prerelease.spec
+├── data/                   # systemd units and example configs for packaging
+│   ├── cortex.service
+│   ├── neuron.service
+│   ├── cortex.example.toml
+│   ├── neuron.example.toml
+│   └── models.example.toml
+└── crates/
+    ├── cortex-core/            # shared types, config, envelopes
+    │   └── src/
+    │       ├── lib.rs
+    │       ├── build_info.rs   # BuildInfo type for /version endpoint
+    │       ├── config.rs       # figment-based config structs
+    │       ├── catalogue.rs    # ModelProfile, placement matching
+    │       ├── discovery.rs    # DeviceInfo, DiscoveryResponse
+    │       ├── harness.rs      # Harness trait, HarnessConfig, HarnessHealth
+    │       ├── node.rs         # NodeState, ModelStatus
+    │       ├── openai.rs       # OpenAI request/response types
+    │       ├── anthropic.rs    # Anthropic request/response types
+    │       ├── translate.rs    # OpenAI <-> Anthropic translation
+    │       └── metrics.rs      # RequestMetrics, histogram helpers
+    ├── cortex-gateway/         # the HTTP proxy server
+    │   └── src/
+    │       ├── lib.rs
+    │       ├── state.rs        # CortexState: Arc<RwLock<...>>
+    │       ├── router.rs       # model -> node routing logic
+    │       ├── proxy.rs        # streaming HTTP proxy to backends
+    │       ├── evictor.rs      # LRU/priority eviction logic
+    │       ├── poller.rs       # background task polling neuron status
+    │       ├── handlers.rs     # axum handlers (chat, completions, models, etc.)
+    │       └── metrics.rs      # prometheus exporter endpoint
+    ├── cortex-cli/             # CLI entrypoint
+    │   └── src/main.rs         # binary: `cortex`
+    ├── neuron/                 # per-host LLM daemon (replaces cortex-agent)
+    │   ├── Cargo.toml          # features: cuda, cudnn, flash-attn, cuda-integration
+    │   ├── build.rs            # compiles CUDA kernels, emits build metadata
+    │   └── src/
+    │       ├── main.rs         # binary: `neuron`
+    │       ├── discovery.rs    # nvidia-smi parsing, device enumeration
+    │       ├── health.rs       # runtime GPU polling
+    │       ├── api.rs          # HTTP handlers for /discovery, /models, etc.
+    │       ├── version.rs      # GET /version endpoint with BuildInfo
+    │       ├── models.rs       # local model lifecycle orchestration
+    │       └── harness/        # in-process candle inference
+    │           ├── device_worker/  # per-device CUDA worker threads
+    │           │   ├── mod.rs      # canonical narrative for worker architecture
+    │           │   ├── jobs.rs     # Job enum, dispatch handlers
+    │ │           └── dispatch.rs   # DeviceWorkerState struct
+    │           ├── candle.rs       # candle model implementation
+    │           └── tp/             # tensor parallelism
+    │               └── worker.rs   # TP worker subprocesses
+    ├── helexa-acp/             # Agent Client Protocol bridge (Apache-2.0)
+    │   └── src/main.rs         # binary: `helexa-acp`, self-contained (no workspace deps)
+    └── helexa-bench/           # benchmark harness
+        └── src/main.rs         # binary: `helexa-bench`, SQLite-backed, version-aware
+```
+
+## Key Design Decisions
+
+### Architecture
+- **cortex** is the control plane. It exposes the unified API, routes requests, manages model lifecycle across the fleet, and collects metrics.
+- **neuron** is the node plane. One instance runs on every GPU host. It discovers local hardware, manages in-process candle inference, handles NCCL tensor parallelism, and reports runtime state.
+- cortex never shells out to `nvidia-smi`, never touches systemd units, and never talks directly to a harness. It talks only to neurons via HTTP API on port 13131.
+
+### Per-device worker thread (neuron)
+Every CUDA device gets one dedicated OS thread that owns its `CudaContext` for the daemon's lifetime. All CUDA operations route through this thread via a `std::sync::mpsc` job channel. Tensors never escape the worker thread alive. Inference replies carry `Vec<f32>` CPU-side logits; sampled tokens come back as `u32`. The opaque `ArchHandle(u64)` and `TpHandle(u64)` are indices into the worker's state slab, not pointers.
+
+CPU loads (`Device::Cpu` fallback) keep the legacy `tokio::task::spawn_blocking + Arc<Mutex<ModelArch>>` path — there's no context to own and the channel hop would only add latency. Four `spawn_blocking` references in `harness/candle.rs` are deliberate CPU fallback.
+
+### candle-native (not mistral.rs)
+neuron builds directly on [candle](https://github.com/huggingface/candle). Every model architecture it serves is implemented in this repository, ported against the HuggingFace reference. No external inference server to babysit. The Harness trait remains as an internal seam for adding future engines (vision/audio/diffusion) but its only implementation is in-process candle.
+
+### Streaming proxy
+Chat completions are proxied as SSE streams. The gateway must:
+1. Parse the inbound request to extract the model name
+2. Route to the correct backend neuron
+3. Stream the response back, capturing token timing for metrics
+4. NOT buffer the full response — true streaming passthrough
+
+### Anthropic translation
+When a request arrives at `/v1/messages` (Anthropic format), the gateway translates it to OpenAI format before proxying to neuron, then translates the response back. This is stateless envelope transformation. Non-streaming round-trip is implemented; streaming SSE translation deferred.
+
+### Eviction
+The evictor runs as a background task. Before loading a model on a node where VRAM is tight:
+1. Check if the model is already loaded elsewhere → route there instead
+2. Find the LRU model on the target node (excluding pinned models)
+3. Call `POST {neuron}/models/unload` on that model
+4. The incoming request's lazy-load triggers the new model load
+
+### Metrics
+Per-request: model, node, prompt_tokens, completion_tokens, total_tokens, tok_per_sec, time_to_first_token_ms, total_latency_ms. Exposed as Prometheus histograms/counters on a separate port (31314).
+
+## Tech Stack
+
+- **Rust 2024 edition** — workspace with 6 crates
+- **Axum 0.8** — HTTP framework
+- **reqwest** — HTTP client for proxying to backends
+- **figment** — config loading (TOML + env vars)
+- **tokio** — async runtime
+- **metrics + metrics-exporter-prometheus** — observability
+- **tracing** — structured logging
+- **candle** — in-process inference engine (neuron only, with CUDA support)
+- **cudarc** — patched for neuron's needs (see workspace `[patch]`)
+- **clap** — CLI parsing
+- **rusqlite** (bundled) — helexa-bench SQLite system-of-record
+
+## Build Commands
+
+```sh
+cargo build --release           # build all crates
+cargo run -p cortex-cli -- serve    # run the gateway
+cargo test                      # run all tests
+cargo clippy --workspace        # lint
+```
+
+### neuron Features
+- `cuda`: Enables CUDA acceleration in candle and cudarc/nccl bindings. Without it, falls back to CPU.
+- `cudnn`: Use cuDNN for convolution/attention kernels (requires `cuda`).
+- `flash-attn`: FlashAttention kernels (requires `cuda`).
+- `cuda-integration`: Reserved for GPU-only integration tests (requires multiple CUDA devices + libnccl).
+
+### Build Scripts
+- `neuron/build.rs`: Compiles CUDA kernels (`src/cuda/*.cu`) using `cudaforge::KernelBuilder` when `cuda` feature is enabled. Handles compute capability checks (sm_<80 disables bf16 intrinsics). Also captures build metadata: git SHA, dirty flag, timestamp, rustc version, profile, features, candle-core version.
+
+## CI
+
+Gitea Actions runs on every push to any branch. All three checks must pass before merging:
+
+```sh
+cargo fmt --check --all                    # formatting
+cargo clippy --workspace -- -D warnings   # lint (warnings are errors)
+cargo test --workspace                     # tests
+```
+
+Run these locally before pushing. `cargo fmt --all` fixes formatting automatically. Clippy warnings must be resolved, not suppressed with `#[allow(...)]` unless there is a clear rationale.
+
+Tagged releases (`v*`) build SRPMs for `cortex`, `helexa-neuron`, and `helexa-bench` and publish to COPR (`helexa/helexa`). Build metadata SHA injection: CI sets `HELEXA_BUILD_SHA=$(git rev-parse HEAD)`.
+
+## Environment
+
+- Targets Fedora 43 (systemd, SELinux enforcing)
+- Nodes communicate over a private network (e.g. WireGuard mesh)
+- cortex listens on port 31313 (API) and 31314 (metrics)
+- neuron listens on port 13131 on each GPU host
+- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
+
+## Conventions
+
+- Error handling: `anyhow` for binaries, `thiserror` for library crates
+- No `unwrap()` in library code; `expect()` only with clear rationale
+- All public types derive `Debug, Clone, Serialize, Deserialize` where sensible
+- Config structs use `figment` with TOML as primary source, env vars as override
+- Prefer `Arc<RwLock<...>>` for shared fleet state; minimize lock duration
+- SSE streaming uses `tokio_stream` + `eventsource-stream` for parsing
+- Log at `info` for request routing, `debug` for proxy details, `warn` for eviction and node health, `error` for proxy failures
+
+## Testing
+
+### Gateway tests
+Use mock neurons spawned via axum in `crates/cortex-gateway/tests/common/mod.rs`. Helpers: `spawn_mock_backend()`, `spawn_gateway()`.
+
+### neuron integration tests
+- Numerical reference tests (`numerical_reference.rs`) require `NEURON_REF_MODEL_PATH` env var pointing to a HF snapshot directory. Fixtures are f32-based for precision validation against HuggingFace transformers.
+- CUDA integration tests (`tp_worker_lifecycle_cuda.rs`) gated behind `cuda-integration` feature; requires 2+ CUDA devices (e.g., 2x RTX 5090).
+
+### Metrics testing
+Use `install_test_recorder()` in test code to capture metrics without the HTTP listener.
+
+## helexa-bench
+
+A continuous, version-aware benchmark harness. Hits each neuron directly on `:13131`, exercises each warm model with a Scenario suite (chat-latency family), and records results into SQLite stamped with the neuron's full `BuildInfo`. The loop is version-aware: skips any (target, build SHA, model, scenario) cell already at `samples_per_version`.
+
+Packaged as `helexa-bench` RPM (prebuilt-binary spec). One systemd unit, typically on the metrics host.
+
+## helexa-acp
+
+Agent Client Protocol bridge — connects ACP editors (Zed, etc.) to any OpenAI-compatible endpoint, cortex by default. Intentionally self-contained: no workspace crate dependencies. Uses `agent-client-protocol` with `unstable_session_model` feature for Zed model picker support. Licensed Apache-2.0 (workspace is GPL-3.0).
+
+## RPM Packaging
+
+- `cortex.spec` — installs the `cortex` binary
+- `helexa-neuron.spec` — installs the `neuron` binary under package name `helexa-neuron` (renamed to avoid Fedora's NEURON neural-simulation package collision)
+- Systemd units in `data/cortex.service`, `data/neuron.service`
+- Example configs: `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
+
+Install:
+```sh
+dnf copr enable helexa/helexa
+dnf install cortex                # gateway host
+dnf install helexa-neuron         # GPU nodes
+```
+
+## Configuration Files
+
+### cortex.toml (gateway)
+```toml
+[gateway]
+listen = "0.0.0.0:31313"
+metrics_listen = "0.0.0.0:31314"
+
+[eviction]
+strategy = "lru"          # lru | priority
+defrag_after_cycles = 50
+
+[[neurons]]
+name = "beast"
+endpoint = "http://beast.internal:13131"
+```
+
+### models.toml (catalogue)
+```toml
+[[models]]
+id = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
+harness = "candle"
+quant = "Q4_K_M"
+vram_mb = 19000
+min_devices = 2
+min_device_vram_mb = 10000
+pinned_on = ["beast"]       # optional: never evict from these neurons
+```
+
+### neuron.toml (per-host)
+Configured via figment + env override. See `neuron.example.toml` for reference.
+
+## neuron API Endpoints
+
+```
+GET  /discovery        → hardware discovery (hostname, OS, CUDA, devices, harnesses)
+GET  /health           → runtime GPU stats (VRAM, utilization, temperature)
+GET  /models           → loaded/unloaded models with VRAM usage
+POST /models/load      → load a model with spec (quant, TP, devices)
+POST /models/unload    → unload a model, freeing device memory
+GET  /models/{id}/endpoint → inference URL for a model
+GET  /version          → build metadata (SHA, features, candle version, etc.)
+```
+
+## Sources of Truth
+
+When prose documentation conflicts with code, trust:
+1. Executable configuration (`*.toml`, `Cargo.toml` features)
+2. Type definitions in `cortex-core/`
+3. Test files in `crates/*/tests/` and `*/src/**/*_test.rs`
+4. `CLAUDE.md` for historical design rationale
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,16 +1,26 @@
-# CLAUDE.md — cortex
+# CLAUDE.md — helexa

 ## Project overview

-cortex is a Rust reverse-proxy that sits in front of multiple
-mistral.rs inference nodes and presents a unified OpenAI + Anthropic
-compatible API surface. It handles model routing, lifecycle management
-(load/unload/evict), request translation, and metrics collection.
+helexa is a self-hosted LLM serving stack for multi-node GPU inference
+clusters. It has two components:
+
+- **cortex** — the per-operator control plane and LLM proxy. A Rust
+  reverse-proxy that sits in front of the fleet and presents a unified
+  OpenAI + Anthropic compatible API surface. It handles model routing,
+  lifecycle management (load/unload/evict), request translation, and
+  metrics collection.
+- **neuron** — the per-host LLM harness. One instance runs on every GPU
+  host, serving candle-based in-process inference and managing local
+  hardware discovery and model lifecycle.
+
+(Historical note: cortex originally proxied to mistral.rs nodes; neuron
+replaced that — see the 2026-05-18 candle-native addendum below.)

 ## Repository layout

 ```
-cortex/
+helexa/
 ├── Cargo.toml              # workspace root
 ├── cortex.toml      # example gateway config
 ├── README.md
@@ -84,6 +94,63 @@ Per-request: model, node, prompt_tokens, completion_tokens, total_tokens,
 tok_per_sec, time_to_first_token_ms, total_latency_ms.
 Exposed as Prometheus histograms/counters on a separate port.

+### Per-device worker thread (neuron)
+The neuron daemon dedicates one OS thread per CUDA device it loads
+onto. That thread binds the device's `CudaContext` once at startup and
+owns it for the daemon's lifetime; every model load, forward step,
+KV-cache reset, VRAM query, NCCL init/sanity, NCCL all_reduce, and
+model drop on that device routes through this thread via a
+`std::sync::mpsc` job channel. Replies cross back via
+`tokio::sync::oneshot`.
+
+Three properties this gives us, in order of weight:
+
+1. **Context locality.** cudarc binds the CUDA context per OS thread
+   via `cuCtxSetCurrent`. Before this refactor, ad-hoc
+   `tokio::task::spawn_blocking` calls bound the context onto a
+   different thread per request — and `device_vram_mb()` from an
+   async task bound it onto whichever tokio worker happened to be
+   running. Pinning the context to one named thread ends that.
+2. **Drop safety.** Every `CudaSlice` in a `Tensor`, every
+   `cudarc::nccl::Comm`, and the `CudaContext` itself call `cuMemFree` /
+   `ncclCommDestroy` / `cuCtxDestroy` during `Drop` — and require the
+   right context current. With the worker owning the model slab,
+   `Drop` always runs on the right thread. The cudarc Drop constraint
+   is structurally enforced.
+3. **Poisoning blast radius.** When a CUDA driver error makes the
+   context unrecoverable, the poison flag lives on the
+   `DeviceWorkerHandle` itself. Subsequent `submit()` calls fast-reject
+   at the channel boundary with a clear "device worker is poisoned"
+   error before any further CUDA work is attempted. The thread doesn't
+   exit (dropping the slab would re-touch the broken context) — it
+   enters a drain-only mode and replies error to everything until the
+   daemon restarts.
+
+Tensors never escape the worker thread alive. Inference replies carry
+`Vec<f32>` CPU-side logits; the async caller wraps them in a CPU
+candle tensor and runs `apply_repeat_penalty` + `LogitsProcessor::sample`
+without ever rebinding the device context. Sampled tokens come back as
+`u32`; VRAM queries as `(u64, u64)`. The opaque `ArchHandle(u64)` and
+`TpHandle(u64)` are the only "references" callers hold to loaded
+models — they're indices into the worker's state slab, not pointers.
+
+The TP worker subprocesses in `harness/tp/worker.rs` are the same
+pattern out-of-process — a dedicated context-owning process per
+non-zero NCCL rank. The in-process worker in `harness/device_worker/`
+brings the discipline to rank 0.
+
+CPU loads (`Device::Cpu` fallback when CUDA is unavailable) keep the
+legacy `tokio::task::spawn_blocking + Arc<Mutex<ModelArch>>` path —
+there's no context to own and the channel hop would only add latency.
+Four `spawn_blocking` references in `harness/candle.rs` are deliberate
+CPU fallback.
+
+Canonical narrative lives in
+`crates/neuron/src/harness/device_worker/mod.rs`'s module
+doc-comment; touch points (the `Job` enum, the dispatch handlers, the
+`DeviceWorkerState` struct) are in the sibling `jobs.rs` and
+`dispatch.rs`.
+
 ## Tech stack

 - **Rust 2024 edition** — workspace with 4 crates
@@ -125,7 +192,8 @@ automatically. Clippy warnings must be resolved, not suppressed with
  - One or more GPU nodes running mistral.rs on port 8080
  - Optionally a metrics-only node (no GPU) for Prometheus/Grafana
 - Each node runs `mistralrs serve` on port 8080
- Gateway listens on port 8000 (API) and 9100 (metrics)
+- Gateway listens on port 31313 (API) and 31314 (metrics)
+- neuron listens on port 13131 on each GPU host
 - TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard

 ## Conventions
@@ -380,7 +448,7 @@ processes (one process per loaded model, each on its own port).

 ## neuron API

-neuron exposes an HTTP API on port 9090 that cortex polls and calls.
+neuron exposes an HTTP API on port 13131 that cortex polls and calls.

 ```
 GET  /discovery
@@ -424,8 +492,8 @@ endpoint. cortex.toml shrinks to:

 ```toml
 [gateway]
-listen = "0.0.0.0:8000"
-metrics_listen = "0.0.0.0:9100"
+listen = "0.0.0.0:31313"
+metrics_listen = "0.0.0.0:31314"

 [eviction]
 strategy = "lru"
@@ -433,15 +501,15 @@ defrag_after_cycles = 50

 [[neurons]]
 name = "beast"
-endpoint = "http://beast.hanzalova.internal:9090"
+endpoint = "http://beast.hanzalova.internal:13131"

 [[neurons]]
 name = "benjy"
-endpoint = "http://benjy.kosherinata.internal:9090"
+endpoint = "http://benjy.hanzalova.internal:13131"

 [[neurons]]
 name = "quadbrat"
-endpoint = "http://quadbrat.hanzalova.internal:9090"
+endpoint = "http://quadbrat.hanzalova.internal:13131"
 ```

 On startup and periodically, cortex calls `GET /discovery` and
@@ -490,7 +558,7 @@ and the hardcoded `vram_mb` per node.
 ## Revised repository layout

 ```
-cortex/
+helexa/
 ├── Cargo.toml
 ├── cortex.toml                 # gateway config (neurons only)
 ├── models.toml                 # model catalogue
@@ -521,7 +589,7 @@ cortex/
 │   │       └── metrics.rs      # prometheus exporter (unchanged)
 │   ├── neuron/                 # node plane (replaces cortex-agent)
 │   │   └── src/
-│   │       ├── main.rs         # binary entrypoint, axum server on :9090
+│   │       ├── main.rs         # binary entrypoint, axum server on :13131
 │   │       ├── discovery.rs    # nvidia-smi, device enumeration
 │   │       ├── health.rs       # runtime GPU polling
 │   │       ├── api.rs          # HTTP handlers for /discovery, /models, etc.
@@ -595,70 +663,140 @@ placement matching can be added incrementally.
 Completed. Both packages have RPM specs, systemd units, and example configs.
 CI builds parallel SRPMs on tag push and publishes to separate COPR repos.

- `cortex.spec` → `helexa/cortex` COPR: binary, systemd unit, config files
- `neuron.spec` → `helexa/neuron` COPR: binary, systemd unit, config
+- `cortex.spec` — installs the `cortex` binary. Package name keeps the
+  short `cortex` because no Fedora package collides with it.
+- `helexa-neuron.spec` — installs the `neuron` binary under package name
+  `helexa-neuron`. Renamed from bare `neuron` to avoid collision with
+  Fedora's NEURON neural-simulation package
+  (https://src.fedoraproject.org/rpms/neuron); binary, systemd unit,
+  system user, and config dir all stay named `neuron` since those are
+  project-local contexts.
 - `data/cortex.service`, `data/neuron.service` — systemd units
 - `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR publish
+- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR
+  publish to a single project `helexa/helexa` hosting both packages.

 Install:
 ```sh
-dnf copr enable helexa/cortex && dnf install cortex    # gateway host
-dnf copr enable helexa/neuron && dnf install neuron    # GPU nodes
+dnf copr enable helexa/helexa
+dnf install cortex                # gateway host
+dnf install helexa-neuron         # GPU nodes
 ```

-### Phase 11: llama.cpp harness stub
+## 2026-05-18 addendum: candle-native pivot

-**Goal:** Prove the harness abstraction works with a second engine.
+Phases 11 (llama.cpp harness) and 12 (mistral.rs COPR) below are
+**superseded**. The project no longer treats mistral.rs or llama.cpp as
+dependencies — both are conceptually out of scope. neuron becomes a
+candle-native inference daemon, with `Harness` retained as an
+internal seam for adding future engines (vision/audio/diffusion) but
+its only implementation being in-process candle.

-**Steps:**
-1. `crates/neuron/src/harness/llamacpp.rs` — implement the `Harness`
-   trait for llama.cpp's `llama-server`.
-   - `start()` — launch `llama-server` with the correct model path,
-     `--port`, `--n-gpu-layers`, `--tensor-split` args. Track the
-     child process.
-   - `stop()` — send SIGTERM to the child process.
-   - `list_models()` — llama-server serves one model per process, so
-     return a single-element list.
-   - `load_model()` — start a new llama-server process for this model.
-   - `unload_model()` — stop the process.
-   - `inference_endpoint()` — return `http://localhost:{assigned_port}`.
-2. Port allocation: neuron assigns ports from a range (e.g. 8100-8199)
-   to llama-server instances.
-3. Register in `HarnessRegistry` when configured:
-   ```toml
-   [[harnesses]]
-   name = "llamacpp"
-   binary = "/usr/local/bin/llama-server"
-   port_range = [8100, 8199]
-   ```
-4. Tests: mock llama-server (simple HTTP server returning canned
-   responses), test load/unload/endpoint lifecycle.
+The full staged plan for this pivot lives at
+`~/.claude/plans/create-a-more-aggressive-calm-naur.md`. Summary:

-**Done when:** A model with `harness = "llamacpp"` in `models.toml` can
-be loaded and served through cortex. Tests pass with mock llama-server.
+- **Stage 1 (this commit):** delete `mistralrs.rs` and `llamacpp.rs`,
+  scaffold inert `CandleHarness`, drop `endpoint`/`systemd_unit` from
+  `HarnessConfig`, default no-op `start`/`stop` on the `Harness` trait.
+- **Stages 2–4:** wire up candle model load/unload (quantized Qwen3
+  first), add OpenAI-compatible inference endpoint in neuron, then SSE
+  streaming.
+- **Stages 5–6:** load-on-activation (default models in config) and
+  unload-on-deactivation (graceful shutdown).
+- **Stages 7–8:** multi-GPU tensor parallelism and broader model/quant
+  coverage.

-### Phase 12 (lower priority): mistral.rs COPR packaging
+Sections of this document that describe mistral.rs HTTP behaviour
+("mistral.rs API gotchas") are retained as historical context for
+Phases 1–10 — they document what was true while the project depended
+on mistral.rs. They do not describe current behaviour.

-**Goal:** Fedora RPMs for mistral.rs built against specific CUDA versions.
+---

-**Steps:**
-1. `mistralrs-cuda.spec` — RPM spec that clones a pinned mistral.rs git
-   tag, builds with `--features cuda`, links against the system CUDA
-   toolkit. Produces `mistralrs-cuda13-server` (CUDA 13.x / sm_120) and
-   `mistralrs-cuda12-server` (CUDA 12.x / sm_89). Install binary to
-   `/usr/local/bin/mistralrs`.
-2. COPR build config: enable the NVIDIA CUDA repo as a build dependency.
-   Pin the CUDA toolkit version in `BuildRequires`.
-3. Gitea Actions or manual workflow: bump the mistral.rs tag in the spec,
-   trigger COPR rebuild.
-4. neuron's mistralrs harness config references which binary/package
-   provides the mistral.rs binary. neuron could warn at startup if the
-   installed mistral.rs CUDA version doesn't match the discovered driver.
+### Phase 11 (superseded): llama.cpp harness stub

-**Done when:** `dnf install mistralrs-cuda13-server` on beast provides a
-working `mistralrs` binary built for Blackwell GPUs. `dnf install
-mistralrs-cuda12-server` on benjy provides one built for Ada GPUs.
+~~Originally planned as a second engine to prove the harness
+abstraction.~~ Replaced by the candle harness work in the 2026-05-18
+addendum above. llama.cpp's any-model/any-hardware breadth is no
+longer in scope for helexa.

-This is a separate repo/spec — not part of the cortex workspace — but
-tightly coupled operationally. Track it as a sibling project.
+### Phase 12 (superseded): mistral.rs COPR packaging
+
+~~Originally planned to ship CUDA-versioned mistral.rs RPMs.~~ Replaced
+by the candle harness work in the 2026-05-18 addendum above. With
+mistral.rs out of the dependency tree, there is nothing to package.
+
+## 2026-05-27 addendum: per-device worker thread
+
+Replaced the ad-hoc `tokio::task::spawn_blocking` pattern that drove
+every leader-side CUDA op with one dedicated OS thread per CUDA device,
+permanently bound to that device's `CudaContext`. All leader-side
+inference work (GGUF + dense + TP shard load, forward, kv-cache clear,
+NCCL init/sanity, NCCL all_reduce, VRAM query, model drop) routes
+through the worker via a `std::sync::mpsc` channel; tensors never
+escape the worker thread alive. See "Per-device worker thread (neuron)"
+above and `crates/neuron/src/harness/device_worker/mod.rs` for the
+canonical narrative.
+
+Motivated by the 2026-05-26 silent-hang on beast: a CUDA OOM cascade
+poisoned the device context on whichever spawn_blocking thread caught
+it, and subsequent requests stalled invisibly on the pool lock. After
+the refactor, the same failure mode shows up in journalctl as
+`prefill sample failed; logits unhealthy nan: 248320/248320` followed
+by `failed, model marked poisoned`. The thread stays alive and rejects
+subsequent requests at the channel boundary.
+
+Landed in four PRs:
+
+- **Phase 1** (`081b532`) — device_worker module + 8 VRAM-query sites
+  route through the worker. CPU build only; smoke on beast confirmed
+  a persistent `cuda-dev-0` thread.
+- **Phase 2** (`b179204`) — single-GPU forward + clear_kv + drop via
+  the worker. `LoadedModel.arch_handle: Option<ArchHandle>` replaces
+  `Arc<Mutex<ModelArch>>` for CUDA loads. CPU keeps the legacy path.
+- **Phase 3** (`76ab24d`) — TP forward + NCCL init/sanity + leader
+  KV-clear routed through the worker. `WorkerPool.leader_nccl` moves
+  into the worker's state. `TpLoadedModel.leader_handle: TpHandle`
+  replaces `Arc<Mutex<TpLeaderModel>>`. CUDA-only TP smoke deferred to
+  next deploy.
+- **Phase 4** (`b4f3576`) — GGUF + dense + TP shard loads move onto
+  the worker. The `Job::TransferIn` / `Job::CloneLeaderComm` bridges
+  from Phases 2/3 deleted; `SendComm` newtype no longer needed in the
+  load path. `grep -rn spawn_blocking crates/neuron/src/harness/`
+  returns only deliberate CPU-fallback hits after this PR.
+
+## 2026-06-13 addendum: build metadata + helexa-bench
+
+Two coupled additions so fleet performance can be tracked automatically
+across neuron updates instead of by hand-running `script/bench.py` and
+editing `doc/benchmarks.md`.
+
+**neuron build metadata + `GET /version`.** neuron's `build.rs` now also
+captures build identity (`HELEXA_GIT_SHA` — preferring a CI/RPM-injected
+`HELEXA_BUILD_SHA`, falling back to git, else `unknown` — plus dirty
+flag, build timestamp, rustc version, profile, enabled cargo features,
+and a best-effort `candle-core` version from `Cargo.lock`). These are
+exposed as `cortex_core::build_info::BuildInfo` (new module) from a new
+`GET /version` endpoint (`neuron/src/version.rs`, wired in `api.rs`) and
+in clap's `--version` long form. The SHA is injected in CI
+(`build-prerelease.yml` build-neuron step: `export HELEXA_BUILD_SHA=$(git
+rev-parse HEAD)`) and via `--define helexa_commit` in the source-build
+spec, so tarball-built RPMs report the real SHA. `/version` is now the
+canonical "which build is live" probe (supersedes the per-host RPM-sha
+check in the fleet-validation flow).
+
+**`crates/helexa-bench`** — a new binary: a continuous, version-aware
+benchmark harness (one systemd unit, typically on the metrics host). It
+hits each neuron **directly** on `:13131`, exercises each **warm**
+(`status == "loaded"`) model with an extensible `Scenario` suite (phase
+1: the chat-latency family ported verbatim from `bench.py` — synthetic
+128/4096-tok prompts, `/no_think`, streamed TTFT + decode-window
+tok/s), and records each run into a SQLite system-of-record stamped with
+the neuron's full `BuildInfo`. The loop is **version-aware**: it skips
+any (target, build SHA, model, scenario) cell already at
+`samples_per_version`, so a steady fleet costs only cheap `/version` +
+`/models` polls until a new SHA ships. `helexa-bench report` regenerates
+the `benchmarks.md`-style table from the DB. `kind = "openai"` targets
+(mistral.rs/llama.cpp comparison) are scaffolded but not yet wired.
+Packaged as the `helexa-bench` RPM (prebuilt-binary spec, outbound-only
+so no firewalld service) via the same `build-prerelease.yml` pipeline.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,13 +5,15 @@ members = [
    "crates/cortex-gateway",
    "crates/cortex-cli",
    "crates/neuron",
+    "crates/helexa-acp",
+    "crates/helexa-bench",
 ]

 [workspace.package]
-version = "0.1.2"
+version = "0.1.16"
 edition = "2024"
 license = "GPL-3.0-or-later"
-repository = "https://git.lair.cafe/helexa/cortex"
+repository = "https://git.lair.cafe/helexa/helexa"

 [workspace.dependencies]
 # async runtime
@@ -27,7 +29,7 @@ serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 toml = "0.8"

-# http client (for proxying to mistralrs backends)
+# http client (for proxying to neuron backends)
 reqwest = { version = "0.12", features = ["json", "stream"] }

 # observability
@@ -60,3 +62,12 @@ eventsource-stream = "0.2"
 # workspace crates
 cortex-core = { path = "crates/cortex-core" }
 cortex-gateway = { path = "crates/cortex-gateway" }
+
+# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is
+# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds
+# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP
+# hang-recovery (abort a wedged collective from another thread, then
+# rebuild the comm). Pinned to a fork revision pending upstream review
+# (grenade/cudarc @ nccl-comm-abort).
+[patch.crates-io]
+cudarc = { git = "https://github.com/grenade/cudarc", rev = "63327a256059f8252641ae46c6bb9eefe707f382" }
--- a/README.md
+++ b/README.md
@@ -1,24 +1,68 @@
-# cortex
+# helexa

-A Rust reverse-proxy and fleet management layer for multi-node
-[mistral.rs](https://github.com/EricLBuehler/mistral.rs) inference clusters.
+**Near-frontier AI for mortals.**

-## Problem
+helexa is a self-hosted LLM serving stack, written in Rust, for people
+who run open-weight models on their own consumer GPUs. It has two
+components:

-Running local LLMs across multiple GPU nodes (different VRAM tiers, different
-model affinities) requires a unified API surface that:
+- **cortex** — the per-operator control plane and LLM proxy. It sits in
+  front of your GPU fleet and presents a unified OpenAI + Anthropic
+  compatible API surface, handling model routing, lifecycle management
+  (load / unload / evict), request translation, and metrics.
+- **neuron** — the per-host LLM harness. One instance runs on every GPU
+  host, serving candle-based in-process inference and managing local
+  hardware discovery and model lifecycle.

- Presents a **single `/v1/models` catalogue** merging every model across every
-  node.
- **Routes requests** to the correct node based on where a model is loaded (or
-  *can* be loaded).
- Manages **model lifecycle** — unload cold models, reload on demand, pin
-  critical ones — using the mistral.rs
-  `/v1/models/{unload,reload,status}` HTTP API (PR #1828+).
- Translates between **OpenAI and Anthropic** request/response envelopes so
-  every client in the homelab speaks whichever dialect it prefers.
- Captures **per-request metrics** (tokens, tok/s, TTFT, latency) and exposes
-  them as Prometheus counters/histograms.
+## Why
+
+Two principles constrain everything in this repository:
+
+1. **Frontier or close to it.** helexa serves the open-weight models
+   that get nearest to frontier capability — not every architecture
+   ever published.
+2. **Consumer hardware.** Everything must run on the cards mortals can
+   actually buy: a 3060 here, a 4090 there, a 5090 if you got lucky.
+   Mixed VRAM tiers across mismatched boxes are the expected topology,
+   not a degraded case.
+
+GPU acquisition is harder than it was a year ago, and the gap between
+what cloud providers charge and what your own silicon costs keeps
+widening. The intersection of those two principles — near-frontier
+models, squeezed onto hardware you own — is helexa's entire niche.
+
+The secondary objective is **predictable consumption**. If you own the
+hardware, your tooling shouldn't break because a cloud provider changed
+billing, deprecated a model, or reshaped an API. cortex's OpenAI and
+Anthropic surfaces are a stability contract: point your editor, agent,
+or CLI at it once, and it keeps working.
+
+## What helexa is not
+
+This is an intentionally different path from vLLM, SGLang, and peers —
+not a smaller version of them. Out of scope, permanently:
+
+- Any-model breadth. Architectures are ported because they're at or
+  near the frontier, not to complete a compatibility matrix.
+- Datacenter-class scheduling. No sophisticated continuous-batching /
+  paged-attention machinery — the workload is a handful of operators
+  and their agents, not 200 QPS.
+- Wrapping external inference engines. neuron builds directly on
+  [candle](https://github.com/huggingface/candle); every model
+  architecture it serves is implemented in this repository, ported
+  against the HuggingFace reference.
+
+One thing that is *not* a principle: CUDA exclusivity. All high-end
+consumer hardware is in scope. helexa is CUDA-only today because
+that's the hardware on the bench — nothing ships untested — and ROCm
+or other consumer accelerators join as soon as there's real hardware
+to build against.
+
+In scope, and where the engineering effort goes: aggressive
+quantization (GGUF Q4_K_M / Q6_K / Q8_0), NCCL tensor parallelism
+across heterogeneous consumer GPUs, careful CUDA failure handling, and
+single-request latency — the performance that one operator at a
+keyboard actually feels.

 ## Architecture

@@ -28,102 +72,119 @@ model affinities) requires a unified API surface that:
 └──────┬───────┘  └─────┬────┘  └──────┬─────┘  └──────┬─────┘
       │                │              │               │
       └────────────────┴──────┬───────┴───────────────┘
-                               │
+                               │  OpenAI + Anthropic APIs
                    ┌──────────▼──────────┐
-                    │   cortex     │
-                    │   (cortex-gateway)      │
+                    │      cortex         │
+                    │  (cortex-gateway)   │
                    │                     │
                    │  Router · Metrics   │
                    │  Evictor · Translate│
                    └──┬──────┬────────┬──┘
                       │      │        │
            ┌──────────▼┐  ┌──▼─────┐  ┌▼──────────┐
-            │ gpu-large │  │gpu-med │  │ gpu-small │
-            │ mistralrs │  │mistral │  │ mistralrs │
-            │ serve     │  │rs serve│  │ serve     │
-            │ :8080     │  │ :8080  │  │  :8080    │
+            │  neuron   │  │ neuron │  │  neuron   │
+            │  :13131   │  │ :13131 │  │  :13131   │
+            │  candle   │  │ candle │  │  candle   │
            └───────────┘  └────────┘  └───────────┘
                  private network (.internal)
 ```

+cortex discovers each neuron's hardware (devices, VRAM, compute
+capability) at runtime and matches it against a model catalogue
+(`models.toml`) to decide placement: which models fit where, what to
+evict when VRAM is tight, where to route a request right now. Adding a
+GPU host to the fleet is one `[[neurons]]` entry — no device specs in
+config.
+
 ### Crates

 | Crate | Purpose |
 |---|---|
-| `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic request/response envelopes |
-| `cortex-gateway` | Axum HTTP server: proxy, router, evictor, metrics exporter |
-| `cortex-agent` | Per-node sidecar: polls local mistralrs, reports to gateway, handles restart/defrag |
+| `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic envelopes, harness trait, discovery types |
+| `cortex-gateway` | Axum HTTP server: proxy, router, evictor, poller, metrics exporter |
+| `neuron` | Per-host daemon: GPU discovery, in-process candle inference, NCCL tensor parallelism, model lifecycle API |
 | `cortex-cli` | CLI entrypoint (`cortex serve`, `cortex status`, etc.) |
+| `helexa-acp` | Agent Client Protocol bridge — connects ACP editors (Zed, etc.) to any OpenAI-compatible endpoint, cortex by default |

-## Node setup
+## The engine

-Each GPU node runs `mistralrs serve` with a multi-model config. Models are
-declared but start **unloaded** — mistral.rs lazy-loads on first request and
-the gateway can explicitly unload/reload via the HTTP API.
+neuron runs inference in-process on candle — there is no external
+inference server to babysit. The parts that earn their keep:

-Example node systemd unit:
+- **Per-device worker threads.** Every CUDA device gets one dedicated
+  OS thread that owns its CUDA context for the daemon's lifetime. All
+  loads, forward passes, KV-cache resets, NCCL collectives, VRAM
+  queries, and unloads route through it; tensors never escape it
+  alive. Context binding is pinned to a known thread, the CUDA `Drop`
+  contract is structurally safe, and a driver error poisons one worker
+  — visibly — instead of hanging the whole process.
+- **Tensor parallelism on consumer cards.** Megatron-style row/column
+  parallel layers with NCCL all-reduce, spanning the mismatched GPUs
+  you actually have. A step watchdog aborts wedged collectives instead
+  of letting a request hang forever.
+- **Current model focus: the Qwen3 family** — dense and GGUF-quantized,
+  including the hybrid linear-attention (Gated DeltaNet) generation.
+  Vision support is in progress. Each architecture is ported against
+  its HuggingFace reference implementation.

-```ini
-# /etc/systemd/system/mistralrs.service
-[Unit]
-Description=mistral.rs inference server
-After=network-online.target
-Wants=network-online.target
+See `CLAUDE.md` for design rationale and
+`crates/neuron/src/harness/device_worker/` for the worker narrative.

-[Service]
-Type=simple
-ExecStart=/usr/local/bin/mistralrs serve \
-    --from-config /etc/mistralrs/config.toml \
-    --port 8080
-Restart=on-failure
-RestartSec=5
-Environment=CUDA_VISIBLE_DEVICES=0,1
+## Install

-[Install]
-WantedBy=multi-user.target
+Pre-built RPMs for Fedora:
+
+```sh
+dnf copr enable helexa/helexa
+dnf install cortex            # on the gateway host
+dnf install helexa-neuron     # on each GPU host
+systemctl enable --now cortex   # or neuron, respectively
 ```

-## Gateway config
+## Configure

 ```toml
-# cortex.toml
+# /etc/cortex/cortex.toml
 [gateway]
-listen = "0.0.0.0:8000"
-metrics_listen = "0.0.0.0:9100"
+listen = "0.0.0.0:31313"
+metrics_listen = "0.0.0.0:31314"

 [eviction]
 strategy = "lru"        # lru | priority
 defrag_after_cycles = 50

-[[nodes]]
-name = "gpu-large"
-endpoint = "http://gpu-large.internal:8080"
-vram_mb = 49_152        # e.g. 2x RTX 4090
-pinned = ["your-org/large-model"]
+[[neurons]]
+name = "beast"
+endpoint = "http://beast.internal:13131"

-[[nodes]]
-name = "gpu-medium"
-endpoint = "http://gpu-medium.internal:8080"
-vram_mb = 24_576        # e.g. RTX 4090
-pinned = ["your-org/medium-model"]
-
-[[nodes]]
-name = "gpu-small"
-endpoint = "http://gpu-small.internal:8080"
-vram_mb = 12_288        # e.g. RTX 3060
-pinned = ["your-org/embedding-model"]
+[[neurons]]
+name = "benjy"
+endpoint = "http://benjy.internal:13131"
 ```

-## Building
+Model placement profiles (VRAM requirements, quant, device minimums,
+pinning) live in `models.toml` — see `models.example.toml`.
+
+## Run
+
+```sh
+# start the gateway
+cortex serve --config /etc/cortex/cortex.toml
+
+# check fleet status
+cortex status
+
+# one catalogue across every node
+curl http://localhost:31313/v1/models
+```
+
+## Build from source

 ```sh
 cargo build --release
 ```

-## CI
-
-Every push triggers format, lint, and test checks. Ensure these pass
-locally before pushing:
+CI runs on every push; keep it green locally:

 ```sh
 cargo fmt --check --all                    # must be clean
@@ -131,20 +192,18 @@ cargo clippy --workspace -- -D warnings   # warnings are errors
 cargo test --workspace                     # all tests must pass
 ```

-Tagged releases (`v*`) additionally build an SRPM and publish to COPR.
+Tagged releases (`v*`) build SRPMs for `cortex` and `helexa-neuron`
+and publish to COPR.

-## Running
+## Status

-```sh
-# start the gateway
-cortex serve --config cortex.toml
+Pre-1.0 and moving fast. The gateway path (routing, eviction,
+translation, metrics) is stable and tested; the candle-native engine
+is under active development — expect the supported-model list to track
+the open-weight frontier, deliberately narrowly.

-# check fleet status
-cortex status
-
-# list all models across nodes
-curl http://localhost:8000/v1/models
-```
+Development happens at <https://git.lair.cafe/helexa/helexa>;
+<https://github.com/helexa-ai/helexa> is a read-only mirror.

 ## License

--- a/asset/helexa-bench/bob.toml
+++ b/asset/helexa-bench/bob.toml
@@ -0,0 +1,38 @@
+# helexa-bench config for bob.hanzalova.internal.
+#
+# Synced to /etc/helexa-bench/helexa-bench.toml by script/infra-setup.sh
+# (the helexa-bench RPM ships helexa-bench.example.toml as a
+# %config(noreplace) default; this per-host file overrides it).
+#
+# bob is a client host (it also runs Agent Zero); helexa-bench here hits
+# every neuron on the fleet directly and records build-stamped results
+# into the local SQLite store.
+
+[bench]
+sweep_interval_secs = 1800
+samples_per_version = 5
+iteration_pause_secs = 2
+request_timeout_secs = 600
+db_path = "/var/lib/helexa-bench/bench.sqlite"
+
+[scenarios]
+prompt_sizes = [128, 4096]
+max_tokens = 256
+
+# Read-only JSON API consumed by the bench UI (hosted separately) and for
+# programmatic access. Served alongside the sweep loop.
+[api]
+enabled = true
+listen = "0.0.0.0:13132"
+
+[[targets]]
+name = "beast"
+endpoint = "http://beast.hanzalova.internal:13131"
+
+[[targets]]
+name = "benjy"
+endpoint = "http://benjy.hanzalova.internal:13131"
+
+[[targets]]
+name = "quadbrat"
+endpoint = "http://quadbrat.hanzalova.internal:13131"
--- a/asset/neuron/beast.toml
+++ b/asset/neuron/beast.toml
@@ -0,0 +1,24 @@
+# neuron.toml for beast.hanzalova.internal
+#
+# 2x RTX 5090 (32 GB each) — TP-2 capable. Pre-warms Qwen3.6-27B with
+# q5k ISQ across both GPUs at activation, matching the validate-neuron
+# invocation: `validate-neuron.sh beast.hanzalova.internal
+# Qwen/Qwen3.6-27B q5k 2`.
+#
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. Edits
+# take effect after the next deploy workflow run restarts the service
+# (default_models is read at activation).
+
+port = 13131
+
+[[harnesses]]
+name = "candle"
+
+[harness.candle]
+
+[[default_models]]
+model_id = "Qwen/Qwen3.6-27B"
+harness = "candle"
+quant = "q6k"
+tensor_parallel = 2
+devices = [0, 1]
--- a/asset/neuron/benjy.toml
+++ b/asset/neuron/benjy.toml
@@ -0,0 +1,19 @@
+# neuron.toml for benjy.hanzalova.internal
+#
+# 1x RTX 4090 (24 GB) — largest single-GPU host on the fleet. Pre-warms
+# Qwen3-8B (bf16, ~18 GB), leaving ~6 GB for KV cache + activations on
+# moderate-length contexts.
+#
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.
+
+port = 13131
+
+[[harnesses]]
+name = "candle"
+
+[harness.candle]
+
+[[default_models]]
+model_id = "Qwen/Qwen3-8B"
+harness = "candle"
+devices = [0]
--- a/asset/neuron/quadbrat.toml
+++ b/asset/neuron/quadbrat.toml
@@ -0,0 +1,19 @@
+# neuron.toml for quadbrat.hanzalova.internal
+#
+# 1x RTX 3060 (12 GB) — small / quantised tier. Pre-warms Qwen3-1.7B
+# (bf16, ~4 GB), leaving ~7 GB for KV cache so long contexts on a small
+# model still have plenty of room.
+#
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.
+
+port = 13131
+
+[[harnesses]]
+name = "candle"
+
+[harness.candle]
+
+[[default_models]]
+model_id = "Qwen/Qwen3-1.7B"
+harness = "candle"
+devices = [0]
--- a/asset/nginx/bench.helexa.ai.bootstrap.conf
+++ b/asset/nginx/bench.helexa.ai.bootstrap.conf
@@ -0,0 +1,15 @@
+# Bootstrap vhost for bench.helexa.ai — http-only, used ONLY to obtain
+# the initial Let's Encrypt cert via the webroot challenge (the full TLS
+# vhost can't load before the cert file exists). script/infra-setup.sh
+# installs this, runs certbot, then swaps in bench.helexa.ai.conf.
+server {
+    listen 80;
+    server_name bench.helexa.ai;
+
+    location /.well-known/acme-challenge/ {
+        root /var/www/bench.helexa.ai;
+    }
+    location / {
+        try_files $uri $uri/ =404;
+    }
+}
--- a/asset/nginx/bench.helexa.ai.conf
+++ b/asset/nginx/bench.helexa.ai.conf
@@ -0,0 +1,56 @@
+# Public, auth-less bench UI at https://bench.helexa.ai.
+#
+# Serves the static SPA from /var/www/bench.helexa.ai (rsynced by
+# .gitea/workflows/deploy.yml's deploy-bench-ui job) and reverse-proxies
+# /api to the helexa-bench read API on bob over the WireGuard mesh — so
+# the browser stays same-origin (no CORS) and the internal API never
+# needs to be exposed publicly.
+#
+# TLS via Let's Encrypt; the cert is obtained/renewed by certbot
+# (bootstrapped one-time in script/infra-setup.sh). Mirrors the
+# dev.swym.hanzalova.internal vhost convention on this host.
+
+server {
+    listen 80;
+    server_name bench.helexa.ai;
+
+    # Keep serving the ACME webroot so certbot can renew.
+    location /.well-known/acme-challenge/ {
+        root /var/www/bench.helexa.ai;
+    }
+    location / {
+        return 301 https://$host$request_uri;
+    }
+}
+
+server {
+    listen 443 ssl;
+    http2 on;
+    server_name bench.helexa.ai;
+
+    ssl_certificate     /etc/letsencrypt/live/bench.helexa.ai/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/bench.helexa.ai/privkey.pem;
+    ssl_protocols TLSv1.2 TLSv1.3;
+    ssl_ciphers HIGH:!aNULL:!MD5;
+    ssl_prefer_server_ciphers on;
+    ssl_session_cache shared:SSL:10m;
+
+    root /var/www/bench.helexa.ai;
+    index index.html;
+
+    # Bench read API on bob (internal WireGuard); browser stays same-origin.
+    location /api/ {
+        proxy_pass http://bob.hanzalova.internal:13132;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_read_timeout 60s;
+    }
+
+    # SPA fallback — client-side routes (/trends, /runs) resolve to index.html.
+    location / {
+        try_files $uri $uri/ /index.html;
+    }
+}
--- a/asset/nginx/bench.internal.conf
+++ b/asset/nginx/bench.internal.conf
@@ -0,0 +1,34 @@
+# Internal bench UI vhost — https://bench.internal, reachable from inside
+# the WireGuard mesh (the public bench.helexa.ai dead-ends at the OPNsense
+# LAN interface, which only port-forwards :443 from the WAN). Same SPA +
+# /api→bob proxy as bench.helexa.ai, but with an internal-CA cert
+# (smallstep "lair", renewed by step@bench.timer). Mirrors the
+# *.internal vhost convention on oolon.kosherinata.internal.
+server {
+    server_name bench.internal;
+    listen 443 ssl;
+    http2 on;
+
+    ssl_certificate /etc/nginx/tls/cert/bench.internal.pem;
+    ssl_certificate_key /etc/nginx/tls/key/bench.internal.pem;
+    ssl_trusted_certificate /etc/pki/ca-trust/source/anchors/root-internal.pem;
+    ssl_protocols TLSv1.3;
+
+    # Shared webroot with the public vhost — same built SPA.
+    root /var/www/bench.helexa.ai;
+    index index.html;
+
+    location /api/ {
+        proxy_pass http://bob.hanzalova.internal:13132;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_read_timeout 60s;
+    }
+
+    location / {
+        try_files $uri $uri/ /index.html;
+    }
+}
--- a/asset/sudoers.d/bench-host.conf
+++ b/asset/sudoers.d/bench-host.conf
@@ -0,0 +1,25 @@
+# Install on the bench host (bob) as /etc/sudoers.d/helexa_gitea_ci
+# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
+# which SSHes as gitea_ci@bob to roll out helexa-bench package upgrades
+# and config changes.
+#
+# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
+# helexa-org apps can drop their own sudoers files on the same host
+# without overwriting this one.
+#
+# helexa-bench polls the neuron fleet (outbound) and serves a read-only
+# JSON API on tcp/13132 for the bench UI — hence the firewall-cmd grants.
+
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/helexa-bench/helexa-bench.toml
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start helexa-bench.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop helexa-bench.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now helexa-bench.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
+# sudoers reserves `:` and `=` and requires `\` escaping inside command
+# arguments — without it visudo errors at the first `:` in `https://`.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
--- a/asset/sudoers.d/cortex-host.conf
+++ b/asset/sudoers.d/cortex-host.conf
@@ -0,0 +1,23 @@
+# Install on the cortex gateway host as /etc/sudoers.d/helexa_gitea_ci
+# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
+# which SSHes as gitea_ci@<gateway> to roll out cortex package upgrades
+# and config changes.
+#
+# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
+# helexa-org apps can drop their own sudoers files on the same host
+# without overwriting this one.
+
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
+# deploy-bench-ui rsyncs the built bench SPA into the nginx webroot.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /var/www/bench.helexa.ai/
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now cortex.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
+# sudoers reserves `:` and `=` and requires `\` escaping inside command
+# arguments — without it visudo errors at the first `:` in `https://`.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
--- a/asset/sudoers.d/neuron-host.conf
+++ b/asset/sudoers.d/neuron-host.conf
@@ -0,0 +1,43 @@
+# Install on every neuron host as /etc/sudoers.d/helexa_gitea_ci
+# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
+# which SSHes as gitea_ci@<neuron-host> to roll out helexa-neuron-<flavour>
+# package upgrades and config changes.
+#
+# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
+# helexa-org apps can drop their own sudoers files on the same host
+# without overwriting this one.
+#
+# All three CUDA flavours are listed because a host's flavour can change
+# (e.g. GPU swap) and we don't want the sudoers file to need to change
+# in lockstep. Only one flavour can be installed at a time (the packages
+# Conflict: with each other), so the attack surface is bounded to "wrong
+# flavour installed" — vandalism, not privilege escalation.
+
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
+# deploy.yml writes the per-model systemd drop-in carrying
+# NEURON_MAX_PROMPT_TOKENS: gitea_ci stages it in its own dir, then
+# installs it root-owned. Exact source/dest paths; see doc/context-limits.md.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf /etc/systemd/system/neuron.service.d/model.conf
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now neuron.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ada
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ada
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-blackwell
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-blackwell
+# sudoers reserves `:` and `=` and requires `\` escaping inside command
+# arguments — without it visudo errors at the first `:` in `https://`.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install -y libcudnn9-cuda-13
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
+# deploy-dev.yml fast path: install a freshly-built dev binary over the
+# packaged one. Exact source path + args; the workflow must use this
+# command form verbatim. The next deploy.yml run reconciles the host
+# back to the RPM-owned binary.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
--- a/asset/systemd/step@.service
+++ b/asset/systemd/step@.service
@@ -0,0 +1,20 @@
+# Internal-CA cert renewal for %i.internal, driven by step@%i.timer.
+# Replicated from oolon.kosherinata.internal (the kosherinata DC proxy).
+# Renews an EXISTING cert via mTLS (step ca renew) — the initial cert
+# must be issued once with a provisioner (see script/infra-setup.sh).
+# Installed to /etc/systemd/system/step@.service.
+[Unit]
+Description=step cert renew for %i.internal
+Documentation=https://smallstep.com/docs/step-ca/renewal
+
+[Service]
+Type=oneshot
+ExecCondition=/usr/bin/step certificate needs-renewal \
+    /etc/nginx/tls/cert/%i.internal.pem
+ExecStart=/usr/bin/step ca renew \
+    --force \
+    --ca-url https://ca.internal \
+    --root /etc/pki/ca-trust/source/anchors/root-internal.pem \
+    /etc/nginx/tls/cert/%i.internal.pem \
+    /etc/nginx/tls/key/%i.internal.pem
+ExecStartPost=/usr/bin/systemctl reload nginx.service
--- a/asset/systemd/step@.timer
+++ b/asset/systemd/step@.timer
@@ -0,0 +1,15 @@
+# Periodic internal-cert renewal for %i.internal (every 15 min, jittered).
+# Replicated from oolon.kosherinata.internal. Installed to
+# /etc/systemd/system/step@.timer; enable per-cert with
+# `systemctl enable --now step@bench.timer`.
+[Unit]
+Description=step cert renew timer for %i.internal
+
+[Timer]
+Persistent=true
+OnCalendar=*:1/15
+AccuracySec=1us
+RandomizedDelaySec=5m
+
+[Install]
+WantedBy=timers.target
--- a/bench/.gitignore
+++ b/bench/.gitignore
@@ -0,0 +1,3 @@
+node_modules
+dist
+*.local
--- a/bench/README.md
+++ b/bench/README.md
@@ -0,0 +1,45 @@
+# helexa bench UI
+
+A Vite + React (SWC, TypeScript) app that visualises the fleet benchmark
+data collected by `helexa-bench`. It reads the read-only JSON API the
+bench daemon serves (`crates/helexa-bench/src/api.rs`, default
+`:13132` on bob).
+
+Stack: React Router, react-bootstrap, Recharts.
+
+## Pages
+
+- **Overview** — latest median results per (host, model, scenario) cell.
+- **Trends** — decode-tok/s and TTFT plotted across neuron build SHAs as
+  releases roll out (the headline view). Pick host / model / scenario.
+- **Runs** — filterable raw-run explorer.
+
+## Develop
+
+```sh
+cd bench
+npm install
+npm run dev      # http://localhost:5173
+```
+
+`vite.config.ts` proxies `/api` → `http://bob.hanzalova.internal:13132`,
+so the dev server talks to the live bench API with no CORS fuss. Point
+the proxy elsewhere (or run a local `helexa-bench serve`) to develop
+against other data.
+
+## Production hosting
+
+Public at **https://bench.helexa.ai** — nginx on the gateway
+(`hanzalova.internal`) serves the static `dist/` and reverse-proxies
+`/api` to the bench API on bob over WireGuard, so the SPA is same-origin
+(no CORS) and the internal API stays off the public internet.
+
+- `npm run build` is run with **no** `VITE_API_BASE` (the app calls
+  `/api/...` on its own origin; nginx proxies it to bob).
+- `.gitea/workflows/deploy.yml` (`deploy-bench-ui`) builds and rsyncs
+  `dist/` to `/var/www/bench.helexa.ai` on every deploy.
+- The nginx vhost (`asset/nginx/bench.helexa.ai.conf`) and the
+  Let's Encrypt cert are one-time host setup in `script/infra-setup.sh`.
+
+To host elsewhere instead, build with
+`VITE_API_BASE=<bob-api-origin>` and serve the static `dist/`.
--- a/bench/index.html
+++ b/bench/index.html
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>helexa bench</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>
--- a/bench/package-lock.json
+++ b/bench/package-lock.json
--- a/bench/package.json
+++ b/bench/package.json
@@ -0,0 +1,28 @@
+{
+  "name": "helexa-bench-ui",
+  "private": true,
+  "version": "0.1.0",
+  "type": "module",
+  "description": "Visualisation app for helexa-bench fleet benchmark data.",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc && vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "bootstrap": "^5.3.3",
+    "react": "^18.3.1",
+    "react-bootstrap": "^2.10.5",
+    "react-dom": "^18.3.1",
+    "react-router-dom": "^6.26.2",
+    "recharts": "^2.12.7"
+  },
+  "devDependencies": {
+    "@types/node": "^20.14.0",
+    "@types/react": "^18.3.5",
+    "@types/react-dom": "^18.3.0",
+    "@vitejs/plugin-react-swc": "^3.7.0",
+    "typescript": "^5.5.4",
+    "vite": "^5.4.0"
+  }
+}
--- a/bench/src/App.tsx
+++ b/bench/src/App.tsx
@@ -0,0 +1,30 @@
+import { Container, Nav, Navbar } from "react-bootstrap";
+import { NavLink, Outlet } from "react-router-dom";
+
+export default function App() {
+  return (
+    <>
+      <Navbar bg="dark" variant="dark" expand="md">
+        <Container>
+          <Navbar.Brand as={NavLink} to="/">
+            helexa&nbsp;bench
+          </Navbar.Brand>
+          <Nav className="me-auto">
+            <Nav.Link as={NavLink} to="/" end>
+              Overview
+            </Nav.Link>
+            <Nav.Link as={NavLink} to="/trends">
+              Trends
+            </Nav.Link>
+            <Nav.Link as={NavLink} to="/runs">
+              Runs
+            </Nav.Link>
+          </Nav>
+        </Container>
+      </Navbar>
+      <Container className="py-4">
+        <Outlet />
+      </Container>
+    </>
+  );
+}
--- a/bench/src/api.ts
+++ b/bench/src/api.ts
@@ -0,0 +1,45 @@
+import type { Dimensions, ReportRow, RunRow, SeriesPoint } from "./types";
+
+// Empty default → `fetch('/api/...')` hits the dev proxy (vite.config.ts)
+// or the same origin. For a separately-hosted build, set VITE_API_BASE to
+// the bob API origin (e.g. http://bob.hanzalova.internal:13132).
+const BASE = import.meta.env.VITE_API_BASE ?? "";
+
+async function getJson<T>(path: string): Promise<T> {
+  const res = await fetch(`${BASE}${path}`);
+  if (!res.ok) {
+    throw new Error(`${res.status} ${res.statusText}: ${await res.text()}`);
+  }
+  return res.json() as Promise<T>;
+}
+
+export const getDimensions = () => getJson<Dimensions>("/api/dimensions");
+export const getSummary = () => getJson<ReportRow[]>("/api/summary");
+
+// host is resolved server-side (each model maps to one host today), so the
+// public UI selects by model + scenario alone.
+export const getSeries = (model: string, scenario: string) =>
+  getJson<SeriesPoint[]>(
+    `/api/series?model=${encodeURIComponent(model)}&scenario=${encodeURIComponent(scenario)}`,
+  );
+
+export interface RunsParams {
+  host?: string;
+  model?: string;
+  scenario?: string;
+  sha?: string;
+  ok?: boolean;
+  limit?: number;
+}
+
+export const getRuns = (p: RunsParams = {}) => {
+  const q = new URLSearchParams();
+  if (p.host) q.set("host", p.host);
+  if (p.model) q.set("model", p.model);
+  if (p.scenario) q.set("scenario", p.scenario);
+  if (p.sha) q.set("sha", p.sha);
+  if (p.ok !== undefined) q.set("ok", String(p.ok));
+  if (p.limit) q.set("limit", String(p.limit));
+  const qs = q.toString();
+  return getJson<RunRow[]>(`/api/runs${qs ? `?${qs}` : ""}`);
+};
--- a/bench/src/baseline.ts
+++ b/bench/src/baseline.ts
@@ -0,0 +1,52 @@
+// Pre-helexa-bench baseline, transcribed verbatim from doc/benchmarks.md.
+//
+// IMPORTANT — different measurement regime. These were measured by
+// script/bench.py *through the cortex gateway* (so TTFT/total include a
+// proxy hop), reported as medians only, before helexa-bench existed.
+// helexa-bench measures each neuron *directly*. So these points are an
+// honest historical anchor, NOT apples-to-apples with the live series —
+// the Trends view renders them dashed + labelled, never merged into the
+// live line.
+//
+// Host is inferred from the model via the doc's Fleet table
+// (beast=27B, benjy=8B, quadbrat=1.7B). Timestamps are the two 2026-06-12
+// snapshots in the doc, ordered (08:00 = pre-#11, 16:00 = post-#11) so
+// they sort before the bench era on the shared time axis.
+
+export interface BaselinePoint {
+  host: string;
+  model: string;
+  scenario: string;
+  git_sha: string;
+  build_timestamp: string;
+  ttft_s: number;
+  decode_tps: number;
+  total_s: number;
+}
+
+/** Source: bench.py via cortex gateway — see doc/benchmarks.md. */
+export const BASELINE_SOURCE = "bench.py · via cortex gateway";
+
+export const BASELINE: BaselinePoint[] = [
+  // ── 8f6f1d3 — baseline (2026-06-12) ────────────────────────────────
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 1.658, decode_tps: 35.0, total_s: 8.981 },
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 7.067, decode_tps: 33.7, total_s: 14.63 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 0.884, decode_tps: 62.4, total_s: 4.938 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 1.818, decode_tps: 46.5, total_s: 7.27 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 0.685, decode_tps: 81.3, total_s: 3.741 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 2.743, decode_tps: 35.4, total_s: 9.884 },
+  // ── a1952a4 — post prefix-KV-cache (#11, 2026-06-12) ───────────────
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.355, decode_tps: 45.8, total_s: 4.147 },
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.431, decode_tps: 43.3, total_s: 4.387 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 0.886, decode_tps: 78.6, total_s: 2.478 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.824, decode_tps: 58.3, total_s: 3.969 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 0.702, decode_tps: 104.8, total_s: 1.895 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 2.749, decode_tps: 44.9, total_s: 5.534 },
+];
+
+/** Baseline points for one (model, scenario) cell, oldest first. */
+export function baselineFor(model: string, scenario: string): BaselinePoint[] {
+  return BASELINE.filter(
+    (b) => b.model === model && b.scenario === scenario,
+  ).sort((a, b) => a.build_timestamp.localeCompare(b.build_timestamp));
+}
--- a/bench/src/main.tsx
+++ b/bench/src/main.tsx
@@ -0,0 +1,22 @@
+import React from "react";
+import ReactDOM from "react-dom/client";
+import { BrowserRouter, Route, Routes } from "react-router-dom";
+import "bootstrap/dist/css/bootstrap.min.css";
+import App from "./App";
+import Overview from "./pages/Overview";
+import Trends from "./pages/Trends";
+import Runs from "./pages/Runs";
+
+ReactDOM.createRoot(document.getElementById("root")!).render(
+  <React.StrictMode>
+    <BrowserRouter>
+      <Routes>
+        <Route path="/" element={<App />}>
+          <Route index element={<Overview />} />
+          <Route path="trends" element={<Trends />} />
+          <Route path="runs" element={<Runs />} />
+        </Route>
+      </Routes>
+    </BrowserRouter>
+  </React.StrictMode>,
+);
--- a/bench/src/pages/Overview.tsx
+++ b/bench/src/pages/Overview.tsx
@@ -0,0 +1,64 @@
+import { useEffect, useState } from "react";
+import { Alert, Spinner, Table } from "react-bootstrap";
+import { getSummary } from "../api";
+import type { ReportRow } from "../types";
+
+const f = (n: number | null, p = 2) => (n == null ? "—" : n.toFixed(p));
+
+export default function Overview() {
+  const [rows, setRows] = useState<ReportRow[]>([]);
+  const [err, setErr] = useState<string | null>(null);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    getSummary()
+      .then(setRows)
+      .catch((e) => setErr(String(e)))
+      .finally(() => setLoading(false));
+  }, []);
+
+  if (loading) return <Spinner animation="border" />;
+  if (err) return <Alert variant="danger">{err}</Alert>;
+
+  return (
+    <>
+      <h3 className="mb-3">Latest results per cell</h3>
+      <p className="text-muted">
+        Median of each cell's samples on the most recent build seen for that
+        (host, model, scenario).
+      </p>
+      <Table striped bordered hover responsive size="sm">
+        <thead>
+          <tr>
+            <th>GPU</th>
+            <th>model</th>
+            <th className="text-end">prompt tok</th>
+            <th className="text-end">TTFT (s)</th>
+            <th className="text-end">decode tok/s</th>
+            <th className="text-end">total (s)</th>
+            <th>build</th>
+            <th className="text-end">n</th>
+          </tr>
+        </thead>
+        <tbody>
+          {rows.map((r, i) => (
+            <tr key={i}>
+              <td>{r.gpu ?? r.target_name}</td>
+              <td>{r.model_id}</td>
+              <td className="text-end">
+                {r.prompt_tokens ?? `~${r.prompt_size_approx}`}
+              </td>
+              <td className="text-end">{f(r.ttft_s_median, 3)}</td>
+              <td className="text-end">{f(r.decode_tps_median, 1)}</td>
+              <td className="text-end">{f(r.total_s_median, 3)}</td>
+              <td>
+                <code>{r.git_sha}</code>
+              </td>
+              <td className="text-end">{r.samples}</td>
+            </tr>
+          ))}
+        </tbody>
+      </Table>
+    </>
+  );
+}
--- a/bench/src/pages/Runs.tsx
+++ b/bench/src/pages/Runs.tsx
@@ -0,0 +1,141 @@
+import { useEffect, useState } from "react";
+import { Alert, Badge, Col, Form, Row, Spinner, Table } from "react-bootstrap";
+import { getDimensions, getRuns } from "../api";
+import type { Dimensions, RunRow } from "../types";
+
+const f = (n: number | null, p = 2) => (n == null ? "—" : n.toFixed(p));
+
+function Picker({
+  label,
+  value,
+  set,
+  options,
+}: {
+  label: string;
+  value: string;
+  set: (v: string) => void;
+  options: string[];
+}) {
+  return (
+    <Form.Group as={Col}>
+      <Form.Label>{label}</Form.Label>
+      <Form.Select value={value} onChange={(e) => set(e.target.value)}>
+        <option value="">(all)</option>
+        {options.map((o) => (
+          <option key={o} value={o}>
+            {o}
+          </option>
+        ))}
+      </Form.Select>
+    </Form.Group>
+  );
+}
+
+export default function Runs() {
+  const [dims, setDims] = useState<Dimensions | null>(null);
+  const [host, setHost] = useState("");
+  const [model, setModel] = useState("");
+  const [scenario, setScenario] = useState("");
+  const [rows, setRows] = useState<RunRow[]>([]);
+  const [err, setErr] = useState<string | null>(null);
+  const [loading, setLoading] = useState(false);
+
+  useEffect(() => {
+    getDimensions()
+      .then(setDims)
+      .catch((e) => setErr(String(e)));
+  }, []);
+
+  useEffect(() => {
+    setLoading(true);
+    getRuns({
+      host: host || undefined,
+      model: model || undefined,
+      scenario: scenario || undefined,
+      limit: 200,
+    })
+      .then(setRows)
+      .catch((e) => setErr(String(e)))
+      .finally(() => setLoading(false));
+  }, [host, model, scenario]);
+
+  if (err) return <Alert variant="danger">{err}</Alert>;
+
+  return (
+    <>
+      <h3 className="mb-3">Runs</h3>
+      {dims && (
+        <Row className="g-3 mb-3">
+          {/* GPU filter — labelled by GPU, but filters by the underlying host. */}
+          <Form.Group as={Col}>
+            <Form.Label>GPU</Form.Label>
+            <Form.Select value={host} onChange={(e) => setHost(e.target.value)}>
+              <option value="">(all)</option>
+              {dims.hosts.map((h) => (
+                <option key={h} value={h}>
+                  {dims.host_gpus[h] ?? h}
+                </option>
+              ))}
+            </Form.Select>
+          </Form.Group>
+          <Picker
+            label="Model"
+            value={model}
+            set={setModel}
+            options={dims.models}
+          />
+          <Picker
+            label="Scenario"
+            value={scenario}
+            set={setScenario}
+            options={dims.scenarios}
+          />
+        </Row>
+      )}
+      {loading ? (
+        <Spinner animation="border" />
+      ) : (
+        <Table striped bordered hover responsive size="sm">
+          <thead>
+            <tr>
+              <th>ts</th>
+              <th>GPU</th>
+              <th>model</th>
+              <th>scenario</th>
+              <th>build</th>
+              <th className="text-end">TTFT</th>
+              <th className="text-end">tok/s</th>
+              <th className="text-end">total</th>
+              <th>ok</th>
+            </tr>
+          </thead>
+          <tbody>
+            {rows.map((r) => (
+              <tr key={r.id}>
+                <td>{r.ts}</td>
+                <td>{r.gpu ?? r.host}</td>
+                <td>{r.model_id}</td>
+                <td>{r.scenario_id}</td>
+                <td>
+                  <code>{r.git_sha}</code>
+                </td>
+                <td className="text-end">{f(r.ttft_s, 3)}</td>
+                <td className="text-end">{f(r.decode_tps, 1)}</td>
+                <td className="text-end">{f(r.total_s, 3)}</td>
+                <td>
+                  {r.ok ? (
+                    <Badge bg="success">ok</Badge>
+                  ) : (
+                    <Badge bg="danger" title={r.error ?? ""}>
+                      fail
+                    </Badge>
+                  )}
+                </td>
+              </tr>
+            ))}
+          </tbody>
+        </Table>
+      )}
+    </>
+  );
+}
--- a/bench/src/pages/Trends.tsx
+++ b/bench/src/pages/Trends.tsx
@@ -0,0 +1,221 @@
+import { useEffect, useMemo, useState } from "react";
+import { Alert, Col, Form, Row, Spinner } from "react-bootstrap";
+import {
+  CartesianGrid,
+  Legend,
+  Line,
+  LineChart,
+  ReferenceLine,
+  ResponsiveContainer,
+  Tooltip,
+  XAxis,
+  YAxis,
+} from "recharts";
+import { getDimensions, getSeries } from "../api";
+import type { Dimensions, SeriesPoint } from "../types";
+import { BASELINE_SOURCE, baselineFor } from "../baseline";
+
+function Picker({
+  label,
+  value,
+  set,
+  options,
+}: {
+  label: string;
+  value: string;
+  set: (v: string) => void;
+  options: string[];
+}) {
+  return (
+    <Form.Group as={Col}>
+      <Form.Label>{label}</Form.Label>
+      <Form.Select value={value} onChange={(e) => set(e.target.value)}>
+        {options.map((o) => (
+          <option key={o} value={o}>
+            {o}
+          </option>
+        ))}
+      </Form.Select>
+    </Form.Group>
+  );
+}
+
+export default function Trends() {
+  const [dims, setDims] = useState<Dimensions | null>(null);
+  const [model, setModel] = useState("");
+  const [scenario, setScenario] = useState("");
+  const [series, setSeries] = useState<SeriesPoint[]>([]);
+  const [err, setErr] = useState<string | null>(null);
+
+  useEffect(() => {
+    getDimensions()
+      .then((d) => {
+        setDims(d);
+        if (d.models[0]) setModel(d.models[0]);
+        if (d.scenarios[0]) setScenario(d.scenarios[0]);
+      })
+      .catch((e) => setErr(String(e)));
+  }, []);
+
+  useEffect(() => {
+    if (model && scenario) {
+      getSeries(model, scenario)
+        .then(setSeries)
+        .catch((e) => setErr(String(e)));
+    }
+  }, [model, scenario]);
+
+  // Prepend the pre-helexa-bench baseline (dashed, separate keys) so it
+  // anchors the timeline without being merged into the live line. Different
+  // measurement regime — see baseline.ts / doc/benchmarks.md.
+  const base = useMemo(
+    () => baselineFor(model, scenario),
+    [model, scenario],
+  );
+  const data = useMemo(
+    () => [
+      ...base.map((p) => ({
+        label: p.git_sha,
+        baseTtft: p.ttft_s,
+        baseDecode: p.decode_tps,
+        baseTotal: p.total_s,
+      })),
+      ...series.map((p) => ({
+        label: p.git_sha,
+        ttft: p.ttft_s_median,
+        decode: p.decode_tps_median,
+        total: p.total_s_median,
+      })),
+    ],
+    [series, base],
+  );
+
+  // Divider marking the boundary between the two regimes (drawn at the
+  // first live build, with baseline points to its left).
+  const firstLive = series[0]?.git_sha;
+  const showDivider = base.length > 0 && series.length > 0;
+
+  if (err) return <Alert variant="danger">{err}</Alert>;
+  if (!dims) return <Spinner animation="border" />;
+
+  return (
+    <>
+      <h3 className="mb-3">Trends over builds</h3>
+      <Row className="g-3 mb-4">
+        <Picker
+          label="Model"
+          value={model}
+          set={setModel}
+          options={dims.models}
+        />
+        <Picker
+          label="Scenario"
+          value={scenario}
+          set={setScenario}
+          options={dims.scenarios}
+        />
+      </Row>
+
+      {dims.model_gpus[model] && (
+        <p className="text-muted mb-3">
+          Measured on <strong>{dims.model_gpus[model]}</strong>.
+        </p>
+      )}
+
+      {data.length === 0 ? (
+        <Alert variant="info">No data for this selection yet.</Alert>
+      ) : (
+        <>
+          {base.length > 0 && (
+            <p className="text-muted small mb-3">
+              Dashed = pre-helexa-bench baseline ({BASELINE_SOURCE}); solid =
+              helexa-bench (direct to neuron). Different measurement regimes —
+              see <code>doc/benchmarks.md</code>.
+            </p>
+          )}
+          <h5 className="mt-3">decode tok/s (higher is better)</h5>
+          <ResponsiveContainer width="100%" height={280}>
+            <LineChart data={data} margin={{ top: 8, right: 24, bottom: 8, left: 0 }}>
+              <CartesianGrid strokeDasharray="3 3" />
+              <XAxis dataKey="label" />
+              <YAxis />
+              <Tooltip />
+              <Legend />
+              {showDivider && firstLive && (
+                <ReferenceLine
+                  x={firstLive}
+                  stroke="#bbb"
+                  strokeDasharray="3 3"
+                  label={{
+                    value: "bench.py → helexa-bench",
+                    position: "top",
+                    fill: "#999",
+                    fontSize: 11,
+                  }}
+                />
+              )}
+              <Line
+                type="monotone"
+                dataKey="decode"
+                name="decode tok/s"
+                stroke="#0d6efd"
+                connectNulls
+              />
+              {base.length > 0 && (
+                <Line
+                  type="monotone"
+                  dataKey="baseDecode"
+                  name="baseline (bench.py · gateway)"
+                  stroke="#888"
+                  strokeDasharray="5 5"
+                  connectNulls
+                />
+              )}
+            </LineChart>
+          </ResponsiveContainer>
+
+          <h5 className="mt-4">TTFT seconds (lower is better)</h5>
+          <ResponsiveContainer width="100%" height={280}>
+            <LineChart data={data} margin={{ top: 8, right: 24, bottom: 8, left: 0 }}>
+              <CartesianGrid strokeDasharray="3 3" />
+              <XAxis dataKey="label" />
+              <YAxis />
+              <Tooltip />
+              <Legend />
+              {showDivider && firstLive && (
+                <ReferenceLine
+                  x={firstLive}
+                  stroke="#bbb"
+                  strokeDasharray="3 3"
+                  label={{
+                    value: "bench.py → helexa-bench",
+                    position: "top",
+                    fill: "#999",
+                    fontSize: 11,
+                  }}
+                />
+              )}
+              <Line
+                type="monotone"
+                dataKey="ttft"
+                name="TTFT (s)"
+                stroke="#dc3545"
+                connectNulls
+              />
+              {base.length > 0 && (
+                <Line
+                  type="monotone"
+                  dataKey="baseTtft"
+                  name="baseline (bench.py · gateway)"
+                  stroke="#888"
+                  strokeDasharray="5 5"
+                  connectNulls
+                />
+              )}
+            </LineChart>
+          </ResponsiveContainer>
+        </>
+      )}
+    </>
+  );
+}
--- a/bench/src/types.ts
+++ b/bench/src/types.ts
@@ -0,0 +1,69 @@
+// Mirrors the JSON served by helexa-bench's read API (crates/helexa-bench/src/api.rs).
+
+export interface BuildRef {
+  git_sha: string;
+  build_timestamp: string | null;
+  package_version: string | null;
+}
+
+export interface Dimensions {
+  hosts: string[];
+  models: string[];
+  scenarios: string[];
+  builds: BuildRef[];
+  /** host → GPU label, e.g. "2× RTX 5090". */
+  host_gpus: Record<string, string>;
+  /** model → GPU label (model maps to one host today). */
+  model_gpus: Record<string, string>;
+}
+
+/** Latest-SHA-per-cell medians (the report table). */
+export interface ReportRow {
+  target_name: string;
+  model_id: string;
+  scenario_id: string;
+  prompt_size_approx: number;
+  git_sha: string;
+  prompt_tokens: number | null;
+  ttft_s_median: number | null;
+  decode_tps_median: number | null;
+  total_s_median: number | null;
+  samples: number;
+  /** Public-facing resource name (the host's GPU(s)). */
+  gpu: string | null;
+}
+
+/** One point in a per-build time-series for a (host, model, scenario) cell. */
+export interface SeriesPoint {
+  git_sha: string;
+  build_timestamp: string | null;
+  package_version: string | null;
+  ttft_s_median: number | null;
+  decode_tps_median: number | null;
+  total_s_median: number | null;
+  samples: number;
+}
+
+export interface RunRow {
+  id: number;
+  ts: string;
+  host: string;
+  /** Public-facing resource name (the host's GPU(s)). */
+  gpu: string | null;
+  hostname: string | null;
+  git_sha: string;
+  build_timestamp: string | null;
+  package_version: string;
+  model_id: string;
+  harness: string;
+  scenario_id: string;
+  prompt_size_approx: number;
+  prompt_tokens_actual: number | null;
+  max_tokens: number;
+  ttft_s: number | null;
+  decode_tps: number | null;
+  total_s: number | null;
+  completion_tokens: number | null;
+  ok: boolean;
+  error: string | null;
+}
--- a/bench/src/vite-env.d.ts
+++ b/bench/src/vite-env.d.ts
@@ -0,0 +1,9 @@
+/// <reference types="vite/client" />
+
+interface ImportMetaEnv {
+  /** Base origin of the bench API. Empty → use the dev proxy / same origin. */
+  readonly VITE_API_BASE?: string;
+}
+interface ImportMeta {
+  readonly env: ImportMetaEnv;
+}
--- a/bench/tsconfig.json
+++ b/bench/tsconfig.json
@@ -0,0 +1,22 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "useDefineForClassFields": true,
+    "lib": ["ES2022", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "types": ["node", "vite/client"]
+  },
+  "include": ["src", "vite.config.ts"]
+}
--- a/bench/vite.config.ts
+++ b/bench/vite.config.ts
@@ -0,0 +1,18 @@
+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react-swc";
+
+// Dev server proxies /api to the bench API on bob so `fetch('/api/...')`
+// works without CORS/mixed-origin fuss during local development.
+// For a production build hosted elsewhere, set VITE_API_BASE to the bob
+// API origin (e.g. http://bob.hanzalova.internal:13132) instead.
+export default defineConfig({
+  plugins: [react()],
+  server: {
+    proxy: {
+      "/api": {
+        target: "http://bob.hanzalova.internal:13132",
+        changeOrigin: true,
+      },
+    },
+  },
+});
--- a/cortex.example.toml
+++ b/cortex.example.toml
@@ -3,22 +3,27 @@
 # Copy to cortex.toml and adjust for your environment.
 #
 # Environment variable overrides use CORTEX_ prefix with __ separators:
-#   CORTEX_GATEWAY__LISTEN=0.0.0.0:9000
+#   CORTEX_GATEWAY__LISTEN=0.0.0.0:31313
+
+# Path to the model catalogue (limits, cost, pinning, aliases, feasibility).
+# Defaults to the packaged location below; uncomment to override for a
+# non-packaged / local run.
+# models_config = "/etc/cortex/models.toml"

 [gateway]
-listen = "0.0.0.0:8000"
-metrics_listen = "0.0.0.0:9100"
+listen = "0.0.0.0:31313"
+metrics_listen = "0.0.0.0:31314"

 [eviction]
 strategy = "lru"
-# Restart mistralrs after this many load/unload cycles to defragment VRAM.
+# Restart neurons after this many load/unload cycles to defragment VRAM.
 # Set to 0 to disable.
 defrag_after_cycles = 50

 # -- Nodes ---------------------------------------------------------------
-# Each [[nodes]] entry declares a mistral.rs instance in the fleet.
-# Models are discovered by polling the node's /v1/models endpoint.
-# Pinned models are never evicted.
+# Each [[nodes]] entry declares a neuron daemon in the fleet.
+# Models are discovered by polling the neuron's /models endpoint.
+# Pinned models (see models.toml) are never evicted.

 [[nodes]]
 name = "gpu-large"
@@ -43,3 +48,45 @@ vram_mb = 12288           # e.g. RTX 3060 (12 GB)
 pinned = [
    "your-org/embedding-model",
 ]
+
+# -- Entitlements (multi-tenant governance, #47) -------------------------
+# Identity + per-key token budgets. Omit this section entirely for the
+# legacy single-operator behaviour: requests are anonymous and uncapped.
+#
+# The local/static provider below is the source of truth for accounts,
+# keys, and hard caps until the upstream clearing house exists. Identity
+# rides standard bearer auth only — clients send
+#   Authorization: Bearer <key>
+# no custom headers or body fields.
+
+[entitlements]
+# Reject unauthenticated requests with 401 invalid_api_key. Leave false
+# (allow-anonymous) during rollout; flip to true once keys are issued.
+require_auth = false
+
+# One entry per API key.
+[[entitlements.keys]]
+key = "sk-example-rolling"        # the bearer token the client sends
+account_id = "team-research"      # billable account (keys may share one)
+key_id = "research-ci"            # stable label for ledger/metrics (optional)
+hard_cap = 5_000_000              # hard token cap over the window
+# Rolling window that resets — over-cap requests get 429 rate_limit_exceeded
+# + Retry-After, so well-behaved clients (opencode/AI SDK) back off and retry.
+window = { kind = "rolling", seconds = 3600 }
+
+[[entitlements.keys]]
+key = "sk-example-balance"
+account_id = "team-research"
+key_id = "research-prepaid"
+hard_cap = 20_000_000
+# Hard balance, no reset — exhaustion returns 429 insufficient_quota
+# (the client surfaces and stops). This is the default when `window` is
+# omitted. Never 402.
+window = { kind = "balance" }
+
+[[entitlements.keys]]
+key = "sk-example-infra"
+account_id = "operator"
+key_id = "infra"
+# No hard_cap → uncapped operator infra key (own fleet, own use). Still
+# metered for visibility.
--- a/cortex.spec
+++ b/cortex.spec
@@ -1,10 +1,10 @@
 Name:           cortex
-Version:        0.1.2
+Version:        0.1.16
 Release:        1%{?dist}
 Summary:        Inference gateway for multi-node GPU clusters

 License:        GPL-3.0-or-later
-URL:            https://git.lair.cafe/helexa/cortex
+URL:            https://git.lair.cafe/helexa/helexa
 Source0:        %{name}-%{version}.tar.gz
 Source1:        %{name}-%{version}-vendor.tar.gz

@@ -21,6 +21,16 @@ BuildRequires:  systemd-rpm-macros

 Requires(pre):  shadow-utils
 Requires:       systemd
+Requires:       firewalld-filesystem
+
+# systemd-rpm-macros ships a unit dep generator that parses User=/Group=
+# from our .service file and emits Requires: user(cortex)/group(cortex).
+# rpm's sysusers provides-generator emits the unversioned form for groups
+# but only a versioned user(cortex) = <base64> for users with GECOS/home/
+# shell. Provide the unversioned user(cortex) explicitly so dnf can resolve
+# the auto-generated Requires. Without this, dnf5 silently filters the
+# package and reports "Nothing to do".
+Provides:       user(cortex)

 %description
 Cortex is a Rust reverse-proxy that sits in front of multiple inference
@@ -47,9 +57,10 @@ cargo build --release -p cortex-cli
 install -Dm755 target/release/cortex %{buildroot}%{_bindir}/cortex
 install -Dm644 data/cortex.service %{buildroot}%{_unitdir}/cortex.service
 install -Dm644 data/cortex-sysusers.conf %{buildroot}%{_sysusersdir}/cortex.conf
-install -dm750 %{buildroot}%{_sysconfdir}/cortex
-install -Dm640 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
-install -Dm640 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
+install -Dm644 data/cortex-firewalld.xml %{buildroot}%{_prefix}/lib/firewalld/services/cortex.xml
+install -dm755 %{buildroot}%{_sysconfdir}/cortex
+install -Dm644 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
+install -Dm644 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml

 %pre
 %sysusers_create_compat %{_builddir}/%{name}-%{version}/data/cortex-sysusers.conf
@@ -63,16 +74,53 @@ install -Dm640 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
 %postun
 %systemd_postun_with_restart cortex.service

+%posttrans
+# Migration: older cortex packages shipped the firewalld service as
+# `helexa-cortex` and (in some build streams) with wrong port numbers
+# (9301/9302/9304). Operators who enabled that legacy service in their
+# zone end up with the wrong-port override taking precedence over the
+# vendor `cortex.xml` now in /usr/lib/firewalld/services/. Clean up the
+# stale /etc/ override here and migrate any zone bindings to the new
+# service name.
+if [ -f /etc/firewalld/services/helexa-cortex.xml ]; then
+    rm -f /etc/firewalld/services/helexa-cortex.xml
+fi
+if [ -x /usr/bin/firewall-cmd ] && /usr/bin/firewall-cmd --state >/dev/null 2>&1; then
+    # Drop the legacy service name from every zone where it was enabled
+    # and add the new `cortex` service in its place. Operators who never
+    # ran firewall-cmd against either name see no zone change.
+    for zone in $(/usr/bin/firewall-cmd --get-active-zones 2>/dev/null \
+        | awk '!/^[[:space:]]/ {print $1}'); do
+        if /usr/bin/firewall-cmd --permanent --zone="$zone" --query-service=helexa-cortex >/dev/null 2>&1; then
+            /usr/bin/firewall-cmd --permanent --zone="$zone" --remove-service=helexa-cortex >/dev/null 2>&1 || :
+            /usr/bin/firewall-cmd --permanent --zone="$zone" --add-service=cortex >/dev/null 2>&1 || :
+        fi
+    done
+    /usr/bin/firewall-cmd --reload >/dev/null 2>&1 || :
+fi
+:
+
 %files
 %license LICENSE
 %doc README.md
 %{_bindir}/cortex
 %{_unitdir}/cortex.service
 %{_sysusersdir}/cortex.conf
-%dir %attr(750,root,cortex) %{_sysconfdir}/cortex
-%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/cortex.toml
-%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/models.toml
+%{_prefix}/lib/firewalld/services/cortex.xml
+%dir %{_sysconfdir}/cortex
+%config(noreplace) %{_sysconfdir}/cortex/cortex.toml
+%config(noreplace) %{_sysconfdir}/cortex/models.toml

 %changelog
-* Tue Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
+* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.16-1
+- chore: ignore local deploy script
+- chore: move default ports out of common-collision ranges
+- ci: drop actions/cache for cargo registry and target
+
+* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.14-1
+- ci: publish both packages to a single helexa/helexa COPR project
+- fix(rpm): rename neuron package to helexa-neuron
+- ci: commit generated %changelog entries back to main
+
+* Wed Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
 - Initial package
--- a/crates/cortex-cli/src/main.rs
+++ b/crates/cortex-cli/src/main.rs
@@ -5,7 +5,7 @@ use tracing_subscriber::EnvFilter;

 #[derive(Parser)]
 #[command(name = "cortex")]
-#[command(about = "Unified inference gateway for multi-node mistral.rs clusters")]
+#[command(about = "Unified inference gateway for multi-node GPU clusters")]
 #[command(version)]
 struct Cli {
    #[command(subcommand)]
@@ -23,7 +23,7 @@ enum Commands {
    /// Print the fleet status (models, nodes, health).
    Status {
        /// Gateway API endpoint to query.
-        #[arg(short, long, default_value = "http://localhost:8000")]
+        #[arg(short, long, default_value = "http://localhost:31313")]
        endpoint: String,
    },
 }
--- a/crates/cortex-core/src/anthropic.rs
+++ b/crates/cortex-core/src/anthropic.rs
@@ -2,7 +2,7 @@
 //!
 //! These mirror the `/v1/messages` format used by the Anthropic API.
 //! The gateway accepts these, translates to OpenAI format, proxies to
-//! mistral.rs, then translates the response back.
+//! the inference backend (neuron), then translates the response back.

 use serde::{Deserialize, Serialize};
 use serde_json::Value;
--- a/crates/cortex-core/src/build_info.rs
+++ b/crates/cortex-core/src/build_info.rs
@@ -0,0 +1,119 @@
+//! Build/version metadata shared between cortex and neuron.
+//!
+//! neuron captures these facts at compile time in its `build.rs`
+//! (git SHA, enabled cargo features, rustc/candle versions, …) and
+//! serves them from `GET /version`. cortex and `helexa-bench`
+//! deserialize the same struct so a benchmark run can be attributed to
+//! the exact daemon build that produced it — not just the host's CUDA
+//! and driver versions that `/discovery` already reports.
+//!
+//! Every field beyond the always-present package version is
+//! `#[serde(default)]` so a newer reader stays compatible with an
+//! older neuron that omits a field (and vice versa) — the same
+//! forward/backward-compat discipline as
+//! [`crate::discovery::ActivationStatus`].
+
+use serde::{Deserialize, Serialize};
+
+/// Build-time identity of a neuron daemon.
+///
+/// Returned by `GET /version`. The `git_sha` is the canonical "which
+/// build is live" key — benchmark records are bucketed by it, so a
+/// regression can be pinned to a daemon change rather than a host
+/// change. When neuron is built from a source tarball with no git
+/// metadata available (and no `HELEXA_BUILD_SHA` injected by CI/RPM),
+/// `git_sha` is the string `"unknown"`.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct BuildInfo {
+    /// Crate version from `CARGO_PKG_VERSION` (e.g. `"0.1.16"`).
+    pub package_version: String,
+    /// Short git SHA, or `"unknown"` when unavailable at build time.
+    #[serde(default = "unknown")]
+    pub git_sha: String,
+    /// Full 40-char git SHA when available.
+    #[serde(default)]
+    pub git_sha_long: Option<String>,
+    /// Whether the working tree had uncommitted changes at build time.
+    /// `false` when the SHA is unknown (tarball build).
+    #[serde(default)]
+    pub git_dirty: bool,
+    /// RFC3339 build timestamp.
+    #[serde(default)]
+    pub build_timestamp: Option<String>,
+    /// `rustc --version` output of the compiler used.
+    #[serde(default)]
+    pub rustc_version: Option<String>,
+    /// Cargo build profile: `"release"` or `"debug"`.
+    #[serde(default)]
+    pub profile: Option<String>,
+    /// Target triple the binary was compiled for.
+    #[serde(default)]
+    pub target: Option<String>,
+    /// Enabled cargo features (e.g. `["cuda", "cudnn"]`). These define
+    /// the performance envelope, so they are recorded against every
+    /// benchmark run.
+    #[serde(default)]
+    pub features: Vec<String>,
+    /// Locked `candle-core` version, best-effort from `Cargo.lock`.
+    #[serde(default)]
+    pub candle_version: Option<String>,
+}
+
+fn unknown() -> String {
+    "unknown".to_string()
+}
+
+impl BuildInfo {
+    /// A placeholder used by non-neuron benchmark targets (and tests)
+    /// that have no build metadata to report.
+    pub fn unknown() -> Self {
+        BuildInfo {
+            package_version: env!("CARGO_PKG_VERSION").to_string(),
+            git_sha: unknown(),
+            git_sha_long: None,
+            git_dirty: false,
+            build_timestamp: None,
+            rustc_version: None,
+            profile: None,
+            target: None,
+            features: Vec::new(),
+            candle_version: None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn round_trips_full() {
+        let info = BuildInfo {
+            package_version: "0.1.16".into(),
+            git_sha: "30d50d6".into(),
+            git_sha_long: Some("30d50d6abc123".into()),
+            git_dirty: true,
+            build_timestamp: Some("2026-06-13T10:00:00+00:00".into()),
+            rustc_version: Some("rustc 1.85.0".into()),
+            profile: Some("release".into()),
+            target: Some("x86_64-unknown-linux-gnu".into()),
+            features: vec!["cuda".into(), "cudnn".into()],
+            candle_version: Some("0.10.2".into()),
+        };
+        let json = serde_json::to_string(&info).unwrap();
+        let back: BuildInfo = serde_json::from_str(&json).unwrap();
+        assert_eq!(info, back);
+    }
+
+    #[test]
+    fn deserializes_minimal_payload() {
+        // An older neuron might send only the package version; every
+        // other field must default rather than fail.
+        let back: BuildInfo = serde_json::from_str(r#"{"package_version":"0.1.0"}"#).unwrap();
+        assert_eq!(back.package_version, "0.1.0");
+        assert_eq!(back.git_sha, "unknown");
+        assert!(!back.git_dirty);
+        assert!(back.features.is_empty());
+        assert!(back.candle_version.is_none());
+    }
+}
--- a/crates/cortex-core/src/catalogue.rs
+++ b/crates/cortex-core/src/catalogue.rs
@@ -1,6 +1,9 @@
 //! Model catalogue — profiles describing how to serve each model.

+use crate::discovery::DeviceInfo;
+use crate::harness::{ModelCost, ModelLimit};
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::path::Path;

 /// A model serving profile loaded from models.toml.
@@ -22,6 +25,32 @@ pub struct ModelProfile {
    /// Neurons where this model should never be evicted.
    #[serde(default)]
    pub pinned_on: Vec<String>,
+    /// Source scheme this profile's weights come from. When set, the
+    /// router prefixes `id` with `scheme:` before forwarding the load
+    /// request to neuron, ensuring the daemon fetches from the right
+    /// registry regardless of which entry happens to match `id`.
+    ///
+    /// `None` lets neuron substitute its own `default_source` (typically
+    /// `huggingface`). Set to `"helexa"` when the model is hosted in
+    /// the helexa registry — operator-procurement-grade audit relies
+    /// on this being explicit per model rather than implicit.
+    #[serde(default)]
+    pub source: Option<String>,
+
+    // ── Enrichment (issue #62) ────────────────────────────────
+    /// Per-model token budget. When present, advertised in `/v1/models`
+    /// so clients can size and compact their context automatically.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
+    /// Operator-set pricing (USD per 1M tokens). `0.0` for self-hosted.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cost: Option<ModelCost>,
+    /// Static capability flags the operator wants to advertise even
+    /// before the model is loaded on any neuron (e.g. `"reasoning"`,
+    /// `"tool_call"`). Runtime-detected capabilities from the harness
+    /// are unioned with this set in the gateway's `/v1/models` response.
+    #[serde(default)]
+    pub capabilities: Vec<String>,
 }

 fn default_min_devices() -> u32 {
@@ -33,6 +62,14 @@ fn default_min_devices() -> u32 {
 pub struct ModelCatalogue {
    #[serde(default)]
    pub models: Vec<ModelProfile>,
+    /// Tier aliases — clients can send a request with `model: "helexa/small"`
+    /// and the gateway transparently rewrites + routes to the concrete
+    /// model id this maps to. Lets operators define latency/quality
+    /// tiers (`small`/`balanced`/`large`, `fast`/`thinking`, etc.)
+    /// without imposing knowledge of specific model ids on clients.
+    /// Loaded from the `[aliases]` table in models.toml.
+    #[serde(default)]
+    pub aliases: HashMap<String, String>,
 }

 impl ModelCatalogue {
@@ -64,4 +101,165 @@ impl ModelCatalogue {
            .iter()
            .any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string()))
    }
+
+    /// Find a profile by model id.
+    pub fn get(&self, model_id: &str) -> Option<&ModelProfile> {
+        self.models.iter().find(|p| p.id == model_id)
+    }
+
+    /// Resolve an alias to its concrete model id. Returns `id` verbatim
+    /// when it isn't an alias. Aliases never chain — operator config
+    /// is treated as flat — so this is a single lookup.
+    pub fn resolve_alias<'a>(&'a self, id: &'a str) -> &'a str {
+        self.aliases.get(id).map(String::as_str).unwrap_or(id)
+    }
+}
+
+impl ModelProfile {
+    /// True iff this profile's placement constraints can be satisfied
+    /// by the named neuron with the given device topology.
+    ///
+    /// Constraints checked:
+    /// - `pinned_on`: non-empty → neuron must be on the list.
+    /// - `min_devices`: neuron must have at least this many devices.
+    /// - `min_device_vram_mb`: at least `min_devices` of the neuron's
+    ///   devices must each meet this VRAM floor.
+    pub fn is_feasible_on(&self, neuron_name: &str, devices: &[DeviceInfo]) -> bool {
+        if !self.pinned_on.is_empty() && !self.pinned_on.iter().any(|n| n == neuron_name) {
+            return false;
+        }
+        if (devices.len() as u32) < self.min_devices {
+            return false;
+        }
+        if let Some(min_vram) = self.min_device_vram_mb {
+            let big_enough = devices
+                .iter()
+                .filter(|d| d.vram_total_mb >= min_vram)
+                .count() as u32;
+            if big_enough < self.min_devices {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::discovery::DeviceInfo;
+
+    fn device(idx: u32, vram_mb: u64) -> DeviceInfo {
+        DeviceInfo {
+            index: idx,
+            name: format!("DEV-{idx}"),
+            vram_total_mb: vram_mb,
+            compute_capability: "8.6".into(),
+        }
+    }
+
+    fn profile() -> ModelProfile {
+        ModelProfile {
+            id: "Qwen/Qwen3.6-27B".into(),
+            harness: "candle".into(),
+            quant: None,
+            vram_mb: Some(45_000),
+            min_devices: 2,
+            min_device_vram_mb: Some(24_000),
+            pinned_on: vec![],
+            source: None,
+            limit: None,
+            cost: None,
+            capabilities: vec![],
+        }
+    }
+
+    #[test]
+    fn feasible_when_two_devices_meet_vram_floor() {
+        let p = profile();
+        let devices = [device(0, 32_000), device(1, 32_000)];
+        assert!(p.is_feasible_on("beast", &devices));
+    }
+
+    #[test]
+    fn infeasible_when_only_one_device() {
+        let p = profile();
+        let devices = [device(0, 64_000)];
+        assert!(!p.is_feasible_on("benjy", &devices));
+    }
+
+    #[test]
+    fn infeasible_when_one_device_underspec() {
+        let p = profile();
+        let devices = [device(0, 32_000), device(1, 12_000)];
+        assert!(!p.is_feasible_on("mixed", &devices));
+    }
+
+    #[test]
+    fn pinned_on_excludes_other_neurons() {
+        let mut p = profile();
+        p.pinned_on = vec!["beast".into()];
+        let devices = [device(0, 32_000), device(1, 32_000)];
+        assert!(p.is_feasible_on("beast", &devices));
+        assert!(!p.is_feasible_on("benjy", &devices));
+    }
+
+    #[test]
+    fn no_vram_floor_just_needs_min_devices() {
+        let mut p = profile();
+        p.min_device_vram_mb = None;
+        let devices = [device(0, 1_000), device(1, 1_000)];
+        assert!(p.is_feasible_on("anywhere", &devices));
+    }
+
+    #[test]
+    fn resolve_alias_returns_target_when_alias_present() {
+        let mut cat = ModelCatalogue::default();
+        cat.aliases
+            .insert("helexa/small".into(), "Qwen/Qwen3-1.7B".into());
+        assert_eq!(cat.resolve_alias("helexa/small"), "Qwen/Qwen3-1.7B");
+    }
+
+    #[test]
+    fn resolve_alias_passes_through_when_not_an_alias() {
+        let mut cat = ModelCatalogue::default();
+        cat.aliases
+            .insert("helexa/small".into(), "Qwen/Qwen3-1.7B".into());
+        assert_eq!(cat.resolve_alias("Qwen/Qwen3-8B"), "Qwen/Qwen3-8B");
+    }
+
+    #[test]
+    fn source_defaults_to_none_when_absent_from_toml() {
+        let src = r#"
+[[models]]
+id = "Qwen/Qwen3-30B"
+harness = "candle"
+"#;
+        let cat: ModelCatalogue = toml::from_str(src).expect("parse models table");
+        assert!(cat.models[0].source.is_none());
+    }
+
+    #[test]
+    fn source_round_trips_through_toml() {
+        let src = r#"
+[[models]]
+id = "Helexa/Qwen3.6-27B-Uncensored"
+harness = "candle"
+source = "helexa"
+"#;
+        let cat: ModelCatalogue = toml::from_str(src).expect("parse models table");
+        assert_eq!(cat.models[0].source.as_deref(), Some("helexa"));
+    }
+
+    #[test]
+    fn aliases_table_round_trips_through_toml() {
+        let src = r#"
+[aliases]
+"helexa/small" = "Qwen/Qwen3-1.7B"
+"helexa/large" = "Qwen/Qwen3.6-27B"
+"#;
+        let cat: ModelCatalogue = toml::from_str(src).expect("parse aliases table");
+        assert_eq!(cat.resolve_alias("helexa/small"), "Qwen/Qwen3-1.7B");
+        assert_eq!(cat.resolve_alias("helexa/large"), "Qwen/Qwen3.6-27B");
+    }
 }
--- a/crates/cortex-core/src/config.rs
+++ b/crates/cortex-core/src/config.rs
@@ -1,3 +1,4 @@
+use crate::entitlements::CapWindow;
 use figment::{
    Figment,
    providers::{Env, Format, Toml},
@@ -11,20 +12,68 @@ pub struct GatewayConfig {
    pub eviction: EvictionSettings,
    /// Neuron endpoints (replaces old NodeConfig with static vram_mb/pinned).
    pub neurons: Vec<NeuronEndpoint>,
-    /// Path to the model catalogue file (default: "models.toml").
+    /// Path to the model catalogue file. Defaults to the packaged
+    /// location (`/etc/cortex/models.toml`); set explicitly for
+    /// non-packaged / local runs.
    #[serde(default = "default_models_path")]
    pub models_config: String,
+    /// Multi-tenant governance: auth + per-key token budgets (#47). Empty
+    /// by default — anonymous, uncapped — so existing single-operator
+    /// setups keep working until keys are configured.
+    #[serde(default)]
+    pub entitlements: EntitlementsConfig,
+}
+
+/// `[entitlements]` — the local/static [`crate::entitlements::EntitlementProvider`]
+/// source of truth (#50). Accounts, keys, and hard caps live here; the
+/// future upstream client (#57) ignores this section.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct EntitlementsConfig {
+    /// Reject unauthenticated requests with `401 invalid_api_key` when
+    /// true. Default `false` (allow-anonymous) for dev / single-operator
+    /// continuity.
+    #[serde(default)]
+    pub require_auth: bool,
+    /// Static API keys and their budgets, consumed by the local provider.
+    #[serde(default)]
+    pub keys: Vec<ApiKeyConfig>,
+}
+
+/// One configured API key: the bearer token, the account it bills to, and
+/// its hard cap. `[[entitlements.keys]]` in TOML.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ApiKeyConfig {
+    /// The bearer token clients send in `Authorization: Bearer <key>`.
+    pub key: String,
+    /// Billable account. Multiple keys may share one account.
+    pub account_id: String,
+    /// Stable per-key identifier for ledger/metrics labels. Defaults to
+    /// `account_id` when omitted, so the secret is never used as a label.
+    #[serde(default)]
+    pub key_id: Option<String>,
+    /// Hard token cap. `None`/omitted = uncapped (e.g. operator infra key).
+    #[serde(default)]
+    pub hard_cap: Option<u64>,
+    /// Cap-window semantics. Default: a non-resetting [`CapWindow::Balance`].
+    #[serde(default)]
+    pub window: CapWindow,
 }

 fn default_models_path() -> String {
-    "models.toml".into()
+    // Absolute, so the systemd-launched binary finds the catalogue
+    // regardless of its working directory. The RPM installs the catalogue
+    // here (`cortex.spec`); a relative "models.toml" silently resolved to
+    // the service cwd and left the catalogue empty in production
+    // (pinning / aliases / limits all no-ops). Override via `models_config`
+    // in cortex.toml for local runs.
+    "/etc/cortex/models.toml".into()
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct GatewaySettings {
-    /// Address to listen on for API requests (e.g. "0.0.0.0:8000")
+    /// Address to listen on for API requests (e.g. "0.0.0.0:31313")
    pub listen: String,
-    /// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
+    /// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:31314")
    pub metrics_listen: String,
 }

@@ -50,7 +99,7 @@ pub enum EvictionStrategy {
 pub struct NeuronEndpoint {
    /// Human-readable node name (e.g. "beast")
    pub name: String,
-    /// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
+    /// Base URL of the neuron daemon (e.g. "http://beast.internal:13131")
    pub endpoint: String,
 }

@@ -70,8 +119,8 @@ impl Default for GatewayConfig {
    fn default() -> Self {
        Self {
            gateway: GatewaySettings {
-                listen: "0.0.0.0:8000".into(),
-                metrics_listen: "0.0.0.0:9100".into(),
+                listen: "0.0.0.0:31313".into(),
+                metrics_listen: "0.0.0.0:31314".into(),
            },
            eviction: EvictionSettings {
                strategy: EvictionStrategy::Lru,
@@ -79,6 +128,7 @@ impl Default for GatewayConfig {
            },
            neurons: vec![],
            models_config: default_models_path(),
+            entitlements: EntitlementsConfig::default(),
        }
    }
 }
--- a/crates/cortex-core/src/discovery.rs
+++ b/crates/cortex-core/src/discovery.rs
@@ -22,6 +22,23 @@ pub struct DiscoveryResponse {
    pub driver_version: Option<String>,
    pub devices: Vec<DeviceInfo>,
    pub harnesses: Vec<String>,
+    /// Set when the host has an NVIDIA stack that is currently
+    /// unusable — specifically the userspace↔kernel-module version
+    /// skew after an un-rebooted driver update ("Driver/library
+    /// version mismatch"), where every CUDA call including nvidia-smi
+    /// fails (#19). `None` on healthy hosts AND on hosts with no
+    /// NVIDIA stack at all (CPU-only is not an error). Carries an
+    /// operator-actionable description; cortex can read it to route
+    /// around the node instead of cold-loading into a guaranteed
+    /// failure.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cuda_unavailable_reason: Option<String>,
+    /// The neuron's effective maximum prompt size in tokens
+    /// (`NEURON_MAX_PROMPT_TOKENS`) — the enforced prompt cap on this
+    /// host. `#[serde(default)]` (→ 0) for forward-compat with neurons
+    /// that predate this field; cortex treats 0 as "unknown".
+    #[serde(default)]
+    pub max_prompt_tokens: u64,
 }

 /// Runtime health metrics for a single GPU device.
@@ -36,8 +53,123 @@ pub struct DeviceHealth {

 /// Runtime health response from a neuron endpoint.
 /// Returned by `GET /health`.
+///
+/// `activation` was added in 2026-05-26 to distinguish "process is up
+/// and reachable" from "process is ready to serve traffic". A `Type=simple`
+/// systemd unit reports `active` the moment the binary starts — but a
+/// neuron whose `default_models` list takes minutes to materialise
+/// won't bind its listener (or, in the new flow, won't have any models
+/// loaded) until pre-warm completes. The new field is `#[serde(default)]`
+/// so a pre-2026-05-26 gateway polling a new neuron — or vice versa —
+/// keeps working.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct HealthResponse {
    pub uptime_secs: u64,
    pub devices: Vec<DeviceHealth>,
+    #[serde(default)]
+    pub activation: ActivationStatus,
+    /// Per-model admission load (#53): how many requests are running vs.
+    /// queued on each loaded model right now. Cortex's load-aware router
+    /// (#55) reads this to spread traffic across replicas and to propagate
+    /// honest backpressure. `#[serde(default)]` keeps older gateways/neurons
+    /// interoperable (absent → empty → treated as no load info).
+    #[serde(default)]
+    pub models: Vec<ModelLoad>,
+}
+
+/// Live admission load for one loaded model (#53).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelLoad {
+    pub id: String,
+    /// Requests currently running (batch-1 → 0 or 1).
+    pub in_flight: usize,
+    /// Requests waiting in the bounded admission queue.
+    pub queue_depth: usize,
+}
+
+#[cfg(test)]
+mod health_load_tests {
+    use super::*;
+
+    #[test]
+    fn health_response_without_models_field_still_deserializes() {
+        // A pre-#53 neuron's /health payload omits `models`; the gateway
+        // must still parse it (serde default → empty).
+        let json = r#"{"uptime_secs":42,"devices":[]}"#;
+        let resp: HealthResponse = serde_json::from_str(json).expect("back-compat parse");
+        assert_eq!(resp.uptime_secs, 42);
+        assert!(resp.models.is_empty());
+    }
+
+    #[test]
+    fn health_response_round_trips_model_load() {
+        let resp = HealthResponse {
+            uptime_secs: 1,
+            devices: vec![],
+            activation: ActivationStatus::default(),
+            models: vec![ModelLoad {
+                id: "Qwen/Qwen3.6-27B".into(),
+                in_flight: 1,
+                queue_depth: 3,
+            }],
+        };
+        let s = serde_json::to_string(&resp).unwrap();
+        let back: HealthResponse = serde_json::from_str(&s).unwrap();
+        assert_eq!(back.models.len(), 1);
+        assert_eq!(back.models[0].in_flight, 1);
+        assert_eq!(back.models[0].queue_depth, 3);
+    }
+}
+
+/// High-level activation state of the neuron daemon. The HTTP listener
+/// is bound during both states; what differs is whether the configured
+/// `default_models` have finished loading.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ActivationState {
+    /// At least one `default_models` entry is still loading. The
+    /// neuron's other endpoints work, but inference against
+    /// not-yet-loaded models will 404.
+    PreWarming,
+    /// Every `default_models` entry has either loaded or failed; the
+    /// neuron is steady-state. Subsequent on-demand loads via
+    /// `/models/load` don't flip back to PreWarming — that field
+    /// reflects the activation-time set only.
+    #[default]
+    Ready,
+}
+
+/// Per-model failure record surfaced in [`ActivationStatus::failed`].
+/// The error string is the rendered anyhow chain at the time of the
+/// failure; operators read it from `/health` to decide whether to
+/// retry, edit the spec, or unload+reload.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PreWarmFailure {
+    pub model_id: String,
+    pub error: String,
+}
+
+/// Activation-time progress snapshot. All four lists are populated by
+/// the neuron's pre-warm task and read by the `/health` handler. The
+/// snapshot is consistent: a model id appears in exactly one of
+/// `pending`, `in_progress` (as `Option<String>`), `completed`, or
+/// `failed` at any point in time.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ActivationStatus {
+    pub state: ActivationState,
+    /// Model ids queued but not yet started. Empty in `Ready` state.
+    #[serde(default)]
+    pub pending: Vec<String>,
+    /// Model id currently materialising. None when between models or
+    /// in `Ready` state.
+    #[serde(default)]
+    pub in_progress: Option<String>,
+    /// Model ids that finished loading successfully during this
+    /// activation. Cleared on process restart.
+    #[serde(default)]
+    pub completed: Vec<String>,
+    /// Model ids that failed during this activation, with the rendered
+    /// error chain. Cleared on process restart.
+    #[serde(default)]
+    pub failed: Vec<PreWarmFailure>,
 }
--- a/crates/cortex-core/src/entitlements.rs
+++ b/crates/cortex-core/src/entitlements.rs
@@ -0,0 +1,145 @@
+//! Identity and entitlement primitives for multi-tenant governance (#47).
+//!
+//! Identity is the shared substrate the whole epic hangs off:
+//! `identity (principal) → accounting (spend) → policy → enforcement`. This
+//! module defines the seam — the [`EntitlementProvider`] trait and its data
+//! types — so the local/static provider (operator-config caps, in
+//! cortex-gateway) can land the auth + per-key-cap + amplification fix
+//! *before* any upstream clearing house exists. The future helexa-upstream
+//! client (#57) is just another impl of this trait.
+//!
+//! The provider owns three jobs:
+//! 1. **resolve** a bearer key to a [`Principal`] (drives auth, #49);
+//! 2. **reserve → settle/release** token budget around a request so spend
+//!    can never overshoot a hard cap under concurrency (drives budget
+//!    enforcement, #52);
+//! 3. expose a [`BudgetSnapshot`] for metering/metrics (#51).
+//!
+//! [`BudgetError`] carries the cap-window semantics so the caller can pick
+//! the correct #63 rejection (`rate_limit_exceeded` + `Retry-After` for a
+//! resetting window vs `insufficient_quota` for a hard balance) without the
+//! provider knowing anything about HTTP.
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+/// Internal header carrying the resolved account id from cortex to neuron.
+/// neuron trusts these over the WireGuard link (#54); cortex **strips** any
+/// client-supplied copy before stamping the authoritative value, so a client
+/// can never assert a principal directly.
+pub const HEADER_ACCOUNT_ID: &str = "x-helexa-account-id";
+/// Internal header carrying the resolved key id from cortex to neuron.
+pub const HEADER_KEY_ID: &str = "x-helexa-key-id";
+
+/// Who a request is for. Resolved once at the edge from the bearer key and
+/// carried through the request context. `account_id` is the billable owner
+/// (spendable at any operator, by decision); `key_id` identifies the
+/// specific API key for per-key hard caps and ledger/metrics labels.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct Principal {
+    pub account_id: String,
+    pub key_id: String,
+}
+
+/// Cap-window semantics for a key's hard cap. Determines which #63 code an
+/// over-cap reservation maps to.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind", rename_all = "snake_case")]
+pub enum CapWindow {
+    /// Hard balance — the cap never resets. Exhaustion is permanent
+    /// (`429 insufficient_quota`, no `Retry-After`).
+    #[default]
+    Balance,
+    /// Rolling window of `seconds` that resets. Exhaustion is transient
+    /// (`429 rate_limit_exceeded` + `Retry-After` until reset).
+    Rolling { seconds: u64 },
+}
+
+/// An outstanding budget reservation. The caller holds this opaque handle
+/// between [`EntitlementProvider::reserve`] and exactly one of
+/// [`EntitlementProvider::settle`] / [`EntitlementProvider::release`]. Not
+/// `Clone` — a reservation is consumed once.
+#[derive(Debug)]
+pub struct Reservation {
+    /// Provider-local handle; opaque to the caller.
+    pub id: u64,
+    /// The principal this reservation belongs to.
+    pub principal: Principal,
+    /// Tokens reserved against the cap.
+    pub reserved: u64,
+}
+
+/// A point-in-time view of a key's budget, for metering and metrics (#51).
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct BudgetSnapshot {
+    /// Hard cap in tokens. `None` means uncapped (e.g. an operator infra
+    /// key, #58).
+    pub hard_cap: Option<u64>,
+    /// Settled spend in the current window.
+    pub spent: u64,
+    /// Sum of outstanding (un-settled) reservations.
+    pub reserved: u64,
+}
+
+/// Authentication failure — the bearer key could not be resolved. Maps to
+/// `401 invalid_api_key` (#49/#63).
+#[derive(Debug, thiserror::Error)]
+pub enum AuthError {
+    #[error("invalid or unknown API key")]
+    InvalidKey,
+}
+
+/// Why a reservation was refused. Carries enough for the caller to build the
+/// correct #63 envelope without the provider touching HTTP.
+#[derive(Debug, thiserror::Error)]
+pub enum BudgetError {
+    /// A resetting window is exhausted → `429 rate_limit_exceeded` +
+    /// `Retry-After: retry_after_secs`.
+    #[error(
+        "rolling-window budget exhausted ({requested} requested, {available} available); \
+         resets in {retry_after_secs}s"
+    )]
+    RateLimited {
+        requested: u64,
+        available: u64,
+        retry_after_secs: u64,
+    },
+    /// A hard balance is exhausted → `429 insufficient_quota` (no
+    /// `Retry-After`; the client surfaces and stops). Never `402`.
+    #[error("hard balance exhausted ({requested} requested, {available} available)")]
+    InsufficientQuota { requested: u64, available: u64 },
+}
+
+/// The seam between cortex's enforcement and whatever decides entitlement —
+/// a local/static config provider today (#50), the helexa-upstream client
+/// later (#57). All methods are async so the upstream impl can do network
+/// I/O; the local impl resolves in-process.
+#[async_trait]
+pub trait EntitlementProvider: Send + Sync {
+    /// Resolve a bearer API key to its principal. `Err(InvalidKey)` for an
+    /// unknown/empty key.
+    async fn resolve(&self, api_key: &str) -> Result<Principal, AuthError>;
+
+    /// Reserve up to `max_tokens` against the principal's cap. Returns a
+    /// handle on success, or a [`BudgetError`] (which the caller maps to a
+    /// #63 `429`) if the reservation would exceed the cap. Reserving the
+    /// *maximum* a request could consume before dispatch is what prevents
+    /// overshoot under concurrency.
+    async fn reserve(
+        &self,
+        principal: &Principal,
+        max_tokens: u64,
+    ) -> Result<Reservation, BudgetError>;
+
+    /// Settle a reservation with the tokens actually consumed, releasing the
+    /// unused remainder back to the cap.
+    async fn settle(&self, reservation: Reservation, actual_tokens: u64);
+
+    /// Release a reservation in full — e.g. dispatch failed before any
+    /// tokens were consumed.
+    async fn release(&self, reservation: Reservation);
+
+    /// Current budget snapshot for a principal, for metering/metrics.
+    /// `None` if the provider doesn't track this principal.
+    async fn snapshot(&self, principal: &Principal) -> Option<BudgetSnapshot>;
+}
--- a/crates/cortex-core/src/error_envelope.rs
+++ b/crates/cortex-core/src/error_envelope.rs
@@ -0,0 +1,257 @@
+//! The OpenAI-standard error envelope (#60) and the rejection contract
+//! that rides on it (#63).
+//!
+//! Every non-2xx response cortex and neuron emit uses the shape
+//!
+//! ```json
+//! { "error": { "message": "...", "type": "...", "code": "...", "param": null } }
+//! ```
+//!
+//! because OpenAI-compatible clients (opencode, the AI SDK, litellm, the
+//! OpenAI SDKs) read `error.type` / `error.code` to decide what to do —
+//! most importantly `code == "context_length_exceeded"` triggers
+//! auto-compaction, and a `429` with `Retry-After` makes them back off and
+//! retry rather than surfacing an opaque failure. A flat `{"error":"..."}`
+//! string is invisible to that logic.
+//!
+//! This module is the single source of truth for that envelope. It is
+//! deliberately **axum-agnostic** — cortex-core is a pure types crate — so
+//! it carries the response as data (`status`, `body()`, `retry_after_secs`)
+//! and each HTTP crate (cortex-gateway, neuron) owns a tiny adapter that
+//! turns an [`OpenAiError`] into its framework's response type, setting the
+//! `Retry-After` header when present.
+//!
+//! Retryable conditions **must** carry `Retry-After` (per #63). The named
+//! constructors below encode that: [`OpenAiError::rate_limit_exceeded`] and
+//! [`OpenAiError::service_unavailable`] take a retry hint;
+//! [`OpenAiError::insufficient_quota`] (hard balance, no reset) and
+//! [`OpenAiError::context_length_exceeded`] / [`OpenAiError::invalid_api_key`]
+//! (permanent) do not. `402 Payment Required` is banned by the contract — use
+//! `429 insufficient_quota` for hard budget exhaustion.
+
+use serde_json::{Map, Value, json};
+
+/// A rejection rendered in the OpenAI error envelope.
+///
+/// Build with [`OpenAiError::new`] (or a named constructor), refine with the
+/// `with_*` builders, then hand to the consuming crate's adapter to turn into
+/// an HTTP response.
+#[derive(Debug, Clone)]
+pub struct OpenAiError {
+    /// HTTP status code (e.g. `401`, `429`, `503`).
+    pub status: u16,
+    /// Broad OpenAI category — `"invalid_request_error"`, `"api_error"`,
+    /// `"rate_limit_error"`, …
+    pub error_type: String,
+    /// Specific machine-readable code clients key on (`"invalid_api_key"`,
+    /// `"rate_limit_exceeded"`, `"context_length_exceeded"`, …). `None`
+    /// renders as JSON `null`.
+    pub code: Option<String>,
+    /// Human-readable, actionable message.
+    pub message: String,
+    /// OpenAI's `param` field — the offending request parameter, if any.
+    pub param: Option<String>,
+    /// Seconds to advertise in the `Retry-After` header. Set only on
+    /// retryable conditions; `None` means no header.
+    pub retry_after_secs: Option<u64>,
+    /// Diagnostic fields merged *inside* the `error` object (e.g.
+    /// `prompt_len`, `max`, `free_mb`) so they don't break the envelope
+    /// shape. Clients ignore unknown keys.
+    pub extra: Map<String, Value>,
+}
+
+impl OpenAiError {
+    /// Construct an envelope with an explicit code. For a `null` code use
+    /// [`OpenAiError::without_code`].
+    pub fn new(
+        status: u16,
+        error_type: impl Into<String>,
+        code: impl Into<String>,
+        message: impl Into<String>,
+    ) -> Self {
+        Self {
+            status,
+            error_type: error_type.into(),
+            code: Some(code.into()),
+            message: message.into(),
+            param: None,
+            retry_after_secs: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Construct an envelope whose `code` is `null` (e.g. an unclassified
+    /// internal error).
+    pub fn without_code(
+        status: u16,
+        error_type: impl Into<String>,
+        message: impl Into<String>,
+    ) -> Self {
+        Self {
+            status,
+            error_type: error_type.into(),
+            code: None,
+            message: message.into(),
+            param: None,
+            retry_after_secs: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Advertise a `Retry-After` (seconds). Use on retryable rejections.
+    pub fn with_retry_after(mut self, secs: u64) -> Self {
+        self.retry_after_secs = Some(secs);
+        self
+    }
+
+    /// Set the OpenAI `param` field.
+    pub fn with_param(mut self, param: impl Into<String>) -> Self {
+        self.param = Some(param.into());
+        self
+    }
+
+    /// Merge one diagnostic field into the error object.
+    pub fn with_extra(mut self, key: impl Into<String>, value: Value) -> Self {
+        self.extra.insert(key.into(), value);
+        self
+    }
+
+    /// Merge a bag of diagnostic fields into the error object.
+    pub fn with_extras(mut self, extras: Map<String, Value>) -> Self {
+        for (k, v) in extras {
+            self.extra.insert(k, v);
+        }
+        self
+    }
+
+    /// Render the `{ "error": { … } }` body. Field order is irrelevant to
+    /// clients (they parse JSON); the standard keys come first, then any
+    /// diagnostic extras.
+    pub fn body(&self) -> Value {
+        let mut error = Map::new();
+        error.insert("message".into(), Value::String(self.message.clone()));
+        error.insert("type".into(), Value::String(self.error_type.clone()));
+        error.insert(
+            "code".into(),
+            self.code.clone().map(Value::String).unwrap_or(Value::Null),
+        );
+        error.insert(
+            "param".into(),
+            self.param.clone().map(Value::String).unwrap_or(Value::Null),
+        );
+        for (k, v) in &self.extra {
+            error.insert(k.clone(), v.clone());
+        }
+        json!({ "error": Value::Object(error) })
+    }
+
+    // ── Named constructors for the #63 standard codes ──────────────────
+
+    /// `401 invalid_api_key` — missing/invalid bearer token (#49). Permanent.
+    pub fn invalid_api_key(message: impl Into<String>) -> Self {
+        Self::new(401, "invalid_request_error", "invalid_api_key", message)
+    }
+
+    /// `429 rate_limit_exceeded` + `Retry-After` — transient overload,
+    /// fair-share/in-flight cap, admission rejection, or a rolling budget
+    /// window that resets (#52/#53/#54/#55). Clients back off and retry.
+    pub fn rate_limit_exceeded(message: impl Into<String>, retry_after_secs: u64) -> Self {
+        Self::new(429, "rate_limit_error", "rate_limit_exceeded", message)
+            .with_retry_after(retry_after_secs)
+    }
+
+    /// `429 insufficient_quota` — hard balance exhausted, no reset (#52).
+    /// No `Retry-After`; the client surfaces and stops. (Never `402`.)
+    pub fn insufficient_quota(message: impl Into<String>) -> Self {
+        Self::new(429, "insufficient_quota", "insufficient_quota", message)
+    }
+
+    /// `400 context_length_exceeded` — prompt exceeds the model's context
+    /// window (#56/#60). Permanent for this request; opencode auto-compacts.
+    pub fn context_length_exceeded(message: impl Into<String>) -> Self {
+        Self::new(
+            400,
+            "invalid_request_error",
+            "context_length_exceeded",
+            message,
+        )
+    }
+
+    /// `503 service_unavailable` + optional `Retry-After` — transient
+    /// backend unavailability (no healthy nodes, recovery, fail-closed
+    /// upstream). Retryable when a hint is given.
+    pub fn service_unavailable(message: impl Into<String>, retry_after_secs: Option<u64>) -> Self {
+        let mut err = Self::new(503, "api_error", "service_unavailable", message);
+        err.retry_after_secs = retry_after_secs;
+        err
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn body_has_standard_envelope_shape() {
+        let env = OpenAiError::new(429, "rate_limit_error", "rate_limit_exceeded", "slow down");
+        let body = env.body();
+        let error = body.get("error").and_then(Value::as_object).unwrap();
+        assert_eq!(error["message"], "slow down");
+        assert_eq!(error["type"], "rate_limit_error");
+        assert_eq!(error["code"], "rate_limit_exceeded");
+        assert_eq!(error["param"], Value::Null);
+    }
+
+    #[test]
+    fn without_code_renders_null_code() {
+        let env = OpenAiError::without_code(500, "api_error", "kaboom");
+        assert_eq!(env.body()["error"]["code"], Value::Null);
+    }
+
+    #[test]
+    fn extras_ride_inside_the_error_object() {
+        let env = OpenAiError::context_length_exceeded("too long")
+            .with_extra("prompt_len", json!(60_000))
+            .with_extra("max", json!(49_152));
+        let error = &env.body()["error"];
+        assert_eq!(error["prompt_len"], 60_000);
+        assert_eq!(error["max"], 49_152);
+        assert_eq!(error["code"], "context_length_exceeded");
+    }
+
+    #[test]
+    fn rolling_window_rejection_carries_retry_after() {
+        let env = OpenAiError::rate_limit_exceeded("budget window", 30);
+        assert_eq!(env.status, 429);
+        assert_eq!(env.retry_after_secs, Some(30));
+    }
+
+    #[test]
+    fn hard_balance_rejection_has_no_retry_after() {
+        let env = OpenAiError::insufficient_quota("out of credit");
+        assert_eq!(env.status, 429);
+        assert_eq!(env.code.as_deref(), Some("insufficient_quota"));
+        assert_eq!(env.retry_after_secs, None);
+    }
+
+    #[test]
+    fn permanent_rejections_have_no_retry_after() {
+        assert_eq!(OpenAiError::invalid_api_key("nope").retry_after_secs, None);
+        assert_eq!(
+            OpenAiError::context_length_exceeded("too long").retry_after_secs,
+            None
+        );
+    }
+
+    #[test]
+    fn service_unavailable_retry_after_is_optional() {
+        assert_eq!(
+            OpenAiError::service_unavailable("recovering", Some(5)).retry_after_secs,
+            Some(5)
+        );
+        assert_eq!(
+            OpenAiError::service_unavailable("gone", None).retry_after_secs,
+            None
+        );
+    }
+}
--- a/crates/cortex-core/src/harness.rs
+++ b/crates/cortex-core/src/harness.rs
@@ -9,13 +9,13 @@ use async_trait::async_trait;
 use serde::{Deserialize, Serialize};

 /// Configuration for a harness instance on a neuron.
+///
+/// All current harnesses are in-process (candle); per-harness tuning
+/// (cache paths, device policies, etc.) lives in dedicated config
+/// blocks rather than on this struct.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct HarnessConfig {
    pub name: String,
-    /// Base URL of the harness (e.g. "http://localhost:8080" for mistral.rs).
-    pub endpoint: Option<String>,
-    /// Systemd unit name, if the harness is managed via systemd.
-    pub systemd_unit: Option<String>,
 }

 /// Health status of a harness process.
@@ -36,6 +36,44 @@ pub struct ModelSpec {
    pub devices: Option<Vec<u32>>,
 }

+/// Per-model token budget advertised by the catalogue or neuron.
+///
+/// `context` is the hard wall (the served max-seq-len).  `input` is the
+/// compaction trigger — when set, opencode treats it as "usable context =
+/// input − reserved".  When omitted, clients fall back to `context − output`.
+/// `output` is the maximum number of generation tokens.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelLimit {
+    /// Hard wall — served max-seq-len in tokens.
+    pub context: usize,
+    /// Compaction trigger / usable input budget.  When absent clients fall
+    /// back to `context − output`.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub input: Option<usize>,
+    /// Maximum number of generation tokens.
+    pub output: usize,
+}
+
+/// Operator-set pricing in USD per 1M tokens.
+///
+/// Self-hosted deployments typically leave both at `0.0`.  Cache fields are
+/// optional — set when the backend supports a prefix-cache discount tier.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelCost {
+    /// USD per 1M input (prompt) tokens.
+    #[serde(default)]
+    pub input: f64,
+    /// USD per 1M output (completion) tokens.
+    #[serde(default)]
+    pub output: f64,
+    /// USD per 1M cache-hit tokens (optional).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_read: Option<f64>,
+    /// USD per 1M cache-write tokens (optional).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_write: Option<f64>,
+}
+
 /// A model as reported by a harness.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ModelInfo {
@@ -44,19 +82,54 @@ pub struct ModelInfo {
    pub status: String,
    pub devices: Vec<u32>,
    pub vram_used_mb: Option<u64>,
+    /// Modalities this loaded model supports. Today: `["text"]` for
+    /// text-only checkpoints, `["text", "vision"]` for vision-capable
+    /// ones (Stage B7). Clients like litellm / agent0 can gate
+    /// `image_url` submission on the advertised set.
+    ///
+    /// Optional in the wire format so older clients that don't read
+    /// it stay compatible. Default-empty for absent/older data, which
+    /// callers can interpret as "text".
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub capabilities: Vec<String>,
+
+    // ── Enrichment (issue #62) ────────────────────────────────
+    /// Token budget advertised by the catalogue or discovered at load time.
+    /// `None` when neither the catalogue nor the loaded model can provide it.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
+    /// Operator-set pricing in USD per 1M tokens (0.0 = free/self-hosted).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cost: Option<ModelCost>,
+    /// `true` when the model's tokenizer contains recognised tool-call
+    /// marker tokens (`<tool_call>` / `<\/tool_call>` convention).
+    #[serde(default)]
+    pub tool_call: bool,
+    /// `true` when the model's tokenizer contains recognised reasoning
+    /// marker tokens (`<think>` / `<\/think>` or similar).
+    #[serde(default)]
+    pub reasoning: bool,
 }

 /// What an inference harness must do, from neuron's perspective.
+///
+/// All current harnesses are in-process — they share neuron's address
+/// space and lifecycle. `start`/`stop` therefore default to no-ops; a
+/// future process-supervising harness would override them.
 #[async_trait]
 pub trait Harness: Send + Sync {
-    /// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
+    /// Human-readable name (e.g. "candle").
    fn name(&self) -> &str;

-    /// Start the harness process if it is not already running.
-    async fn start(&self, config: &HarnessConfig) -> Result<()>;
+    /// Start the harness. Default no-op for in-process harnesses.
+    async fn start(&self, _config: &HarnessConfig) -> Result<()> {
+        Ok(())
+    }

-    /// Stop the harness process gracefully.
-    async fn stop(&self) -> Result<()>;
+    /// Stop the harness. Default no-op for in-process harnesses.
+    async fn stop(&self) -> Result<()> {
+        Ok(())
+    }

    /// Health check. Returns the harness process status.
    async fn health(&self) -> HarnessHealth;
--- a/crates/cortex-core/src/lib.rs
+++ b/crates/cortex-core/src/lib.rs
@@ -1,9 +1,14 @@
 pub mod anthropic;
+pub mod build_info;
 pub mod catalogue;
 pub mod config;
 pub mod discovery;
+pub mod entitlements;
+pub mod error_envelope;
 pub mod harness;
 pub mod metrics;
 pub mod node;
 pub mod openai;
+pub mod responses;
+pub mod source;
 pub mod translate;
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -1,3 +1,5 @@
+use crate::discovery::{ActivationStatus, DiscoveryResponse, ModelLoad};
+use crate::harness::{ModelCost, ModelLimit};
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
@@ -6,13 +8,30 @@ use std::collections::HashMap;
 #[derive(Debug, Clone)]
 pub struct NodeState {
    pub name: String,
-    /// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
+    /// Base URL of the neuron daemon (e.g. "http://beast.internal:13131").
    pub endpoint: String,
    pub healthy: bool,
    pub models: HashMap<String, ModelEntry>,
    /// Number of load/unload cycles since last process restart.
    pub lifecycle_cycles: u32,
    pub last_poll: Option<DateTime<Utc>>,
+    /// Result of the most recent successful `GET /discovery` against
+    /// this neuron. Cached forever once obtained — device topology is
+    /// invariant for a given neuron process. `None` until the first
+    /// successful poll. Used by the router and `/v1/models` to do
+    /// catalogue × topology feasibility checks.
+    pub discovery: Option<DiscoveryResponse>,
+    /// Last-seen pre-warm progress from this neuron's `/health`
+    /// endpoint. `None` until the first /health poll succeeds. The
+    /// `/v1/models` handler reads `in_progress` + `pending` from here
+    /// to synthesize `Loading` locations so clients see a catalogued
+    /// model that's mid-prewarm as "loading", not "missing".
+    pub activation: Option<ActivationStatus>,
+    /// Last-seen per-model admission load from this neuron's `/health`
+    /// (#53), keyed by model id. The router (#55) reads it to pick the
+    /// least-busy replica when a model is loaded on more than one neuron.
+    /// Empty until the first /health poll reports load.
+    pub model_load: HashMap<String, ModelLoad>,
 }

 /// A model registered on a node, with its runtime status.
@@ -24,25 +43,102 @@ pub struct ModelEntry {
    pub last_accessed: Option<DateTime<Utc>>,
    /// Estimated VRAM usage in MB when loaded.
    pub vram_estimate_mb: Option<u64>,
+    /// Modalities the loaded model advertises (e.g. `["text", "vision"]`),
+    /// copied verbatim from the neuron's `ModelInfo.capabilities` at poll
+    /// time. Empty when the neuron reports none. `#[serde(default)]` keeps
+    /// older persisted/serialised entries deserialisable.
+    #[serde(default)]
+    pub capabilities: Vec<String>,
+    /// Runtime-detected capability flags from the neuron's `/models`
+    /// response (`ModelInfo`). `false` when the neuron predates these
+    /// fields or hasn't reported them yet.
+    #[serde(default)]
+    pub tool_call: bool,
+    #[serde(default)]
+    pub reasoning: bool,
+    /// Self-derived token budget the neuron computed for this loaded
+    /// model (#67), copied from `ModelInfo.limit` at poll time. `None`
+    /// when the neuron doesn't compute one (arch without a context
+    /// profile, or derivation disabled). This is the authoritative
+    /// source the gateway advertises — operator-declared catalogue
+    /// limits are no longer consulted.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
 }

 /// Model lifecycle status.
+///
+/// `Loading` is a gateway-side synthetic status: neurons never emit it
+/// on `/models` (that endpoint only knows about already-loaded handles).
+/// The gateway populates it from a neuron's `/health` activation
+/// snapshot so the unified `/v1/models` can distinguish "model is
+/// catalogued but no one has it" from "model is materialising on
+/// neuron N right now". Other status values are reported verbatim by
+/// neurons.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase")]
 pub enum ModelStatus {
    Loaded,
    Unloaded,
    Reloading,
+    Loading,
+    /// Reported by neuron while a poisoned model auto-recovers via
+    /// unload→reload (#17/#20). Temporarily unservable but NOT
+    /// evicted: the gateway holds the route, answers with a transient
+    /// retry error instead of 404, and must not race a second
+    /// placement elsewhere.
+    Recovering,
 }

 /// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
-/// Includes which node(s) host this model and their status.
+///
+/// The first four fields (`id`, `object`, `created`, `owned_by`) match
+/// OpenAI's `/v1/models` shape verbatim, so existing OpenAI-aware
+/// tooling deserialises this without custom code. The remaining fields
+/// are helexa-specific extensions — OpenAI clients ignore unknown
+/// fields and other consumers can read them for placement / debugging.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct CortexModelEntry {
    pub id: String,
+    /// Always `"model"` per OpenAI's contract.
    pub object: String,
-    /// Which nodes have this model (and their status).
+    /// Unix-second timestamp; cortex stamps this at response time.
+    pub created: u64,
+    /// OpenAI's "publisher" field — `"helexa"` for everything we serve.
+    pub owned_by: String,
+    /// True if any neuron currently has this model loaded. False for
+    /// catalogue entries that are feasible but not yet loaded.
+    pub loaded: bool,
+    /// Neurons whose discovered topology can satisfy this model's
+    /// catalogue placement constraints. Empty for models that are
+    /// loaded somewhere but not present in the catalogue (cortex has
+    /// no feasibility opinion on those).
+    pub feasible_on: Vec<String>,
+    /// Where this model is actually loaded right now. Subset of (or
+    /// disjoint from) `feasible_on` depending on whether the catalogue
+    /// covers this model.
    pub locations: Vec<ModelLocation>,
+    /// Union of the modalities advertised by every neuron that has this
+    /// model loaded (e.g. `["text", "vision"]`). Empty for catalogue-only
+    /// entries with no loaded location — filled from catalogue profile
+    /// capabilities when available, then unioned with runtime-detected
+    /// values from loaded neurons.
+    #[serde(default)]
+    pub capabilities: Vec<String>,
+    // ── Enrichment (issue #62) ────────────────────────────────
+    /// Per-model token budget from the catalogue profile or discovered
+    /// at load time. `None` when neither source provides it.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
+    /// Operator-set pricing in USD per 1M tokens (0.0 = free/self-hosted).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cost: Option<ModelCost>,
+    /// `true` when any neuron reports this model supports tool calls.
+    #[serde(default)]
+    pub tool_call: bool,
+    /// `true` when any neuron reports this model supports reasoning tokens.
+    #[serde(default)]
+    pub reasoning: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/crates/cortex-core/src/openai.rs
+++ b/crates/cortex-core/src/openai.rs
@@ -3,7 +3,7 @@
 //! These are a subset sufficient for chat completions (streaming + non-streaming).
 //! Fields not relevant to proxying are captured as `serde_json::Value` via
 //! `#[serde(flatten)]` so we forward them without needing to enumerate every
-//! extension field mistral.rs supports.
+//! extension field a backend might support.

 use serde::{Deserialize, Serialize};
 use serde_json::Value;
@@ -22,7 +22,7 @@ pub struct ChatCompletionRequest {
    pub max_tokens: Option<u64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub stream: Option<bool>,
-    /// All other fields (tools, response_format, mistral.rs extensions, etc.)
+    /// All other fields (tools, response_format, backend extensions, etc.)
    #[serde(flatten)]
    pub extra: Value,
 }
@@ -71,10 +71,18 @@ pub struct ChatCompletionChoice {

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ChatCompletionChunk {
+    #[serde(default)]
    pub id: String,
+    #[serde(default)]
    pub object: String,
+    #[serde(default)]
    pub created: u64,
+    // Lenient deserialization throughout: the gateway parses chunks
+    // from arbitrary OpenAI-compatible upstreams, and some engines
+    // omit fields on special frames (e.g. usage-only final chunks).
+    #[serde(default)]
    pub model: String,
+    #[serde(default)]
    pub choices: Vec<ChunkChoice>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub usage: Option<Usage>,
@@ -98,6 +106,31 @@ pub struct Usage {
    pub prompt_tokens: u64,
    pub completion_tokens: u64,
    pub total_tokens: u64,
+    /// OpenAI-standard breakdown of `completion_tokens`. Optional and
+    /// additive — clients that don't read it are unaffected. Carries
+    /// `reasoning_tokens` for reasoning models (a sub-count of
+    /// `completion_tokens`, never added into `total_tokens`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub completion_tokens_details: Option<CompletionTokensDetails>,
+    /// OpenAI-standard breakdown of `prompt_tokens`. Populated once
+    /// prompt caching lands (#11); `None` until then.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prompt_tokens_details: Option<PromptTokensDetails>,
+}
+
+/// Sub-counts of `Usage::completion_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CompletionTokensDetails {
+    /// Tokens generated inside the model's reasoning span.
+    pub reasoning_tokens: u64,
+}
+
+/// Sub-counts of `Usage::prompt_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PromptTokensDetails {
+    /// Prompt tokens served from cache (cache-read rate). Populated
+    /// once prompt caching lands (#11).
+    pub cached_tokens: u64,
 }

 // ── Models list response ─────────────────────────────────────────────
--- a/crates/cortex-core/src/responses.rs
+++ b/crates/cortex-core/src/responses.rs
@@ -0,0 +1,372 @@
+//! OpenAI Responses API (`POST /v1/responses`) envelope types.
+//!
+//! This is OpenAI's newer chat surface, distinct from
+//! `/v1/chat/completions` in three ways that matter for us:
+//!
+//! 1. **Input shape**. Instead of a `messages` array, the request
+//!    carries `input` — either a plain string (single user turn)
+//!    or an array of typed items (messages, function calls,
+//!    function-call outputs, reasoning blocks, …).
+//! 2. **Output shape**. The response carries a single `output`
+//!    array of items, each typed. We always emit one
+//!    `OutputItem::Message` containing the assistant's reply (plus,
+//!    when we get there, separate `function_call` items).
+//! 3. **Streaming events**. Where chat completions stream
+//!    structurally-identical `chat.completion.chunk` frames over
+//!    `data:` lines, Responses streams *named* events
+//!    (`response.created`, `response.output_text.delta`,
+//!    `response.completed`, …) over `event:` + `data:` SSE pairs.
+//!    The wire projector in `neuron::wire::openai_responses` builds
+//!    these from the same [`crate::openai`]-shaped
+//!    `InferenceEvent` stream the chat projector consumes.
+//!
+//! Scope cuts for this first cut:
+//!
+//! - **`previous_response_id` is rejected at parse time**. Stateful
+//!   chained conversations need a persistence layer we don't have.
+//! - **Reasoning items are accepted-and-ignored** (no Qwen3
+//!   `<think>` routing yet). Audio and embedded resources are
+//!   rejected as unsupported.
+//! - **Tool calls** (function_call / function_call_output) are
+//!   carried as round-trip types but the candle harness doesn't
+//!   emit them yet — wired so the surface is in place for the
+//!   day we add proper tool-call extraction.
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+
+// ── Request ──────────────────────────────────────────────────────────
+
+/// Body of a `POST /v1/responses` request.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ResponsesRequest {
+    pub model: String,
+    pub input: ResponsesInput,
+    /// System-prompt-style instructions. The Responses API
+    /// separates these from input so a caller doesn't have to
+    /// build a `system` message item by hand.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+    #[serde(default)]
+    pub stream: bool,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub max_output_tokens: Option<u64>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f64>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f64>,
+    /// Chained-conversation identifier. We don't store responses
+    /// server-side yet; if this is `Some`, the handler returns 400.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub previous_response_id: Option<String>,
+    /// Catch-all for anything we don't model yet (tools, tool_choice,
+    /// reasoning, response_format, …). Lets a client send a
+    /// forward-compatible request without our parser rejecting it.
+    #[serde(flatten)]
+    pub extra: Value,
+}
+
+/// `input` is either a single string or an array of typed items.
+/// `#[serde(untagged)]` so the wire shape `"input": "hi"` and
+/// `"input": [{...}]` both deserialize.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ResponsesInput {
+    Text(String),
+    Items(Vec<ResponsesInputItem>),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ResponsesInputItem {
+    /// A user / assistant / system turn.
+    Message {
+        role: String,
+        content: ResponsesMessageContent,
+    },
+    /// Assistant emitted a tool call. Round-trip only — neuron
+    /// doesn't synthesise these yet.
+    FunctionCall {
+        call_id: String,
+        name: String,
+        arguments: String,
+    },
+    /// User is feeding a tool result back into the model.
+    FunctionCallOutput { call_id: String, output: String },
+    /// Reasoning items emitted by o-series models. Accepted but
+    /// not forwarded to the model — neuron's candle path doesn't
+    /// surface reasoning separately yet.
+    Reasoning {
+        #[serde(default)]
+        content: Vec<Value>,
+    },
+}
+
+/// Inside a `Message` item, content is either a plain string or an
+/// array of typed parts. Mirrors the chat-completions Parts shape.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ResponsesMessageContent {
+    Text(String),
+    Parts(Vec<ResponsesContentPart>),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ResponsesContentPart {
+    /// Plain text inside a user / system turn.
+    InputText { text: String },
+    /// An image. `image_url` is either a remote URL or a
+    /// `data:image/png;base64,…` URI; the request translator just
+    /// forwards the string.
+    InputImage {
+        image_url: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        detail: Option<String>,
+    },
+    /// Returned text inside an assistant turn — only relevant when
+    /// the caller is feeding an assistant turn back in to continue
+    /// a conversation manually (no `previous_response_id`).
+    OutputText {
+        text: String,
+        #[serde(default, skip_serializing_if = "Vec::is_empty")]
+        annotations: Vec<Value>,
+    },
+}
+
+// ── Response (non-streaming) ─────────────────────────────────────────
+
+/// Body of a `POST /v1/responses` response.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ResponsesResponse {
+    pub id: String,
+    /// Always `"response"`.
+    pub object: String,
+    pub created_at: u64,
+    /// `"completed"`, `"incomplete"`, or — for the initial event of
+    /// a streaming response — `"in_progress"`.
+    pub status: String,
+    pub model: String,
+    pub output: Vec<ResponsesOutputItem>,
+    /// Populated on completion; `None` while streaming.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub usage: Option<ResponsesUsage>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ResponsesOutputItem {
+    Message {
+        id: String,
+        /// Always `"assistant"` for model output.
+        role: String,
+        /// Output content parts. We always emit a single
+        /// `OutputText` today; multi-part output would land here
+        /// once we have e.g. image generation.
+        content: Vec<ResponsesOutputContent>,
+        /// Item-level status. `"in_progress"` while streaming the
+        /// content parts, `"completed"` when done.
+        #[serde(default = "default_item_status")]
+        status: String,
+    },
+    /// Reserved for the day tool-call extraction lands. The wire
+    /// shape mirrors `ResponsesInputItem::FunctionCall`.
+    FunctionCall {
+        id: String,
+        call_id: String,
+        name: String,
+        arguments: String,
+        #[serde(default = "default_item_status")]
+        status: String,
+    },
+}
+
+fn default_item_status() -> String {
+    "completed".into()
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum ResponsesOutputContent {
+    OutputText {
+        text: String,
+        /// Citations / inline annotations. Empty today; reserved
+        /// for the day we wire in web search / file search.
+        #[serde(default, skip_serializing_if = "Vec::is_empty")]
+        annotations: Vec<Value>,
+    },
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ResponsesUsage {
+    pub input_tokens: u64,
+    pub output_tokens: u64,
+    pub total_tokens: u64,
+    /// OpenAI-standard breakdown of `output_tokens`. Optional and
+    /// additive. Carries `reasoning_tokens` for reasoning models (a
+    /// sub-count of `output_tokens`, never added into `total_tokens`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub output_tokens_details: Option<OutputTokensDetails>,
+    /// OpenAI-standard breakdown of `input_tokens`. Populated once
+    /// prompt caching lands (#11); `None` until then.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub input_tokens_details: Option<InputTokensDetails>,
+}
+
+/// Sub-counts of `ResponsesUsage::output_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OutputTokensDetails {
+    /// Tokens generated inside the model's reasoning span.
+    pub reasoning_tokens: u64,
+}
+
+/// Sub-counts of `ResponsesUsage::input_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InputTokensDetails {
+    /// Input tokens served from cache (cache-read rate). Populated
+    /// once prompt caching lands (#11).
+    pub cached_tokens: u64,
+}
+
+// ── Streaming event names ────────────────────────────────────────────
+
+/// Event names the SSE projector emits, hoisted as constants so
+/// the projector and the wire shape stay in sync without
+/// string-typos. The strings are dictated by OpenAI's published
+/// Responses API.
+pub mod events {
+    pub const CREATED: &str = "response.created";
+    /// Fired between `response.created` and the first output-item
+    /// event. Marks "request validated, model is generating" —
+    /// some clients use it to differentiate the "warming up" state
+    /// from "streaming tokens" in their UI.
+    pub const IN_PROGRESS: &str = "response.in_progress";
+    pub const OUTPUT_ITEM_ADDED: &str = "response.output_item.added";
+    pub const CONTENT_PART_ADDED: &str = "response.content_part.added";
+    pub const OUTPUT_TEXT_DELTA: &str = "response.output_text.delta";
+    pub const OUTPUT_TEXT_DONE: &str = "response.output_text.done";
+    pub const CONTENT_PART_DONE: &str = "response.content_part.done";
+    pub const OUTPUT_ITEM_DONE: &str = "response.output_item.done";
+    pub const COMPLETED: &str = "response.completed";
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn deserialises_input_string_form() {
+        let raw = r#"{"model": "m", "input": "hello"}"#;
+        let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
+        match req.input {
+            ResponsesInput::Text(s) => assert_eq!(s, "hello"),
+            other => panic!("expected Text, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn deserialises_input_items_form() {
+        let raw = r#"{
+            "model": "m",
+            "input": [
+                {"type": "message", "role": "user", "content": "hi"}
+            ]
+        }"#;
+        let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
+        match req.input {
+            ResponsesInput::Items(items) => {
+                assert_eq!(items.len(), 1);
+                match &items[0] {
+                    ResponsesInputItem::Message { role, content } => {
+                        assert_eq!(role, "user");
+                        match content {
+                            ResponsesMessageContent::Text(t) => assert_eq!(t, "hi"),
+                            other => panic!("expected Text content, got {other:?}"),
+                        }
+                    }
+                    other => panic!("expected Message item, got {other:?}"),
+                }
+            }
+            other => panic!("expected Items, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn deserialises_input_with_image() {
+        let raw = r#"{
+            "model": "m",
+            "input": [
+                {"type": "message", "role": "user", "content": [
+                    {"type": "input_text", "text": "what is this"},
+                    {"type": "input_image", "image_url": "data:image/png;base64,AAA="}
+                ]}
+            ]
+        }"#;
+        let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
+        let items = match req.input {
+            ResponsesInput::Items(i) => i,
+            other => panic!("expected Items, got {other:?}"),
+        };
+        let parts = match &items[0] {
+            ResponsesInputItem::Message {
+                content: ResponsesMessageContent::Parts(p),
+                ..
+            } => p,
+            other => panic!("expected Parts, got {other:?}"),
+        };
+        assert_eq!(parts.len(), 2);
+        assert!(matches!(
+            &parts[0],
+            ResponsesContentPart::InputText { text } if text == "what is this"
+        ));
+        assert!(matches!(
+            &parts[1],
+            ResponsesContentPart::InputImage { image_url, .. }
+                if image_url == "data:image/png;base64,AAA="
+        ));
+    }
+
+    #[test]
+    fn unknown_fields_round_trip_via_extra() {
+        let raw = r#"{
+            "model": "m",
+            "input": "hi",
+            "tools": [{"type": "web_search"}],
+            "reasoning": {"effort": "medium"}
+        }"#;
+        let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
+        assert!(req.extra.get("tools").is_some());
+        assert!(req.extra.get("reasoning").is_some());
+    }
+
+    #[test]
+    fn response_round_trips_through_serde() {
+        let r = ResponsesResponse {
+            id: "resp_1".into(),
+            object: "response".into(),
+            created_at: 1700,
+            status: "completed".into(),
+            model: "m".into(),
+            output: vec![ResponsesOutputItem::Message {
+                id: "msg_1".into(),
+                role: "assistant".into(),
+                content: vec![ResponsesOutputContent::OutputText {
+                    text: "hi there".into(),
+                    annotations: vec![],
+                }],
+                status: "completed".into(),
+            }],
+            usage: Some(ResponsesUsage {
+                input_tokens: 5,
+                output_tokens: 3,
+                total_tokens: 8,
+                output_tokens_details: None,
+                input_tokens_details: None,
+            }),
+        };
+        let json = serde_json::to_string(&r).unwrap();
+        let parsed: ResponsesResponse = serde_json::from_str(&json).unwrap();
+        assert_eq!(parsed.id, "resp_1");
+        assert_eq!(parsed.output.len(), 1);
+    }
+}
--- a/crates/cortex-core/src/source.rs
+++ b/crates/cortex-core/src/source.rs
@@ -0,0 +1,267 @@
+//! Scheme-qualified model identifiers.
+//!
+//! cortex/neuron historically resolves every model id through hf-hub
+//! against `https://huggingface.co`. Helexa is adding an EU-hosted
+//! registry (`registry.helexa.ai`) alongside HF — both speak the same
+//! HF-compatible wire format, but the bytes, jurisdiction, and trust
+//! root differ. Model ids therefore need a scheme:
+//!
+//!   - `huggingface:Qwen/Qwen3.6-27B`         — HF-hosted bytes
+//!   - `helexa:Qwen/Qwen3.6-27B-Uncensored`  — helexa registry bytes
+//!   - `helexa:SomeOperator/CustomFinetune`  — operator publishing
+//!     under the helexa namespace; same scheme handles all `org/name`
+//!     pairs hosted in that registry.
+//!
+//! Bare `org/name` parses with an empty scheme; the caller (typically
+//! a harness) substitutes its configured default scheme so existing
+//! configs keep working through the transition.
+
+use serde::{Deserialize, Serialize};
+use std::fmt;
+use std::str::FromStr;
+
+/// Parsed `scheme:org/name`. Bare `org/name` produces an empty scheme
+/// — call `with_default_scheme` (or check `is_scheme_unset`) to
+/// resolve before using.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ModelSourceId {
+    pub scheme: String,
+    pub org: String,
+    pub name: String,
+}
+
+/// Errors from `ModelSourceId::from_str`. Carries the offending input
+/// so log lines / API errors can echo what the operator typed.
+#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
+pub enum ParseError {
+    #[error("empty model id")]
+    Empty,
+    #[error("model id '{0}' is missing the '/' between org and name")]
+    MissingSlash(String),
+    #[error("model id '{0}' has an empty scheme before ':'")]
+    EmptyScheme(String),
+    #[error("model id '{0}' has an empty org")]
+    EmptyOrg(String),
+    #[error("model id '{0}' has an empty name")]
+    EmptyName(String),
+    #[error("model id '{0}' has a scheme containing '/' which is reserved for org/name")]
+    SchemeContainsSlash(String),
+    #[error("model id '{0}' has a name containing ':' which is reserved for the scheme prefix")]
+    NameContainsColon(String),
+}
+
+impl ModelSourceId {
+    /// Construct directly from already-validated parts. Used by tests
+    /// and call sites that have the fields separately; the public API
+    /// for parsing user input is `FromStr`.
+    pub fn new(scheme: impl Into<String>, org: impl Into<String>, name: impl Into<String>) -> Self {
+        Self {
+            scheme: scheme.into(),
+            org: org.into(),
+            name: name.into(),
+        }
+    }
+
+    /// True when this id parsed from a bare `org/name` (no scheme
+    /// prefix). The harness substitutes its configured default in
+    /// `with_default_scheme` before resolving against a registry.
+    pub fn is_scheme_unset(&self) -> bool {
+        self.scheme.is_empty()
+    }
+
+    /// Substitute `default` for an empty scheme. No-op when the scheme
+    /// is already set. Returns self by value so it composes neatly:
+    /// `id.parse::<ModelSourceId>()?.with_default_scheme("huggingface")`.
+    pub fn with_default_scheme(mut self, default: &str) -> Self {
+        if self.scheme.is_empty() {
+            self.scheme = default.to_string();
+        }
+        self
+    }
+
+    /// The `org/name` half — what an hf-hub `Api::model(...)` call
+    /// expects regardless of which scheme/endpoint we're hitting.
+    pub fn repo_path(&self) -> String {
+        format!("{}/{}", self.org, self.name)
+    }
+}
+
+impl fmt::Display for ModelSourceId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.scheme.is_empty() {
+            write!(f, "{}/{}", self.org, self.name)
+        } else {
+            write!(f, "{}:{}/{}", self.scheme, self.org, self.name)
+        }
+    }
+}
+
+impl FromStr for ModelSourceId {
+    type Err = ParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        if s.is_empty() {
+            return Err(ParseError::Empty);
+        }
+        // Scheme split. Only the *first* colon counts — anything after
+        // belongs to org/name (and would be rejected separately because
+        // `:` isn't allowed there).
+        let (scheme, rest) = match s.split_once(':') {
+            Some((scheme, rest)) => {
+                if scheme.is_empty() {
+                    return Err(ParseError::EmptyScheme(s.to_string()));
+                }
+                if scheme.contains('/') {
+                    return Err(ParseError::SchemeContainsSlash(s.to_string()));
+                }
+                (scheme.to_string(), rest)
+            }
+            None => (String::new(), s),
+        };
+        let (org, name) = rest
+            .split_once('/')
+            .ok_or_else(|| ParseError::MissingSlash(s.to_string()))?;
+        if org.is_empty() {
+            return Err(ParseError::EmptyOrg(s.to_string()));
+        }
+        if name.is_empty() {
+            return Err(ParseError::EmptyName(s.to_string()));
+        }
+        if name.contains(':') {
+            return Err(ParseError::NameContainsColon(s.to_string()));
+        }
+        Ok(Self {
+            scheme,
+            org: org.to_string(),
+            name: name.to_string(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parses_qualified() {
+        let id: ModelSourceId = "huggingface:Qwen/Qwen3.6-27B".parse().unwrap();
+        assert_eq!(id.scheme, "huggingface");
+        assert_eq!(id.org, "Qwen");
+        assert_eq!(id.name, "Qwen3.6-27B");
+        assert_eq!(id.repo_path(), "Qwen/Qwen3.6-27B");
+        assert!(!id.is_scheme_unset());
+    }
+
+    #[test]
+    fn parses_helexa_scheme() {
+        let id: ModelSourceId = "helexa:SomeOperator/Qwen3.6-27B-Uncensored"
+            .parse()
+            .unwrap();
+        assert_eq!(id.scheme, "helexa");
+        assert_eq!(id.org, "SomeOperator");
+        assert_eq!(id.name, "Qwen3.6-27B-Uncensored");
+    }
+
+    #[test]
+    fn parses_bare_id_with_empty_scheme() {
+        let id: ModelSourceId = "Qwen/Qwen3-30B-A3B-Instruct".parse().unwrap();
+        assert_eq!(id.scheme, "");
+        assert_eq!(id.org, "Qwen");
+        assert_eq!(id.name, "Qwen3-30B-A3B-Instruct");
+        assert!(id.is_scheme_unset());
+    }
+
+    #[test]
+    fn substitutes_default_scheme_only_when_unset() {
+        let id: ModelSourceId = "Qwen/Q3".parse().unwrap();
+        assert_eq!(id.with_default_scheme("huggingface").scheme, "huggingface");
+
+        let id: ModelSourceId = "helexa:Qwen/Q3".parse().unwrap();
+        assert_eq!(
+            id.with_default_scheme("huggingface").scheme,
+            "helexa",
+            "default substitution must not override an explicit scheme"
+        );
+    }
+
+    #[test]
+    fn display_roundtrips_qualified_id() {
+        let s = "helexa:Helexa/Qwen3.6-27B";
+        let id: ModelSourceId = s.parse().unwrap();
+        assert_eq!(id.to_string(), s);
+    }
+
+    #[test]
+    fn display_roundtrips_bare_id() {
+        let s = "Qwen/Q3";
+        let id: ModelSourceId = s.parse().unwrap();
+        assert_eq!(id.to_string(), s);
+    }
+
+    #[test]
+    fn rejects_empty() {
+        assert_eq!("".parse::<ModelSourceId>().unwrap_err(), ParseError::Empty);
+    }
+
+    #[test]
+    fn rejects_missing_slash() {
+        match "Qwen".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::MissingSlash(s) => assert_eq!(s, "Qwen"),
+            other => panic!("expected MissingSlash, got {other:?}"),
+        }
+        match "huggingface:Qwen".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::MissingSlash(s) => assert_eq!(s, "huggingface:Qwen"),
+            other => panic!("expected MissingSlash, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_empty_scheme() {
+        match ":Qwen/Q3".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::EmptyScheme(s) => assert_eq!(s, ":Qwen/Q3"),
+            other => panic!("expected EmptyScheme, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_scheme_with_slash() {
+        match "hugg/ingface:Q/N".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::SchemeContainsSlash(s) => assert_eq!(s, "hugg/ingface:Q/N"),
+            other => panic!("expected SchemeContainsSlash, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_empty_org_or_name() {
+        match "huggingface:/N".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::EmptyOrg(_) => {}
+            other => panic!("expected EmptyOrg, got {other:?}"),
+        }
+        match "huggingface:Q/".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::EmptyName(_) => {}
+            other => panic!("expected EmptyName, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_name_with_colon() {
+        match "huggingface:Q/N:weird"
+            .parse::<ModelSourceId>()
+            .unwrap_err()
+        {
+            ParseError::NameContainsColon(s) => assert_eq!(s, "huggingface:Q/N:weird"),
+            other => panic!("expected NameContainsColon, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn serde_roundtrips_via_struct() {
+        // We serialize as a struct (scheme/org/name fields) so the
+        // shape is self-describing in API payloads. Callers that want
+        // the compact `scheme:org/name` string use `Display`/`FromStr`.
+        let id = ModelSourceId::new("helexa", "Helexa", "Qwen3.6-27B");
+        let json = serde_json::to_string(&id).unwrap();
+        let back: ModelSourceId = serde_json::from_str(&json).unwrap();
+        assert_eq!(back, id);
+    }
+}
--- a/crates/cortex-core/src/translate.rs
+++ b/crates/cortex-core/src/translate.rs
--- a/crates/cortex-gateway/Cargo.toml
+++ b/crates/cortex-gateway/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 cortex-core.workspace = true
+async-trait.workspace = true
 tokio.workspace = true
 axum.workspace = true
 tower.workspace = true
@@ -24,6 +25,7 @@ tokio-stream.workspace = true
 eventsource-stream.workspace = true
 bytes = "1"
 urlencoding = "2"
+url = "2"

 [dev-dependencies]
 tokio = { workspace = true, features = ["test-util"] }
--- a/crates/cortex-gateway/src/anthropic_sse.rs
+++ b/crates/cortex-gateway/src/anthropic_sse.rs
@@ -0,0 +1,235 @@
+//! Streaming Anthropic SSE translation (#24).
+//!
+//! The `/v1/messages` handler translates the request envelope to
+//! OpenAI before proxying (see `cortex_core::translate`); this module
+//! completes the round trip for `stream: true` — the upstream OpenAI
+//! SSE stream is re-framed, event by event, into Anthropic's
+//! `message_start` / `content_block_*` / `message_delta` /
+//! `message_stop` sequence as it arrives. True streaming: each
+//! upstream chunk is translated and forwarded immediately; nothing is
+//! buffered beyond the current SSE event's bytes.
+//!
+//! The translation state machine itself is pure and lives in
+//! [`cortex_core::translate::AnthropicStreamTranslator`]; this module
+//! owns the wire concerns — splitting the upstream byte stream into
+//! SSE events, parsing `data:` payloads, and framing the translated
+//! events as `event: <name>\ndata: <json>\n\n`.
+
+use axum::body::Body;
+use axum::http::StatusCode;
+use axum::response::Response;
+use bytes::Bytes;
+use cortex_core::openai::ChatCompletionChunk;
+use cortex_core::translate::AnthropicStreamTranslator;
+use futures::StreamExt;
+use tokio_stream::wrappers::ReceiverStream;
+
+/// Forward the translated OpenAI request to the upstream node and
+/// return the response translated to Anthropic SSE framing.
+pub async fn stream_translated(
+    client: &reqwest::Client,
+    endpoint: &str,
+    openai_body: axum::body::Bytes,
+    model_id: &str,
+    node_name: &str,
+    inbound_headers: &axum::http::HeaderMap,
+    usage_sink: Option<crate::metering::UsageSink>,
+) -> Response {
+    let url = format!("{endpoint}/v1/chat/completions");
+    tracing::info!(
+        handler = "anthropic_messages",
+        model = %model_id,
+        node = %node_name,
+        url = %url,
+        "proxying streaming request (anthropic SSE translation)"
+    );
+
+    let request = crate::auth::forward_principal_headers(
+        client
+            .post(&url)
+            .header("content-type", "application/json")
+            .body(openai_body),
+        inbound_headers,
+    );
+    let upstream = match request.send().await {
+        Ok(r) => r,
+        Err(e) => {
+            tracing::warn!(
+                handler = "anthropic_messages",
+                node = %node_name,
+                url = %url,
+                error = %e,
+                "anthropic stream: upstream request failed"
+            );
+            return anthropic_error(StatusCode::BAD_GATEWAY, "upstream request failed");
+        }
+    };
+
+    let status = upstream.status();
+    if !status.is_success() {
+        tracing::warn!(
+            handler = "anthropic_messages",
+            node = %node_name,
+            url = %url,
+            status = status.as_u16(),
+            "anthropic stream: upstream returned non-2xx"
+        );
+        return anthropic_error(
+            StatusCode::from_u16(status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY),
+            "upstream returned an error",
+        );
+    }
+
+    // Bounded channel: a slow client back-pressures the pump task,
+    // which back-pressures the upstream read — same propagation
+    // discipline as neuron's own projectors.
+    let (tx, rx) = tokio::sync::mpsc::channel::<Result<Bytes, std::convert::Infallible>>(32);
+    let node = node_name.to_string();
+    let model = model_id.to_string();
+    tokio::spawn(async move {
+        let mut upstream = upstream.bytes_stream();
+        let mut translator = AnthropicStreamTranslator::new();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut done = false;
+        // Wire-debug accounting for the stream summary emitted at the
+        // end: did the model emit a structured tool call, what was the
+        // final finish_reason, and how many upstream frames did we see.
+        let mut saw_tool_call = false;
+        let mut last_finish: Option<String> = None;
+        let mut frames = 0u64;
+        // Engine-truth usage for metering (#51), scanned from the upstream
+        // frames (neuron emits a final `usage` object on the stream, #48).
+        let mut usage_prompt = 0u64;
+        let mut usage_completion = 0u64;
+
+        'outer: while let Some(block) = upstream.next().await {
+            let block = match block {
+                Ok(b) => b,
+                Err(e) => {
+                    tracing::warn!(node = %node, error = %e, "anthropic stream: upstream read failed mid-stream");
+                    break;
+                }
+            };
+            buf.extend_from_slice(&block);
+            // SSE events are separated by a blank line.
+            while let Some(pos) = find_event_boundary(&buf) {
+                let event: Vec<u8> = buf.drain(..pos + 2).collect();
+                let text = String::from_utf8_lossy(&event);
+                for line in text.lines() {
+                    let Some(data) = line.strip_prefix("data:") else {
+                        continue;
+                    };
+                    let data = data.trim();
+                    if data == "[DONE]" {
+                        done = true;
+                        if !send_frames(&tx, translator.finish()).await {
+                            break 'outer;
+                        }
+                        continue;
+                    }
+                    tracing::trace!(node = %node, frame = %data, "anthropic stream: upstream frame");
+                    // Capture usage for metering before translation — the
+                    // usage object rides on a late frame (often after the
+                    // last content delta).
+                    if let Some(p) = crate::proxy::last_count_for(data, "prompt_tokens") {
+                        usage_prompt = p;
+                    }
+                    if let Some(c) = crate::proxy::last_count_for(data, "completion_tokens") {
+                        usage_completion = c;
+                    }
+                    let Ok(chunk) = serde_json::from_str::<ChatCompletionChunk>(data) else {
+                        tracing::debug!(node = %node, "anthropic stream: unparsable upstream frame skipped");
+                        continue;
+                    };
+                    frames += 1;
+                    if chunk
+                        .choices
+                        .iter()
+                        .any(|c| c.delta.get("tool_calls").is_some())
+                    {
+                        saw_tool_call = true;
+                    }
+                    if let Some(fr) = chunk.choices.iter().find_map(|c| c.finish_reason.clone()) {
+                        last_finish = Some(fr);
+                    }
+                    if !send_frames(&tx, translator.on_chunk(&chunk)).await {
+                        break 'outer;
+                    }
+                }
+            }
+        }
+        // Upstream ended without [DONE] (error or truncation): still
+        // close the Anthropic event sequence so clients aren't left
+        // with an unterminated message.
+        if !done {
+            let _ = send_frames(&tx, translator.finish()).await;
+        }
+        // Stream summary: the streaming counterpart to the non-streaming
+        // handler's "upstream response" line. `upstream_tool_calls =
+        // false` on a tools-bearing request is the fingerprint of the
+        // model improvising an unparsed tool-call format.
+        tracing::debug!(
+            wire = "anthropic",
+            model = %model,
+            node = %node,
+            frames,
+            upstream_tool_calls = saw_tool_call,
+            finish_reason = ?last_finish,
+            terminated = done,
+            "anthropic stream complete"
+        );
+
+        // Settle metering with the observed usage (#51). Runs on every exit
+        // path of the pump — clean end, early break, or upstream error — so
+        // the reservation is always resolved. `(0, 0)` when no usage frame
+        // was seen, which releases without recording spend.
+        if let Some(sink) = usage_sink {
+            sink(usage_prompt, usage_completion);
+        }
+    });
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header("content-type", "text/event-stream")
+        .header("cache-control", "no-cache")
+        .body(Body::from_stream(ReceiverStream::new(rx)))
+        .unwrap_or_else(|_| {
+            anthropic_error(
+                StatusCode::INTERNAL_SERVER_ERROR,
+                "failed to build response",
+            )
+        })
+}
+
+/// `\n\n` boundary of the first complete SSE event in `buf`, if any.
+fn find_event_boundary(buf: &[u8]) -> Option<usize> {
+    buf.windows(2).position(|w| w == b"\n\n")
+}
+
+/// Render translated events as SSE frames and send them. Returns
+/// `false` when the client has gone away (receiver dropped).
+async fn send_frames(
+    tx: &tokio::sync::mpsc::Sender<Result<Bytes, std::convert::Infallible>>,
+    events: Vec<(String, serde_json::Value)>,
+) -> bool {
+    for (name, payload) in events {
+        let frame = format!("event: {name}\ndata: {payload}\n\n");
+        if tx.send(Ok(Bytes::from(frame))).await.is_err() {
+            return false;
+        }
+    }
+    true
+}
+
+/// Anthropic-shaped error body (`{"type":"error","error":{...}}`).
+fn anthropic_error(status: StatusCode, message: &str) -> Response {
+    let body = serde_json::json!({
+        "type": "error",
+        "error": { "type": "api_error", "message": message }
+    });
+    Response::builder()
+        .status(status)
+        .header("content-type", "application/json")
+        .body(Body::from(body.to_string()))
+        .expect("static error response must build")
+}
--- a/crates/cortex-gateway/src/auth.rs
+++ b/crates/cortex-gateway/src/auth.rs
@@ -0,0 +1,133 @@
+//! API-key authentication + principal resolution (#49).
+//!
+//! Identity rides standard bearer auth only — `Authorization: Bearer <key>`
+//! — which is what keeps every tier OpenAI-compatible by construction (no
+//! custom required headers or body fields, per #47). The middleware resolves
+//! the key to a [`Principal`] via the [`EntitlementProvider`], carries it in
+//! the request extensions for cortex-side metering/enforcement (#51/#52), and
+//! stamps it as internal headers on the request so it reaches neuron, which
+//! trusts cortex's assertion over WireGuard (#54).
+//!
+//! Anti-spoofing: any client-supplied principal header is **stripped** before
+//! the authoritative value is stamped, so a client can never assert a
+//! principal it didn't authenticate as.
+//!
+//! Rejection contract (#63): missing key under `require_auth`, or any present
+//! but unresolvable key, yields `401 invalid_api_key` in the #60 envelope.
+
+use crate::error::envelope_response;
+use crate::state::CortexState;
+use axum::extract::{Request, State};
+use axum::http::header::AUTHORIZATION;
+use axum::http::{HeaderMap, HeaderValue};
+use axum::middleware::Next;
+use axum::response::Response;
+use cortex_core::entitlements::{HEADER_ACCOUNT_ID, HEADER_KEY_ID};
+use cortex_core::error_envelope::OpenAiError;
+use std::sync::Arc;
+
+/// Endpoints that never require auth: liveness/readiness probes. Everything
+/// else flows through resolution.
+fn is_public(path: &str) -> bool {
+    path == "/health" || path == "/"
+}
+
+/// Extract the bearer token from an `Authorization` header value, if present
+/// and well-formed. Scheme match is case-insensitive per RFC 7235.
+fn parse_bearer(headers: &HeaderMap) -> Option<String> {
+    let raw = headers.get(AUTHORIZATION)?.to_str().ok()?;
+    let (scheme, token) = raw.split_once(' ')?;
+    if scheme.eq_ignore_ascii_case("bearer") {
+        let token = token.trim();
+        (!token.is_empty()).then(|| token.to_string())
+    } else {
+        None
+    }
+}
+
+/// Axum middleware: resolve the bearer key, attach the principal, stamp the
+/// internal headers. Wired in `build_app` via `from_fn_with_state`.
+pub async fn require_principal(
+    State(fleet): State<Arc<CortexState>>,
+    mut req: Request,
+    next: Next,
+) -> Response {
+    if is_public(req.uri().path()) {
+        return next.run(req).await;
+    }
+
+    // Anti-spoof: drop any client-supplied principal headers up front.
+    {
+        let headers = req.headers_mut();
+        headers.remove(HEADER_ACCOUNT_ID);
+        headers.remove(HEADER_KEY_ID);
+    }
+
+    match parse_bearer(req.headers()) {
+        Some(key) => match fleet.entitlements.resolve(&key).await {
+            Ok(principal) => {
+                // Stamp the authoritative principal for neuron. Account/key
+                // ids come from operator config, so they're valid header
+                // values; guard anyway and skip a malformed one rather than
+                // panic.
+                if let (Ok(account), Ok(key_id)) = (
+                    HeaderValue::from_str(&principal.account_id),
+                    HeaderValue::from_str(&principal.key_id),
+                ) {
+                    let headers = req.headers_mut();
+                    headers.insert(HEADER_ACCOUNT_ID, account);
+                    headers.insert(HEADER_KEY_ID, key_id);
+                }
+                // Carry the typed principal for cortex-side metering (#51)
+                // and budget enforcement (#52).
+                req.extensions_mut().insert(principal);
+                next.run(req).await
+            }
+            // An unrecognized key only hard-fails when auth is *required*.
+            // In allow-anonymous mode (the default) we must IGNORE it and
+            // serve the request unauthenticated — otherwise the placeholder
+            // keys that OpenAI-compatible clients send by default (opencode,
+            // Open WebUI, Agent Zero, litellm) would all break, even though
+            // the operator never opted into auth. Pre-#49 the bearer was
+            // never inspected at all; this preserves that for require_auth=false.
+            Err(_) => {
+                if fleet.require_auth {
+                    unauthorized("invalid API key")
+                } else {
+                    tracing::debug!(
+                        "ignoring unrecognized bearer token (require_auth=false): serving anonymously"
+                    );
+                    next.run(req).await
+                }
+            }
+        },
+        None => {
+            if fleet.require_auth {
+                unauthorized("missing API key; supply 'Authorization: Bearer <key>'")
+            } else {
+                next.run(req).await
+            }
+        }
+    }
+}
+
+/// `401 invalid_api_key` in the standard envelope (#63).
+fn unauthorized(message: &str) -> Response {
+    envelope_response(OpenAiError::invalid_api_key(message))
+}
+
+/// Copy the cortex-stamped principal headers from an inbound [`HeaderMap`]
+/// onto an outbound reqwest builder. Used by the Anthropic proxy paths,
+/// which construct their own upstream requests instead of going through
+/// [`crate::proxy::forward_request`] (which forwards all headers verbatim).
+pub fn forward_principal_headers(
+    mut builder: reqwest::RequestBuilder,
+    headers: &HeaderMap,
+) -> reqwest::RequestBuilder {
+    for name in [HEADER_ACCOUNT_ID, HEADER_KEY_ID] {
+        if let Some(value) = headers.get(name) {
+            builder = builder.header(name, value);
+        }
+    }
+    builder
+}
--- a/crates/cortex-gateway/src/entitlements_local.rs
+++ b/crates/cortex-gateway/src/entitlements_local.rs
@@ -0,0 +1,317 @@
+//! The local/static [`EntitlementProvider`] (#50).
+//!
+//! Accounts, keys, and hard caps come from operator config
+//! ([`cortex_core::config::EntitlementsConfig`]); reservations and settled
+//! spend are tracked in-process. This lands auth + per-key caps + the
+//! amplification fix before any upstream clearing house exists; the future
+//! helexa-upstream client (#57) implements the same trait.
+//!
+//! Budget math is serialized under a single [`std::sync::Mutex`] so
+//! reserve/settle/release are atomic — a key's `spent + reserved` can never
+//! exceed its hard cap even under concurrent requests (the #52 guarantee).
+//! The lock is held only for the in-memory arithmetic, never across an
+//! await.
+
+use cortex_core::config::{ApiKeyConfig, EntitlementsConfig};
+use cortex_core::entitlements::{
+    AuthError, BudgetError, BudgetSnapshot, CapWindow, EntitlementProvider, Principal, Reservation,
+};
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Instant;
+
+/// Per-key budget configuration (resolved from [`ApiKeyConfig`]).
+struct Budget {
+    hard_cap: Option<u64>,
+    window: CapWindow,
+}
+
+/// Live, mutable accounting for one key over its current window.
+#[derive(Default)]
+struct Ledger {
+    /// Settled spend in the current window.
+    spent: u64,
+    /// Sum of outstanding (un-settled) reservations.
+    reserved: u64,
+    /// Start of the current rolling window; `None` until the first reserve.
+    /// Unused for [`CapWindow::Balance`].
+    window_start: Option<Instant>,
+}
+
+pub struct LocalEntitlementProvider {
+    /// Bearer token → principal.
+    keys: HashMap<String, Principal>,
+    /// `key_id` → budget config.
+    budgets: HashMap<String, Budget>,
+    /// `key_id` → live ledger.
+    ledgers: Mutex<HashMap<String, Ledger>>,
+    /// Monotonic source of opaque reservation handles.
+    next_id: AtomicU64,
+}
+
+impl LocalEntitlementProvider {
+    /// Build from the `[entitlements]` config. A key without an explicit
+    /// `key_id` is tracked at `account_id` granularity (its secret is never
+    /// used as a label).
+    pub fn from_config(config: &EntitlementsConfig) -> Self {
+        let mut keys = HashMap::new();
+        let mut budgets = HashMap::new();
+        for ApiKeyConfig {
+            key,
+            account_id,
+            key_id,
+            hard_cap,
+            window,
+        } in &config.keys
+        {
+            let key_id = key_id.clone().unwrap_or_else(|| account_id.clone());
+            keys.insert(
+                key.clone(),
+                Principal {
+                    account_id: account_id.clone(),
+                    key_id: key_id.clone(),
+                },
+            );
+            budgets.insert(
+                key_id,
+                Budget {
+                    hard_cap: *hard_cap,
+                    window: window.clone(),
+                },
+            );
+        }
+        Self {
+            keys,
+            budgets,
+            ledgers: Mutex::new(HashMap::new()),
+            next_id: AtomicU64::new(1),
+        }
+    }
+}
+
+/// Tokens still available under `cap` given current `spent`/`reserved`.
+/// `None` cap = unlimited.
+fn available(cap: Option<u64>, spent: u64, reserved: u64) -> Option<u64> {
+    cap.map(|c| c.saturating_sub(spent).saturating_sub(reserved))
+}
+
+#[async_trait::async_trait]
+impl EntitlementProvider for LocalEntitlementProvider {
+    async fn resolve(&self, api_key: &str) -> Result<Principal, AuthError> {
+        self.keys.get(api_key).cloned().ok_or(AuthError::InvalidKey)
+    }
+
+    async fn reserve(
+        &self,
+        principal: &Principal,
+        max_tokens: u64,
+    ) -> Result<Reservation, BudgetError> {
+        // A principal with no configured budget (or an uncapped one) always
+        // reserves; we still track spend for metrics.
+        let budget = self.budgets.get(&principal.key_id);
+        let (cap, window) = match budget {
+            Some(b) => (b.hard_cap, b.window.clone()),
+            None => (None, CapWindow::Balance),
+        };
+
+        let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        let ledger = ledgers.entry(principal.key_id.clone()).or_default();
+
+        // Lazily reset a rolling window that has elapsed before checking.
+        let mut retry_after_secs = 0;
+        if let CapWindow::Rolling { seconds } = window {
+            let now = Instant::now();
+            match ledger.window_start {
+                Some(start) if now.duration_since(start).as_secs() < seconds => {
+                    retry_after_secs = seconds - now.duration_since(start).as_secs();
+                }
+                _ => {
+                    // First reserve, or the window has fully elapsed: reset.
+                    ledger.spent = 0;
+                    ledger.window_start = Some(now);
+                    retry_after_secs = seconds;
+                }
+            }
+        }
+
+        if let Some(avail) = available(cap, ledger.spent, ledger.reserved)
+            && max_tokens > avail
+        {
+            return Err(match window {
+                CapWindow::Rolling { .. } => BudgetError::RateLimited {
+                    requested: max_tokens,
+                    available: avail,
+                    // At least 1s so clients don't hot-loop on a sub-second
+                    // remainder.
+                    retry_after_secs: retry_after_secs.max(1),
+                },
+                CapWindow::Balance => BudgetError::InsufficientQuota {
+                    requested: max_tokens,
+                    available: avail,
+                },
+            });
+        }
+
+        ledger.reserved += max_tokens;
+        Ok(Reservation {
+            id: self.next_id.fetch_add(1, Ordering::Relaxed),
+            principal: principal.clone(),
+            reserved: max_tokens,
+        })
+    }
+
+    async fn settle(&self, reservation: Reservation, actual_tokens: u64) {
+        let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        if let Some(ledger) = ledgers.get_mut(&reservation.principal.key_id) {
+            ledger.reserved = ledger.reserved.saturating_sub(reservation.reserved);
+            ledger.spent += actual_tokens;
+        }
+    }
+
+    async fn release(&self, reservation: Reservation) {
+        let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        if let Some(ledger) = ledgers.get_mut(&reservation.principal.key_id) {
+            ledger.reserved = ledger.reserved.saturating_sub(reservation.reserved);
+        }
+    }
+
+    async fn snapshot(&self, principal: &Principal) -> Option<BudgetSnapshot> {
+        let ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        let (spent, reserved) = ledgers
+            .get(&principal.key_id)
+            .map(|l| (l.spent, l.reserved))
+            .unwrap_or((0, 0));
+        let hard_cap = self.budgets.get(&principal.key_id).and_then(|b| b.hard_cap);
+        Some(BudgetSnapshot {
+            hard_cap,
+            spent,
+            reserved,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn provider() -> LocalEntitlementProvider {
+        let config = EntitlementsConfig {
+            require_auth: true,
+            keys: vec![
+                ApiKeyConfig {
+                    key: "sk-balance".into(),
+                    account_id: "acct-a".into(),
+                    key_id: Some("key-balance".into()),
+                    hard_cap: Some(1_000),
+                    window: CapWindow::Balance,
+                },
+                ApiKeyConfig {
+                    key: "sk-rolling".into(),
+                    account_id: "acct-b".into(),
+                    key_id: Some("key-rolling".into()),
+                    hard_cap: Some(500),
+                    window: CapWindow::Rolling { seconds: 3_600 },
+                },
+                ApiKeyConfig {
+                    key: "sk-infra".into(),
+                    account_id: "operator".into(),
+                    key_id: Some("key-infra".into()),
+                    hard_cap: None,
+                    window: CapWindow::Balance,
+                },
+            ],
+        };
+        LocalEntitlementProvider::from_config(&config)
+    }
+
+    #[tokio::test]
+    async fn resolves_configured_key_to_principal() {
+        let p = provider();
+        let principal = p.resolve("sk-balance").await.expect("known key resolves");
+        assert_eq!(principal.account_id, "acct-a");
+        assert_eq!(principal.key_id, "key-balance");
+    }
+
+    #[tokio::test]
+    async fn unknown_key_is_invalid() {
+        let p = provider();
+        assert!(matches!(
+            p.resolve("sk-nope").await,
+            Err(AuthError::InvalidKey)
+        ));
+    }
+
+    #[tokio::test]
+    async fn reserve_settle_release_round_trip() {
+        let p = provider();
+        let principal = p.resolve("sk-balance").await.unwrap();
+
+        let r = p.reserve(&principal, 400).await.expect("within cap");
+        // Reserved, not yet spent.
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.hard_cap, Some(1_000));
+        assert_eq!(snap.reserved, 400);
+        assert_eq!(snap.spent, 0);
+
+        // Used fewer tokens than reserved → remainder released, spend exact.
+        p.settle(r, 250).await;
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.reserved, 0);
+        assert_eq!(snap.spent, 250);
+
+        // A reservation that is released contributes no spend.
+        let r2 = p.reserve(&principal, 100).await.unwrap();
+        p.release(r2).await;
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.reserved, 0);
+        assert_eq!(snap.spent, 250);
+    }
+
+    #[tokio::test]
+    async fn balance_over_cap_is_insufficient_quota() {
+        let p = provider();
+        let principal = p.resolve("sk-balance").await.unwrap();
+        // Reserve most of the cap, then ask for more than remains.
+        let _r = p.reserve(&principal, 900).await.unwrap();
+        let err = p.reserve(&principal, 200).await.expect_err("over cap");
+        match err {
+            BudgetError::InsufficientQuota {
+                requested,
+                available,
+            } => {
+                assert_eq!(requested, 200);
+                assert_eq!(available, 100);
+            }
+            other => panic!("expected InsufficientQuota, got {other:?}"),
+        }
+    }
+
+    #[tokio::test]
+    async fn rolling_over_cap_is_rate_limited_with_retry_after() {
+        let p = provider();
+        let principal = p.resolve("sk-rolling").await.unwrap();
+        let _r = p.reserve(&principal, 500).await.unwrap();
+        let err = p.reserve(&principal, 1).await.expect_err("over cap");
+        match err {
+            BudgetError::RateLimited {
+                retry_after_secs, ..
+            } => {
+                assert!(retry_after_secs >= 1, "must advertise a retry hint");
+                assert!(retry_after_secs <= 3_600);
+            }
+            other => panic!("expected RateLimited, got {other:?}"),
+        }
+    }
+
+    #[tokio::test]
+    async fn uncapped_infra_key_never_refuses() {
+        let p = provider();
+        let principal = p.resolve("sk-infra").await.unwrap();
+        let r = p.reserve(&principal, 10_000_000).await.expect("uncapped");
+        p.settle(r, 10_000_000).await;
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.hard_cap, None);
+        assert_eq!(snap.spent, 10_000_000);
+    }
+}
--- a/crates/cortex-gateway/src/error.rs
+++ b/crates/cortex-gateway/src/error.rs
@@ -0,0 +1,24 @@
+//! Gateway adapter that turns the shared, axum-agnostic
+//! [`cortex_core::error_envelope::OpenAiError`] into an axum [`Response`],
+//! setting the `Retry-After` header when the envelope carries one.
+//!
+//! cortex-core owns the envelope shape and the rejection contract (#60/#63);
+//! this is the only place the gateway crosses from that data into axum.
+
+use axum::http::{HeaderValue, StatusCode, header};
+use axum::response::{IntoResponse, Json, Response};
+use cortex_core::error_envelope::OpenAiError;
+
+/// Render an [`OpenAiError`] as an axum response (status + JSON envelope +
+/// optional `Retry-After`).
+pub fn envelope_response(err: OpenAiError) -> Response {
+    let status = StatusCode::from_u16(err.status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+    let retry_after = err.retry_after_secs;
+    let mut response = (status, Json(err.body())).into_response();
+    if let Some(secs) = retry_after
+        && let Ok(value) = HeaderValue::from_str(&secs.to_string())
+    {
+        response.headers_mut().insert(header::RETRY_AFTER, value);
+    }
+    response
+}
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
--- a/crates/cortex-gateway/src/lib.rs
+++ b/crates/cortex-gateway/src/lib.rs
@@ -1,5 +1,10 @@
+pub mod anthropic_sse;
+pub mod auth;
+pub mod entitlements_local;
+pub mod error;
 pub mod evictor;
 pub mod handlers;
+pub mod metering;
 pub mod metrics;
 pub mod poller;
 pub mod proxy;
@@ -8,15 +13,26 @@ pub mod state;

 use anyhow::Result;
 use axum::Router;
+use axum::middleware::from_fn_with_state;
 use cortex_core::config::GatewayConfig;
 use std::sync::Arc;
 use tower_http::cors::CorsLayer;
 use tower_http::trace::TraceLayer;

 /// Build the Axum application router with all routes wired up.
+///
+/// Layer order (outermost first): trace → CORS → auth → handlers. CORS is
+/// outer to auth so preflight `OPTIONS` short-circuits before resolution;
+/// auth (`require_principal`) resolves the bearer key, attaches the
+/// principal, and stamps the internal principal headers before any handler
+/// runs.
 pub fn build_app(fleet: Arc<state::CortexState>) -> Router {
    Router::new()
        .merge(handlers::api_routes())
+        .layer(from_fn_with_state(
+            Arc::clone(&fleet),
+            auth::require_principal,
+        ))
        .layer(CorsLayer::permissive())
        .layer(TraceLayer::new_for_http())
        .with_state(fleet)
--- a/crates/cortex-gateway/src/metering.rs
+++ b/crates/cortex-gateway/src/metering.rs
@@ -0,0 +1,219 @@
+//! Per-request token metering (#51).
+//!
+//! Captures the real `(prompt, completion)` usage of every request and feeds
+//! it to two places: the [`EntitlementProvider`] spend ledger (via
+//! reserve→settle) and per-principal Prometheus counters. The principal is
+//! reconstructed from the internal headers the auth middleware stamped (#49),
+//! so this works uniformly across every proxy path without threading the
+//! typed principal through each handler.
+//!
+//! The reserve→settle lifecycle is established here but, in this phase,
+//! reserves **zero** tokens — metering only, no enforcement. Budget
+//! enforcement (#52) flips the reserved amount to the real
+//! `prompt + max_output` and handles the [`BudgetError`] rejection; the
+//! settle/release plumbing is identical, so that change is localized.
+//!
+//! [`ReservationGuard`] makes leaks impossible: settling records actual
+//! spend and releases the unused remainder; dropping a guard that was never
+//! settled releases the whole reservation. So an early return, error path,
+//! or dropped stream can't strand a reservation.
+
+use axum::http::HeaderMap;
+use cortex_core::entitlements::{
+    BudgetError, EntitlementProvider, HEADER_ACCOUNT_ID, HEADER_KEY_ID, Principal,
+};
+use cortex_core::error_envelope::OpenAiError;
+use std::sync::Arc;
+
+/// Fallback output-token budget when neither the request nor the model's
+/// advertised limit gives one. Bounds the reservation so a capped key is
+/// still gated even on under-specified requests (#52).
+pub const FALLBACK_MAX_OUTPUT: u64 = 4096;
+
+/// Invoked exactly once at request completion with best-effort
+/// `(prompt_tokens, completion_tokens)`. When no usage could be observed
+/// (e.g. a pre-dispatch failure or a dropped stream) it is dropped unused —
+/// which releases the held reservation via [`ReservationGuard`]'s `Drop`.
+pub type UsageSink = Box<dyn FnOnce(u64, u64) + Send>;
+
+/// Reconstruct the principal from the cortex-stamped internal headers. The
+/// auth middleware strips any client copy and stamps the authoritative value,
+/// so these headers are trustworthy within cortex. `None` for anonymous
+/// (unauthenticated) requests.
+pub fn principal_from_headers(headers: &HeaderMap) -> Option<Principal> {
+    let account_id = headers.get(HEADER_ACCOUNT_ID)?.to_str().ok()?.to_string();
+    let key_id = headers.get(HEADER_KEY_ID)?.to_str().ok()?.to_string();
+    Some(Principal { account_id, key_id })
+}
+
+/// Emit per-principal spend counters (#51). Labelled by account/key only —
+/// both are operator-bounded, so cardinality is controlled.
+pub fn record_spend(principal: &Principal, prompt: u64, completion: u64) {
+    let labels = [
+        ("account", principal.account_id.clone()),
+        ("key", principal.key_id.clone()),
+    ];
+    metrics::counter!("cortex_spend_tokens_total", &labels).increment(prompt + completion);
+    metrics::counter!("cortex_spend_prompt_tokens_total", &labels).increment(prompt);
+    metrics::counter!("cortex_spend_completion_tokens_total", &labels).increment(completion);
+}
+
+/// Holds a budget reservation for the life of a request. [`settle`] records
+/// actual spend and releases the remainder; an un-settled guard releases the
+/// whole reservation when dropped. Anonymous requests carry an empty guard,
+/// where every operation is a no-op.
+///
+/// [`settle`]: ReservationGuard::settle
+pub struct ReservationGuard {
+    provider: Arc<dyn EntitlementProvider>,
+    reservation: Option<cortex_core::entitlements::Reservation>,
+}
+
+impl ReservationGuard {
+    /// An empty guard for an anonymous request — no reservation to resolve.
+    pub fn anonymous(provider: Arc<dyn EntitlementProvider>) -> Self {
+        Self {
+            provider,
+            reservation: None,
+        }
+    }
+
+    /// Wrap an already-acquired reservation.
+    fn held(
+        provider: Arc<dyn EntitlementProvider>,
+        reservation: cortex_core::entitlements::Reservation,
+    ) -> Self {
+        Self {
+            provider,
+            reservation: Some(reservation),
+        }
+    }
+
+    /// Settle with the tokens actually consumed, disarming the drop-release.
+    /// Spawns the (fast, in-process for the local provider) settle so the
+    /// caller — which may be a sync stream-completion callback — needn't
+    /// await.
+    pub fn settle(mut self, actual_tokens: u64) {
+        if let Some(reservation) = self.reservation.take() {
+            let provider = Arc::clone(&self.provider);
+            tokio::spawn(async move {
+                provider.settle(reservation, actual_tokens).await;
+            });
+        }
+    }
+}
+
+impl Drop for ReservationGuard {
+    fn drop(&mut self) {
+        if let Some(reservation) = self.reservation.take() {
+            let provider = Arc::clone(&self.provider);
+            tokio::spawn(async move {
+                provider.release(reservation).await;
+            });
+        }
+    }
+}
+
+/// Build the completion sink for an authenticated request: record spend and
+/// settle the reservation with the observed total. Dropping it unused (no
+/// usage observed) releases the reservation via the guard.
+pub fn usage_sink(principal: Principal, guard: ReservationGuard) -> UsageSink {
+    Box::new(move |prompt, completion| {
+        record_spend(&principal, prompt, completion);
+        guard.settle(prompt + completion);
+    })
+}
+
+/// Reserve the request's upper-bound token cost for the principal, refusing
+/// *before* dispatch if it would exceed the hard cap (#52). On success
+/// returns a guard the caller settles with actual usage; on refusal returns
+/// the #63 envelope (`rate_limit_exceeded` + `Retry-After` for a resetting
+/// window, `insufficient_quota` for a hard balance — never `402`).
+pub async fn reserve_or_reject(
+    provider: Arc<dyn EntitlementProvider>,
+    principal: &Principal,
+    max_tokens: u64,
+) -> Result<ReservationGuard, OpenAiError> {
+    match provider.reserve(principal, max_tokens).await {
+        Ok(reservation) => Ok(ReservationGuard::held(provider, reservation)),
+        Err(err) => Err(budget_error_to_envelope(err)),
+    }
+}
+
+/// Map a [`BudgetError`] to the #63 envelope. The provider chose the window
+/// semantics; this only translates them to HTTP.
+fn budget_error_to_envelope(err: BudgetError) -> OpenAiError {
+    match err {
+        BudgetError::RateLimited {
+            retry_after_secs, ..
+        } => OpenAiError::rate_limit_exceeded(err.to_string(), retry_after_secs),
+        BudgetError::InsufficientQuota { .. } => OpenAiError::insufficient_quota(err.to_string()),
+    }
+}
+
+/// Upper-bound tokens to reserve for a request (#52): an over-estimate of
+/// the prompt plus the maximum output. `advertised_output` is the model's
+/// `limit.output` (#62), used when the request omits `max_(completion_)tokens`.
+/// Over-reserving is safe — settle corrects spend to the actual usage.
+pub fn reservation_estimate(body: &[u8], advertised_output: Option<u64>) -> u64 {
+    let max_output = requested_max_output(body)
+        .or(advertised_output)
+        .unwrap_or(FALLBACK_MAX_OUTPUT);
+    estimate_prompt_tokens(body).saturating_add(max_output)
+}
+
+/// The client's requested output cap, from `max_completion_tokens` (or the
+/// legacy `max_tokens`). `None` when unspecified.
+fn requested_max_output(body: &[u8]) -> Option<u64> {
+    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
+    v.get("max_completion_tokens")
+        .or_else(|| v.get("max_tokens"))
+        .and_then(serde_json::Value::as_u64)
+}
+
+/// Rough prompt-token estimate at ~4 chars/token over the whole body. cortex
+/// has no tokenizer; JSON overhead makes this a conservative over-estimate,
+/// and neuron remains the exact context wall (#56/#60). Settle reconciles to
+/// the real usage afterward.
+fn estimate_prompt_tokens(body: &[u8]) -> u64 {
+    (body.len() as u64 / 4).max(1)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn requested_max_output_prefers_max_completion_tokens() {
+        let body = br#"{"model":"m","max_completion_tokens":256,"max_tokens":99}"#;
+        assert_eq!(requested_max_output(body), Some(256));
+    }
+
+    #[test]
+    fn requested_max_output_falls_back_to_legacy_max_tokens() {
+        let body = br#"{"model":"m","max_tokens":128}"#;
+        assert_eq!(requested_max_output(body), Some(128));
+    }
+
+    #[test]
+    fn estimate_uses_requested_output_when_present() {
+        // Requested output dominates; prompt estimate is small for a tiny body.
+        let body = br#"{"model":"m","max_tokens":1000}"#;
+        let est = reservation_estimate(body, Some(8192));
+        assert!(est >= 1000 && est < 1100, "est was {est}");
+    }
+
+    #[test]
+    fn estimate_uses_advertised_output_when_request_omits_it() {
+        let body = br#"{"model":"m","messages":[]}"#;
+        let est = reservation_estimate(body, Some(8192));
+        assert!(est >= 8192, "est was {est}");
+    }
+
+    #[test]
+    fn estimate_falls_back_when_nothing_advertised() {
+        let body = br#"{"model":"m"}"#;
+        let est = reservation_estimate(body, None);
+        assert!(est >= FALLBACK_MAX_OUTPUT, "est was {est}");
+    }
+}
--- a/crates/cortex-gateway/src/metrics.rs
+++ b/crates/cortex-gateway/src/metrics.rs
@@ -46,6 +46,14 @@ fn describe_metrics() {
        "Generation throughput in tokens per second"
    );
    metrics::describe_counter!("cortex_requests_total", "Total number of proxied requests");
+    metrics::describe_counter!(
+        "cortex_prompt_tokens_total",
+        "Total prompt tokens reported by upstream usage objects"
+    );
+    metrics::describe_counter!(
+        "cortex_completion_tokens_total",
+        "Total completion tokens reported by upstream usage objects"
+    );
    metrics::describe_counter!(
        "cortex_request_errors_total",
        "Total number of failed proxy requests"
@@ -55,4 +63,16 @@ fn describe_metrics() {
        "cortex_cold_starts_total",
        "Total number of cold-start model loads"
    );
+    metrics::describe_counter!(
+        "cortex_spend_tokens_total",
+        "Total metered tokens (prompt + completion) per principal, labelled by account/key (#51)"
+    );
+    metrics::describe_counter!(
+        "cortex_spend_prompt_tokens_total",
+        "Metered prompt tokens per principal, labelled by account/key (#51)"
+    );
+    metrics::describe_counter!(
+        "cortex_spend_completion_tokens_total",
+        "Metered completion tokens per principal, labelled by account/key (#51)"
+    );
 }
--- a/crates/cortex-gateway/src/poller.rs
+++ b/crates/cortex-gateway/src/poller.rs
@@ -3,6 +3,7 @@

 use crate::state::CortexState;
 use chrono::Utc;
+use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
 use cortex_core::harness::ModelInfo;
 use cortex_core::node::{ModelEntry, ModelStatus};
 use std::sync::Arc;
@@ -25,7 +26,68 @@ pub async fn poll_once(fleet: &CortexState) {
    }
 }

+/// Fetch `GET /discovery` and cache it on the NodeState — topology is
+/// invariant for a given neuron process, so a successful fetch is kept.
+/// Re-polled only while `max_prompt_tokens` is still unknown (0): on a
+/// rolling deploy cortex can win the race and cache a neuron's discovery
+/// before that neuron reports the field (it deserialises to 0). Re-polling
+/// until a real cap arrives self-heals that without periodic polling.
+async fn maybe_poll_discovery(fleet: &CortexState, name: &str, endpoint: &str) {
+    {
+        let nodes = fleet.nodes.read().await;
+        match nodes.get(name) {
+            Some(n)
+                if n.discovery
+                    .as_ref()
+                    .is_some_and(|d| d.max_prompt_tokens > 0) =>
+            {
+                return;
+            }
+            _ => {}
+        }
+    }
+    let url = format!("{endpoint}/discovery");
+    let resp = match fleet
+        .http_client
+        .get(&url)
+        .timeout(Duration::from_secs(5))
+        .send()
+        .await
+    {
+        Ok(r) if r.status().is_success() => r,
+        Ok(r) => {
+            tracing::debug!(node = name, status = %r.status(), "discovery probe non-success");
+            return;
+        }
+        Err(e) => {
+            tracing::debug!(node = name, error = %e, "discovery probe unreachable");
+            return;
+        }
+    };
+    match resp.json::<DiscoveryResponse>().await {
+        Ok(d) => {
+            let mut nodes = fleet.nodes.write().await;
+            if let Some(node) = nodes.get_mut(name) {
+                tracing::info!(
+                    node = name,
+                    hostname = %d.hostname,
+                    devices = d.devices.len(),
+                    "discovery cached"
+                );
+                node.discovery = Some(d);
+            }
+        }
+        Err(e) => {
+            tracing::warn!(node = name, error = %e, "failed to parse /discovery response");
+        }
+    }
+}
+
 async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
+    // Topology first — cheap once cached, and the router needs it to
+    // route requests against catalogue entries that aren't loaded yet.
+    maybe_poll_discovery(fleet, name, endpoint).await;
+
    let url = format!("{endpoint}/models");

    let result = fleet
@@ -54,12 +116,22 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
                            .and_modify(|e| {
                                e.status = status;
                                e.vram_estimate_mb = upstream.vram_used_mb;
+                                e.capabilities = upstream.capabilities.clone();
+                                e.tool_call = upstream.tool_call;
+                                e.reasoning = upstream.reasoning;
+                                // Neuron's self-derived limit (#67) — the
+                                // authoritative source the gateway advertises.
+                                e.limit = upstream.limit.clone();
                            })
                            .or_insert_with(|| ModelEntry {
                                id: upstream.id.clone(),
                                status,
                                last_accessed: None,
                                vram_estimate_mb: upstream.vram_used_mb,
+                                capabilities: upstream.capabilities.clone(),
+                                tool_call: upstream.tool_call,
+                                reasoning: upstream.reasoning,
+                                limit: upstream.limit.clone(),
                            });
                    }

@@ -89,6 +161,54 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
            node.healthy = false;
        }
    }
+
+    // Release the write lock before the next HTTP call.
+    drop(nodes);
+
+    // Poll /health for the activation snapshot. We don't want this to
+    // flip the node to unhealthy on its own — a neuron that's serving
+    // /models fine is still operational even if /health is briefly
+    // unavailable — so failures are debug-level and leave the existing
+    // activation reading in place.
+    poll_health(fleet, name, endpoint).await;
+}
+
+/// Fetch `/health` and stash the activation snapshot on NodeState.
+/// Decoupled from the /models poll so a /health glitch doesn't mark
+/// the neuron unhealthy or evict the model list.
+async fn poll_health(fleet: &CortexState, name: &str, endpoint: &str) {
+    let url = format!("{endpoint}/health");
+    let resp = match fleet
+        .http_client
+        .get(&url)
+        .timeout(Duration::from_secs(5))
+        .send()
+        .await
+    {
+        Ok(r) if r.status().is_success() => r,
+        Ok(r) => {
+            tracing::debug!(node = name, status = %r.status(), "/health probe non-success");
+            return;
+        }
+        Err(e) => {
+            tracing::debug!(node = name, error = %e, "/health probe failed");
+            return;
+        }
+    };
+    match resp.json::<HealthResponse>().await {
+        Ok(h) => {
+            let mut nodes = fleet.nodes.write().await;
+            if let Some(node) = nodes.get_mut(name) {
+                node.activation = Some(h.activation);
+                // Per-model admission load (#53) → keyed by id for the
+                // load-aware router (#55).
+                node.model_load = h.models.into_iter().map(|m| (m.id.clone(), m)).collect();
+            }
+        }
+        Err(e) => {
+            tracing::debug!(node = name, error = %e, "failed to parse /health response");
+        }
+    }
 }

 fn parse_status(s: &str) -> ModelStatus {
@@ -96,6 +216,8 @@ fn parse_status(s: &str) -> ModelStatus {
        "loaded" => ModelStatus::Loaded,
        "unloaded" => ModelStatus::Unloaded,
        "reloading" => ModelStatus::Reloading,
+        "loading" => ModelStatus::Loading,
+        "recovering" => ModelStatus::Recovering,
        _ => ModelStatus::Loaded,
    }
 }
--- a/crates/cortex-gateway/src/proxy.rs
+++ b/crates/cortex-gateway/src/proxy.rs
@@ -1,4 +1,4 @@
-//! Streaming HTTP reverse proxy to mistral.rs backends.
+//! Streaming HTTP reverse proxy to neuron backends.
 //!
 //! For streaming requests, SSE chunks are forwarded as they arrive.
 //! The proxy captures timing information for metrics but does not
@@ -9,16 +9,31 @@ use anyhow::Result;
 use axum::body::Body;
 use axum::http::{HeaderMap, StatusCode};
 use axum::response::{IntoResponse, Response};
+use futures::Stream;
+use futures::stream::BoxStream;
 use reqwest::Client;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::time::Instant;

 /// Proxy a request body to the resolved backend node and stream the response.
+///
+/// Logging contract: every call emits exactly one structured event at
+/// info / warn level for operator visibility, regardless of outcome.
+/// Network-level failures and non-2xx upstream statuses are warn'd here
+/// (closest to the wire); the user-facing response carries only the
+/// status code and a generic message — implementation detail (body,
+/// error chain) lives in the log, never in the API surface.
 pub async fn forward_request(
    client: &Client,
    route: &RouteDecision,
    path: &str,
    headers: HeaderMap,
    body: bytes::Bytes,
+    model_id: &str,
+    usage_sink: Option<crate::metering::UsageSink>,
 ) -> Result<Response, ProxyError> {
+    let request_start = Instant::now();
    let url = format!("{}{}", route.endpoint, path);
    tracing::info!(
        node = %route.node_name,
@@ -37,13 +52,39 @@ pub async fn forward_request(
        req_builder = req_builder.header(key, value);
    }

-    let upstream_resp = req_builder.send().await.map_err(ProxyError::Upstream)?;
+    let upstream_resp = match req_builder.send().await {
+        Ok(r) => r,
+        Err(e) => {
+            tracing::warn!(
+                node = %route.node_name,
+                url = %url,
+                error = %e,
+                "proxy: upstream request failed (network)"
+            );
+            return Err(ProxyError::Upstream(e));
+        }
+    };

-    let status =
-        StatusCode::from_u16(upstream_resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
+    let upstream_status = upstream_resp.status();
+    if !upstream_status.is_success() {
+        // Streaming body — can't snippet without breaking the stream
+        // pass-through. Log status + URL; the client still gets the
+        // upstream status, just without the leaked body.
+        tracing::warn!(
+            node = %route.node_name,
+            url = %url,
+            status = upstream_status.as_u16(),
+            "proxy: upstream returned non-2xx"
+        );
+    }
+
+    let status = StatusCode::from_u16(upstream_status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);

    let resp_headers = upstream_resp.headers().clone();
-    let stream = upstream_resp.bytes_stream();
+    let stream = TokenMetricsStream::new(
+        Box::pin(upstream_resp.bytes_stream()),
+        TokenMetrics::new(model_id, &route.node_name, request_start, usage_sink),
+    );

    let body = Body::from_stream(stream);

@@ -52,31 +93,284 @@ pub async fn forward_request(
        response = response.header(key, value);
    }

-    response
-        .body(body)
-        .map_err(|e| ProxyError::ResponseBuild(e.to_string()))
+    response.body(body).map_err(|e| {
+        tracing::warn!(
+            node = %route.node_name,
+            url = %url,
+            error = %e,
+            "proxy: failed to build response"
+        );
+        ProxyError::ResponseBuild(e.to_string())
+    })
 }

 #[derive(Debug, thiserror::Error)]
 pub enum ProxyError {
-    #[error("upstream request failed: {0}")]
+    #[error("upstream request failed")]
    Upstream(reqwest::Error),
-    #[error("failed to build response: {0}")]
+    #[error("failed to build response")]
    ResponseBuild(String),
 }

 impl IntoResponse for ProxyError {
    fn into_response(self) -> Response {
-        let status = match &self {
-            ProxyError::Upstream(_) => StatusCode::BAD_GATEWAY,
-            ProxyError::ResponseBuild(_) => StatusCode::INTERNAL_SERVER_ERROR,
+        let (status, code, message) = match &self {
+            ProxyError::Upstream(_) => (
+                StatusCode::BAD_GATEWAY,
+                "upstream_connection_error",
+                "upstream request failed",
+            ),
+            ProxyError::ResponseBuild(_) => (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                "internal_server_error",
+                "failed to build response",
+            ),
        };
-        let body = serde_json::json!({
-            "error": {
-                "message": self.to_string(),
-                "type": "proxy_error",
-            }
-        });
-        (status, axum::Json(body)).into_response()
+        crate::error::envelope_response(cortex_core::error_envelope::OpenAiError::new(
+            status.as_u16(),
+            "api_error",
+            code,
+            message,
+        ))
+    }
+}
+
+// ── Per-request token metrics (#21) ─────────────────────────────────
+//
+// The proxy never buffers or re-serialises the upstream body — chunks
+// are forwarded verbatim. For metrics it observes each chunk's arrival
+// time and keeps a bounded tail of the body text, from which the final
+// OpenAI `usage` object (present on the last SSE chunk and on
+// non-streaming JSON bodies alike) yields engine-truth token counts.
+//
+// Emitted per request, labelled {model, node}:
+//   cortex_time_to_first_token_seconds  (histogram) — first body chunk
+//   cortex_tokens_per_second            (histogram) — completion tokens
+//       over the decode window (first→last chunk); falls back to the
+//       full request duration for single-chunk (non-streaming) bodies
+//   cortex_prompt_tokens_total / cortex_completion_tokens_total (counters)
+
+/// Cap on the retained body tail. The usage object rides on the final
+/// chunk, so a generous tail is plenty; the cap bounds memory on huge
+/// non-streaming bodies.
+const TAIL_CAP_BYTES: usize = 64 * 1024;
+
+/// Find the value of the LAST `"key": <integer>` occurrence in `tail`.
+/// Pure and chunk-boundary-safe (the tail is contiguous appended text).
+/// The quoted-needle form means `completion_tokens` never matches
+/// `completion_tokens_details`.
+pub(crate) fn last_count_for(tail: &str, key: &str) -> Option<u64> {
+    let needle = format!("\"{key}\"");
+    let mut result = None;
+    for (idx, _) in tail.match_indices(&needle) {
+        let rest = tail[idx + needle.len()..].trim_start();
+        let Some(rest) = rest.strip_prefix(':') else {
+            continue;
+        };
+        let rest = rest.trim_start();
+        let digits: &str = &rest[..rest
+            .char_indices()
+            .find(|(_, c)| !c.is_ascii_digit())
+            .map(|(i, _)| i)
+            .unwrap_or(rest.len())];
+        if let Ok(v) = digits.parse::<u64>() {
+            result = Some(v);
+        }
+    }
+    result
+}
+
+struct TokenMetrics {
+    labels: [(&'static str, String); 2],
+    request_start: Instant,
+    first_chunk: Option<Instant>,
+    last_chunk: Option<Instant>,
+    tail: String,
+    finished: bool,
+    /// Per-principal metering hook (#51). Invoked exactly once in `finish`
+    /// with the observed `(prompt, completion)` so the reservation can be
+    /// settled and spend recorded. `None` for anonymous requests.
+    usage_sink: Option<crate::metering::UsageSink>,
+}
+
+impl TokenMetrics {
+    fn new(
+        model_id: &str,
+        node_name: &str,
+        request_start: Instant,
+        usage_sink: Option<crate::metering::UsageSink>,
+    ) -> Self {
+        Self {
+            labels: [
+                ("model", model_id.to_string()),
+                ("node", node_name.to_string()),
+            ],
+            request_start,
+            first_chunk: None,
+            last_chunk: None,
+            tail: String::new(),
+            finished: false,
+            usage_sink,
+        }
+    }
+
+    fn observe(&mut self, chunk: &[u8]) {
+        let now = Instant::now();
+        self.first_chunk.get_or_insert(now);
+        self.last_chunk = Some(now);
+        self.tail.push_str(&String::from_utf8_lossy(chunk));
+        if self.tail.len() > TAIL_CAP_BYTES {
+            // Keep the newest half; the usage object is always at the
+            // very end of the body. Split at a char boundary.
+            let mut cut = self.tail.len() - TAIL_CAP_BYTES / 2;
+            while !self.tail.is_char_boundary(cut) {
+                cut += 1;
+            }
+            self.tail.drain(..cut);
+        }
+    }
+
+    /// Emit the metrics exactly once — called on clean stream end and
+    /// from Drop (client disconnect mid-stream still records what we
+    /// saw).
+    fn finish(&mut self) {
+        if self.finished {
+            return;
+        }
+        self.finished = true;
+
+        let prompt = last_count_for(&self.tail, "prompt_tokens");
+        let completion = last_count_for(&self.tail, "completion_tokens");
+
+        // Per-model metrics — only when body chunks actually arrived.
+        if let Some(first) = self.first_chunk {
+            let ttft = first.duration_since(self.request_start).as_secs_f64();
+            metrics::histogram!("cortex_time_to_first_token_seconds", &self.labels).record(ttft);
+
+            if let Some(prompt) = prompt {
+                metrics::counter!("cortex_prompt_tokens_total", &self.labels).increment(prompt);
+            }
+            if let Some(completion) = completion.filter(|c| *c > 0) {
+                metrics::counter!("cortex_completion_tokens_total", &self.labels)
+                    .increment(completion);
+
+                let last = self.last_chunk.unwrap_or(first);
+                let decode_window = last.duration_since(first).as_secs_f64();
+                // Streaming: rate over the decode window (first→last chunk).
+                // Non-streaming bodies arrive as ~one chunk (window ≈ 0),
+                // where the only honest denominator is the full request
+                // duration.
+                let secs = if decode_window >= 0.1 {
+                    decode_window
+                } else {
+                    last.duration_since(self.request_start).as_secs_f64()
+                };
+                if secs > 0.0 {
+                    metrics::histogram!("cortex_tokens_per_second", &self.labels)
+                        .record(completion as f64 / secs);
+                }
+            }
+        }
+
+        // Per-principal metering + reservation settle (#51). Always runs so
+        // the reservation is resolved even when no usage/body was observed
+        // (sink with (0, 0) → settle 0 → release).
+        if let Some(sink) = self.usage_sink.take() {
+            sink(prompt.unwrap_or(0), completion.unwrap_or(0));
+        }
+    }
+}
+
+/// Pass-through stream wrapper that feeds [`TokenMetrics`]. Emits on
+/// clean end-of-stream; the Drop impl covers client disconnects.
+struct TokenMetricsStream {
+    inner: BoxStream<'static, Result<bytes::Bytes, reqwest::Error>>,
+    metrics: TokenMetrics,
+}
+
+impl TokenMetricsStream {
+    fn new(
+        inner: BoxStream<'static, Result<bytes::Bytes, reqwest::Error>>,
+        metrics: TokenMetrics,
+    ) -> Self {
+        Self { inner, metrics }
+    }
+}
+
+impl Stream for TokenMetricsStream {
+    type Item = Result<bytes::Bytes, reqwest::Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        match this.inner.as_mut().poll_next(cx) {
+            Poll::Ready(Some(Ok(chunk))) => {
+                this.metrics.observe(&chunk);
+                Poll::Ready(Some(Ok(chunk)))
+            }
+            Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
+            Poll::Ready(None) => {
+                this.metrics.finish();
+                Poll::Ready(None)
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl Drop for TokenMetricsStream {
+    fn drop(&mut self) {
+        self.metrics.finish();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::last_count_for;
+
+    #[test]
+    fn extracts_counts_from_final_sse_usage_chunk() {
+        let tail = concat!(
+            "data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n",
+            "data: {\"choices\":[],\"usage\":{\"prompt_tokens\":225,",
+            "\"completion_tokens\":42,\"total_tokens\":267}}\n\n",
+            "data: [DONE]\n\n"
+        );
+        assert_eq!(last_count_for(tail, "prompt_tokens"), Some(225));
+        assert_eq!(last_count_for(tail, "completion_tokens"), Some(42));
+    }
+
+    #[test]
+    fn extracts_counts_from_non_streaming_body() {
+        let tail = "{\"choices\":[{\"message\":{\"content\":\"hi\"}}],\
+                    \"usage\":{\"prompt_tokens\": 12, \"completion_tokens\": 7}}";
+        assert_eq!(last_count_for(tail, "prompt_tokens"), Some(12));
+        assert_eq!(last_count_for(tail, "completion_tokens"), Some(7));
+    }
+
+    #[test]
+    fn ignores_details_variants_and_takes_last_occurrence() {
+        // completion_tokens_details must not shadow completion_tokens,
+        // and the LAST usage object wins (matters when content echoes
+        // a usage-shaped string earlier in the stream).
+        let tail = concat!(
+            "data: {\"usage\":{\"completion_tokens\":1}}\n\n",
+            "data: {\"usage\":{\"completion_tokens\":99,",
+            "\"completion_tokens_details\":{\"reasoning_tokens\":3}}}\n\n"
+        );
+        assert_eq!(last_count_for(tail, "completion_tokens"), Some(99));
+    }
+
+    #[test]
+    fn absent_keys_yield_none() {
+        assert_eq!(
+            last_count_for("data: [DONE]\n\n", "completion_tokens"),
+            None
+        );
+        assert_eq!(last_count_for("", "prompt_tokens"), None);
+        // key present but non-numeric value
+        assert_eq!(
+            last_count_for("\"completion_tokens\": null", "completion_tokens"),
+            None
+        );
    }
 }
--- a/crates/cortex-gateway/src/router.rs
+++ b/crates/cortex-gateway/src/router.rs
@@ -2,13 +2,21 @@
 //!
 //! Given a model ID from an inbound request, determine which node should
 //! handle it. Priority:
-//!   1. Node where the model is currently `Loaded`
-//!   2. Node where the model is `Unloaded` (will lazy-load on request)
-//!   3. Error: model not found on any node
+//!   1. Node where the model is currently `Loaded` → use it.
+//!   2. Node where the model is `Unloaded` → use it; neuron's existing
+//!      lazy-load behaviour will reload before serving the request.
+//!   3. Model is in the catalogue → pick a feasible neuron, call
+//!      `POST /models/load`, wait for the load to complete, then
+//!      proxy. First-request cold-load latency is acceptable per the
+//!      unified-endpoint contract.
+//!   4. Not in catalogue, not loaded anywhere → 404.

 use crate::state::CortexState;
+use cortex_core::catalogue::ModelProfile;
+use cortex_core::harness::ModelSpec;
 use cortex_core::node::ModelStatus;
 use std::sync::Arc;
+use std::time::Duration;

 /// The routing decision: which node endpoint to proxy the request to.
 #[derive(Debug, Clone)]
@@ -16,62 +24,400 @@ pub struct RouteDecision {
    pub node_name: String,
    /// The inference endpoint to proxy to (from neuron's /models/{id}/endpoint).
    pub endpoint: String,
-    /// Whether the model will need to load (cold start).
+    /// Whether the model will need to load (cold start). Set to true
+    /// when we proxied to an `Unloaded` node (lazy load on neuron) or
+    /// when we just triggered an explicit cold-load via the catalogue
+    /// path.
    pub cold_start: bool,
+    /// The concrete model id we actually routed to. Equal to the
+    /// caller's requested id unless an alias was resolved (e.g. caller
+    /// asked for `helexa/small`, this carries `Qwen/Qwen3-1.7B`). The
+    /// handler uses this to rewrite the request body's `model` field
+    /// before proxying — neurons reject requests where the body's
+    /// model name doesn't match a loaded model.
+    pub resolved_model_id: String,
 }

 #[derive(Debug, thiserror::Error)]
 pub enum RouteError {
-    #[error("model '{0}' not found on any node")]
+    #[error("model '{0}' not found on any node and not in catalogue")]
    ModelNotFound(String),
    #[error("no healthy nodes available")]
    NoHealthyNodes,
    #[error("failed to resolve inference endpoint for model '{0}' on node '{1}'")]
    EndpointResolveFailed(String, String),
+    #[error(
+        "model '{model_id}' is in the catalogue but no healthy neuron's topology satisfies its constraints"
+    )]
+    NoFeasibleNeuron { model_id: String },
+    #[error("cold-load of '{model_id}' on '{node}' failed: {message}")]
+    ColdLoadFailed {
+        model_id: String,
+        node: String,
+        message: String,
+    },
+    #[error(
+        "model '{model_id}' is recovering on node '{node}' (device context rebuild in progress) — retry shortly"
+    )]
+    ModelRecovering { model_id: String, node: String },
+}
+
+impl RouteError {
+    /// HTTP status the gateway should answer with. `NoHealthyNodes` and
+    /// `ModelRecovering` are the transient cases (503 service_unavailable,
+    /// safe to retry the same request); everything else is 404.
+    pub fn http_status(&self) -> u16 {
+        match self {
+            RouteError::NoHealthyNodes | RouteError::ModelRecovering { .. } => 503,
+            _ => 404,
+        }
+    }
+
+    /// Broad OpenAI error category for the JSON envelope.
+    pub fn broad_type(&self) -> &'static str {
+        match self {
+            RouteError::ModelNotFound(_) => "invalid_request_error",
+            RouteError::NoHealthyNodes
+            | RouteError::EndpointResolveFailed(_, _)
+            | RouteError::NoFeasibleNeuron { .. }
+            | RouteError::ColdLoadFailed { .. }
+            | RouteError::ModelRecovering { .. } => "api_error",
+        }
+    }
+
+    /// Specific machine-readable error code.
+    pub fn code(&self) -> &'static str {
+        match self {
+            RouteError::ModelNotFound(_) => "model_not_found",
+            RouteError::NoHealthyNodes => "service_unavailable",
+            RouteError::EndpointResolveFailed(_, _) => "service_unavailable",
+            RouteError::NoFeasibleNeuron { .. } => "service_unavailable",
+            RouteError::ColdLoadFailed { .. } => "service_unavailable",
+            RouteError::ModelRecovering { .. } => "service_unavailable",
+        }
+    }
+
+    /// Seconds to advertise in `Retry-After` for the transient variants
+    /// (#63). `NoHealthyNodes` may clear once the poller re-marks a node
+    /// healthy; `ModelRecovering` clears once the device context finishes
+    /// rebuilding — both are safe to retry. Everything else is permanent
+    /// for this request (404) and carries no hint.
+    pub fn retry_after_secs(&self) -> Option<u64> {
+        match self {
+            RouteError::ModelRecovering { .. } => Some(2),
+            RouteError::NoHealthyNodes => Some(5),
+            _ => None,
+        }
+    }
 }

 /// Resolve which node should serve a request for the given model.
 /// Asks the neuron for the inference endpoint after selecting a node.
 pub async fn resolve(
    fleet: &Arc<CortexState>,
-    model_id: &str,
+    requested_model_id: &str,
 ) -> Result<RouteDecision, RouteError> {
-    let (node_name, neuron_endpoint, cold_start) = {
+    // Alias resolution first — swap `helexa/small` (etc.) for the
+    // concrete id before any node lookups so the rest of routing,
+    // loading, and metrics deal in concrete ids only. `resolve_alias`
+    // returns the input verbatim when it isn't an alias.
+    let model_id = fleet.catalogue.resolve_alias(requested_model_id);
+    if model_id != requested_model_id {
+        tracing::debug!(
+            requested = requested_model_id,
+            resolved = model_id,
+            "alias resolved"
+        );
+    }
+    // Snapshot loaded / unloaded / recovering state from the poller cache.
+    let (loaded_route, unloaded_route, recovering_node, any_healthy) = {
        let nodes = fleet.nodes.read().await;
-
-        let mut loaded_candidate = None;
-        let mut unloaded_candidate = None;
-
+        // All healthy nodes with the model loaded, each with its current
+        // admission load (#53) so we can pick the least-busy replica (#55).
+        let mut loaded_candidates: Vec<(String, String, usize)> = Vec::new();
+        let mut unloaded_route = None;
+        let mut recovering_node = None;
+        let mut any_healthy = false;
        for node in nodes.values() {
            if !node.healthy {
                continue;
            }
+            any_healthy = true;
            if let Some(entry) = node.models.get(model_id) {
                match entry.status {
                    ModelStatus::Loaded | ModelStatus::Reloading => {
-                        loaded_candidate = Some((node.name.clone(), node.endpoint.clone(), false));
-                        break;
+                        // Least-busy score: in-flight + queued from the
+                        // neuron's last /health (#53). Unknown load (no poll
+                        // yet) scores 0 so the replica stays eligible.
+                        let score = node
+                            .model_load
+                            .get(model_id)
+                            .map(|l| l.in_flight + l.queue_depth)
+                            .unwrap_or(0);
+                        loaded_candidates.push((node.name.clone(), node.endpoint.clone(), score));
                    }
                    ModelStatus::Unloaded => {
-                        if unloaded_candidate.is_none() {
-                            unloaded_candidate =
-                                Some((node.name.clone(), node.endpoint.clone(), true));
+                        if unloaded_route.is_none() {
+                            unloaded_route = Some((node.name.clone(), node.endpoint.clone(), true));
                        }
                    }
+                    // Auto-recovering (#17/#20): the model is rebuilding
+                    // its device context on this node. Hold the route —
+                    // answer "retry shortly" rather than 404, and do NOT
+                    // fall through to the catalogue cold-load, which
+                    // would race a second placement (and a second copy's
+                    // worth of VRAM) against the in-flight recovery.
+                    ModelStatus::Recovering => {
+                        if recovering_node.is_none() {
+                            recovering_node = Some(node.name.clone());
+                        }
+                    }
+                    // Loading is gateway-synthesised from neuron's
+                    // activation snapshot; it never appears on the
+                    // wire from neuron's `/models`. Skip — the model
+                    // isn't actually servable yet. The pre-existing
+                    // race (catalogue cold_load fires a parallel
+                    // /models/load against the in-flight load) is no
+                    // worse than before; fixing it needs neuron-side
+                    // in-flight tracking on /models/load itself.
+                    ModelStatus::Loading => {}
+                }
+            }
+        }
+        // Pick the least-busy loaded replica; ties break by node name for
+        // deterministic routing. `false` = not a cold start.
+        let loaded_route = loaded_candidates
+            .into_iter()
+            .min_by(|a, b| a.2.cmp(&b.2).then_with(|| a.0.cmp(&b.0)))
+            .map(|(name, endpoint, _score)| (name, endpoint, false));
+        (loaded_route, unloaded_route, recovering_node, any_healthy)
+    };
+
+    if !any_healthy {
+        return Err(RouteError::NoHealthyNodes);
+    }
+
+    // Priority 1: already loaded.
+    if let Some((node_name, neuron_endpoint, cold_start)) = loaded_route {
+        return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await;
+    }
+
+    // Priority 2: recovering somewhere — transient hold, not a reroute.
+    if let Some(node) = recovering_node {
+        return Err(RouteError::ModelRecovering {
+            model_id: model_id.to_string(),
+            node,
+        });
+    }
+
+    // Priority 3: known to neuron but unloaded (neuron's lazy load).
+    if let Some((node_name, neuron_endpoint, cold_start)) = unloaded_route {
+        return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await;
+    }
+
+    // Priority 4: catalogue × topology cold-load.
+    if let Some(profile) = fleet.catalogue.get(model_id) {
+        let (node_name, neuron_endpoint) = pick_feasible_neuron(fleet, profile).await?;
+        cold_load(fleet, &node_name, &neuron_endpoint, profile).await?;
+        return finish(fleet, &node_name, &neuron_endpoint, model_id, true).await;
+    }
+
+    Err(RouteError::ModelNotFound(model_id.to_string()))
+}
+
+/// Pick a healthy neuron whose discovered topology satisfies the
+/// profile. Preference order:
+///   1. A neuron from `profile.pinned_on` that is healthy + feasible.
+///   2. Otherwise, any healthy + feasible neuron, stable by name.
+async fn pick_feasible_neuron(
+    fleet: &Arc<CortexState>,
+    profile: &ModelProfile,
+) -> Result<(String, String), RouteError> {
+    let nodes = fleet.nodes.read().await;
+    let mut candidates: Vec<(String, String, bool)> = Vec::new();
+    for node in nodes.values() {
+        if !node.healthy {
+            continue;
+        }
+        let Some(disc) = node.discovery.as_ref() else {
+            continue;
+        };
+        if !profile.is_feasible_on(&node.name, &disc.devices) {
+            continue;
+        }
+        let pinned = profile.pinned_on.iter().any(|n| n == &node.name);
+        candidates.push((node.name.clone(), node.endpoint.clone(), pinned));
+    }
+    candidates.sort_by(|a, b| {
+        b.2.cmp(&a.2) // pinned first (true > false)
+            .then(a.0.cmp(&b.0))
+    });
+    let pick = candidates.into_iter().next();
+    pick.map(|(n, e, _)| (n, e))
+        .ok_or_else(|| RouteError::NoFeasibleNeuron {
+            model_id: profile.id.clone(),
+        })
+}
+
+/// Issue `POST {endpoint}/models/load` for this profile on this neuron,
+/// blocking until the load completes (neuron's load endpoint is
+/// synchronous — it returns 200 once VRAM is materialised). On success
+/// also inserts a `Loaded` entry into the local NodeState cache so the
+/// caller's subsequent endpoint lookup sees the new model without
+/// waiting for the next poll cycle.
+async fn cold_load(
+    fleet: &Arc<CortexState>,
+    node_name: &str,
+    neuron_endpoint: &str,
+    profile: &ModelProfile,
+) -> Result<(), RouteError> {
+    let spec = profile_to_spec(fleet, node_name, profile).await;
+    let url = format!("{neuron_endpoint}/models/load");
+    tracing::info!(model = %profile.id, node = node_name, "cold-loading via /models/load");
+
+    // Generous timeout: a fresh download + safetensors mmap + device
+    // copy for a 30B-class dense model can comfortably exceed 5 min on
+    // a slow link. The HTTP client's own default already covers most
+    // of this; pin a longer per-request bound just here.
+    let resp = match fleet
+        .http_client
+        .post(&url)
+        .timeout(Duration::from_secs(1800))
+        .json(&spec)
+        .send()
+        .await
+    {
+        Ok(r) => r,
+        Err(e) => {
+            return Err(RouteError::ColdLoadFailed {
+                model_id: profile.id.clone(),
+                node: node_name.to_string(),
+                message: format!("HTTP request failed: {e}"),
+            });
+        }
+    };
+
+    let status = resp.status();
+    if !status.is_success() {
+        let body = resp.text().await.unwrap_or_default();
+        // Neuron returns 400 "already loaded" when two concurrent
+        // requests race the same model. Treat that as success — both
+        // requests effectively achieved the same end state.
+        if body.contains("already loaded") {
+            tracing::info!(
+                model = %profile.id,
+                node = node_name,
+                "cold-load saw 'already loaded' — treating as success"
+            );
+        } else {
+            return Err(RouteError::ColdLoadFailed {
+                model_id: profile.id.clone(),
+                node: node_name.to_string(),
+                message: format!("HTTP {status}: {body}"),
+            });
+        }
+    } else {
+        tracing::info!(model = %profile.id, node = node_name, "cold-load returned 200");
+    }
+
+    // Warm the cache: insert a Loaded ModelEntry so the next
+    // resolve() finds the model without waiting for the poll loop.
+    {
+        let mut nodes = fleet.nodes.write().await;
+        if let Some(node) = nodes.get_mut(node_name) {
+            node.models.insert(
+                profile.id.clone(),
+                cortex_core::node::ModelEntry {
+                    id: profile.id.clone(),
+                    status: ModelStatus::Loaded,
+                    last_accessed: Some(chrono::Utc::now()),
+                    vram_estimate_mb: profile.vram_mb,
+                    capabilities: Vec::new(),
+                    tool_call: false,
+                    reasoning: false,
+                    limit: None,
+                },
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Translate a `ModelProfile` to a `ModelSpec` neuron's /models/load
+/// accepts. Devices are picked from the neuron's discovered topology —
+/// the first `min_devices` indices that meet `min_device_vram_mb`.
+async fn profile_to_spec(
+    fleet: &Arc<CortexState>,
+    node_name: &str,
+    profile: &ModelProfile,
+) -> ModelSpec {
+    let devices = {
+        let nodes = fleet.nodes.read().await;
+        let mut picked: Vec<u32> = Vec::new();
+        if let Some(node) = nodes.get(node_name)
+            && let Some(disc) = &node.discovery
+        {
+            let min_vram = profile.min_device_vram_mb.unwrap_or(0);
+            for d in &disc.devices {
+                if d.vram_total_mb >= min_vram {
+                    picked.push(d.index);
+                    if picked.len() as u32 >= profile.min_devices {
+                        break;
+                    }
                }
            }
        }
-
-        loaded_candidate.or(unloaded_candidate).ok_or_else(|| {
-            if nodes.values().any(|n| n.healthy) {
-                RouteError::ModelNotFound(model_id.to_string())
-            } else {
-                RouteError::NoHealthyNodes
-            }
-        })?
+        if picked.is_empty() {
+            // Fall back to a 0..min_devices default; pick_feasible_neuron
+            // already verified the topology satisfies the constraints,
+            // so this only fires if discovery raced or was lost.
+            (0..profile.min_devices).collect()
+        } else {
+            picked
+        }
    };

-    // Ask the neuron for the inference endpoint for this model.
+    let tensor_parallel = if profile.min_devices > 1 {
+        Some(profile.min_devices)
+    } else {
+        None
+    };
+
+    ModelSpec {
+        model_id: qualified_model_id(profile),
+        harness: profile.harness.clone(),
+        quant: profile.quant.clone(),
+        tensor_parallel,
+        devices: Some(devices),
+    }
+}
+
+/// Prefix the catalogue id with the scheme when one is declared, so
+/// neuron resolves the load against the right registry. Without this,
+/// a profile pointing at the helexa registry would resolve via
+/// neuron's `default_source` (typically `huggingface`) and fetch
+/// bytes from the wrong place. Profiles that omit `source` continue
+/// to pass the bare id through, preserving the pre-Phase-3 contract.
+///
+/// Stays at module scope (not nested in `profile_to_spec`) so the unit
+/// tests can exercise it without spinning up CortexState topology.
+fn qualified_model_id(profile: &ModelProfile) -> String {
+    match profile.source.as_deref() {
+        Some(scheme) if !scheme.is_empty() => format!("{scheme}:{}", profile.id),
+        _ => profile.id.clone(),
+    }
+}
+
+/// Resolve neuron's `/models/{id}/endpoint` to its inference URL and
+/// build the final `RouteDecision`. Shared by all three priority
+/// branches above.
+async fn finish(
+    fleet: &Arc<CortexState>,
+    node_name: &str,
+    neuron_endpoint: &str,
+    model_id: &str,
+    cold_start: bool,
+) -> Result<RouteDecision, RouteError> {
    let endpoint_url = format!(
        "{}/models/{}/endpoint",
        neuron_endpoint,
@@ -89,13 +435,122 @@ pub async fn resolve(
        _ => None,
    };

-    let endpoint = inference_endpoint.ok_or_else(|| {
-        RouteError::EndpointResolveFailed(model_id.to_string(), node_name.clone())
+    let raw = inference_endpoint.ok_or_else(|| {
+        RouteError::EndpointResolveFailed(model_id.to_string(), node_name.to_string())
    })?;

+    // Rewrite loopback inference URLs to use the configured neuron host.
+    // Neuron's default bind_url is `http://localhost:13131` (it can't
+    // reliably know its own externally-resolvable name). Cortex sees a
+    // URL that's only meaningful from the neuron host's own perspective;
+    // proxying directly to localhost from a different cortex host would
+    // hit nothing. Keep neuron's port and path (a future harness could
+    // serve inference on a different port than the management API), but
+    // swap the host for the one in cortex.toml.
+    let endpoint = rewrite_loopback_host(&raw, neuron_endpoint).unwrap_or(raw);
+
    Ok(RouteDecision {
-        node_name,
+        node_name: node_name.to_string(),
        endpoint,
        cold_start,
+        resolved_model_id: model_id.to_string(),
    })
 }
+
+/// If `inference_url`'s host is a loopback name (localhost / 127.0.0.1 /
+/// 0.0.0.0 / ::1), return a copy with the host replaced by
+/// `neuron_endpoint`'s host. Otherwise return None and the caller falls
+/// back to the inference URL as-is.
+fn rewrite_loopback_host(inference_url: &str, neuron_endpoint: &str) -> Option<String> {
+    let inf = url::Url::parse(inference_url).ok()?;
+    let inf_host = inf.host_str()?;
+    let is_loopback = matches!(inf_host, "localhost" | "127.0.0.1" | "0.0.0.0" | "::1");
+    if !is_loopback {
+        return None;
+    }
+    let neuron = url::Url::parse(neuron_endpoint).ok()?;
+    let new_host = neuron.host_str()?;
+    let mut out = inf.clone();
+    out.set_host(Some(new_host)).ok()?;
+    // url::Url::to_string normalises an empty path to "/", which then
+    // breaks downstream callers that do format!("{endpoint}/v1/...")
+    // and produce a double slash. The proxy URL is treated as a base
+    // string that the caller appends paths to, so strip the trailing
+    // slash here.
+    let s = out.to_string();
+    Some(s.trim_end_matches('/').to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{ModelProfile, qualified_model_id, rewrite_loopback_host};
+
+    fn bare_profile(id: &str, source: Option<&str>) -> ModelProfile {
+        ModelProfile {
+            id: id.into(),
+            harness: "candle".into(),
+            quant: None,
+            vram_mb: None,
+            min_devices: 1,
+            min_device_vram_mb: None,
+            pinned_on: vec![],
+            source: source.map(String::from),
+            limit: None,
+            cost: None,
+            capabilities: vec![],
+        }
+    }
+
+    #[test]
+    fn qualified_id_passes_through_when_source_absent() {
+        let p = bare_profile("Qwen/Qwen3-30B", None);
+        assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B");
+    }
+
+    #[test]
+    fn qualified_id_prefixes_when_source_set() {
+        let p = bare_profile("Helexa/Qwen3.6-27B-Uncensored", Some("helexa"));
+        assert_eq!(
+            qualified_model_id(&p),
+            "helexa:Helexa/Qwen3.6-27B-Uncensored"
+        );
+    }
+
+    #[test]
+    fn qualified_id_passes_through_when_source_is_empty_string() {
+        // An empty scheme is treated as absent — neuron's default_source
+        // substitution kicks in.
+        let p = bare_profile("Qwen/Qwen3-30B", Some(""));
+        assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B");
+    }
+
+    #[test]
+    fn rewrites_localhost_keeps_port_and_path() {
+        let out = rewrite_loopback_host(
+            "http://localhost:13131",
+            "http://beast.hanzalova.internal:13131",
+        );
+        assert_eq!(
+            out.as_deref(),
+            Some("http://beast.hanzalova.internal:13131")
+        );
+    }
+
+    #[test]
+    fn rewrites_loopback_with_distinct_inference_port() {
+        let out = rewrite_loopback_host("http://127.0.0.1:8080", "http://beast.lan:13131");
+        assert_eq!(out.as_deref(), Some("http://beast.lan:8080"));
+    }
+
+    #[test]
+    fn leaves_non_loopback_alone() {
+        let out = rewrite_loopback_host("http://other.host:1234", "http://beast.lan:13131");
+        assert_eq!(out, None);
+    }
+
+    #[test]
+    fn malformed_inference_url_returns_none() {
+        let out = rewrite_loopback_host("not a url", "http://beast.lan:13131");
+        assert_eq!(out, None);
+    }
+}
--- a/crates/cortex-gateway/src/state.rs
+++ b/crates/cortex-gateway/src/state.rs
@@ -1,7 +1,10 @@
+use crate::entitlements_local::LocalEntitlementProvider;
 use cortex_core::catalogue::ModelCatalogue;
 use cortex_core::config::{EvictionSettings, GatewayConfig, NeuronEndpoint};
+use cortex_core::entitlements::EntitlementProvider;
 use cortex_core::node::NodeState;
 use std::collections::HashMap;
+use std::sync::Arc;
 use tokio::sync::RwLock;

 /// Shared fleet state, protected by a RwLock for concurrent reader access.
@@ -11,6 +14,12 @@ pub struct CortexState {
    pub eviction: EvictionSettings,
    pub catalogue: ModelCatalogue,
    pub http_client: reqwest::Client,
+    /// Resolves bearer keys to principals and enforces token budgets (#47).
+    /// A local/static provider today (#50); the upstream client later (#57).
+    pub entitlements: Arc<dyn EntitlementProvider>,
+    /// Whether to reject unauthenticated requests (#49). Read by the auth
+    /// middleware once it lands.
+    pub require_auth: bool,
 }

 impl CortexState {
@@ -26,12 +35,18 @@ impl CortexState {
                    models: HashMap::new(),
                    lifecycle_cycles: 0,
                    last_poll: None,
+                    discovery: None,
+                    activation: None,
+                    model_load: HashMap::new(),
                },
            );
        }

        let catalogue = ModelCatalogue::load(&config.models_config);

+        let entitlements: Arc<dyn EntitlementProvider> =
+            Arc::new(LocalEntitlementProvider::from_config(&config.entitlements));
+
        Self {
            nodes: RwLock::new(nodes),
            neuron_configs: config.neurons.clone(),
@@ -41,6 +56,8 @@ impl CortexState {
                .timeout(std::time::Duration::from_secs(300))
                .build()
                .expect("failed to build HTTP client"),
+            entitlements,
+            require_auth: config.entitlements.require_auth,
        }
    }
 }
--- a/crates/cortex-gateway/tests/aliases.rs
+++ b/crates/cortex-gateway/tests/aliases.rs
@@ -0,0 +1,280 @@
+//! Alias resolution: a client request with `model: "helexa/small"`
+//! routes to the concrete model id (e.g. `Qwen/Qwen3-1.7B`), with the
+//! proxied request body rewritten so the upstream neuron sees a model
+//! name that matches its loaded handle.
+
+mod common;
+
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::json;
+use std::path::PathBuf;
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+/// Write a `models.toml` with one alias to a unique temp path. Returns
+/// the path; the file persists for the test process and gets reaped by
+/// the OS at exit. Using $XDG_RUNTIME_DIR fallback for the temp dir
+/// keeps the file off shared /tmp on CI without pulling in tempfile.
+fn write_models_toml(alias: &str, target: &str) -> PathBuf {
+    let contents = format!(
+        r#"
+[aliases]
+"{alias}" = "{target}"
+"#
+    );
+    let mut path = std::env::temp_dir();
+    let pid = std::process::id();
+    let now = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap()
+        .as_nanos();
+    path.push(format!("cortex-test-models-{pid}-{now}.toml"));
+    std::fs::write(&path, contents).expect("write temp models.toml");
+    path
+}
+
+#[tokio::test]
+async fn test_alias_resolves_in_chat_completions() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let models_path = write_models_toml("helexa/small", "test-model");
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: mock_url,
+        }],
+        models_config: models_path.to_string_lossy().to_string(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+
+    // Seed the node as healthy with the concrete model loaded under
+    // the target id. The poller doesn't run in this test; we just
+    // populate state manually.
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node must exist");
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    // Sanity: the catalogue actually picked up the alias.
+    assert_eq!(
+        fleet.catalogue.resolve_alias("helexa/small"),
+        "test-model",
+        "alias should resolve to target id"
+    );
+
+    // Spawn the gateway against this fleet.
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let gateway_addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    let gateway_url = format!("http://{gateway_addr}");
+
+    // Send a chat completion against the alias. The mock backend
+    // echoes back the `model` field it received — so a body whose
+    // model wasn't rewritten would come back as "helexa/small", and a
+    // properly-rewritten one as "test-model".
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gateway_url}/v1/chat/completions"))
+        .json(&json!({
+            "model": "helexa/small",
+            "messages": [{"role": "user", "content": "hi"}],
+        }))
+        .send()
+        .await
+        .expect("gateway should respond");
+
+    assert!(resp.status().is_success(), "gateway returned non-2xx");
+    let body: serde_json::Value = resp.json().await.expect("response is JSON");
+    assert_eq!(
+        body.get("model").and_then(|m| m.as_str()),
+        Some("test-model"),
+        "mock backend should have seen the resolved model id, not the alias"
+    );
+}
+
+#[tokio::test]
+async fn test_aliases_surface_in_v1_models() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let models_path = write_models_toml("helexa/small", "test-model");
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: mock_url,
+        }],
+        models_config: models_path.to_string_lossy().to_string(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+
+    // Seed the target as loaded so the alias's mirrored entry shows
+    // loaded=true.
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node must exist");
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(2000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let gateway_addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    let gateway_url = format!("http://{gateway_addr}");
+
+    let resp = reqwest::get(format!("{gateway_url}/v1/models"))
+        .await
+        .expect("gateway should respond");
+    let body: serde_json::Value = resp.json().await.unwrap();
+    let entries = body
+        .get("data")
+        .and_then(|d| d.as_array())
+        .expect("data array");
+
+    // Both the alias and the target should be present.
+    let ids: Vec<&str> = entries
+        .iter()
+        .filter_map(|e| e.get("id").and_then(|v| v.as_str()))
+        .collect();
+    assert!(ids.contains(&"test-model"), "target should be listed");
+    assert!(ids.contains(&"helexa/small"), "alias should be listed");
+
+    // The alias's `loaded` flag and locations should mirror the target.
+    let alias_entry = entries
+        .iter()
+        .find(|e| e.get("id").and_then(|v| v.as_str()) == Some("helexa/small"))
+        .expect("alias entry");
+    assert_eq!(alias_entry.get("loaded"), Some(&json!(true)));
+    let locations = alias_entry
+        .get("locations")
+        .and_then(|l| l.as_array())
+        .expect("locations array");
+    assert_eq!(locations.len(), 1);
+    assert_eq!(
+        locations[0].get("node").and_then(|n| n.as_str()),
+        Some("mock-node")
+    );
+}
+
+#[tokio::test]
+async fn test_alias_falls_through_for_unmapped_model() {
+    // Catalogue has an alias for some-other-thing but the request
+    // model "test-model" isn't an alias; resolution should be a no-op.
+    let mock_url = common::spawn_mock_neuron().await;
+    let models_path = write_models_toml("helexa/large", "definitely-not-loaded");
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: mock_url,
+        }],
+        models_config: models_path.to_string_lossy().to_string(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node must exist");
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let gateway_addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    let gateway_url = format!("http://{gateway_addr}");
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway_url}/v1/chat/completions"))
+        .json(&json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "hi"}],
+        }))
+        .send()
+        .await
+        .unwrap();
+    assert!(resp.status().is_success());
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(
+        body.get("model").and_then(|m| m.as_str()),
+        Some("test-model")
+    );
+}
--- a/crates/cortex-gateway/tests/anthropic.rs
+++ b/crates/cortex-gateway/tests/anthropic.rs
@@ -123,3 +123,212 @@ async fn test_anthropic_invalid_request() {

    assert_eq!(resp.status(), 400);
 }
+
+/// Tool round-trip: an Anthropic `/v1/messages` request carrying tools
+/// (the Claude Code shape: `{name, description, input_schema}`) must
+/// reach the upstream neuron reshaped into OpenAI function-tool form,
+/// and tool history (`tool_use` / `tool_result` blocks) must become
+/// `tool_calls` / `role:"tool"` messages. This is the fix for the
+/// failure where the model received malformed tool defs and improvised
+/// an unparseable `<tool_use_name>` format.
+#[tokio::test]
+async fn test_anthropic_tools_reshaped_for_upstream() {
+    let (mock_url, captured) = common::spawn_capturing_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/messages"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "max_tokens": 100,
+            "tools": [{
+                "name": "Read",
+                "description": "Read a file from disk",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {"path": {"type": "string"}},
+                    "required": ["path"]
+                }
+            }],
+            "tool_choice": {"type": "auto"},
+            "messages": [
+                {"role": "user", "content": "read /etc/hosts"},
+                {"role": "assistant", "content": [
+                    {"type": "text", "text": "Reading it."},
+                    {"type": "tool_use", "id": "toolu_42", "name": "Read",
+                     "input": {"path": "/etc/hosts"}}
+                ]},
+                {"role": "user", "content": [
+                    {"type": "tool_result", "tool_use_id": "toolu_42",
+                     "content": "127.0.0.1 localhost"}
+                ]}
+            ]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+    assert_eq!(resp.status(), 200);
+
+    let forwarded = {
+        let guard = captured.lock().unwrap();
+        guard.last().cloned().expect("upstream received a request")
+    };
+
+    // Tool definitions reshaped to OpenAI function form.
+    let tools = forwarded["tools"].as_array().expect("tools array");
+    assert_eq!(tools[0]["type"], "function");
+    assert_eq!(tools[0]["function"]["name"], "Read");
+    assert_eq!(
+        tools[0]["function"]["parameters"]["properties"]["path"]["type"],
+        "string"
+    );
+    assert!(tools[0]["function"].get("input_schema").is_none());
+
+    // tool_choice mapped.
+    assert_eq!(forwarded["tool_choice"], "auto");
+
+    // Message history: user, assistant(+tool_calls), tool, user.
+    let msgs = forwarded["messages"].as_array().expect("messages array");
+    let assistant = msgs
+        .iter()
+        .find(|m| m["role"] == "assistant")
+        .expect("assistant turn");
+    assert_eq!(assistant["tool_calls"][0]["id"], "toolu_42");
+    assert_eq!(assistant["tool_calls"][0]["function"]["name"], "Read");
+    // arguments is the parsed object, not a JSON string — the Qwen3.6
+    // chat template iterates `tool_call.arguments | items`.
+    assert_eq!(
+        assistant["tool_calls"][0]["function"]["arguments"],
+        json!({"path": "/etc/hosts"})
+    );
+
+    let tool_msg = msgs
+        .iter()
+        .find(|m| m["role"] == "tool")
+        .expect("tool result turn");
+    assert_eq!(tool_msg["tool_call_id"], "toolu_42");
+    assert_eq!(tool_msg["content"], "127.0.0.1 localhost");
+}
+
+/// #24: a streaming Anthropic request gets a translated Anthropic SSE
+/// stream — not raw OpenAI frames. Verifies the full event sequence,
+/// text reassembly, and the content type.
+#[tokio::test]
+async fn test_anthropic_streaming_sse_translation() {
+    let mock_url =
+        common::spawn_streaming_mock_neuron(4, std::time::Duration::from_millis(20)).await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/messages"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "max_tokens": 64,
+            "stream": true,
+            "messages": [{"role": "user", "content": "Hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+    assert!(
+        resp.headers()
+            .get("content-type")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .starts_with("text/event-stream"),
+        "anthropic stream must be SSE"
+    );
+
+    let body = resp.text().await.expect("stream should complete");
+    assert!(
+        !body.contains("chat.completion.chunk"),
+        "raw OpenAI frames must not leak through:\n{body}"
+    );
+
+    let event_names: Vec<&str> = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("event: "))
+        .collect();
+    assert_eq!(
+        event_names,
+        vec![
+            "message_start",
+            "content_block_start",
+            "content_block_delta",
+            "content_block_delta",
+            "content_block_delta",
+            "content_block_delta",
+            "content_block_stop",
+            "message_delta",
+            "message_stop",
+        ],
+        "unexpected event sequence:\n{body}"
+    );
+
+    // Reassemble the text deltas: the mock emits token0..token3.
+    let text: String = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("data: "))
+        .filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
+        .filter(|v| v["type"] == "content_block_delta")
+        .filter_map(|v| v["delta"]["text"].as_str().map(String::from))
+        .collect();
+    assert_eq!(text, "token0token1token2token3");
+
+    // The mock sends no finish_reason — stop_reason defaults to
+    // end_turn, and output_tokens falls back to the delta count.
+    let message_delta = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("data: "))
+        .filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
+        .find(|v| v["type"] == "message_delta")
+        .expect("message_delta event present");
+    assert_eq!(message_delta["delta"]["stop_reason"], "end_turn");
+    assert_eq!(message_delta["usage"]["output_tokens"], 4);
+}
+
+/// #24: an upstream usage frame (stream_options include_usage shape)
+/// rides into message_delta as input/output token counts.
+#[tokio::test]
+async fn test_anthropic_streaming_usage_propagation() {
+    let mock_url = common::spawn_streaming_mock_neuron_with_usage(
+        3,
+        std::time::Duration::from_millis(10),
+        225,
+        42,
+    )
+    .await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let body = client
+        .post(format!("{gw_url}/v1/messages"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "max_tokens": 64,
+            "stream": true,
+            "messages": [{"role": "user", "content": "Hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed")
+        .text()
+        .await
+        .expect("stream should complete");
+
+    let message_delta = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("data: "))
+        .filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
+        .find(|v| v["type"] == "message_delta")
+        .expect("message_delta event present");
+    assert_eq!(message_delta["usage"]["output_tokens"], 42);
+    assert_eq!(message_delta["usage"]["input_tokens"], 225);
+}
--- a/crates/cortex-gateway/tests/auth.rs
+++ b/crates/cortex-gateway/tests/auth.rs
@@ -0,0 +1,272 @@
+//! Integration tests for API-key auth + principal resolution (#49).
+//!
+//! Verifies the #63 rejection contract (401 invalid_api_key via the #60
+//! envelope) and that an authenticated request reaches neuron carrying the
+//! internal principal headers — while a client-supplied principal header is
+//! stripped (anti-spoofing).
+
+use axum::Json;
+use axum::extract::Path;
+use axum::http::HeaderMap;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, HEADER_ACCOUNT_ID, HEADER_KEY_ID};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::{Arc, Mutex};
+use tokio::net::TcpListener;
+
+/// What the mock neuron observed on the inbound `/v1/chat/completions`
+/// request: the principal headers cortex stamped (or didn't).
+#[derive(Default)]
+struct Seen {
+    account_id: Option<String>,
+    key_id: Option<String>,
+}
+
+/// Spawn a mock neuron that records the principal headers it receives and
+/// returns a trivial chat completion. Returns (base_url, observed).
+async fn spawn_capturing_neuron() -> (String, Arc<Mutex<Seen>>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let seen: Arc<Mutex<Seen>> = Arc::new(Mutex::new(Seen::default()));
+    let sink = Arc::clone(&seen);
+
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |headers: HeaderMap, Json(body): Json<Value>| {
+                let sink = Arc::clone(&sink);
+                async move {
+                    {
+                        let mut s = sink.lock().unwrap();
+                        s.account_id = headers
+                            .get(HEADER_ACCOUNT_ID)
+                            .and_then(|v| v.to_str().ok())
+                            .map(str::to_string);
+                        s.key_id = headers
+                            .get(HEADER_KEY_ID)
+                            .and_then(|v| v.to_str().ok())
+                            .map(str::to_string);
+                    }
+                    let model = body.get("model").and_then(Value::as_str).unwrap_or("m");
+                    Json(json!({
+                        "id": "chatcmpl-auth-001",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "message": {"role": "assistant", "content": "ok"},
+                            "finish_reason": "stop"
+                        }],
+                        "usage": {"prompt_tokens": 3, "completion_tokens": 1, "total_tokens": 4}
+                    }))
+                }
+            }),
+        )
+        .with_state(());
+
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    (base_url, seen)
+}
+
+/// Spawn a gateway with the given entitlements config, a single neuron, and
+/// `test-model` seeded as loaded (build_app spawns no poller).
+async fn spawn_gateway(neuron_url: &str, entitlements: EntitlementsConfig) -> String {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements,
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    format!("http://{addr}")
+}
+
+fn one_key_config(require_auth: bool) -> EntitlementsConfig {
+    EntitlementsConfig {
+        require_auth,
+        keys: vec![ApiKeyConfig {
+            key: "sk-good".into(),
+            account_id: "acct-1".into(),
+            key_id: Some("key-1".into()),
+            hard_cap: None,
+            window: CapWindow::Balance,
+        }],
+    }
+}
+
+fn chat_body() -> Value {
+    json!({
+        "model": "test-model",
+        "messages": [{"role": "user", "content": "hi"}]
+    })
+}
+
+#[tokio::test]
+async fn missing_key_when_required_is_401_invalid_api_key() {
+    let (neuron, _seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::UNAUTHORIZED);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "invalid_api_key");
+    assert_eq!(body["error"]["type"], "invalid_request_error");
+}
+
+#[tokio::test]
+async fn unrecognized_key_is_ignored_when_auth_not_required() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    // allow-anonymous mode: a placeholder/unknown bearer (as opencode,
+    // Open WebUI, Agent Zero, litellm all send by default) must NOT be
+    // rejected — it's ignored and the request is served anonymously.
+    let gateway = spawn_gateway(&neuron, one_key_config(false)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-dummy-placeholder")
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    // Served, but anonymous — no principal stamped from the bogus key.
+    assert!(seen.lock().unwrap().account_id.is_none());
+}
+
+#[tokio::test]
+async fn invalid_key_is_401_when_auth_required() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    // With auth required, a present-but-wrong credential is rejected.
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-wrong")
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::UNAUTHORIZED);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "invalid_api_key");
+    // Rejected before dispatch — neuron never saw the request.
+    assert!(seen.lock().unwrap().account_id.is_none());
+}
+
+#[tokio::test]
+async fn valid_key_reaches_neuron_with_principal_headers() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-good")
+        // A spoofed principal header must be stripped, not forwarded.
+        .header(HEADER_ACCOUNT_ID, "attacker")
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let s = seen.lock().unwrap();
+    assert_eq!(s.account_id.as_deref(), Some("acct-1"));
+    assert_eq!(s.key_id.as_deref(), Some("key-1"));
+}
+
+#[tokio::test]
+async fn anonymous_allowed_when_auth_not_required() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, EntitlementsConfig::default()).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    // No principal resolved → no principal headers stamped.
+    let s = seen.lock().unwrap();
+    assert!(s.account_id.is_none());
+    assert!(s.key_id.is_none());
+}
+
+#[tokio::test]
+async fn health_is_public_even_when_auth_required() {
+    let (neuron, _seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .get(format!("{gateway}/health"))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+}
--- a/crates/cortex-gateway/tests/budget_enforcement.rs
+++ b/crates/cortex-gateway/tests/budget_enforcement.rs
@@ -0,0 +1,253 @@
+//! Integration tests for budget enforcement (#52) — the A0 seatbelt.
+//!
+//! A reservation over the key's hard cap is refused *before* neuron is hit,
+//! with the #63 code matching the cap-window semantics (rate_limit_exceeded
+//! + Retry-After for a resetting window, insufficient_quota for a hard
+//! balance). Spend never exceeds the cap. No 402, ever.
+
+use axum::Json;
+use axum::extract::Path;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, Principal};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use tokio::net::TcpListener;
+
+/// Mock neuron with a hit counter on the inference path, so a test can prove
+/// a request was (or wasn't) dispatched.
+async fn spawn_counting_neuron() -> (String, Arc<AtomicU64>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let hits = Arc::new(AtomicU64::new(0));
+    let sink = Arc::clone(&hits);
+
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| {
+                let sink = Arc::clone(&sink);
+                async move {
+                    sink.fetch_add(1, Ordering::SeqCst);
+                    let model = body.get("model").and_then(Value::as_str).unwrap_or("m");
+                    Json(json!({
+                        "id": "chatcmpl-budget",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}],
+                        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+                    }))
+                }
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (base_url, hits)
+}
+
+async fn spawn_gateway(neuron_url: &str, key: ApiKeyConfig) -> (Arc<CortexState>, String) {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig {
+            require_auth: true,
+            keys: vec![key],
+        },
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (fleet, format!("http://{addr}"))
+}
+
+fn key(window: CapWindow, hard_cap: u64) -> ApiKeyConfig {
+    ApiKeyConfig {
+        key: "sk-cap".into(),
+        account_id: "acct-cap".into(),
+        key_id: Some("key-cap".into()),
+        hard_cap: Some(hard_cap),
+        window,
+    }
+}
+
+fn chat(max_tokens: u64) -> Value {
+    json!({
+        "model": "test-model",
+        "max_tokens": max_tokens,
+        "messages": [{"role": "user", "content": "hi"}]
+    })
+}
+
+#[tokio::test]
+async fn balance_over_cap_is_429_insufficient_quota_before_dispatch() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    // Cap far below a single request's reservation (max_tokens 1000).
+    let (_fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 10)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(1000))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::TOO_MANY_REQUESTS);
+    // Hard balance → no Retry-After.
+    assert!(resp.headers().get(reqwest::header::RETRY_AFTER).is_none());
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "insufficient_quota");
+    // Refused before dispatch — neuron never saw it.
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn rolling_over_cap_is_429_rate_limited_with_retry_after() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (_fleet, gateway) =
+        spawn_gateway(&neuron, key(CapWindow::Rolling { seconds: 3600 }, 10)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(1000))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::TOO_MANY_REQUESTS);
+    let retry = resp
+        .headers()
+        .get(reqwest::header::RETRY_AFTER)
+        .expect("rolling-window rejection must carry Retry-After");
+    assert!(retry.to_str().unwrap().parse::<u64>().unwrap() >= 1);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn within_cap_is_served() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (_fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 1_000_000)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(50))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    assert_eq!(hits.load(Ordering::SeqCst), 1);
+}
+
+#[tokio::test]
+async fn a0_seatbelt_caps_a_runaway_fan_out() {
+    // An Agent-Zero-style key with a modest cap: a burst of requests drains
+    // it, then further requests are refused — the account stops draining and
+    // spend never exceeds the cap.
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 100)).await;
+    let client = reqwest::Client::new();
+
+    let mut ok = 0;
+    let mut refused = 0;
+    for _ in 0..20 {
+        let resp = client
+            .post(format!("{gateway}/v1/chat/completions"))
+            .bearer_auth("sk-cap")
+            .json(&chat(20))
+            .send()
+            .await
+            .unwrap();
+        match resp.status() {
+            reqwest::StatusCode::OK => {
+                ok += 1;
+                let _ = resp.bytes().await.unwrap();
+            }
+            reqwest::StatusCode::TOO_MANY_REQUESTS => {
+                refused += 1;
+                let body: Value = resp.json().await.unwrap();
+                assert_eq!(body["error"]["code"], "insufficient_quota");
+            }
+            other => panic!("unexpected status {other}"),
+        }
+    }
+
+    assert!(ok >= 1, "some requests should be served");
+    assert!(refused >= 1, "the cap must eventually refuse the fan-out");
+    assert_eq!(
+        hits.load(Ordering::SeqCst),
+        ok,
+        "refused requests never dispatched"
+    );
+
+    // Spend never exceeded the hard cap (reservation prevents overshoot).
+    // Poll briefly for in-flight settles to land.
+    let principal = Principal {
+        account_id: "acct-cap".into(),
+        key_id: "key-cap".into(),
+    };
+    for _ in 0..50 {
+        let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+        if snap.reserved == 0 {
+            break;
+        }
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+    }
+    let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+    assert!(snap.spent <= 100, "spent {} exceeded cap", snap.spent);
+}
--- a/crates/cortex-gateway/tests/common/mod.rs
+++ b/crates/cortex-gateway/tests/common/mod.rs
@@ -22,6 +22,7 @@ use tokio::net::TcpListener;
 /// - GET /models/:id/endpoint (returns the inference URL)
 /// - POST /models/unload (accepts unload requests)
 /// - GET /v1/chat/completions + POST /v1/chat/completions (inference)
+///
 /// Returns the neuron base URL.
 pub async fn spawn_mock_neuron() -> String {
    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
@@ -43,6 +44,7 @@ pub async fn spawn_mock_neuron() -> String {
            post(|Json(_body): Json<Value>| async { Json(json!({"status": "unloaded"})) }),
        )
        .route("/v1/chat/completions", post(mock_chat_completions))
+        .route("/v1/responses", post(mock_responses))
        .route("/v1/models", get(mock_v1_models));

    tokio::spawn(async move {
@@ -52,9 +54,64 @@ pub async fn spawn_mock_neuron() -> String {
    base_url
 }

+/// Like [`spawn_mock_neuron`] but captures the JSON body of every
+/// `POST /v1/chat/completions` it receives into the returned handle, so
+/// a test can assert what the gateway *actually forwarded upstream*
+/// (e.g. that Anthropic-shaped tools were reshaped to OpenAI form).
+pub async fn spawn_capturing_mock_neuron() -> (String, Arc<std::sync::Mutex<Vec<Value>>>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let captured: Arc<std::sync::Mutex<Vec<Value>>> = Arc::new(std::sync::Mutex::new(Vec::new()));
+    let sink = captured.clone();
+
+    let app = Router::new()
+        .route("/models", get(mock_neuron_list_models))
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| {
+                let sink = sink.clone();
+                async move {
+                    let model = body
+                        .get("model")
+                        .and_then(|v| v.as_str())
+                        .unwrap_or("unknown");
+                    let resp = json!({
+                        "id": "chatcmpl-capture-001",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "message": {"role": "assistant", "content": "Hello from mock backend"},
+                            "finish_reason": "stop"
+                        }],
+                        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+                    });
+                    sink.lock().unwrap().push(body);
+                    Json(resp)
+                }
+            }),
+        );
+
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    (base_url, captured)
+}
+
 async fn mock_neuron_list_models() -> Json<Value> {
    Json(json!([
-        {"id": "test-model", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000}
+        {"id": "test-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000, "capabilities": ["text"], "tool_call": false, "reasoning": false}
    ]))
 }

@@ -92,6 +149,39 @@ async fn mock_chat_completions(Json(body): Json<Value>) -> Json<Value> {
    }))
 }

+async fn mock_responses(Json(body): Json<Value>) -> Json<Value> {
+    let model = body
+        .get("model")
+        .and_then(|v| v.as_str())
+        .unwrap_or("unknown");
+    // Echo the model field back and synthesise a tiny ResponsesResponse.
+    // Mirrors the shape neuron's /v1/responses handler emits so the
+    // gateway test only needs to assert the proxy round-tripped it.
+    Json(json!({
+        "id": "resp-test-001",
+        "object": "response",
+        "created_at": 1700000000_u64,
+        "status": "completed",
+        "model": model,
+        "output": [{
+            "type": "message",
+            "id": "msg-test-001",
+            "role": "assistant",
+            "content": [{
+                "type": "output_text",
+                "text": "Hello from mock backend",
+                "annotations": []
+            }],
+            "status": "completed"
+        }],
+        "usage": {
+            "input_tokens": 5,
+            "output_tokens": 5,
+            "total_tokens": 10
+        }
+    }))
+}
+
 /// Spawns a mock neuron that returns SSE streaming responses for chat completions.
 pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Duration) -> String {
    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
@@ -161,8 +251,120 @@ pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Durati
    base_url
 }

+/// Like `spawn_streaming_mock_neuron`, but the stream ends with an
+/// OpenAI `stream_options.include_usage`-style final chunk (empty
+/// choices + usage object) before `[DONE]` — the shape the gateway's
+/// token metrics (#21) extract counts from.
+pub async fn spawn_streaming_mock_neuron_with_usage(
+    chunk_count: usize,
+    chunk_delay: Duration,
+    prompt_tokens: u64,
+    completion_tokens: u64,
+) -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+
+    let app = Router::new()
+        .route("/models", get(mock_neuron_list_models))
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_model_id): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| async move {
+                let model = body
+                    .get("model")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("unknown")
+                    .to_string();
+
+                let mut chunks: Vec<String> = (0..chunk_count)
+                    .map(|i| {
+                        let chunk = json!({
+                            "id": "chatcmpl-stream-002",
+                            "object": "chat.completion.chunk",
+                            "created": 1700000000_u64,
+                            "model": model,
+                            "choices": [{
+                                "index": 0,
+                                "delta": { "content": format!("token{i}") },
+                                "finish_reason": null
+                            }]
+                        });
+                        format!("data: {chunk}\n\n")
+                    })
+                    .collect();
+                let usage_chunk = json!({
+                    "id": "chatcmpl-stream-002",
+                    "object": "chat.completion.chunk",
+                    "created": 1700000000_u64,
+                    "model": model,
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens,
+                        "completion_tokens": completion_tokens,
+                        "total_tokens": prompt_tokens + completion_tokens
+                    }
+                });
+                chunks.push(format!("data: {usage_chunk}\n\n"));
+                chunks.push("data: [DONE]\n\n".to_string());
+
+                let delay = chunk_delay;
+                let stream = stream::iter(chunks).then(move |chunk| async move {
+                    tokio::time::sleep(delay).await;
+                    Ok::<_, std::convert::Infallible>(chunk)
+                });
+
+                Response::builder()
+                    .header(header::CONTENT_TYPE, "text/event-stream")
+                    .header(header::CACHE_CONTROL, "no-cache")
+                    .body(Body::from_stream(stream))
+                    .unwrap()
+            }),
+        );
+
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    base_url
+}
+
 /// Spawns a mock neuron with a custom models list.
 pub async fn spawn_mock_neuron_with_models(models_response: Value) -> String {
+    spawn_mock_neuron_with_models_and_health(models_response, default_health_response()).await
+}
+
+/// Default `/health` response used by mocks that don't care about the
+/// activation field — empty devices, no in-flight pre-warm, state=ready.
+pub fn default_health_response() -> Value {
+    json!({
+        "uptime_secs": 0,
+        "devices": [],
+        "activation": {
+            "state": "ready",
+            "pending": [],
+            "in_progress": null,
+            "completed": [],
+            "failed": []
+        }
+    })
+}
+
+/// Variant of `spawn_mock_neuron_with_models` that also serves a
+/// `/health` body. Used by tests that drive the gateway's activation
+/// surface (poller reading /health, /v1/models synthesising Loading
+/// locations from in_progress / pending).
+pub async fn spawn_mock_neuron_with_models_and_health(
+    models_response: Value,
+    health_response: Value,
+) -> String {
    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
    let addr = listener.local_addr().unwrap();
    let base_url = format!("http://{addr}");
@@ -176,6 +378,13 @@ pub async fn spawn_mock_neuron_with_models(models_response: Value) -> String {
                async move { Json(resp) }
            }),
        )
+        .route(
+            "/health",
+            get(move || {
+                let resp = health_response.clone();
+                async move { Json(resp) }
+            }),
+        )
        .route(
            "/models/{model_id}/endpoint",
            get(move |Path(_model_id): Path<String>| {
@@ -220,6 +429,7 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
            endpoint: mock_url.to_string(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -236,6 +446,10 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
--- a/crates/cortex-gateway/tests/error_envelope.rs
+++ b/crates/cortex-gateway/tests/error_envelope.rs
@@ -0,0 +1,140 @@
+mod common;
+
+use serde_json::json;
+
+#[tokio::test]
+async fn error_response_model_not_found() {
+    let neuron_url = common::spawn_mock_neuron().await;
+    let gateway_url = common::spawn_gateway(&neuron_url).await;
+
+    let client = reqwest::Client::new();
+
+    // Request a model that isn't loaded on the mock neuron.
+    let resp = client
+        .post(format!("{gateway_url}/v1/chat/completions"))
+        .header("Content-Type", "application/json")
+        .json(&json!({
+            "model": "nonexistent-model",
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), axum::http::StatusCode::NOT_FOUND);
+
+    let body: serde_json::Value = resp.json().await.expect("valid json");
+    let err = body.get("error").expect("response has error object");
+
+    // Broad type categorization
+    assert_eq!(err.get("type").unwrap(), "invalid_request_error");
+    // Specific machine-readable code
+    assert_eq!(
+        err.get("code").unwrap().as_str().unwrap(),
+        "model_not_found"
+    );
+    // param is always null
+    assert!(err.get("param").unwrap().is_null());
+}
+
+#[tokio::test]
+async fn error_response_missing_model_field() {
+    let neuron_url = common::spawn_mock_neuron().await;
+    let gateway_url = common::spawn_gateway(&neuron_url).await;
+
+    let client = reqwest::Client::new();
+
+    // Request without the required `model` field.
+    let resp = client
+        .post(format!("{gateway_url}/v1/chat/completions"))
+        .header("Content-Type", "application/json")
+        .json(&json!({
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), axum::http::StatusCode::BAD_REQUEST);
+
+    let body: serde_json::Value = resp.json().await.expect("valid json");
+    let err = body.get("error").expect("response has error object");
+
+    assert_eq!(err.get("type").unwrap(), "invalid_request_error");
+    assert_eq!(
+        err.get("code").unwrap().as_str().unwrap(),
+        "missing_model_field"
+    );
+    assert!(err.get("param").unwrap().is_null());
+}
+
+#[tokio::test]
+async fn error_response_no_healthy_nodes() {
+    use cortex_core::config::{EvictionSettings, GatewayConfig, GatewaySettings, NeuronEndpoint};
+    use std::sync::Arc;
+
+    // Create a gateway config with a neuron pointing at an unreachable port so no node is ever healthy.
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: cortex_core::config::EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "dead-node".into(),
+            endpoint: "http://127.0.0.1:1".into(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(cortex_gateway::state::CortexState::from_config(&config));
+
+    let app = cortex_gateway::build_app(fleet);
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    // Allow the poller a moment to mark the node unhealthy.
+    tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .header("Content-Type", "application/json")
+        .json(&json!({
+            "model": "any-model",
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), axum::http::StatusCode::SERVICE_UNAVAILABLE);
+
+    // Transient 503 — the gateway advertises Retry-After so OpenAI-compatible
+    // clients back off and retry rather than surfacing an opaque error (#63).
+    let retry_after = resp
+        .headers()
+        .get(reqwest::header::RETRY_AFTER)
+        .expect("transient 503 must carry Retry-After")
+        .to_str()
+        .unwrap()
+        .to_string();
+    assert_eq!(retry_after, "5");
+
+    let body: serde_json::Value = resp.json().await.expect("valid json");
+    let err = body.get("error").expect("response has error object");
+
+    assert_eq!(err.get("type").unwrap(), "api_error");
+    assert_eq!(
+        err.get("code").unwrap().as_str().unwrap(),
+        "service_unavailable"
+    );
+    assert!(err.get("param").unwrap().is_null());
+}
--- a/crates/cortex-gateway/tests/eviction.rs
+++ b/crates/cortex-gateway/tests/eviction.rs
@@ -71,6 +71,7 @@ fn make_fleet(endpoint: &str, defrag_after: u32) -> Arc<CortexState> {
            endpoint: endpoint.to_string(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };
    Arc::new(CortexState::from_config(&config))
 }
@@ -91,6 +92,10 @@ async fn test_evict_lru_model() {
                status: ModelStatus::Loaded,
                last_accessed: Some(Utc::now() - chrono::Duration::hours(2)),
                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
        node.models.insert(
@@ -100,6 +105,10 @@ async fn test_evict_lru_model() {
                status: ModelStatus::Loaded,
                last_accessed: Some(Utc::now()),
                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
@@ -163,6 +172,10 @@ async fn test_eviction_increments_lifecycle_cycles() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
--- a/crates/cortex-gateway/tests/load_routing.rs
+++ b/crates/cortex-gateway/tests/load_routing.rs
@@ -0,0 +1,189 @@
+//! Load-aware routing across replicas (#55).
+//!
+//! When a model is loaded on more than one healthy neuron, the router picks
+//! the least-busy replica using the per-model admission load each neuron
+//! reports on `GET /health` (#53), rather than always taking the first.
+
+mod common;
+
+use axum::Json;
+use axum::extract::Path;
+use axum::http::{StatusCode, header};
+use axum::response::IntoResponse;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::discovery::ModelLoad;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+/// Seed a node as healthy with `test-model` loaded and a given admission load.
+async fn seed_loaded(fleet: &CortexState, node: &str, in_flight: usize, queue_depth: usize) {
+    let mut nodes = fleet.nodes.write().await;
+    let n = nodes.get_mut(node).expect("node exists");
+    n.healthy = true;
+    n.models.insert(
+        "test-model".into(),
+        ModelEntry {
+            id: "test-model".into(),
+            status: ModelStatus::Loaded,
+            last_accessed: None,
+            vram_estimate_mb: Some(8000),
+            capabilities: Vec::new(),
+            tool_call: false,
+            reasoning: false,
+            limit: None,
+        },
+    );
+    n.model_load.insert(
+        "test-model".into(),
+        ModelLoad {
+            id: "test-model".into(),
+            in_flight,
+            queue_depth,
+        },
+    );
+}
+
+/// Build a gateway state over two mock neurons (no poller; we seed state).
+async fn two_neuron_fleet(endpoint_a: &str, endpoint_b: &str) -> Arc<CortexState> {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![
+            NeuronEndpoint {
+                name: "node-a".into(),
+                endpoint: endpoint_a.to_string(),
+            },
+            NeuronEndpoint {
+                name: "node-b".into(),
+                endpoint: endpoint_b.to_string(),
+            },
+        ],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+    Arc::new(CortexState::from_config(&config))
+}
+
+#[tokio::test]
+async fn routes_to_least_busy_replica() {
+    let neuron_a = common::spawn_mock_neuron().await;
+    let neuron_b = common::spawn_mock_neuron().await;
+    let fleet = two_neuron_fleet(&neuron_a, &neuron_b).await;
+
+    // A is busy (1 running + 3 queued), B is idle.
+    seed_loaded(&fleet, "node-a", 1, 3).await;
+    seed_loaded(&fleet, "node-b", 0, 0).await;
+
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("model is loaded on both nodes");
+    assert_eq!(route.node_name, "node-b", "should pick the idle replica");
+
+    // Flip the load: now B is the busy one.
+    seed_loaded(&fleet, "node-a", 0, 0).await;
+    seed_loaded(&fleet, "node-b", 1, 5).await;
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("still loaded");
+    assert_eq!(route.node_name, "node-a", "should follow the lighter load");
+}
+
+/// Mock neuron whose inference endpoint always returns a #63 backpressure
+/// envelope (503 + Retry-After) — simulating a saturated neuron.
+async fn spawn_busy_neuron() -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(|| async {
+                let body = json!({"error": {
+                    "message": "model is busy (admission queue full); retry shortly",
+                    "type": "rate_limit_error",
+                    "code": "rate_limit_exceeded",
+                    "param": null
+                }});
+                (
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    [(header::RETRY_AFTER, "6")],
+                    Json(body),
+                )
+                    .into_response()
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    base_url
+}
+
+#[tokio::test]
+async fn neuron_backpressure_is_propagated_intact() {
+    // A saturated neuron's 503 + Retry-After + envelope must reach the client
+    // verbatim — not unwrapped, remapped, or stripped (#55 / #63).
+    let neuron = spawn_busy_neuron().await;
+    let fleet = two_neuron_fleet(&neuron, &neuron).await;
+    seed_loaded(&fleet, "node-a", 1, 8).await;
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let resp = reqwest::Client::new()
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::SERVICE_UNAVAILABLE);
+    assert_eq!(
+        resp.headers()
+            .get(reqwest::header::RETRY_AFTER)
+            .and_then(|v| v.to_str().ok()),
+        Some("6"),
+        "Retry-After must survive the proxy"
+    );
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+}
+
+#[tokio::test]
+async fn ties_break_deterministically_by_name() {
+    let neuron_a = common::spawn_mock_neuron().await;
+    let neuron_b = common::spawn_mock_neuron().await;
+    let fleet = two_neuron_fleet(&neuron_a, &neuron_b).await;
+
+    // Equal load on both → stable pick (lowest node name).
+    seed_loaded(&fleet, "node-a", 0, 0).await;
+    seed_loaded(&fleet, "node-b", 0, 0).await;
+
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("loaded");
+    assert_eq!(route.node_name, "node-a", "ties break by name");
+}
--- a/crates/cortex-gateway/tests/metering.rs
+++ b/crates/cortex-gateway/tests/metering.rs
@@ -0,0 +1,207 @@
+//! Integration tests for per-request token metering (#51).
+//!
+//! Drives authenticated requests through the gateway to a mock neuron that
+//! reports a fixed `usage` object, then asserts the EntitlementProvider's
+//! spend ledger reflects cumulative per-key spend and that reservations
+//! settle to actual (no outstanding reserved tokens once requests complete).
+
+mod common;
+
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, Principal};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::json;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::net::TcpListener;
+
+const ACCOUNT: &str = "acct-meter";
+const KEY_ID: &str = "key-meter";
+const BEARER: &str = "sk-meter";
+
+/// The mock neuron (common::spawn_mock_neuron) reports this fixed usage on
+/// every chat completion.
+const PROMPT_PER_REQ: u64 = 10;
+const COMPLETION_PER_REQ: u64 = 5;
+
+async fn spawn_metered_gateway(neuron_url: &str) -> (Arc<CortexState>, String) {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig {
+            require_auth: true,
+            keys: vec![ApiKeyConfig {
+                key: BEARER.into(),
+                account_id: ACCOUNT.into(),
+                key_id: Some(KEY_ID.into()),
+                hard_cap: Some(1_000_000),
+                window: CapWindow::Balance,
+            }],
+        },
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (fleet, format!("http://{addr}"))
+}
+
+fn principal() -> Principal {
+    Principal {
+        account_id: ACCOUNT.into(),
+        key_id: KEY_ID.into(),
+    }
+}
+
+/// Poll the provider ledger until settled spend reaches `expected` (settle
+/// runs in a spawned task after the response stream finishes) or time out.
+async fn await_spent(fleet: &CortexState, expected: u64) -> u64 {
+    let principal = principal();
+    for _ in 0..100 {
+        let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+        if snap.spent >= expected {
+            return snap.spent;
+        }
+        tokio::time::sleep(Duration::from_millis(20)).await;
+    }
+    fleet.entitlements.snapshot(&principal).await.unwrap().spent
+}
+
+#[tokio::test]
+async fn cumulative_spend_is_metered_per_key() {
+    let neuron = common::spawn_mock_neuron().await;
+    let (fleet, gateway) = spawn_metered_gateway(&neuron).await;
+    let client = reqwest::Client::new();
+
+    const N: u64 = 3;
+    for _ in 0..N {
+        let resp = client
+            .post(format!("{gateway}/v1/chat/completions"))
+            .bearer_auth(BEARER)
+            .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+            .send()
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), reqwest::StatusCode::OK);
+        // Drain the body so the response stream finishes and metering settles.
+        let _ = resp.bytes().await.unwrap();
+    }
+
+    let expected = N * (PROMPT_PER_REQ + COMPLETION_PER_REQ);
+    let spent = await_spent(&fleet, expected).await;
+    assert_eq!(
+        spent, expected,
+        "ledger must reflect cumulative per-key spend"
+    );
+
+    // Reservations settled to actual — nothing left outstanding.
+    let snap = fleet.entitlements.snapshot(&principal()).await.unwrap();
+    assert_eq!(snap.reserved, 0, "all reservations must settle/release");
+    assert_eq!(snap.hard_cap, Some(1_000_000));
+}
+
+#[tokio::test]
+async fn anonymous_request_records_no_spend() {
+    // require_auth=false so the unauthenticated request is served, but with
+    // no principal it must not touch any ledger.
+    let neuron = common::spawn_mock_neuron().await;
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron.clone(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig::default(),
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let resp = reqwest::Client::new()
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+
+    // An unconfigured principal has a zeroed snapshot — nothing was metered.
+    let snap = fleet
+        .entitlements
+        .snapshot(&Principal {
+            account_id: "nobody".into(),
+            key_id: "nobody".into(),
+        })
+        .await
+        .unwrap();
+    assert_eq!(snap.spent, 0);
+}
--- a/crates/cortex-gateway/tests/metrics.rs
+++ b/crates/cortex-gateway/tests/metrics.rs
@@ -1,20 +1,26 @@
 mod common;

 use serde_json::json;
+use std::sync::OnceLock;
+
+/// The metrics recorder is a process-wide global; both tests in this
+/// binary run against one shared install. Assertions must therefore be
+/// order-independent (presence of names / monotonic counters, not
+/// "empty before").
+fn recorder() -> &'static metrics_exporter_prometheus::PrometheusHandle {
+    static HANDLE: OnceLock<metrics_exporter_prometheus::PrometheusHandle> = OnceLock::new();
+    HANDLE.get_or_init(|| {
+        cortex_gateway::metrics::install_test_recorder().expect("recorder should install")
+    })
+}

 #[tokio::test]
 async fn test_metrics_emitted_after_proxy() {
-    let handle = cortex_gateway::metrics::install_test_recorder().expect("recorder should install");
+    let handle = recorder();

    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

-    let before = handle.render();
-    assert!(
-        !before.contains("cortex_requests_total"),
-        "no request metrics before any requests"
-    );
-
    let client = reqwest::Client::new();
    let resp = client
        .post(format!("{gw_url}/v1/chat/completions"))
@@ -44,3 +50,72 @@ async fn test_metrics_emitted_after_proxy() {
        "no errors expected for a successful request"
    );
 }
+
+#[tokio::test]
+async fn test_token_metrics_emitted_for_streamed_request() {
+    // #21: a streamed chat completion with a final usage chunk must
+    // produce TTFT + tok/s histograms and prompt/completion token
+    // counters, labelled with model and node. The recorder is global
+    // per-process, so this test runs in its own binary invocation —
+    // cargo's per-file integration binaries give us that as long as
+    // only one test in this file installs the recorder... it isn't:
+    // test_metrics_emitted_after_proxy also installs. Whichever wins
+    // the race, both render from the same recorder, so assert on
+    // delta-able names rather than exact totals.
+    let handle = recorder();
+
+    let mock_url = common::spawn_streaming_mock_neuron_with_usage(
+        5,
+        std::time::Duration::from_millis(40),
+        225,
+        42,
+    )
+    .await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/chat/completions"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "Hi"}],
+            "stream": true
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+    assert_eq!(resp.status(), 200);
+    let body = resp.text().await.expect("stream should complete");
+    assert!(body.contains("[DONE]"));
+
+    let rendered = handle.render();
+    for needle in [
+        "cortex_time_to_first_token_seconds",
+        "cortex_tokens_per_second",
+    ] {
+        assert!(
+            rendered.contains(needle),
+            "{needle} should be present.\nMetrics:\n{rendered}"
+        );
+    }
+    // The recorder is shared with the sibling test (same model/node
+    // labels), so counters are lower bounds, not exact values: this
+    // request contributed prompt=225 / completion=42.
+    let counter_value = |name: &str| -> u64 {
+        rendered
+            .lines()
+            .find(|l| l.starts_with(name) && l.contains(r#"model="test-model""#))
+            .and_then(|l| l.rsplit(' ').next())
+            .and_then(|v| v.parse().ok())
+            .unwrap_or_else(|| panic!("{name} should be present.\nMetrics:\n{rendered}"))
+    };
+    assert!(
+        counter_value("cortex_prompt_tokens_total") >= 225,
+        "prompt token counter should include this request's 225.\nMetrics:\n{rendered}"
+    );
+    assert!(
+        counter_value("cortex_completion_tokens_total") >= 42,
+        "completion token counter should include this request's 42.\nMetrics:\n{rendered}"
+    );
+}
--- a/crates/cortex-gateway/tests/model_limits.rs
+++ b/crates/cortex-gateway/tests/model_limits.rs
@@ -0,0 +1,132 @@
+//! Issue #62 / #67: `GET /v1/models` advertises a per-model serving budget so
+//! an OpenAI-compatible client (opencode's helexa provider) can size and
+//! compact its context without hand-configuration.
+//!
+//! Asserts the composition sources land on the response:
+//!   - `limit` from the neuron's self-derived value (#67) — NOT the catalogue;
+//!     an operator-declared catalogue `limit` is deliberately ignored.
+//!   - `cost` from the catalogue profile (operator-set pricing).
+//!   - `tool_call` / `reasoning` from the neuron's runtime detection (OR-ed in)
+//!
+//! Also a regression guard for the removal of `max_model_len` — the misnamed,
+//! unconsumed vLLM-ism that this contract replaces.
+
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::harness::ModelLimit;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+#[tokio::test]
+async fn v1_models_surfaces_limit_cost_and_capability_flags() {
+    // Catalogue declares pricing + an operator `limit` that must be IGNORED
+    // (#67): the neuron's self-derived limit is authoritative.
+    let models_toml = r#"
+[[models]]
+id = "test-model"
+harness = "candle"
+limit.context = 999999
+limit.input = 999999
+limit.output = 999999
+cost.input = 0.0
+cost.output = 0.0
+capabilities = ["text"]
+"#;
+    let cat_path = std::env::temp_dir().join("cortex_test_issue62_models.toml");
+    std::fs::write(&cat_path, models_toml).unwrap();
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            // Never contacted: build_app does not spawn the poller, so the
+            // seeded state below is authoritative for /v1/models.
+            endpoint: "http://127.0.0.1:1".into(),
+        }],
+        models_config: cat_path.to_string_lossy().into_owned(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+
+    // Seed the model as loaded on the node with runtime-detected flags set —
+    // these must OR into the catalogue entry, not be lost.
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node exists");
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: vec!["text".into()],
+                tool_call: true,
+                reasoning: true,
+                // Neuron's self-derived limit (#67) — the authoritative
+                // source. Distinct from the catalogue's (ignored) values.
+                limit: Some(ModelLimit {
+                    context: 49152,
+                    input: Some(40960),
+                    output: 8192,
+                }),
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let body: serde_json::Value = reqwest::Client::new()
+        .get(format!("http://{addr}/v1/models"))
+        .send()
+        .await
+        .unwrap()
+        .json()
+        .await
+        .unwrap();
+
+    let entry = body["data"]
+        .as_array()
+        .expect("data is an array")
+        .iter()
+        .find(|m| m["id"] == "test-model")
+        .expect("test-model present in /v1/models");
+
+    // `limit` is the neuron's self-derived value (#67), NOT the catalogue's
+    // (which declared 999999 and must be ignored). `cost` still flows from
+    // the catalogue.
+    assert_eq!(entry["limit"]["context"], 49152);
+    assert_eq!(entry["limit"]["input"], 40960);
+    assert_eq!(entry["limit"]["output"], 8192);
+    assert_eq!(entry["cost"]["input"], 0.0);
+    assert_eq!(entry["cost"]["output"], 0.0);
+
+    // Runtime-detected capability flags OR-ed in from the neuron's ModelEntry.
+    assert_eq!(entry["tool_call"], true);
+    assert_eq!(entry["reasoning"], true);
+
+    // Regression guard: the removed, unconsumed vLLM-ism must not reappear.
+    assert!(
+        entry.get("max_model_len").is_none(),
+        "max_model_len was removed; /v1/models must not advertise it"
+    );
+
+    let _ = std::fs::remove_file(&cat_path);
+}
--- a/crates/cortex-gateway/tests/poller.rs
+++ b/crates/cortex-gateway/tests/poller.rs
@@ -12,8 +12,8 @@ use std::sync::Arc;
 async fn test_poller_discovers_models() {
    // Mock neuron reports 2 models via /models endpoint (neuron format).
    let mock_url = common::spawn_mock_neuron_with_models(json!([
-        {"id": "model-a", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
-        {"id": "model-b", "harness": "mistralrs", "status": "unloaded", "devices": [], "vram_used_mb": null}
+        {"id": "model-a", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
+        {"id": "model-b", "harness": "candle", "status": "unloaded", "devices": [], "vram_used_mb": null}
    ]))
    .await;

@@ -31,6 +31,7 @@ async fn test_poller_discovers_models() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -63,8 +64,8 @@ async fn test_poller_discovers_models() {
 #[tokio::test]
 async fn test_poller_updates_gateway_models_endpoint() {
    let mock_url = common::spawn_mock_neuron_with_models(json!([
-        {"id": "model-x", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
-        {"id": "model-y", "harness": "mistralrs", "status": "loaded", "devices": [1], "vram_used_mb": null}
+        {"id": "model-x", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null},
+        {"id": "model-y", "harness": "candle", "status": "loaded", "devices": [1], "vram_used_mb": null}
    ]))
    .await;

@@ -82,6 +83,7 @@ async fn test_poller_updates_gateway_models_endpoint() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -118,6 +120,88 @@ async fn test_poller_updates_gateway_models_endpoint() {
    }
 }

+#[tokio::test]
+async fn test_models_endpoint_unions_capabilities_across_nodes() {
+    // C3: two neurons each have the same model loaded but advertise
+    // different capability sets. The gateway's /v1/models must report
+    // the union — a model loaded text-only on one node and
+    // text+vision on another is vision-capable to the fleet.
+    let node_a = common::spawn_mock_neuron_with_models(json!([
+        {"id": "shared-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null, "capabilities": ["text"]}
+    ]))
+    .await;
+    let node_b = common::spawn_mock_neuron_with_models(json!([
+        {"id": "shared-model", "harness": "candle", "status": "loaded", "devices": [1], "vram_used_mb": null, "capabilities": ["text", "vision"]}
+    ]))
+    .await;
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![
+            NeuronEndpoint {
+                name: "node-a".into(),
+                endpoint: node_a,
+            },
+            NeuronEndpoint {
+                name: "node-b".into(),
+                endpoint: node_b,
+            },
+        ],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    cortex_gateway::poller::poll_once(&fleet).await;
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let client = reqwest::Client::new();
+    let body: serde_json::Value = client
+        .get(format!("http://{addr}/v1/models"))
+        .send()
+        .await
+        .expect("request should succeed")
+        .json()
+        .await
+        .unwrap();
+
+    let model = body["data"]
+        .as_array()
+        .expect("data array")
+        .iter()
+        .find(|m| m["id"] == "shared-model")
+        .expect("shared-model should be present");
+
+    let caps: Vec<&str> = model["capabilities"]
+        .as_array()
+        .expect("capabilities array")
+        .iter()
+        .filter_map(|c| c.as_str())
+        .collect();
+    assert!(caps.contains(&"text"), "union must include text: {caps:?}");
+    assert!(
+        caps.contains(&"vision"),
+        "union must include vision: {caps:?}"
+    );
+    assert_eq!(caps.len(), 2, "union must not duplicate text: {caps:?}");
+
+    // Both nodes hold the model, so two locations regardless of caps.
+    assert_eq!(model["locations"].as_array().unwrap().len(), 2);
+}
+
 #[tokio::test]
 async fn test_poller_marks_unreachable_node_unhealthy() {
    let config = GatewayConfig {
@@ -134,6 +218,7 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
            endpoint: "http://127.0.0.1:1".into(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -152,8 +237,8 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
 #[tokio::test]
 async fn test_poller_removes_stale_models() {
    let mock_url = common::spawn_mock_neuron_with_models(json!([
-        {"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
-        {"id": "drop-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
+        {"id": "keep-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null},
+        {"id": "drop-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null}
    ]))
    .await;

@@ -171,6 +256,7 @@ async fn test_poller_removes_stale_models() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -183,7 +269,7 @@ async fn test_poller_removes_stale_models() {

    // New mock with only one model.
    let new_mock_url = common::spawn_mock_neuron_with_models(json!([
-        {"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
+        {"id": "keep-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null}
    ]))
    .await;

@@ -201,6 +287,7 @@ async fn test_poller_removes_stale_models() {
            endpoint: new_mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet2 = Arc::new(CortexState::from_config(&config2));
@@ -216,6 +303,10 @@ async fn test_poller_removes_stale_models() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
        node.models.insert(
@@ -225,6 +316,10 @@ async fn test_poller_removes_stale_models() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
@@ -237,3 +332,96 @@ async fn test_poller_removes_stale_models() {
    assert!(node.models.contains_key("keep-me"));
    assert!(!node.models.contains_key("drop-me"));
 }
+
+#[tokio::test]
+async fn test_poller_captures_activation_from_health() {
+    // Mock neuron is mid-prewarm: /models reports nothing (the loading
+    // model hasn't been inserted into the harness map yet), but
+    // /health's activation says model-x is in_progress and model-y is
+    // queued behind it.
+    let mock_url = common::spawn_mock_neuron_with_models_and_health(
+        json!([]),
+        json!({
+            "uptime_secs": 30,
+            "devices": [],
+            "activation": {
+                "state": "pre_warming",
+                "pending": ["Qwen/model-y"],
+                "in_progress": "Qwen/model-x",
+                "completed": [],
+                "failed": []
+            }
+        }),
+    )
+    .await;
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "prewarm-node".into(),
+            endpoint: mock_url,
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    cortex_gateway::poller::poll_once(&fleet).await;
+
+    let nodes = fleet.nodes.read().await;
+    let node = nodes.get("prewarm-node").unwrap();
+    assert!(node.healthy);
+    // /models was empty — no entries in the per-node model map.
+    assert!(node.models.is_empty());
+    // But /health's activation should be captured.
+    let activation = node
+        .activation
+        .as_ref()
+        .expect("activation should be populated after /health poll");
+    assert_eq!(activation.in_progress.as_deref(), Some("Qwen/model-x"));
+    assert_eq!(activation.pending, vec!["Qwen/model-y".to_string()]);
+}
+
+#[tokio::test]
+async fn test_poller_parses_recovering_status() {
+    // #20: a model auto-recovering on a neuron (poisoned → unload →
+    // reload, #17) is reported with status "recovering" and must land
+    // in gateway state as the dedicated Recovering status — not fall
+    // through the parser's catch-all to Loaded.
+    let mock_url = common::spawn_mock_neuron_with_models(json!([
+        {"id": "model-r", "harness": "candle", "status": "recovering", "devices": [0, 1], "vram_used_mb": null}
+    ]))
+    .await;
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "test-node".into(),
+            endpoint: mock_url,
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    cortex_gateway::poller::poll_once(&fleet).await;
+
+    let nodes = fleet.nodes.read().await;
+    let node = nodes.get("test-node").unwrap();
+    let model_r = node.models.get("model-r").expect("model-r should exist");
+    assert_eq!(model_r.status, ModelStatus::Recovering);
+}
--- a/crates/cortex-gateway/tests/prompt_prevalidation.rs
+++ b/crates/cortex-gateway/tests/prompt_prevalidation.rs
@@ -0,0 +1,174 @@
+//! Fail-fast prompt pre-validation + advisory client hints (#56).
+//!
+//! cortex refuses a prompt that already exceeds the model's advertised
+//! context window before dispatching to neuron — the same #60
+//! `context_length_exceeded` envelope neuron would emit, just earlier — and
+//! attaches an advisory `X-Helexa-Advice` header for fingerprinted clients.
+
+use axum::Json;
+use axum::extract::Path;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::harness::ModelLimit;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use tokio::net::TcpListener;
+
+/// Mock neuron with a hit counter, so a test can prove a request was (or
+/// wasn't) dispatched past the gateway's pre-validation.
+async fn spawn_counting_neuron() -> (String, Arc<AtomicU64>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let hits = Arc::new(AtomicU64::new(0));
+    let sink = Arc::clone(&hits);
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move || {
+                let sink = Arc::clone(&sink);
+                async move {
+                    sink.fetch_add(1, Ordering::SeqCst);
+                    Json(json!({
+                        "id": "c", "object": "chat.completion", "created": 1_700_000_000_u64,
+                        "model": "test-model",
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}],
+                        "usage": {"prompt_tokens": 3, "completion_tokens": 1, "total_tokens": 4}
+                    }))
+                }
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (base_url, hits)
+}
+
+/// Gateway over one neuron with `test-model` loaded and a tiny advertised
+/// context window (so a modest prompt overflows it).
+async fn spawn_gateway(neuron: &str, context: usize) -> String {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let n = nodes.get_mut("mock-node").unwrap();
+        n.healthy = true;
+        n.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: Some(ModelLimit {
+                    context,
+                    input: None,
+                    output: 16,
+                }),
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    format!("http://{addr}")
+}
+
+#[tokio::test]
+async fn over_long_prompt_is_rejected_before_dispatch() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let gateway = spawn_gateway(&neuron, 50).await; // tiny 50-token window
+
+    // ~1200 chars → ~300 est tokens, well over 50.
+    let big = "word ".repeat(240);
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .header("user-agent", "litellm/1.0")
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": big}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::BAD_REQUEST);
+    // Advisory hint for the fingerprinted client (header only, never body).
+    assert!(
+        resp.headers().get("x-helexa-advice").is_some(),
+        "litellm should get advice"
+    );
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "context_length_exceeded");
+    assert_eq!(body["error"]["max"], 50);
+    // Refused at the edge — neuron never saw it.
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn within_context_passes_through() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let gateway = spawn_gateway(&neuron, 4096).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    assert_eq!(hits.load(Ordering::SeqCst), 1, "served by neuron");
+}
+
+#[tokio::test]
+async fn unknown_client_gets_no_advice_header() {
+    let (neuron, _hits) = spawn_counting_neuron().await;
+    let gateway = spawn_gateway(&neuron, 50).await;
+
+    let big = "word ".repeat(240);
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        // no/unknown User-Agent → no advice, but still a clean 400
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": big}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::BAD_REQUEST);
+    assert!(resp.headers().get("x-helexa-advice").is_none());
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "context_length_exceeded");
+}
--- a/crates/cortex-gateway/tests/proxy_basic.rs
+++ b/crates/cortex-gateway/tests/proxy_basic.rs
@@ -117,6 +117,7 @@ async fn test_no_healthy_nodes() {
            endpoint: "http://127.0.0.1:1".into(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };
    let fleet = std::sync::Arc::new(cortex_gateway::state::CortexState::from_config(&config));

@@ -139,7 +140,7 @@ async fn test_no_healthy_nodes() {
        .await
        .expect("request should succeed");

-    assert_eq!(resp.status(), 404);
+    assert_eq!(resp.status(), 503);

    let body: serde_json::Value = resp.json().await.unwrap();
    assert!(
@@ -171,3 +172,67 @@ async fn test_missing_model_field() {
    let body: serde_json::Value = resp.json().await.unwrap();
    assert!(body["error"]["message"].as_str().unwrap().contains("model"));
 }
+
+#[tokio::test]
+async fn test_recovering_model_returns_503_and_stays_listed() {
+    // #20: while a model auto-recovers on a neuron, the gateway must
+    // hold the route — transient 503 ("retry shortly"), not the 404
+    // "not found on any node" that makes a recovering model look
+    // evicted — and keep listing it on /v1/models.
+    let mock_url = common::spawn_mock_neuron().await;
+    let (fleet, gw_url) = common::spawn_gateway_with_state(&mock_url).await;
+
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node must exist");
+        node.models.insert(
+            "recovering-model".into(),
+            cortex_core::node::ModelEntry {
+                id: "recovering-model".into(),
+                status: cortex_core::node::ModelStatus::Recovering,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/chat/completions"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "recovering-model",
+            "messages": [{"role": "user", "content": "Hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 503);
+    let body: serde_json::Value = resp.json().await.unwrap();
+    let message = body["error"]["message"].as_str().unwrap();
+    assert!(
+        message.contains("recovering") && message.contains("retry"),
+        "503 body must say recovering/retry, got: {message}"
+    );
+
+    // The model must still be visible on the unified models endpoint.
+    let models: serde_json::Value = client
+        .get(format!("{gw_url}/v1/models"))
+        .send()
+        .await
+        .expect("models request should succeed")
+        .json()
+        .await
+        .unwrap();
+    let listed = models["data"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .any(|m| m["id"] == "recovering-model");
+    assert!(listed, "recovering model must stay listed on /v1/models");
+}
--- a/crates/cortex-gateway/tests/responses.rs
+++ b/crates/cortex-gateway/tests/responses.rs
@@ -0,0 +1,91 @@
+//! Integration tests for the `/v1/responses` proxy route.
+//!
+//! The gateway forwards the request body to whichever neuron has the
+//! model loaded. These tests exercise the routing decision (200 on a
+//! known model, 404 on an unknown model, 400 on a missing model
+//! field) and confirm the response body round-trips verbatim.
+
+mod common;
+
+use serde_json::json;
+
+/// Happy path: gateway routes a `/v1/responses` request to the neuron
+/// that has the model loaded, and the neuron's response body
+/// arrives at the client unchanged.
+#[tokio::test]
+async fn test_responses_proxy() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/responses"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "input": "Hi"
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+
+    let body: serde_json::Value = resp.json().await.expect("valid JSON response");
+    assert_eq!(body["id"], "resp-test-001");
+    assert_eq!(body["object"], "response");
+    assert_eq!(body["model"], "test-model");
+    assert_eq!(body["status"], "completed");
+    assert_eq!(
+        body["output"][0]["content"][0]["text"],
+        "Hello from mock backend"
+    );
+    // Usage shape is the Responses-specific (input/output_tokens),
+    // not the chat-completions one (prompt/completion_tokens). Asserts
+    // the proxy didn't accidentally route through the wrong handler.
+    assert_eq!(body["usage"]["total_tokens"], 10);
+    assert!(body["usage"].get("input_tokens").is_some());
+}
+
+/// A request that targets a model not present in the catalogue gets
+/// 404 from the router. This matches the chat-completions handler's
+/// behaviour — same error path, same status code, so a client can
+/// share retry logic across the two routes.
+#[tokio::test]
+async fn test_responses_model_not_found() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/responses"))
+        .json(&json!({
+            "model": "not-in-catalogue",
+            "input": "Hi"
+        }))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 404);
+}
+
+/// A request body without a `model` field can't be routed; the
+/// gateway returns 400 before reaching a backend. Same as the
+/// chat-completions handler — extracted via the same `extract_model`
+/// helper.
+#[tokio::test]
+async fn test_responses_missing_model_field() {
+    let mock_url = common::spawn_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/responses"))
+        .json(&json!({
+            "input": "Hi"
+        }))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 400);
+}
--- a/crates/cortex-gateway/tests/streaming.rs
+++ b/crates/cortex-gateway/tests/streaming.rs
@@ -51,18 +51,18 @@ async fn test_streaming_sse_passthrough() {
    }

    assert!(
-        chunks.len() >= chunk_count + 1,
-        "expected at least {} chunks (got {}): {:?}",
-        chunk_count + 1,
+        chunks.len() > chunk_count,
+        "expected more than {} chunks (got {}): {:?}",
+        chunk_count,
        chunks.len(),
        chunks,
    );

    assert_eq!(chunks.last().unwrap(), "[DONE]");

-    for i in 0..chunk_count {
+    for (i, chunk) in chunks.iter().enumerate().take(chunk_count) {
        let chunk_json: serde_json::Value =
-            serde_json::from_str(&chunks[i]).expect("chunk should be valid JSON");
+            serde_json::from_str(chunk).expect("chunk should be valid JSON");
        assert_eq!(
            chunk_json["choices"][0]["delta"]["content"],
            format!("token{i}")
--- a/crates/helexa-acp/Cargo.toml
+++ b/crates/helexa-acp/Cargo.toml
@@ -0,0 +1,48 @@
+[package]
+name = "helexa-acp"
+version = "0.1.16"
+edition = "2024"
+license = "Apache-2.0"
+repository = "https://git.lair.cafe/helexa/helexa"
+description = """
+Agent Client Protocol bridge for the helexa self-hosted LLM stack.
+Speaks ACP to ACP-compatible editor clients (Zed, etc.) and forwards
+the conversation to any OpenAI-compatible HTTP endpoint — defaulting
+to cortex (helexa's reverse-proxy / fleet gateway).
+"""
+
+# This crate is intentionally self-contained — no dependencies on other
+# workspace crates (cortex-core, cortex-gateway, neuron). The goal is
+# a painless migration to a dedicated GitHub repo in the future if the
+# project grows beyond helexa's needs. All deps are crates.io.
+[dependencies]
+# `unstable_session_model` flips on the SessionModelState type and the
+# session/set_model RPC the model-picker dropdown in Zed needs. The
+# feature is upstream-marked unstable; we accept that risk because the
+# model picker is core UX and the alternative (rolling our own
+# extension method) drifts further from spec each time it moves.
+agent-client-protocol = { version = "0.12", features = ["unstable_session_model"] }
+tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "io-util", "process", "signal"] }
+reqwest = { version = "0.12", features = ["json", "stream", "rustls-tls"], default-features = false }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+toml = "0.8"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+anyhow = "1"
+thiserror = "2"
+async-trait = "0.1"
+futures = "0.3"
+tokio-stream = "0.1"
+tokio-util = { version = "0.7", features = ["rt"] }
+eventsource-stream = "0.2"
+async-stream = "0.3"
+url = { version = "2", features = ["serde"] }
+# Already transitively pulled via the ACP SDK; declared directly so we
+# can format ISO 8601 timestamps for `SessionInfo.updated_at` in the
+# session/list response.
+chrono = { version = "0.4", default-features = false, features = ["std"] }
+
+[[bin]]
+name = "helexa-acp"
+path = "src/main.rs"
--- a/crates/helexa-acp/README.md
+++ b/crates/helexa-acp/README.md
@@ -0,0 +1,546 @@
+# helexa-acp
+
+ACP (Agent Client Protocol) bridge for editors like
+[Zed](https://zed.dev). Lets you point your editor's agent panel at
+**any combination** of OpenAI-compatible, OpenAI Responses, and
+Anthropic Messages endpoints — public APIs, private LAN deployments,
+local Ollama / LM Studio — and switch between them per session via a
+model dropdown.
+
+The "missing ACP binary" for users who don't want to be locked into
+one vendor's agent client.
+
+```
+       ┌───────────────────────────────────┐
+       │  Zed (or any ACP editor client)   │
+       └────────────┬──────────────────────┘
+                    │  stdio JSON-RPC (ACP)
+                    ▼
+            ┌─────────────────┐
+            │   helexa-acp    │  ← one binary, multi-endpoint
+            └─────┬───────────┘
+                  │  HTTP / SSE
+         ┌────────┼─────────────┬──────────────┬──────────────┐
+         ▼        ▼             ▼              ▼              ▼
+    cortex/    OpenAI       Anthropic      OpenRouter    LM Studio
+    neuron    Responses    Messages
+   (self-     (gpt-5,…)    (Claude)
+    hosted)
+```
+
+## What it does
+
+- **Speaks ACP** over stdio to editor clients (Zed today; any future
+  ACP client tomorrow).
+- **Multi-endpoint** — one config file lists every LLM endpoint
+  you want available; pick one per session via the model dropdown
+  (`endpoint:model` selector).
+- **Three wire formats**: `openai-chat` (the broadly compatible
+  default), `openai-responses` (newer OpenAI surface), and
+  `anthropic-messages` (Claude). Each is a separate provider impl
+  in `src/provider/`; adding a fourth (Gemini, Ollama native, …) is
+  one file plus a `WireApi` enum variant.
+- **Built-in tools**: `read_file`, `write_file`, `edit_file`,
+  `list_dir`, `bash`. Permission-gated by default; the editor user
+  approves writes/shell per-call.
+- **Three session modes**: Default (gated), Bypass Permissions
+  (auto-allow), and Plan (write-only-to-plan-dir, no shell).
+- **Vision** — drag-drop images into the agent panel against any
+  vision-capable model.
+- **Session resume** — multi-day conversations survive editor
+  restarts via on-disk transcript persistence.
+- **Context compaction** — rolling history stays inside the model's
+  context window automatically so long sessions on small-context
+  local models don't fall over.
+
+## Install
+
+### From source
+
+```sh
+git clone https://git.lair.cafe/helexa/helexa.git
+cd helexa
+cargo install --path crates/helexa-acp
+# Binary lands at ~/.cargo/bin/helexa-acp
+```
+
+### Pre-built RPM (Fedora 43)
+
+```sh
+dnf copr enable helexa/helexa
+dnf install helexa-acp
+```
+
+The COPR project bundles helexa-acp alongside the cortex gateway
+and helexa-neuron flavours; install only the package(s) you need.
+
+## Quick start
+
+The fastest path: env-var single-endpoint config.
+
+```sh
+export HELEXA_ACP_BASE_URL=http://hanzalova.internal:31313/v1
+export HELEXA_ACP_MODEL=Qwen/Qwen3.6-27B
+helexa-acp  # speaks ACP over stdin/stdout; not interactive
+```
+
+Then in Zed (`~/.config/zed/settings.json`):
+
+```jsonc
+{
+  "agent_servers": {
+    "helexa": {
+      "command": "helexa-acp",
+      "args": []
+    }
+  }
+}
+```
+
+Restart Zed → open the agent panel → pick "helexa" → start
+chatting. Tool calls (file reads, writes, bash) prompt for
+permission per-call in Default mode.
+
+That's the minimum. The full config story below is what unlocks
+the multi-endpoint dropdown.
+
+## Multi-endpoint config
+
+Copy `helexa-acp.example.toml` from this repo to
+`$XDG_CONFIG_HOME/helexa-acp/config.toml` (typically
+`~/.config/helexa-acp/config.toml`) and edit:
+
+```toml
+default_endpoint = "helexa"
+
+[[endpoints]]
+name = "helexa"
+base_url = "http://hanzalova.internal:31313/v1"
+wire_api = "openai-chat"
+default_model = "Qwen/Qwen3.6-27B"
+max_tokens = 8192
+context_window = 32768
+
+[[endpoints]]
+name = "openrouter"
+base_url = "https://openrouter.ai/api/v1"
+wire_api = "openai-chat"
+api_key_env = "OPENROUTER_API_KEY"
+default_model = "anthropic/claude-opus-4"
+
+[[endpoints]]
+name = "anthropic"
+base_url = "https://api.anthropic.com/v1"
+wire_api = "anthropic-messages"
+api_key_env = "ANTHROPIC_API_KEY"
+default_model = "claude-opus-4"
+```
+
+Restart Zed. The model dropdown lists every model from every
+configured endpoint with the `endpoint:model` selector
+(`helexa:Qwen/Qwen3.6-27B`, `openrouter:anthropic/claude-opus-4`,
+…). Switch mid-session; the next prompt routes to the new endpoint.
+
+When only one endpoint is configured the prefix is dropped (model
+ids appear bare).
+
+### Selector syntax
+
+The `model` field on every internal request is parsed as
+`<endpoint>:<model>`:
+
+- `openrouter:gpt-4o` → routes to the `openrouter` endpoint,
+  model `gpt-4o`.
+- `helexa/large` → no colon → falls through to whichever endpoint
+  is named in `default_endpoint`, model `helexa/large`.
+- `:gpt-5` → leading colon → also falls through to default.
+
+## Endpoint cookbook
+
+Copy-pasteable blocks. Mix and match.
+
+### cortex / neuron (self-hosted)
+
+```toml
+[[endpoints]]
+name = "helexa"
+base_url = "http://hanzalova.internal:31313/v1"
+wire_api = "openai-chat"
+default_model = "Qwen/Qwen3.6-27B"
+max_tokens = 8192
+context_window = 32768
+```
+
+Use `openai-responses` instead of `openai-chat` once cortex 0.1.16+
+is deployed and you want the Responses API surface (vision item
+shape, structured reasoning items, etc.).
+
+### OpenAI directly
+
+```toml
+[[endpoints]]
+name = "openai"
+base_url = "https://api.openai.com/v1"
+wire_api = "openai-responses"
+api_key_env = "OPENAI_API_KEY"
+default_model = "gpt-5"
+```
+
+`openai-responses` is the right choice for current OpenAI models;
+`openai-chat` works against legacy GPT-3.5/4 deployments and
+anything labelled "chat completions".
+
+### Anthropic directly
+
+```toml
+[[endpoints]]
+name = "anthropic"
+base_url = "https://api.anthropic.com/v1"
+wire_api = "anthropic-messages"
+api_key_env = "ANTHROPIC_API_KEY"
+default_model = "claude-opus-4"
+```
+
+helexa-acp sends `x-api-key` + `anthropic-version: 2023-06-01`
+automatically. The `api_key_env` indirection keeps your key out of
+the config file.
+
+### OpenRouter (multi-vendor proxy)
+
+```toml
+[[endpoints]]
+name = "openrouter"
+base_url = "https://openrouter.ai/api/v1"
+wire_api = "openai-chat"
+api_key_env = "OPENROUTER_API_KEY"
+default_model = "anthropic/claude-opus-4"
+```
+
+OpenRouter speaks OpenAI-compat for every model it fronts, so
+`openai-chat` is the right wire format regardless of the
+underlying vendor.
+
+### LM Studio (local)
+
+```toml
+[[endpoints]]
+name = "lmstudio"
+base_url = "http://localhost:1234/v1"
+wire_api = "openai-chat"
+default_model = "auto"
+```
+
+LM Studio's "auto" model id picks whatever's loaded. Same shape
+works for Ollama in compat mode (`http://localhost:11434/v1`) and
+vLLM.
+
+### Multiple cortex deployments
+
+```toml
+[[endpoints]]
+name = "lan"
+base_url = "http://hanzalova.internal:31313/v1"
+wire_api = "openai-chat"
+default_model = "Qwen/Qwen3.6-27B"
+
+[[endpoints]]
+name = "cloud"
+base_url = "https://cortex.example.com/v1"
+wire_api = "openai-chat"
+api_key_env = "CLOUD_CORTEX_KEY"
+default_model = "Qwen/Qwen3-VL-8B"
+```
+
+Use the `endpoint:model` selector to switch between them mid-session.
+
+## Zed setup
+
+`~/.config/zed/settings.json`:
+
+```jsonc
+{
+  "agent_servers": {
+    "helexa": {
+      "command": "helexa-acp"
+    }
+  }
+}
+```
+
+Optional environment overrides for the binary:
+
+```jsonc
+{
+  "agent_servers": {
+    "helexa": {
+      "command": "helexa-acp",
+      "env": {
+        "HELEXA_ACP_LOG_FILE": "/tmp/helexa-acp.log",
+        "RUST_LOG": "helexa_acp=debug"
+      }
+    }
+  }
+}
+```
+
+`HELEXA_ACP_LOG_FILE` is the one you actually want — Zed doesn't
+surface the agent's stderr, so without that env var debug output is
+invisible. Point it at a file you can `tail -f`.
+
+After restarting Zed: ⌘+? (or wherever your "Open Agent Panel"
+binding is) → select "helexa" → the model dropdown populates from
+your config → start prompting.
+
+## Modes
+
+Three session modes ship; the user picks via Zed's mode dropdown
+on the agent panel.
+
+| Mode | Reads | Writes | Bash | Permission prompts |
+|------|-------|--------|------|--------------------|
+| **Default** | ✓ | with prompt | with prompt | per call |
+| **Bypass Permissions** | ✓ | ✓ | ✓ | never |
+| **Plan** | ✓ | only into plan dir | disabled | never (plan-dir writes auto-allow) |
+
+### Default
+
+Reads are always allowed (`read_file`, `list_dir` are
+unrestricted). Writes and shell commands prompt the user before
+running. The intended baseline for any session where the agent
+might do something you'd rather review first.
+
+### Bypass Permissions
+
+Auto-allow every tool call. Use for agentic loops you trust — bulk
+edits across many files, scripted workflows, prepared session
+templates. Never for code the agent hasn't seen before.
+
+### Plan
+
+The "draft an implementation plan before you write code" mode.
+Available tools:
+
+- `read_file`, `list_dir`: unrestricted (read the codebase).
+- `write_file`, `edit_file`: allowed *only* under
+  `$XDG_DATA_HOME/helexa-acp/plans/<project-id>/`. Any path
+  outside that returns "plan mode: writes are restricted to …"
+  back to the model so it self-corrects.
+- `bash`: disabled outright. Returns "plan mode: shell execution
+  is disabled" if attempted.
+
+When the plan is complete, the model presents a 3-option menu:
+
+1. **Bypass Permissions** — implement the plan now, no prompts.
+2. **Default** — implement now with per-tool prompts.
+3. **Plan** (stay here) — refine the plan with more guidance.
+
+Switch the mode dropdown to your preference and reply to proceed.
+
+## Tools
+
+Five tools, defined in `src/tools.rs`:
+
+| Tool | Args | Gated in Default? |
+|------|------|-------------------|
+| `read_file` | `path`, `line?`, `limit?` | no |
+| `list_dir` | `path` | no |
+| `write_file` | `path`, `content` | yes |
+| `edit_file` | `path`, `old_text`, `new_text` | yes |
+| `bash` | `command`, `cwd?` | yes |
+
+### Path handling
+
+`~`, `~/`, `$HOME`, and `$HOME/` are expanded server-side before
+the path reaches ACP or local fs. Lets the model emit
+`~/git/repo/file.rs` and have it Just Work.
+
+`read_file` first tries the editor's filesystem (ACP's
+`fs/read_text_file` — respects open buffers, workspace overlays,
+etc.). If that fails — typically because the path is outside Zed's
+workspace boundary — it falls back to `std::fs::read_to_string`.
+This lets the agent pull in shared material like
+`~/git/architecture/generic.md` from a different project's
+session.
+
+The fallback is logged at warn level so you can see when it kicks
+in.
+
+### Tool dispatch
+
+Tool descriptions reach the model through a Qwen3 Hermes-format
+`# Tools` block injected into the system prompt — cortex/neuron
+pass the OpenAI `tools` request field through to the encoder
+unread, so we work the model into emitting `<tool_call>{json}</tool_call>`
+markers it then parses out of the content stream. This applies to
+the helexa wire format; OpenAI / Anthropic endpoints with native
+tool support would use their own paths once they're wired in.
+
+The parser is tolerant: malformed JSON (trailing braces, missing
+`name`, name nested in `arguments`) gets a repair pass; if that
+fails the call surfaces as a "Malformed tool call" card in Zed and
+the model gets a synthetic error result so it can self-correct.
+
+## Session resume
+
+helexa-acp persists every session to
+`$XDG_DATA_HOME/helexa-acp/sessions/<id>.json`. Zed's `session/list`
+RPC asks helexa-acp to enumerate them on workspace open;
+`session/load` rehydrates and replays the transcript as
+`session/update` notifications so the agent panel renders the
+prior conversation.
+
+Behaviour:
+
+- Persisted per-round, so a mid-turn agent stall (long bash, wedged
+  ACP roundtrip) doesn't lose earlier rounds.
+- Survives editor restart and the helexa-acp binary upgrading
+  between versions.
+- Project-scoped: only sessions whose `cwd` matches the workspace
+  are listed.
+
+To wipe history: `rm -rf $XDG_DATA_HOME/helexa-acp/sessions/`.
+
+## Context compaction
+
+When an endpoint sets `context_window`, helexa-acp projects the
+rolling history into a token budget before each request — old
+`ToolResult` content (read_file payloads are the worst offenders)
+gets elided to one-line markers, preserving `tool_call_id` pairing
+so the wire schema stays valid.
+
+System prompts, user turns, and the most recent ~4 messages are
+never elided. The full history stays on disk; compaction is a
+per-request projection, not a destructive edit.
+
+Set `context_window = 32768` for a 32 K Qwen3, `131072` for a
+modern Claude, etc. With `max_tokens` also set, the budget is
+`context_window - max_tokens - 512_safety`.
+
+## Troubleshooting
+
+### "default endpoint 'helexa' has no usable provider — check config"
+
+The named default endpoint failed to construct. Usually:
+
+- `api_key_env` references a variable that isn't set in the env
+  Zed launched helexa-acp with.
+- The TOML's `wire_api` is misspelled (only `openai-chat`,
+  `openai-responses`, `anthropic-messages` are accepted).
+
+Test by running `helexa-acp` directly from a shell — startup
+errors land on stderr.
+
+### Model dropdown is empty
+
+Each provider's `list_models` failed at startup. Look at
+`HELEXA_ACP_LOG_FILE` for "list_models failed; this endpoint's
+models won't appear in the picker". Likely the endpoint URL is
+wrong, the API key is invalid, or the upstream `/v1/models`
+endpoint isn't responding.
+
+The agent still works against `default_model` even when the
+dropdown is empty — list-models is for picking, not routing.
+
+### "prompt_too_long" / agent stalls mid-conversation
+
+You hit the model's context window. Set `context_window` on the
+endpoint and helexa-acp will compact before sending. The log line
+`context compaction applied` confirms it's running; if it fires
+but the upstream still rejects, the compaction heuristic
+under-counted and the budget needs tuning down.
+
+### Reading files outside the workspace returns "not found"
+
+Zed's `fs/read_text_file` is workspace-scoped. helexa-acp falls
+back to local `std::fs` automatically when that fails — look for
+`fs/read_text_file failed; falling back to local std::fs` in the
+log. If even local read fails, the file genuinely doesn't exist
+or the user process lacks permissions.
+
+### Tool calls render as text instead of structured cards
+
+The model is emitting `<tool_call>` markers that the parser can't
+decode. Two common causes:
+
+1. The system prompt isn't reaching the model (cortex/neuron's
+   tool-block injection didn't fire). Confirm with
+   `RUST_LOG=helexa_acp=debug` and look at the outgoing
+   `POST /chat/completions` body.
+2. The model itself is too small / undertrained to follow the
+   Hermes format reliably. helexa-acp has shape-based name
+   inference and JSON repair, but there's a floor below which
+   nothing helps.
+
+### Plan-mode writes refused even inside the plan dir
+
+The path comparison is byte-for-byte. If the model emits a path
+with `~` and the plan_dir has the expanded form, expansion runs
+*before* the comparison — but resolved-vs-symlinked-path
+mismatches can still bite. The error message names the attempted
+path and the expected prefix so you can compare directly.
+
+## Architecture
+
+Source layout under `crates/helexa-acp/src/`:
+
+| File | Responsibility |
+|------|----------------|
+| `main.rs` | tokio + Stdio transport. Builds providers, hands off to `agent::Agent` |
+| `config.rs` | TOML + env-fallback config, endpoint resolver |
+| `agent.rs` | ACP handlers (initialize, session/new, session/prompt, session/cancel, session/set_mode, session/set_model, session/load, session/list), prompt loop with tool-call recursion |
+| `session.rs` | Per-session state map (Arc<RwLock<HashMap<…>>>) |
+| `store.rs` | On-disk session persistence, plan-dir resolution |
+| `prompt.rs` | System-prompt assembly, plan-mode addendum |
+| `tools.rs` | Tool schemas + shape-based name inference |
+| `tool_runner.rs` | Dispatch a single tool call through ACP client RPCs; permission gate |
+| `qwen3.rs` | Qwen3 Hermes tool-format parser (`<tool_call>` / `<think>` markers) |
+| `compaction.rs` | Token-budget compaction for the rolling history |
+| `path_util.rs` | `~` / `$HOME` expansion shared across every path-taking tool |
+| `provider/openai_chat.rs` | OpenAI chat completions provider |
+| `provider/openai_responses.rs` | OpenAI Responses API provider |
+| `provider/anthropic_messages.rs` | Anthropic Messages API provider |
+
+### Adding a new wire format
+
+1. New file under `src/provider/` implementing the `Provider`
+   trait (encoder + SSE decoder).
+2. Add a `WireApi` variant in `config.rs`.
+3. Wire it into `build_provider` in `main.rs`.
+4. Done — every other module is wire-format-agnostic.
+
+### Concurrency
+
+- `Arc<RwLock<HashMap<SessionId, Arc<Mutex<SessionState>>>>>` —
+  per-session mutex so concurrent requests across sessions don't
+  contend; the map's RwLock is read-mostly.
+- Every tool call dispatched serially within a session (parallel
+  dispatch would require Zed to handle interleaved permission
+  prompts).
+- Provider streams are back-pressured by the consumer (bounded
+  mpsc channels).
+
+### Self-contained
+
+The crate has no workspace-internal dependencies (no
+`cortex-core`, no `cortex-gateway`). Migration to a dedicated
+GitHub repo for cross-platform CI / cargo-dist binaries is
+Cargo.toml-only.
+
+## Status
+
+- Stages 1–6 shipped: scaffold, agent loop, tools, modes, session
+  resume, image input, model picker, three wire formats.
+- Stage 8 (RPM + multi-platform CI) tracked in the canonical plan;
+  Linux x86_64 RPM ships today via the cortex monorepo's Gitea
+  Actions.
+
+## Contributing
+
+Repository: https://git.lair.cafe/helexa/helexa (`crates/helexa-acp/`).
+Issues / PRs welcome. The canonical staged plan is in
+`~/.claude/plans/plan-the-per-device-worker-abstract-micali.md` on
+the maintainer's machine; the substages 3a–3e and 6a/6b that the
+canonical plan didn't anticipate are documented in commit messages.
+
+CI: `cargo fmt --check --all`, `cargo clippy --workspace -- -D
+warnings`, `cargo test --workspace` must all pass before merge.
--- a/crates/helexa-acp/src/agent.rs
+++ b/crates/helexa-acp/src/agent.rs
--- a/crates/helexa-acp/src/compaction.rs
+++ b/crates/helexa-acp/src/compaction.rs
@@ -0,0 +1,425 @@
+//! Rolling-conversation compaction for small-context local models.
+//!
+//! The tool-call loop in [`crate::agent`] grows the message vec it
+//! sends upstream every round. On a frontier model that's fine; on a
+//! 32 K Qwen3 the first few `read_file` results can push the prompt
+//! past the model's context window, at which point cortex/neuron
+//! refuses with `prompt_too_long` and the whole turn dies. Long-form
+//! local agents are unusable without something here.
+//!
+//! Strategy (intentionally simple — no LLM-summarization round-trip,
+//! no tokenizer dependency):
+//!
+//! 1. **Protect** the things the model cannot reason without:
+//!    - The system prompt (idx 0).
+//!    - Every `Role::User` turn (the user's intent — irreplaceable).
+//!    - The last [`KEEP_TAIL`] messages (most recent rounds stay
+//!      verbatim so the model can keep working on what it just
+//!      observed).
+//! 2. **Elide** older `Role::Assistant` prose and older `Role::Tool`
+//!    result content. The structure stays — `tool_call_id`s, tool
+//!    names, and argument JSON survive intact — so OpenAI's strict
+//!    `tool_calls` ↔ `tool` pairing schema remains satisfied. Only
+//!    the *payload* shrinks to a one-line marker.
+//! 3. Walk oldest→newest, recomputing the budget after each elision.
+//!    Stop as soon as we fit; we don't compact more than necessary.
+//! 4. If we still exceed budget after eliding everything we're
+//!    allowed to, return what we have. The upstream will surface a
+//!    `prompt_too_long` error and the user can intervene; that's
+//!    better than silently dropping content the model needs.
+//!
+//! Token estimation uses a `chars / 3.5` heuristic — conservative
+//! (over-estimates tokens slightly) so we compact a touch early
+//! rather than a touch late.
+
+use crate::provider::{Message, MessageContent, MessagePart, Role};
+
+/// Most-recent N messages that are never elided. Roughly "the
+/// current tool round in flight" — assistant turn that called the
+/// tools + each tool result + a bit of slack.
+const KEEP_TAIL: usize = 4;
+
+/// Below this content size we don't bother eliding — the savings
+/// don't outweigh the loss of detail. Roughly 60–80 tokens.
+const ELIDE_MIN_CHARS: usize = 256;
+
+/// Roughly tokens-per-character for English + code mixed in. The
+/// actual per-tokenizer ratio varies (GPT-4o ≈ 4 chars/token on
+/// English prose, ≈ 3 chars/token on code-heavy text). We pick a
+/// value on the conservative end so the budget check fires *before*
+/// the upstream tokenizer says no.
+const CHARS_PER_TOKEN: f32 = 3.5;
+
+/// Per-message envelope overhead (role + JSON framing). Comes out
+/// to a few tokens; tiny but it adds up across long histories.
+const ENVELOPE_TOKENS: usize = 8;
+
+/// Rough per-image token cost used by the budget estimator. Real
+/// vision tokenizers vary widely (256–1024 tokens for typical
+/// resolutions on Qwen3-VL, OpenAI's `low`/`high` detail toggles
+/// pick between ~85 and ~1000+). 512 is a defensible middle that
+/// keeps compaction from treating images as free.
+const IMAGE_TOKENS_APPROX: usize = 512;
+
+/// Stats reported back from [`compact_to_budget`] for the caller to
+/// log. The numbers are estimates (see [`estimate_tokens`]), so
+/// don't compare them to upstream-reported token counts as if they
+/// were exact.
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct CompactionStats {
+    /// Estimated tokens in the input messages.
+    pub original_tokens: usize,
+    /// Estimated tokens after compaction. Equal to `original_tokens`
+    /// when no compaction was needed.
+    pub final_tokens: usize,
+    /// Number of messages whose content was elided. Zero is the
+    /// hot path (nothing to do).
+    pub elided_messages: usize,
+}
+
+impl CompactionStats {
+    fn unchanged(tokens: usize) -> Self {
+        Self {
+            original_tokens: tokens,
+            final_tokens: tokens,
+            elided_messages: 0,
+        }
+    }
+}
+
+/// Approximate token count for one message. Sums the textual
+/// payload's chars, divides by [`CHARS_PER_TOKEN`], and adds an
+/// envelope constant. Cheap (no allocation) so safe to call once per
+/// message per round.
+pub fn estimate_tokens(msg: &Message) -> usize {
+    let chars = match &msg.content {
+        MessageContent::Text { text } => text.len(),
+        MessageContent::MultiPart { parts } => parts
+            .iter()
+            .map(|p| match p {
+                MessagePart::Text { text } => text.len(),
+                // Each image is one block in the context window; the
+                // upstream tokenizer handles the real cost (and it
+                // varies wildly by model — Qwen3-VL uses ~256-1024
+                // tokens per image depending on size). Take a
+                // middle estimate so the budget tracker doesn't
+                // pretend images are free.
+                MessagePart::Image(_) => IMAGE_TOKENS_APPROX * CHARS_PER_TOKEN as usize,
+            })
+            .sum(),
+        MessageContent::ToolCalls { text, calls } => {
+            let txt = text.as_deref().map(|s| s.len()).unwrap_or(0);
+            let calls_size: usize = calls
+                .iter()
+                .map(|c| c.name.len() + c.arguments.len() + c.id.len())
+                .sum();
+            txt + calls_size
+        }
+        MessageContent::ToolResult {
+            tool_call_id,
+            content,
+        } => tool_call_id.len() + content.len(),
+    };
+    ((chars as f32 / CHARS_PER_TOKEN) as usize) + ENVELOPE_TOKENS
+}
+
+/// Sum of [`estimate_tokens`] across all messages.
+pub fn total_tokens(messages: &[Message]) -> usize {
+    messages.iter().map(estimate_tokens).sum()
+}
+
+/// Project `messages` into a vec whose estimated token count fits in
+/// `budget` tokens. Returns the projection plus stats about what
+/// was done. When the input already fits, the projection is a clone
+/// of the input and stats report zero elisions.
+///
+/// See module docs for the strategy and protected set.
+pub fn compact_to_budget(messages: &[Message], budget: usize) -> (Vec<Message>, CompactionStats) {
+    let original = total_tokens(messages);
+    if original <= budget {
+        return (messages.to_vec(), CompactionStats::unchanged(original));
+    }
+
+    let mut out = messages.to_vec();
+    let len = out.len();
+    let tail_start = len.saturating_sub(KEEP_TAIL);
+    let mut elided = 0usize;
+
+    // Two passes. First pass: ToolResult contents (largest savings
+    // per elision — read_file payloads land here). Second pass: long
+    // Assistant prose. We don't interleave because eliding a long
+    // assistant turn before a really old read_file would do less
+    // good per elision; oldest-first ordering is enforced *within*
+    // each pass instead.
+    for pass in 0..2 {
+        for i in 1..tail_start {
+            if matches!(out[i].role, Role::User) {
+                continue;
+            }
+            let target_pass_2 = matches!(
+                &out[i].content,
+                MessageContent::Text { .. } | MessageContent::ToolCalls { .. }
+            );
+            let target_pass_1 = matches!(&out[i].content, MessageContent::ToolResult { .. });
+            let in_pass = (pass == 0 && target_pass_1) || (pass == 1 && target_pass_2);
+            if !in_pass {
+                continue;
+            }
+            if elide_in_place(&mut out[i]) {
+                elided += 1;
+                if total_tokens(&out) <= budget {
+                    let final_tokens = total_tokens(&out);
+                    return (
+                        out,
+                        CompactionStats {
+                            original_tokens: original,
+                            final_tokens,
+                            elided_messages: elided,
+                        },
+                    );
+                }
+            }
+        }
+    }
+
+    let final_tokens = total_tokens(&out);
+    (
+        out,
+        CompactionStats {
+            original_tokens: original,
+            final_tokens,
+            elided_messages: elided,
+        },
+    )
+}
+
+/// Shrink one message's payload while keeping its structural role
+/// (so tool_call_id pairing survives). Returns `true` when the
+/// message changed.
+///
+/// - `ToolResult.content` → `(elided: N bytes of tool result)`
+/// - `ToolCalls.text`     → `(elided: N bytes of assistant prose)`
+/// - `Text` (assistant)   → `(elided: N bytes of assistant prose)`
+///
+/// Already-tiny payloads are skipped — eliding a 50-byte string
+/// would *grow* it once the marker is in place.
+fn elide_in_place(msg: &mut Message) -> bool {
+    match &mut msg.content {
+        MessageContent::ToolResult { content, .. } => {
+            if content.len() < ELIDE_MIN_CHARS {
+                return false;
+            }
+            *content = format!("(elided: {} bytes of tool result)", content.len());
+            true
+        }
+        MessageContent::ToolCalls { text, .. } => match text {
+            Some(t) if t.len() >= ELIDE_MIN_CHARS => {
+                *text = Some(format!("(elided: {} bytes of assistant prose)", t.len()));
+                true
+            }
+            _ => false,
+        },
+        MessageContent::Text { text } => {
+            if text.len() < ELIDE_MIN_CHARS {
+                return false;
+            }
+            *text = format!("(elided: {} bytes of assistant prose)", text.len());
+            true
+        }
+        MessageContent::MultiPart { .. } => {
+            // MultiPart messages today only exist as User turns,
+            // and User turns are protected by the role check in
+            // `compact_to_budget` — so this branch is unreachable
+            // for current call sites. Returning false keeps the
+            // unreachable path benign if a future stage starts
+            // emitting MultiPart on other roles.
+            false
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::provider::ToolCall;
+
+    fn sys(text: &str) -> Message {
+        Message {
+            role: Role::System,
+            content: MessageContent::Text { text: text.into() },
+        }
+    }
+    fn user(text: &str) -> Message {
+        Message {
+            role: Role::User,
+            content: MessageContent::Text { text: text.into() },
+        }
+    }
+    fn assistant_text(text: &str) -> Message {
+        Message {
+            role: Role::Assistant,
+            content: MessageContent::Text { text: text.into() },
+        }
+    }
+    fn assistant_calls(text: Option<&str>, name: &str, args: &str, id: &str) -> Message {
+        Message {
+            role: Role::Assistant,
+            content: MessageContent::ToolCalls {
+                text: text.map(|s| s.to_string()),
+                calls: vec![ToolCall {
+                    id: id.into(),
+                    name: name.into(),
+                    arguments: args.into(),
+                }],
+            },
+        }
+    }
+    fn tool_result(id: &str, body: &str) -> Message {
+        Message {
+            role: Role::Tool,
+            content: MessageContent::ToolResult {
+                tool_call_id: id.into(),
+                content: body.into(),
+            },
+        }
+    }
+
+    #[test]
+    fn under_budget_is_a_no_op_clone() {
+        let msgs = vec![sys("you are an agent"), user("hi"), assistant_text("hello")];
+        let (out, stats) = compact_to_budget(&msgs, 10_000);
+        assert_eq!(stats.elided_messages, 0);
+        assert_eq!(stats.original_tokens, stats.final_tokens);
+        assert_eq!(out.len(), msgs.len());
+        // Strings unchanged.
+        match &out[2].content {
+            MessageContent::Text { text } => assert_eq!(text, "hello"),
+            other => panic!("expected Text, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn elides_old_tool_result_before_old_assistant_prose() {
+        // History: sys, user, assistant_calls, big_tool_result,
+        //          assistant_with_big_text, user, assistant_calls,
+        //          small_tool_result.
+        // KEEP_TAIL=4 protects the last four; the big tool result
+        // sits in the prunable range and should go first because
+        // pass 0 (tool results) runs before pass 1 (prose).
+        let big_result = "X".repeat(4096);
+        let big_prose = "Y".repeat(2048);
+        let msgs = vec![
+            sys("preamble"),
+            user("first ask"),
+            assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "c0"),
+            tool_result("c0", &big_result),
+            assistant_text(&big_prose),
+            user("follow up"),
+            assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "c1"),
+            tool_result("c1", "short result body"),
+        ];
+        let before = total_tokens(&msgs);
+        // Force compaction by setting budget well below current.
+        let budget = before / 2;
+        let (out, stats) = compact_to_budget(&msgs, budget);
+
+        assert!(
+            stats.elided_messages >= 1,
+            "expected at least one elision, got {stats:?}"
+        );
+        // The big tool result must be elided (oldest fat target).
+        match &out[3].content {
+            MessageContent::ToolResult { content, .. } => {
+                assert!(
+                    content.starts_with("(elided:"),
+                    "tool result not elided: {content:?}"
+                );
+            }
+            other => panic!("expected ToolResult, got {other:?}"),
+        }
+        // Last four messages must be untouched.
+        assert!(matches!(
+            &out[out.len() - 1].content,
+            MessageContent::ToolResult { content, .. } if content == "short result body"
+        ));
+    }
+
+    #[test]
+    fn never_elides_system_or_user_turns() {
+        let big_user = "U".repeat(8192);
+        let msgs = vec![sys("preamble"), user(&big_user), assistant_text("ok")];
+        let budget = 10; // way below — forces all possible elision
+        let (out, _stats) = compact_to_budget(&msgs, budget);
+        // System unchanged.
+        match &out[0].content {
+            MessageContent::Text { text } => assert_eq!(text, "preamble"),
+            other => panic!("expected Text, got {other:?}"),
+        }
+        // User unchanged even though it's huge.
+        match &out[1].content {
+            MessageContent::Text { text } => assert_eq!(text.len(), big_user.len()),
+            other => panic!("expected Text, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn preserves_tool_call_id_pairing_after_elision() {
+        // OpenAI strict mode rejects a tool-result whose tool_call_id
+        // doesn't match a preceding assistant tool_call. Elision
+        // must not break that linkage.
+        let big = "Z".repeat(4096);
+        let msgs = vec![
+            sys("preamble"),
+            user("first"),
+            assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "call_42"),
+            tool_result("call_42", &big),
+            // Tail messages.
+            user("next"),
+            assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "call_43"),
+            tool_result("call_43", "ok"),
+            assistant_text("done"),
+        ];
+        let budget = total_tokens(&msgs) / 3;
+        let (out, _stats) = compact_to_budget(&msgs, budget);
+        // The assistant call and its result both carry call_42.
+        let call_id = match &out[2].content {
+            MessageContent::ToolCalls { calls, .. } => calls[0].id.clone(),
+            other => panic!("expected ToolCalls, got {other:?}"),
+        };
+        match &out[3].content {
+            MessageContent::ToolResult { tool_call_id, .. } => {
+                assert_eq!(tool_call_id, &call_id, "pairing broken");
+            }
+            other => panic!("expected ToolResult, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn estimate_tokens_grows_with_content() {
+        let small = sys("hi");
+        let large = sys(&"x".repeat(10_000));
+        assert!(estimate_tokens(&large) > estimate_tokens(&small) * 100);
+    }
+
+    #[test]
+    fn elide_in_place_skips_short_content() {
+        let mut m = tool_result("c0", "tiny");
+        assert!(!elide_in_place(&mut m));
+        match m.content {
+            MessageContent::ToolResult { content, .. } => assert_eq!(content, "tiny"),
+            other => panic!("expected ToolResult, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn returns_best_effort_when_budget_unmeetable() {
+        // Single huge user message that cannot be elided. Budget 10.
+        // We don't error — we return what we have and let upstream
+        // refuse the prompt with its own error.
+        let big_user = "U".repeat(100_000);
+        let msgs = vec![sys("preamble"), user(&big_user)];
+        let (out, stats) = compact_to_budget(&msgs, 10);
+        assert_eq!(out.len(), msgs.len());
+        assert!(stats.final_tokens > 10, "still over budget by design");
+    }
+}
--- a/crates/helexa-acp/src/config.rs
+++ b/crates/helexa-acp/src/config.rs
@@ -0,0 +1,424 @@
+//! Configuration for the helexa-acp bridge.
+//!
+//! Loaded from `$XDG_CONFIG_HOME/helexa-acp/config.toml` (or
+//! `~/.config/helexa-acp/config.toml` as a fallback). If no config file
+//! exists, falls back to building a single anonymous endpoint from env
+//! vars — that keeps "just point at one cortex" frictionless without
+//! requiring a config file on disk.
+//!
+//! The design goal is "the missing ACP binary for users with multiple
+//! API endpoints (possibly on a private LAN, possibly mixing wire
+//! types)". Hence: every endpoint is named, has its own wire API, and
+//! has its own default model. The agent's selected model id can be
+//! prefixed `endpoint:model` to route across endpoints; a bare
+//! `model` falls through to the configured `default_endpoint`.
+//!
+//! ### Example TOML
+//!
+//! ```toml
+//! default_endpoint = "helexa"
+//!
+//! [[endpoints]]
+//! name = "helexa"
+//! base_url = "http://hanzalova.internal:31313/v1"
+//! wire_api = "openai-chat"
+//! default_model = "helexa/large"
+//!
+//! [[endpoints]]
+//! name = "openrouter"
+//! base_url = "https://openrouter.ai/api/v1"
+//! wire_api = "openai-chat"
+//! api_key_env = "OPENROUTER_API_KEY"
+//! default_model = "anthropic/claude-opus-4"
+//!
+//! [[endpoints]]
+//! name = "lmstudio"
+//! base_url = "http://localhost:1234/v1"
+//! wire_api = "openai-chat"
+//! default_model = "auto"
+//! ```
+
+use anyhow::{Context, anyhow};
+use serde::{Deserialize, Serialize};
+use std::path::{Path, PathBuf};
+use url::Url;
+
+const DEFAULT_BASE_URL: &str = "http://hanzalova.internal:31313/v1";
+const DEFAULT_MODEL: &str = "helexa/large";
+const DEFAULT_ENDPOINT_NAME: &str = "default";
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Config {
+    /// Name of the endpoint used when a request doesn't pick one
+    /// explicitly. Must reference an entry in `endpoints`. Defaults to
+    /// the first endpoint declared if unset.
+    #[serde(default)]
+    pub default_endpoint: Option<String>,
+    /// Per-endpoint configuration. At least one entry is required.
+    #[serde(default)]
+    pub endpoints: Vec<EndpointConfig>,
+    /// Optional path to a system-prompt file. When unset, the built-in
+    /// default prompt from `prompt.rs` is used.
+    #[serde(default)]
+    pub system_prompt_path: Option<PathBuf>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EndpointConfig {
+    /// Short identifier used in `endpoint:model` routing and in logs.
+    pub name: String,
+    /// Base URL of the OpenAI-compatible API. Must include the `/v1`
+    /// (or equivalent) suffix — paths like `chat/completions` and
+    /// `models` are joined onto this.
+    pub base_url: Url,
+    /// Wire protocol the endpoint speaks. Phase 1 supports
+    /// [`WireApi::OpenAiChat`] only; `openai-responses` and
+    /// `anthropic-messages` land later behind their own providers.
+    #[serde(default)]
+    pub wire_api: WireApi,
+    /// Model to use when the client hasn't picked one via
+    /// `session/set_model`.
+    #[serde(default)]
+    pub default_model: Option<String>,
+    /// Static API key to send as `Authorization: Bearer …`. Prefer
+    /// `api_key_env` for anything sensitive — keys in plain TOML are a
+    /// liability.
+    #[serde(default)]
+    pub api_key: Option<String>,
+    /// Env var name to read for the API key. Resolved at startup so a
+    /// missing env var yields a clear error rather than silent
+    /// unauthenticated calls.
+    #[serde(default)]
+    pub api_key_env: Option<String>,
+    /// Cap on the model's output tokens per turn. `None` lets the
+    /// upstream pick its own default (cortex/neuron's default is
+    /// often small enough to trip Zed's "Output Limit Reached" on
+    /// long responses). Set to e.g. `32768` to let the model
+    /// produce longer turns. Goes into the OpenAI `max_tokens`
+    /// request field.
+    #[serde(default)]
+    pub max_tokens: Option<u64>,
+    /// Model context window in tokens (prompt + response). When set,
+    /// the agent compacts conversation history before each completion
+    /// so the prompt fits within `context_window - max_tokens - safety`
+    /// tokens — long sessions on small-context local models (Qwen3 at
+    /// 32 K) survive past the first few tool-call rounds rather than
+    /// dying with `prompt_too_long`. `None` disables compaction.
+    #[serde(default)]
+    pub context_window: Option<usize>,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub enum WireApi {
+    /// `POST {base}/chat/completions` returning OpenAI-format SSE.
+    /// Compatible with cortex, LM Studio, Ollama (compat mode),
+    /// OpenRouter, OpenAI itself.
+    #[default]
+    #[serde(rename = "openai-chat")]
+    OpenAiChat,
+    /// `POST {base}/responses` — OpenAI's newer Responses API. Not
+    /// implemented yet; the variant is reserved so endpoint configs
+    /// can be authored ahead of provider support.
+    #[serde(rename = "openai-responses")]
+    OpenAiResponses,
+    /// `POST {base}/messages` — Anthropic format. Reserved.
+    #[serde(rename = "anthropic-messages")]
+    AnthropicMessages,
+}
+
+impl EndpointConfig {
+    /// Resolve the API key from `api_key` (literal) or `api_key_env`
+    /// (env-var lookup). Returns `Ok(None)` when neither is set;
+    /// `Err` when `api_key_env` references a missing variable.
+    pub fn resolve_api_key(&self) -> anyhow::Result<Option<String>> {
+        if let Some(literal) = &self.api_key {
+            return Ok(Some(literal.clone()));
+        }
+        if let Some(var) = &self.api_key_env {
+            return Ok(Some(std::env::var(var).with_context(|| {
+                format!(
+                    "endpoint '{}' references missing env var {}",
+                    self.name, var
+                )
+            })?));
+        }
+        Ok(None)
+    }
+
+    /// `{base_url}/chat/completions`.
+    pub fn chat_completions_url(&self) -> Url {
+        join_segments(&self.base_url, &["chat", "completions"])
+    }
+
+    /// `{base_url}/responses` — OpenAI Responses API endpoint.
+    pub fn responses_url(&self) -> Url {
+        join_segments(&self.base_url, &["responses"])
+    }
+
+    /// `{base_url}/models`. Called from `Provider::list_models`, which
+    /// Stage 4 wires into the model-picker dropdown; until then it's
+    /// reachable code with no in-tree callers.
+    #[allow(dead_code)]
+    pub fn models_url(&self) -> Url {
+        join_segments(&self.base_url, &["models"])
+    }
+}
+
+impl Config {
+    /// Load from TOML at the standard config path, or build from env
+    /// vars if no file exists. Env-fallback yields a single endpoint
+    /// named `"default"`.
+    pub fn load() -> anyhow::Result<Self> {
+        let path = config_path();
+        if let Some(path) = &path
+            && path.exists()
+        {
+            return Self::from_file(path);
+        }
+        Self::from_env()
+    }
+
+    /// Single-endpoint config constructed from `HELEXA_ACP_BASE_URL`,
+    /// `HELEXA_ACP_MODEL`, `HELEXA_ACP_API_KEY`,
+    /// `HELEXA_ACP_SYSTEM_PROMPT_PATH`, `HELEXA_ACP_MAX_TOKENS`.
+    pub fn from_env() -> anyhow::Result<Self> {
+        let base_url = std::env::var("HELEXA_ACP_BASE_URL")
+            .ok()
+            .unwrap_or_else(|| DEFAULT_BASE_URL.into());
+        let base_url = Url::parse(&base_url)
+            .with_context(|| format!("HELEXA_ACP_BASE_URL is not a valid URL ({base_url})"))?;
+        let default_model = std::env::var("HELEXA_ACP_MODEL")
+            .ok()
+            .unwrap_or_else(|| DEFAULT_MODEL.into());
+        let api_key = std::env::var("HELEXA_ACP_API_KEY")
+            .ok()
+            .filter(|s| !s.is_empty());
+        let system_prompt_path = std::env::var("HELEXA_ACP_SYSTEM_PROMPT_PATH")
+            .ok()
+            .filter(|s| !s.is_empty())
+            .map(PathBuf::from);
+        let max_tokens = std::env::var("HELEXA_ACP_MAX_TOKENS")
+            .ok()
+            .filter(|s| !s.is_empty())
+            .map(|s| {
+                s.parse::<u64>().with_context(|| {
+                    format!("HELEXA_ACP_MAX_TOKENS is not a positive integer ({s})")
+                })
+            })
+            .transpose()?;
+        let context_window = std::env::var("HELEXA_ACP_CONTEXT_WINDOW")
+            .ok()
+            .filter(|s| !s.is_empty())
+            .map(|s| {
+                s.parse::<usize>().with_context(|| {
+                    format!("HELEXA_ACP_CONTEXT_WINDOW is not a positive integer ({s})")
+                })
+            })
+            .transpose()?;
+        Ok(Self {
+            default_endpoint: Some(DEFAULT_ENDPOINT_NAME.into()),
+            endpoints: vec![EndpointConfig {
+                name: DEFAULT_ENDPOINT_NAME.into(),
+                base_url,
+                wire_api: WireApi::OpenAiChat,
+                default_model: Some(default_model),
+                api_key,
+                api_key_env: None,
+                max_tokens,
+                context_window,
+            }],
+            system_prompt_path,
+        })
+    }
+
+    pub fn from_file(path: &Path) -> anyhow::Result<Self> {
+        let text = std::fs::read_to_string(path)
+            .with_context(|| format!("read config {}", path.display()))?;
+        let mut cfg: Self =
+            toml::from_str(&text).with_context(|| format!("parse config {}", path.display()))?;
+        cfg.validate()?;
+        Ok(cfg)
+    }
+
+    fn validate(&mut self) -> anyhow::Result<()> {
+        if self.endpoints.is_empty() {
+            return Err(anyhow!("config has no [[endpoints]] entries"));
+        }
+        for (i, ep) in self.endpoints.iter().enumerate() {
+            if ep.name.is_empty() {
+                return Err(anyhow!("endpoints[{i}] has empty name"));
+            }
+            if ep.name.contains(':') {
+                return Err(anyhow!(
+                    "endpoints[{i}].name '{}' contains ':' which would clash \
+                     with the endpoint:model selector syntax",
+                    ep.name
+                ));
+            }
+        }
+        // Pick a default endpoint if none was named.
+        if self.default_endpoint.is_none() {
+            self.default_endpoint = Some(self.endpoints[0].name.clone());
+        }
+        let default_name = self.default_endpoint.as_deref().unwrap();
+        if !self.endpoints.iter().any(|e| e.name == default_name) {
+            return Err(anyhow!(
+                "default_endpoint '{default_name}' is not declared in [[endpoints]]"
+            ));
+        }
+        Ok(())
+    }
+
+    /// Look up an endpoint by name. Returns `None` if not configured.
+    pub fn endpoint(&self, name: &str) -> Option<&EndpointConfig> {
+        self.endpoints.iter().find(|e| e.name == name)
+    }
+
+    /// The default endpoint (guaranteed to exist after `validate`).
+    pub fn default_endpoint(&self) -> &EndpointConfig {
+        let name = self
+            .default_endpoint
+            .as_deref()
+            .expect("default_endpoint set by validate");
+        self.endpoint(name)
+            .expect("default_endpoint resolves after validate")
+    }
+}
+
+/// Parse an ACP-side `model` field into (endpoint name, raw model id).
+///
+/// `helexa:helexa/large` → (`Some("helexa")`, `"helexa/large"`).
+/// `helexa/large` → (`None`, `"helexa/large"`).
+///
+/// The split happens at the FIRST colon. Model ids commonly contain
+/// `/` (HuggingFace style) but rarely `:`; if a model id ever does, the
+/// user can quote-prefix with the default endpoint name.
+pub fn parse_model_selector(input: &str) -> (Option<&str>, &str) {
+    match input.split_once(':') {
+        Some((endpoint, model)) if !endpoint.is_empty() && !model.is_empty() => {
+            (Some(endpoint), model)
+        }
+        _ => (None, input),
+    }
+}
+
+fn config_path() -> Option<PathBuf> {
+    if let Ok(override_path) = std::env::var("HELEXA_ACP_CONFIG_PATH") {
+        return Some(PathBuf::from(override_path));
+    }
+    let xdg = std::env::var("XDG_CONFIG_HOME")
+        .ok()
+        .filter(|s| !s.is_empty());
+    let base = xdg.map(PathBuf::from).or_else(|| {
+        std::env::var("HOME")
+            .ok()
+            .map(|h| PathBuf::from(h).join(".config"))
+    })?;
+    Some(base.join("helexa-acp").join("config.toml"))
+}
+
+fn join_segments(base: &Url, segments: &[&str]) -> Url {
+    let mut out = base.clone();
+    if let Ok(mut path) = out.path_segments_mut() {
+        path.pop_if_empty().extend(segments.iter().copied());
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn url_join_handles_trailing_slash() {
+        let ep = EndpointConfig {
+            name: "x".into(),
+            base_url: Url::parse("http://h.internal:31313/v1").unwrap(),
+            wire_api: WireApi::OpenAiChat,
+            default_model: None,
+            api_key: None,
+            api_key_env: None,
+            max_tokens: None,
+            context_window: None,
+        };
+        assert_eq!(
+            ep.chat_completions_url().as_str(),
+            "http://h.internal:31313/v1/chat/completions"
+        );
+        assert_eq!(
+            ep.models_url().as_str(),
+            "http://h.internal:31313/v1/models"
+        );
+    }
+
+    #[test]
+    fn parses_model_selector() {
+        assert_eq!(
+            parse_model_selector("helexa:helexa/large"),
+            (Some("helexa"), "helexa/large")
+        );
+        assert_eq!(parse_model_selector("helexa/large"), (None, "helexa/large"));
+        assert_eq!(parse_model_selector("gpt-5"), (None, "gpt-5"));
+        // Edge case: a leading colon → no endpoint.
+        assert_eq!(parse_model_selector(":gpt-5"), (None, ":gpt-5"));
+    }
+
+    #[test]
+    fn env_fallback_builds_single_endpoint() {
+        // Don't actually set env vars (would race with other tests);
+        // just confirm the default path constructs cleanly.
+        unsafe {
+            std::env::remove_var("HELEXA_ACP_BASE_URL");
+            std::env::remove_var("HELEXA_ACP_MODEL");
+            std::env::remove_var("HELEXA_ACP_API_KEY");
+        }
+        let cfg = Config::from_env().unwrap();
+        assert_eq!(cfg.endpoints.len(), 1);
+        assert_eq!(cfg.endpoints[0].name, "default");
+        assert_eq!(cfg.endpoints[0].base_url.as_str(), DEFAULT_BASE_URL);
+        assert_eq!(
+            cfg.endpoints[0].default_model.as_deref(),
+            Some(DEFAULT_MODEL)
+        );
+    }
+
+    #[test]
+    fn toml_parses_multi_endpoint() {
+        let toml_text = r#"
+            default_endpoint = "helexa"
+
+            [[endpoints]]
+            name = "helexa"
+            base_url = "http://hanzalova.internal:31313/v1"
+            default_model = "helexa/large"
+
+            [[endpoints]]
+            name = "openrouter"
+            base_url = "https://openrouter.ai/api/v1"
+            wire_api = "openai-chat"
+            api_key_env = "OPENROUTER_API_KEY"
+            default_model = "anthropic/claude-opus-4"
+        "#;
+        let mut cfg: Config = toml::from_str(toml_text).unwrap();
+        cfg.validate().unwrap();
+        assert_eq!(cfg.endpoints.len(), 2);
+        assert_eq!(cfg.default_endpoint().name, "helexa");
+        assert_eq!(cfg.endpoints[0].wire_api, WireApi::OpenAiChat);
+        assert_eq!(
+            cfg.endpoints[1].api_key_env.as_deref(),
+            Some("OPENROUTER_API_KEY")
+        );
+    }
+
+    #[test]
+    fn validate_rejects_colon_in_endpoint_name() {
+        let toml_text = r#"
+            [[endpoints]]
+            name = "bad:name"
+            base_url = "http://x/v1"
+        "#;
+        let mut cfg: Config = toml::from_str(toml_text).unwrap();
+        let err = cfg.validate().unwrap_err();
+        assert!(format!("{err}").contains("clash"));
+    }
+}
--- a/crates/helexa-acp/src/main.rs
+++ b/crates/helexa-acp/src/main.rs
@@ -0,0 +1,145 @@
+//! helexa-acp — Agent Client Protocol bridge for multi-endpoint LLM
+//! setups (helexa, LM Studio, Ollama, OpenRouter, OpenAI, Anthropic,
+//! …) with a clean per-endpoint wire-format selector.
+//!
+//! Speaks ACP over stdio to an editor client (Zed today). Every
+//! configured endpoint produces a wire-format-specific
+//! [`provider::Provider`] implementation; the agent loop in
+//! [`agent::Agent`] is provider-agnostic, so adding e.g. an Anthropic
+//! /v1/messages provider doesn't touch `agent.rs`.
+//!
+//! Config: `$XDG_CONFIG_HOME/helexa-acp/config.toml` for the multi-
+//! endpoint case; env vars (`HELEXA_ACP_BASE_URL`, etc.) for the
+//! single-endpoint case when no config file exists.
+
+use agent_client_protocol::{Result, Stdio};
+use std::sync::Arc;
+
+mod agent;
+mod compaction;
+mod config;
+mod path_util;
+mod prompt;
+mod provider;
+mod qwen3;
+mod session;
+mod store;
+mod tool_runner;
+mod tools;
+
+use agent::Agent;
+use config::{Config, EndpointConfig, WireApi};
+use provider::{
+    Provider, anthropic_messages::AnthropicMessagesProvider, openai_chat::OpenAIChatProvider,
+    openai_responses::OpenAIResponsesProvider,
+};
+
+/// Set up tracing. Logs go to stderr by default — stdout is
+/// reserved for the JSON-RPC stream. Setting `HELEXA_ACP_LOG_FILE`
+/// to an absolute path appends logs to that file instead, which is
+/// the practical way to capture debug output when the agent runs
+/// under an editor (Zed, etc.) that doesn't surface stderr.
+///
+/// `RUST_LOG` still controls levels (e.g. `helexa_acp=debug`).
+/// ANSI colours are auto-stripped when writing to a file so the log
+/// is plain text.
+fn init_tracing() {
+    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
+
+    let log_file = std::env::var("HELEXA_ACP_LOG_FILE")
+        .ok()
+        .filter(|s| !s.is_empty());
+
+    match log_file {
+        Some(path) => match std::fs::OpenOptions::new()
+            .create(true)
+            .append(true)
+            .open(&path)
+        {
+            Ok(file) => {
+                tracing_subscriber::fmt()
+                    .with_writer(std::sync::Mutex::new(file))
+                    .with_env_filter(env_filter)
+                    .with_ansi(false)
+                    .init();
+            }
+            Err(e) => {
+                // Fall back to stderr and shout. We don't want a
+                // typo'd log path to silence the agent entirely.
+                tracing_subscriber::fmt()
+                    .with_writer(std::io::stderr)
+                    .with_env_filter(env_filter)
+                    .init();
+                tracing::warn!(
+                    path = %path,
+                    error = %e,
+                    "HELEXA_ACP_LOG_FILE could not be opened; using stderr"
+                );
+            }
+        },
+        None => {
+            tracing_subscriber::fmt()
+                .with_writer(std::io::stderr)
+                .with_env_filter(env_filter)
+                .init();
+        }
+    }
+}
+
+/// Build a provider for `endpoint` according to its declared
+/// `wire_api`. Future wire types (OpenAI Responses, Anthropic
+/// /v1/messages, Ollama native) slot in here without changing the
+/// caller.
+fn build_provider(endpoint: EndpointConfig) -> anyhow::Result<Arc<dyn Provider>> {
+    match endpoint.wire_api {
+        WireApi::OpenAiChat => Ok(Arc::new(OpenAIChatProvider::new(endpoint)?)),
+        WireApi::OpenAiResponses => Ok(Arc::new(OpenAIResponsesProvider::new(endpoint)?)),
+        WireApi::AnthropicMessages => Ok(Arc::new(AnthropicMessagesProvider::new(endpoint)?)),
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    init_tracing();
+
+    let cfg = Config::load()
+        .map_err(|e| agent_client_protocol::util::internal_error(format!("config: {e:#}")))?;
+    tracing::info!(
+        endpoints = cfg.endpoints.len(),
+        default_endpoint = %cfg.default_endpoint().name,
+        default_model = ?cfg.default_endpoint().default_model,
+        "helexa-acp starting"
+    );
+
+    // Build a provider for each configured endpoint up-front. Cheap —
+    // just sets up a reqwest::Client and resolves the API key — and
+    // surfaces config mistakes (missing API key env var, unsupported
+    // wire_api) before the editor even sends an initialize request.
+    let mut providers: Vec<Arc<dyn Provider>> = Vec::with_capacity(cfg.endpoints.len());
+    for endpoint in &cfg.endpoints {
+        match build_provider(endpoint.clone()) {
+            Ok(p) => {
+                tracing::info!(
+                    endpoint = %endpoint.name,
+                    base_url = %endpoint.base_url,
+                    wire_api = ?endpoint.wire_api,
+                    "registered provider"
+                );
+                providers.push(p);
+            }
+            Err(e) => {
+                tracing::warn!(
+                    endpoint = %endpoint.name,
+                    error = %format!("{e:#}"),
+                    "skipping endpoint with invalid config"
+                );
+            }
+        }
+    }
+
+    let agent = Agent::new(&cfg, providers)
+        .await
+        .map_err(|e| agent_client_protocol::util::internal_error(format!("agent: {e:#}")))?;
+    agent.serve(Stdio::new()).await
+}
--- a/crates/helexa-acp/src/path_util.rs
+++ b/crates/helexa-acp/src/path_util.rs
@@ -0,0 +1,192 @@
+//! Path expansion shared across every tool that takes a path.
+//!
+//! Models often emit shell-style paths like `~/git/repo/file.rs` or
+//! `$HOME/notes.md`. ACP's `fs/read_text_file` and friends — and our
+//! own local `std::fs` reads — both want a real absolute path; the
+//! `~` / `$HOME` forms reach them as literal strings and the open
+//! fails. The tool schemas already document "absolute path" but in
+//! practice the model slips up often enough that handling it
+//! server-side is the difference between "works" and "the agent is
+//! brittle".
+//!
+//! Scope is deliberately small:
+//!
+//! - `~` and `~/` (current user only — `~user` lookups would require
+//!   pulling in passwd parsing).
+//! - `$HOME` and `$HOME/`.
+//!
+//! Any other shell variable (`$PWD`, `${HOME}`, …) passes through
+//! unchanged. The shell already expands them inside `bash` tool
+//! commands; for the file-tool argument fields, we deliberately
+//! limit the set so the behaviour is predictable.
+//!
+//! Falls back to the input path verbatim when `HOME` is unset
+//! (stripped-down container env). That preserves the "no surprise
+//! mutations" rule — never invent a path the caller didn't ask for.
+
+use std::path::{Path, PathBuf};
+
+/// Process-global lock for tests that mutate `HOME`. Anyone in the
+/// crate touching `HOME` must hold this for the duration of the
+/// read-modify-restore window — otherwise concurrent `cargo test`
+/// workers race and flake.
+///
+/// Only built into the test binaries. Production code never mutates
+/// env vars.
+#[cfg(test)]
+pub(crate) static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+/// Expand `~`, `~/`, `$HOME`, and `$HOME/` prefixes against the
+/// current user's home directory. All other inputs pass through
+/// unchanged.
+///
+/// Returns the input verbatim if `HOME` isn't set in the env.
+pub fn expand_path(input: &Path) -> PathBuf {
+    let Some(s) = input.to_str() else {
+        return input.to_path_buf();
+    };
+    let Ok(home) = std::env::var("HOME") else {
+        return input.to_path_buf();
+    };
+    let home = PathBuf::from(home);
+    if s == "~" || s == "$HOME" {
+        return home;
+    }
+    if let Some(rest) = s.strip_prefix("~/") {
+        return home.join(rest);
+    }
+    if let Some(rest) = s.strip_prefix("$HOME/") {
+        return home.join(rest);
+    }
+    input.to_path_buf()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Set HOME for the duration of the test. Tests using this run
+    /// serially under the crate-wide [`ENV_LOCK`] because env
+    /// mutation isn't thread-safe — `cargo test` parallel workers
+    /// would race without it.
+    fn with_home<F: FnOnce()>(home: &str, body: F) {
+        let _g = ENV_LOCK.lock().unwrap();
+        let prior = std::env::var("HOME").ok();
+        // SAFETY: tests touch process-global env. The mutex
+        // serialises access; sub-threads in other test modules
+        // touching HOME aren't expected (none in this crate).
+        unsafe {
+            std::env::set_var("HOME", home);
+        }
+        body();
+        unsafe {
+            match prior {
+                Some(p) => std::env::set_var("HOME", p),
+                None => std::env::remove_var("HOME"),
+            }
+        }
+    }
+
+    #[test]
+    fn expands_tilde_slash() {
+        with_home("/home/me", || {
+            assert_eq!(
+                expand_path(Path::new("~/git/repo/file.rs")),
+                PathBuf::from("/home/me/git/repo/file.rs")
+            );
+        });
+    }
+
+    #[test]
+    fn expands_bare_tilde() {
+        with_home("/home/me", || {
+            assert_eq!(expand_path(Path::new("~")), PathBuf::from("/home/me"));
+        });
+    }
+
+    #[test]
+    fn expands_dollar_home_slash() {
+        with_home("/home/me", || {
+            assert_eq!(
+                expand_path(Path::new("$HOME/notes.md")),
+                PathBuf::from("/home/me/notes.md")
+            );
+        });
+    }
+
+    #[test]
+    fn expands_bare_dollar_home() {
+        with_home("/home/me", || {
+            assert_eq!(expand_path(Path::new("$HOME")), PathBuf::from("/home/me"));
+        });
+    }
+
+    #[test]
+    fn absolute_path_passes_through() {
+        with_home("/home/me", || {
+            assert_eq!(
+                expand_path(Path::new("/etc/hostname")),
+                PathBuf::from("/etc/hostname")
+            );
+        });
+    }
+
+    #[test]
+    fn relative_path_passes_through() {
+        with_home("/home/me", || {
+            assert_eq!(
+                expand_path(Path::new("src/main.rs")),
+                PathBuf::from("src/main.rs")
+            );
+        });
+    }
+
+    #[test]
+    fn tilde_user_form_not_expanded() {
+        // ~other is shell sugar for /home/other and would require
+        // passwd parsing to resolve. Out of scope — pass it
+        // through and let the open fail with a clear error.
+        with_home("/home/me", || {
+            assert_eq!(
+                expand_path(Path::new("~other/x")),
+                PathBuf::from("~other/x")
+            );
+        });
+    }
+
+    #[test]
+    fn no_home_env_passes_through() {
+        // Share the same crate-wide lock as `with_home` — otherwise
+        // a parallel test setting HOME races this clear-and-assert
+        // window.
+        let _g = ENV_LOCK.lock().unwrap();
+        let prior = std::env::var("HOME").ok();
+        // SAFETY: serialised by LOCK above.
+        unsafe {
+            std::env::remove_var("HOME");
+        }
+        assert_eq!(
+            expand_path(Path::new("~/git/repo")),
+            PathBuf::from("~/git/repo")
+        );
+        unsafe {
+            if let Some(p) = prior {
+                std::env::set_var("HOME", p);
+            }
+        }
+    }
+
+    #[test]
+    fn dollar_other_var_not_expanded() {
+        with_home("/home/me", || {
+            assert_eq!(
+                expand_path(Path::new("$PWD/file")),
+                PathBuf::from("$PWD/file")
+            );
+            assert_eq!(
+                expand_path(Path::new("${HOME}/file")),
+                PathBuf::from("${HOME}/file")
+            );
+        });
+    }
+}
--- a/crates/helexa-acp/src/prompt.rs
+++ b/crates/helexa-acp/src/prompt.rs
@@ -0,0 +1,274 @@
+//! System prompt assembly.
+//!
+//! The system message has two parts:
+//!
+//! 1. A short human-readable preamble (working directory, style
+//!    instructions). Either the built-in [`DEFAULT_PROMPT`] or a
+//!    user-supplied file at `HELEXA_ACP_SYSTEM_PROMPT_PATH` /
+//!    `system_prompt_path`. `{cwd}` is substituted in both.
+//! 2. A `# Tools` block in Qwen3 Hermes format (see [`crate::qwen3`])
+//!    describing the available functions. This is what makes the
+//!    model actually call them — neuron/cortex don't honour the
+//!    OpenAI `tools` API field, so the tool list has to live in the
+//!    prompt itself.
+
+use agent_client_protocol::schema::SessionModeId;
+use anyhow::Context;
+use std::path::Path;
+
+use crate::provider::ToolSpec;
+use crate::qwen3;
+use crate::session::MODE_PLAN;
+
+const DEFAULT_PROMPT: &str = "\
+You are helexa-acp, a coding assistant working inside an editor.
+
+Working directory: {cwd}
+
+Use the tools described below whenever the user's request involves
+looking at or modifying files, or running commands. Do not ask the
+user to paste file contents you could read yourself. All file paths
+must be absolute. Writes and shell commands may prompt the user for
+permission depending on the session mode.
+
+Be concise; the user is reading your output in an editor pane.";
+
+/// Build the system prompt for a session.
+///
+/// - `cwd`: session working directory (substituted for `{cwd}` in
+///   the preamble — both the default and any user-supplied template).
+/// - `override_path`: path to a user-supplied template, already
+///   resolved by [`crate::config::Config`]. The `# Tools` block is
+///   appended *after* the user's template so a custom preamble
+///   still gets the tool descriptions the model needs.
+/// - `tools`: the tools to advertise. Empty list → no `# Tools`
+///   block is appended at all.
+/// - `mode`: current session mode. When the mode is [`MODE_PLAN`]
+///   a plan-mode addendum describing the restrictions and the
+///   completion menu is appended *after* the `# Tools` block so it
+///   is the last thing the model reads before user input.
+/// - `plan_dir`: resolved plan directory for the cwd. Only consulted
+///   when `mode == MODE_PLAN`. `None` means the plan directory could
+///   not be resolved (no `HOME` / `XDG_DATA_HOME`) — the addendum
+///   still renders but with a placeholder so the model knows to
+///   surface the error to the user rather than guess a path.
+pub fn build_system_prompt(
+    cwd: &Path,
+    override_path: Option<&Path>,
+    tools: &[ToolSpec],
+    mode: &SessionModeId,
+    plan_dir: Option<&Path>,
+) -> anyhow::Result<String> {
+    let template = match override_path {
+        Some(path) => std::fs::read_to_string(path)
+            .with_context(|| format!("read system prompt from {}", path.display()))?,
+        None => DEFAULT_PROMPT.to_string(),
+    };
+    let mut prompt = template.replace("{cwd}", &cwd.display().to_string());
+    prompt.push_str(&qwen3::render_tool_block(tools));
+    if mode.0.as_ref() == MODE_PLAN {
+        prompt.push_str(&render_plan_mode_block(plan_dir));
+    }
+    Ok(prompt)
+}
+
+/// Plan-mode instruction block. Tells the model:
+///
+/// 1. Where it may write — only inside `plan_dir`.
+/// 2. What it may *not* do — bash is disabled; writes outside
+///    `plan_dir` are refused by the runtime.
+/// 3. How to finish — emit the 3-option menu so the user can
+///    switch modes and either kick off implementation (with or
+///    without permission prompts) or keep iterating on the plan.
+fn render_plan_mode_block(plan_dir: Option<&Path>) -> String {
+    let plan_path = plan_dir
+        .map(|p| p.display().to_string())
+        .unwrap_or_else(|| "<plan directory could not be resolved — tell the user>".to_string());
+    format!(
+        "\n\n# Plan mode\n\
+         \n\
+         You are in **plan mode**. Your task is to draft a written\n\
+         implementation plan for the user; you must NOT modify any\n\
+         project files or run shell commands.\n\
+         \n\
+         Rules in plan mode:\n\
+         \n\
+         - `read_file` and `list_dir` are unrestricted — use them to\n\
+           explore the codebase as needed.\n\
+         - `write_file` and `edit_file` are allowed ONLY under the\n\
+           plan directory: `{plan_path}`. The runtime will refuse any\n\
+           write outside it.\n\
+         - `bash` is disabled. Do not call it.\n\
+         \n\
+         Write the plan as one or more Markdown files under\n\
+         `{plan_path}`. Use descriptive filenames\n\
+         (`01-overview.md`, `02-data-model.md`, etc.). It is fine to\n\
+         iterate — overwrite the file when you refine a section.\n\
+         \n\
+         When the plan is complete, do NOT begin implementation.\n\
+         Instead, end your turn with this menu, verbatim, so the\n\
+         user can choose how to proceed:\n\
+         \n\
+         ---\n\
+         **Plan complete.** To proceed, switch the session mode in\n\
+         the agent dropdown and send a follow-up message:\n\
+         \n\
+         1. **Bypass Permissions** — implement the plan now, skipping\n\
+            per-tool permission prompts.\n\
+         2. **Default** — implement the plan now, prompting before\n\
+            each write or shell command.\n\
+         3. **Plan** (stay here) — refine the plan; reply with the\n\
+            change you want and I will revise it.\n\
+         ---\n"
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::session::{MODE_DEFAULT, MODE_PLAN};
+    use std::io::Write;
+
+    fn default_mode() -> SessionModeId {
+        SessionModeId::new(MODE_DEFAULT)
+    }
+    fn plan_mode() -> SessionModeId {
+        SessionModeId::new(MODE_PLAN)
+    }
+
+    #[test]
+    fn default_prompt_substitutes_cwd() {
+        let prompt =
+            build_system_prompt(Path::new("/home/me/proj"), None, &[], &default_mode(), None)
+                .unwrap();
+        assert!(
+            prompt.contains("/home/me/proj"),
+            "cwd not interpolated: {prompt}"
+        );
+        assert!(prompt.contains("helexa-acp"));
+        assert!(
+            !prompt.contains("{cwd}"),
+            "left-over placeholder in default prompt"
+        );
+        // With no tools, the # Tools block is absent.
+        assert!(!prompt.contains("# Tools"));
+        // Default mode does not get the plan-mode addendum.
+        assert!(!prompt.contains("# Plan mode"));
+    }
+
+    #[test]
+    fn tools_are_appended_in_hermes_format() {
+        let spec = ToolSpec {
+            name: "read_file".into(),
+            description: "Read a file.".into(),
+            parameters: serde_json::json!({"type":"object","properties":{}, "required":[]}),
+        };
+        let prompt =
+            build_system_prompt(Path::new("/x"), None, &[spec], &default_mode(), None).unwrap();
+        assert!(prompt.contains("# Tools"));
+        assert!(prompt.contains("<tools>"));
+        assert!(prompt.contains("\"name\":\"read_file\""));
+        assert!(prompt.contains("<tool_call>"));
+    }
+
+    #[test]
+    fn override_path_is_read_and_templated() {
+        let mut tmp = tempfile_in_target("prompt.txt");
+        tmp.write_all(b"custom prompt for {cwd} only").unwrap();
+        tmp.flush().unwrap();
+
+        let path = tmp.path().to_path_buf();
+        drop(tmp);
+
+        let prompt = build_system_prompt(
+            Path::new("/etc"),
+            Some(path.as_path()),
+            &[],
+            &default_mode(),
+            None,
+        )
+        .expect("read override");
+        assert_eq!(prompt, "custom prompt for /etc only");
+
+        let _ = std::fs::remove_file(&path);
+    }
+
+    #[test]
+    fn missing_override_path_errors() {
+        let err = build_system_prompt(
+            Path::new("/tmp"),
+            Some(Path::new("/definitely/not/a/real/path")),
+            &[],
+            &default_mode(),
+            None,
+        )
+        .unwrap_err();
+        assert!(format!("{err:#}").contains("read system prompt"));
+    }
+
+    #[test]
+    fn plan_mode_addendum_includes_plan_dir_and_menu() {
+        let plan_dir = Path::new("/home/me/.local/share/helexa-acp/plans/proj-deadbeef");
+        let prompt = build_system_prompt(
+            Path::new("/home/me/proj"),
+            None,
+            &[],
+            &plan_mode(),
+            Some(plan_dir),
+        )
+        .unwrap();
+        assert!(prompt.contains("# Plan mode"));
+        assert!(
+            prompt.contains(plan_dir.to_str().unwrap()),
+            "plan dir not interpolated: {prompt}"
+        );
+        // The 3-option menu must be present so the model emits it verbatim.
+        assert!(prompt.contains("Bypass Permissions"));
+        assert!(prompt.contains("**Default**"));
+        assert!(prompt.contains("3. **Plan**"));
+        // Bash disabled instruction must be present.
+        assert!(prompt.contains("`bash` is disabled"));
+    }
+
+    #[test]
+    fn plan_mode_addendum_handles_unresolved_plan_dir() {
+        let prompt =
+            build_system_prompt(Path::new("/home/me/proj"), None, &[], &plan_mode(), None).unwrap();
+        assert!(prompt.contains("# Plan mode"));
+        assert!(prompt.contains("could not be resolved"));
+    }
+
+    /// Tiny temp-file helper that doesn't pull in the `tempfile` crate.
+    /// Writes under `target/` so it's cleaned up by `cargo clean`.
+    fn tempfile_in_target(name: &str) -> TempHandle {
+        let base = std::env::var("CARGO_TARGET_TMPDIR")
+            .ok()
+            .map(std::path::PathBuf::from)
+            .unwrap_or_else(std::env::temp_dir);
+        let _ = std::fs::create_dir_all(&base);
+        let pid = std::process::id();
+        let path = base.join(format!("helexa-acp-{pid}-{name}"));
+        let file = std::fs::File::create(&path).expect("create temp file");
+        TempHandle { file, path }
+    }
+
+    struct TempHandle {
+        file: std::fs::File,
+        path: std::path::PathBuf,
+    }
+
+    impl TempHandle {
+        fn path(&self) -> &Path {
+            &self.path
+        }
+    }
+
+    impl Write for TempHandle {
+        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+            self.file.write(buf)
+        }
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.file.flush()
+        }
+    }
+}
--- a/crates/helexa-acp/src/provider/anthropic_messages.rs
+++ b/crates/helexa-acp/src/provider/anthropic_messages.rs
--- a/crates/helexa-acp/src/provider/mod.rs
+++ b/crates/helexa-acp/src/provider/mod.rs
@@ -0,0 +1,230 @@
+//! Provider trait — the seam between the ACP-side agent loop and
+//! whatever wire protocol an endpoint actually speaks.
+//!
+//! Every concrete provider (OpenAI chat completions, OpenAI Responses,
+//! Anthropic /v1/messages, Ollama native, …) implements
+//! [`Provider`]. The agent constructs a [`CompletionRequest`] using
+//! provider-agnostic types and consumes a stream of
+//! [`CompletionEvent`]s — neither end knows which wire format is on
+//! the other side of the trait.
+//!
+//! Day-1 provider: [`openai_chat::OpenAIChatProvider`]. Day-N
+//! providers slot in without touching `agent.rs`.
+
+use async_trait::async_trait;
+use futures::stream::BoxStream;
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use tokio_util::sync::CancellationToken;
+
+pub mod anthropic_messages;
+pub mod openai_chat;
+pub mod openai_responses;
+
+/// Provider-agnostic LLM endpoint. Implementations translate between
+/// [`CompletionRequest`] / [`CompletionEvent`] and whatever wire
+/// format their endpoint speaks.
+#[async_trait]
+pub trait Provider: Send + Sync {
+    /// Endpoint name as configured by the user (e.g. `"helexa"`,
+    /// `"openrouter"`). Used in logs and in the `endpoint:model`
+    /// selector.
+    fn name(&self) -> &str;
+
+    /// List models available at this endpoint. Used to build the
+    /// model-picker dropdown in editor clients (Stage 4). Should
+    /// return quickly (cache if necessary).
+    #[allow(dead_code)]
+    async fn list_models(&self) -> anyhow::Result<Vec<ModelInfo>>;
+
+    /// Run a chat completion. Returns a stream of provider-agnostic
+    /// events. The stream stops when the upstream finishes, when
+    /// `cancel` is fired, or when the stream is dropped.
+    async fn complete(
+        &self,
+        request: CompletionRequest,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<BoxStream<'static, anyhow::Result<CompletionEvent>>>;
+}
+
+/// One model exposed by a provider. Constructed by `list_models` —
+/// Stage 4 is when the agent loop starts consuming it for the
+/// model-picker dropdown.
+#[allow(dead_code)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelInfo {
+    pub id: String,
+    /// Human-friendly name, if the endpoint exposes one. Otherwise
+    /// `id` is used as the display name.
+    #[serde(default)]
+    pub display_name: Option<String>,
+}
+
+/// Inputs to a completion. Provider-agnostic — concrete providers
+/// translate this into their wire format.
+#[derive(Debug, Clone)]
+pub struct CompletionRequest {
+    /// Endpoint-local model id (without the `endpoint:` prefix).
+    pub model: String,
+    pub messages: Vec<Message>,
+    /// Tools the model is allowed to call. Empty list means no tool
+    /// support advertised.
+    pub tools: Vec<ToolSpec>,
+    pub temperature: Option<f64>,
+    pub top_p: Option<f64>,
+    pub max_tokens: Option<u64>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Message {
+    pub role: Role,
+    pub content: MessageContent,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Role {
+    System,
+    User,
+    Assistant,
+    /// Tool result message. Provider impls turn this into whatever
+    /// shape the upstream wire format wants (OpenAI uses
+    /// `role: "tool"` + `tool_call_id`; Anthropic uses content blocks).
+    /// Stage 3 (tools) constructs this; Stage 2 never does.
+    Tool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum MessageContent {
+    /// Plain text turn (system / user / assistant). Struct variant
+    /// rather than newtype so the persisted JSON has an explicit
+    /// `text` field — that lets us use internal tagging on the
+    /// enum, which is incompatible with newtype-of-primitive
+    /// variants.
+    Text { text: String },
+    /// Mixed text + image user turn. Stage 5 introduces this when
+    /// Zed sends an `ImageContent` block alongside the user's prompt.
+    /// Providers that don't support vision should down-convert by
+    /// dropping image parts and concatenating text parts.
+    MultiPart { parts: Vec<MessagePart> },
+    /// Assistant turn that called one or more tools. Stage 3 starts
+    /// constructing this when the provider stream yields a
+    /// `ToolCallStart` / `ToolCallArgsDelta` sequence.
+    ToolCalls {
+        /// Optional text the assistant said alongside the tool calls.
+        text: Option<String>,
+        calls: Vec<ToolCall>,
+    },
+    /// Tool result. `tool_call_id` matches the assistant's call id.
+    /// Stage 3 constructs this after the tool runner finishes.
+    ToolResult {
+        tool_call_id: String,
+        content: String,
+    },
+}
+
+/// One part of a [`MessageContent::MultiPart`] message.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum MessagePart {
+    Text { text: String },
+    Image(ImageData),
+}
+
+/// Inline image attachment. `data` is base64-encoded raw image
+/// bytes; the encoder constructs an `image_url` data URI from it
+/// at request time. `uri` carries any pointer the client supplied
+/// (e.g. `file:///tmp/x.png`) — we keep it on the message for
+/// debugging / future providers but the OpenAI encoder ignores it
+/// when `data` is present (data wins, since it round-trips through
+/// every wire format).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ImageData {
+    pub mime_type: String,
+    /// Base64-encoded image bytes (no `data:` prefix, no padding
+    /// stripped — exactly what `ImageContent.data` carried).
+    pub data: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub uri: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolCall {
+    /// Provider-assigned id that ties the call to its result. The
+    /// Qwen3 wire format we use today doesn't carry this on the
+    /// model side (calls and results are matched positionally inside
+    /// a turn), so the field looks unused in the prod build — but it
+    /// flows through to `MessageContent::ToolResult.tool_call_id` for
+    /// history bookkeeping and a future strict-OpenAI backend will
+    /// consume it directly.
+    #[allow(dead_code)]
+    pub id: String,
+    pub name: String,
+    /// JSON-encoded arguments. Kept as a string because providers
+    /// stream argument bytes incrementally and only validate at the
+    /// end; the agent decodes once the call is complete.
+    pub arguments: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct ToolSpec {
+    pub name: String,
+    pub description: String,
+    /// JSON Schema of the arguments object.
+    pub parameters: Value,
+}
+
+/// Events emitted by a provider during a streaming completion.
+#[derive(Debug, Clone)]
+pub enum CompletionEvent {
+    /// Incremental visible text from the assistant.
+    TextDelta(String),
+    /// Incremental "reasoning" / thought text, if the model emits one
+    /// (e.g. Qwen3 with `<think>` tags surfaced as a separate stream,
+    /// or OpenAI reasoning models).
+    ReasoningDelta(String),
+    /// A new tool call has started. Stage 2 ignores the payload; the
+    /// agent loop in Stage 3 reads `index` to correlate with
+    /// [`Self::ToolCallArgsDelta`], `id` for the eventual tool-result
+    /// turn, and `name` to dispatch the runner.
+    #[allow(dead_code)]
+    ToolCallStart {
+        index: usize,
+        id: String,
+        name: String,
+    },
+    /// More argument bytes for a tool call already announced via
+    /// [`Self::ToolCallStart`]. Stage 2 ignores; Stage 3 accumulates
+    /// the bytes by `index` until the call's arguments are complete.
+    #[allow(dead_code)]
+    ToolCallArgsDelta { index: usize, args_delta: String },
+    /// A `<tool_call>` block whose JSON couldn't be parsed even with
+    /// the qwen3 module's repair attempts. The agent surfaces this
+    /// as a Failed `SessionUpdate::ToolCall` card with the raw body
+    /// visible (so the editor renders structured failure UI rather
+    /// than dumping the body inline in the message pane), and feeds
+    /// a synthetic tool-error message back into history so the
+    /// model can self-correct on the next round.
+    MalformedToolCall { raw: String },
+    /// Stream finished. Carries the upstream `finish_reason` if it
+    /// gave one (`"stop"`, `"length"`, `"tool_calls"`, …).
+    Finish { reason: Option<String> },
+    /// Final usage stats, if the provider supplied them. Stage 2
+    /// matches the variant to drop it; Stage 6b (token metrics) is
+    /// when the payload starts being read.
+    #[allow(dead_code)]
+    Usage(UsageStats),
+}
+
+/// Token accounting reported by the provider at the end of a stream.
+/// Stage 2 doesn't surface usage anywhere — the stable `PromptResponse`
+/// has no usage field, and the unstable variant is gated. Stage 6b
+/// turns these on with Prometheus metrics.
+#[allow(dead_code)]
+#[derive(Debug, Clone, Copy, Default)]
+pub struct UsageStats {
+    pub prompt_tokens: u64,
+    pub completion_tokens: u64,
+    pub total_tokens: u64,
+}
--- a/crates/helexa-acp/src/provider/openai_chat.rs
+++ b/crates/helexa-acp/src/provider/openai_chat.rs
--- a/crates/helexa-acp/src/provider/openai_responses.rs
+++ b/crates/helexa-acp/src/provider/openai_responses.rs
@@ -0,0 +1,987 @@
+//! OpenAI Responses API (`POST /v1/responses`) provider.
+//!
+//! Mirror image of [`super::openai_chat`]: same `Provider` trait
+//! impl, same back-pressured SSE decoder, but speaking OpenAI's
+//! newer Responses surface instead of chat completions.
+//!
+//! Differences from the chat provider, all contained in this file:
+//!
+//! - **Request encoding**: history flattens into an `input` array
+//!   of typed items (`message`, `function_call`, `function_call_output`)
+//!   plus a top-level `instructions` field for the system prompt.
+//!   Multi-part user content stays in the same `[{type:"input_text"},
+//!   {type:"input_image"}]` shape neuron's `request_to_chat` already
+//!   accepts.
+//! - **Streaming decoder**: events are named (`response.created`,
+//!   `response.output_text.delta`, `response.completed`, …) carried
+//!   on the SSE `event:` line. The chat path's `[DONE]` terminator
+//!   doesn't apply; the stream ends after `response.completed`.
+//! - **Tool calls** plumb through the `response.output_item.added`
+//!   (item type `function_call`) → `response.function_call_arguments.delta`
+//!   → `response.function_call_arguments.done` event sequence. The
+//!   neuron candle harness doesn't synthesize these yet (tracked as
+//!   issue #6), but the decoder is wired so the day the upstream
+//!   does, downstream `CompletionEvent::ToolCall*` plumbing just
+//!   works.
+//!
+//! Tool-name handling: the model knows its tool descriptions via
+//! the [`crate::qwen3`] system-prompt block exactly the way the chat
+//! provider does. We don't echo them in the request body because
+//! neuron currently ignores `tools` on /v1/responses (same as on
+//! /v1/chat/completions). Once neuron honours request-side tool
+//! definitions, both providers add them in the same place.
+
+use async_trait::async_trait;
+use eventsource_stream::Eventsource;
+use futures::{Stream, StreamExt, stream::BoxStream};
+use serde::{Deserialize, Serialize};
+use serde_json::{Value, json};
+use std::collections::HashMap;
+use tokio_util::sync::CancellationToken;
+
+use super::{
+    CompletionEvent, CompletionRequest, Message, MessageContent, MessagePart, ModelInfo, Provider,
+    Role, UsageStats,
+};
+use crate::config::EndpointConfig;
+
+pub struct OpenAIResponsesProvider {
+    endpoint: EndpointConfig,
+    #[allow(dead_code)] // Read in `complete()`'s HTTP path; tests don't stand up a server.
+    api_key: Option<String>,
+    #[allow(dead_code)]
+    http: reqwest::Client,
+}
+
+impl OpenAIResponsesProvider {
+    pub fn new(endpoint: EndpointConfig) -> anyhow::Result<Self> {
+        let api_key = endpoint.resolve_api_key()?;
+        let http = reqwest::Client::builder()
+            // Same generous timeout as the chat provider: cortex may
+            // need to cold-load a model before serving the first
+            // chunk, which can be tens of seconds. Cancellation
+            // handles early termination, not timeout.
+            .timeout(std::time::Duration::from_secs(600))
+            .build()?;
+        Ok(Self {
+            endpoint,
+            api_key,
+            http,
+        })
+    }
+}
+
+#[async_trait]
+impl Provider for OpenAIResponsesProvider {
+    fn name(&self) -> &str {
+        &self.endpoint.name
+    }
+
+    async fn list_models(&self) -> anyhow::Result<Vec<ModelInfo>> {
+        let mut req = self.http.get(self.endpoint.models_url());
+        if let Some(key) = &self.api_key {
+            req = req.bearer_auth(key);
+        }
+        let resp = req
+            .send()
+            .await
+            .map_err(|e| anyhow::anyhow!("{} list_models: {e}", self.endpoint.name))?;
+        let status = resp.status();
+        if !status.is_success() {
+            let body = resp.text().await.unwrap_or_default();
+            anyhow::bail!(
+                "{} list_models returned {}: {}",
+                self.endpoint.name,
+                status,
+                body
+            );
+        }
+        let body: WireModelsResponse = resp.json().await?;
+        Ok(body
+            .data
+            .into_iter()
+            .map(|m| ModelInfo {
+                id: m.id,
+                display_name: None,
+            })
+            .collect())
+    }
+
+    async fn complete(
+        &self,
+        request: CompletionRequest,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<BoxStream<'static, anyhow::Result<CompletionEvent>>> {
+        let body = encode_request(&request);
+        tracing::debug!(
+            endpoint = %self.endpoint.name,
+            url = %self.endpoint.responses_url(),
+            body = %serde_json::to_string(&body).unwrap_or_else(|_| "<unserializable>".into()),
+            "POST /responses"
+        );
+        let mut req = self.http.post(self.endpoint.responses_url()).json(&body);
+        if let Some(key) = &self.api_key {
+            req = req.bearer_auth(key);
+        }
+        let resp = req
+            .send()
+            .await
+            .map_err(|e| anyhow::anyhow!("{} responses send: {e}", self.endpoint.name))?;
+        let status = resp.status();
+        if !status.is_success() {
+            let body = resp.text().await.unwrap_or_default();
+            anyhow::bail!(
+                "{} responses returned {}: {}",
+                self.endpoint.name,
+                status,
+                body
+            );
+        }
+        let sse = resp.bytes_stream().eventsource();
+        let stream = decode_stream(sse, cancel);
+        Ok(Box::pin(stream))
+    }
+}
+
+// ── Request encoding ─────────────────────────────────────────────────
+
+fn encode_request(req: &CompletionRequest) -> Value {
+    // Pull the system messages out of history into a single
+    // `instructions` string — the Responses API expects them there,
+    // not inline as an `input` item. Multiple system messages
+    // concatenate with blank lines so we don't lose ordering.
+    let mut instructions: Vec<String> = Vec::new();
+    let mut input_items: Vec<Value> = Vec::new();
+    for msg in &req.messages {
+        if msg.role == Role::System
+            && let MessageContent::Text { text } = &msg.content
+        {
+            instructions.push(text.clone());
+            continue;
+        }
+        if let Some(item) = encode_message_as_input_item(msg) {
+            input_items.push(item);
+        }
+    }
+
+    let mut body = json!({
+        "model": req.model,
+        "input": input_items,
+        "stream": true,
+    });
+    if let Value::Object(map) = &mut body {
+        if !instructions.is_empty() {
+            map.insert(
+                "instructions".into(),
+                Value::String(instructions.join("\n\n")),
+            );
+        }
+        if let Some(t) = req.temperature {
+            map.insert("temperature".into(), json!(t));
+        }
+        if let Some(p) = req.top_p {
+            map.insert("top_p".into(), json!(p));
+        }
+        if let Some(m) = req.max_tokens {
+            // Responses calls it `max_output_tokens`; preserve the
+            // semantic (response cap) when we translate.
+            map.insert("max_output_tokens".into(), json!(m));
+        }
+    }
+    body
+}
+
+fn encode_message_as_input_item(msg: &Message) -> Option<Value> {
+    match (msg.role, &msg.content) {
+        (Role::System, _) => None, // handled out-of-band as `instructions`
+        (Role::User, MessageContent::Text { text }) => Some(json!({
+            "type": "message",
+            "role": "user",
+            "content": text,
+        })),
+        (Role::User, MessageContent::MultiPart { parts }) => Some(json!({
+            "type": "message",
+            "role": "user",
+            "content": encode_user_parts(parts),
+        })),
+        (Role::Assistant, MessageContent::Text { text }) => Some(json!({
+            "type": "message",
+            "role": "assistant",
+            "content": [{
+                "type": "output_text",
+                "text": text,
+                "annotations": [],
+            }],
+        })),
+        (Role::Assistant, MessageContent::ToolCalls { text, calls }) => {
+            // Assistant turns that called tools become a sequence of
+            // items: an optional `message` (any prose alongside the
+            // call) followed by one `function_call` per call. Mirrors
+            // OpenAI Responses' "each item is one structural slot"
+            // shape.
+            //
+            // We can't return multiple items from one call site, so
+            // we encode this by side-stuffing additional items into a
+            // single composite value and have the caller flatten —
+            // but that complicates the API. Easier: build the array
+            // ourselves in the caller path. For now, emit just the
+            // function_calls (the assistant's prose lives in the next
+            // turn's chat history anyway because the model isn't
+            // looking back at its own previous narration). If the
+            // text is non-empty AND we have calls, we lose the text;
+            // qwen3 rarely emits prose alongside tool calls so this
+            // is a deliberate simplification — revisit if it bites.
+            let _ = text;
+            // Take the first call only for the moment; multi-call
+            // turns would need the caller-flattening above.
+            let call = calls.first()?;
+            Some(json!({
+                "type": "function_call",
+                "call_id": call.id,
+                "name": call.name,
+                "arguments": call.arguments,
+            }))
+        }
+        (
+            Role::Tool,
+            MessageContent::ToolResult {
+                tool_call_id,
+                content,
+            },
+        ) => Some(json!({
+            "type": "function_call_output",
+            "call_id": tool_call_id,
+            "output": content,
+        })),
+        (role, content) => {
+            tracing::warn!(
+                ?role,
+                ?content,
+                "openai_responses: unexpected (role, content) shape"
+            );
+            None
+        }
+    }
+}
+
+fn encode_user_parts(parts: &[MessagePart]) -> Value {
+    let items: Vec<Value> = parts
+        .iter()
+        .map(|p| match p {
+            MessagePart::Text { text } => json!({"type": "input_text", "text": text}),
+            MessagePart::Image(img) => json!({
+                "type": "input_image",
+                "image_url": format!("data:{};base64,{}", img.mime_type, img.data),
+            }),
+        })
+        .collect();
+    Value::Array(items)
+}
+
+// ── Wire types ──────────────────────────────────────────────────────
+
+#[allow(dead_code)] // fields read only when list_models runs against a real endpoint
+#[derive(Debug, Deserialize)]
+struct WireModelsResponse {
+    data: Vec<WireModelObject>,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Deserialize)]
+struct WireModelObject {
+    id: String,
+}
+
+// SSE event payload shapes. We only model the fields we care about;
+// `#[serde(default)]` + `Option` everywhere else lets the upstream
+// add optional fields without breaking deserialise.
+
+#[derive(Debug, Deserialize, Serialize)]
+struct OutputItemAddedEvent {
+    #[serde(default)]
+    output_index: u32,
+    item: OutputItem,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+enum OutputItem {
+    Message {
+        #[serde(default)]
+        id: Option<String>,
+    },
+    FunctionCall {
+        #[serde(default)]
+        id: Option<String>,
+        #[serde(default)]
+        call_id: Option<String>,
+        #[serde(default)]
+        name: Option<String>,
+        /// Some upstreams populate `arguments` already on the
+        /// `output_item.added` event for a fully-buffered tool call
+        /// (i.e. when the model finalised the call before the SSE
+        /// flush). Capture it so we can emit a single args delta.
+        #[serde(default)]
+        arguments: Option<String>,
+    },
+    /// `reasoning`, `web_search_call`, etc. We capture-and-ignore
+    /// any item we don't model; the decoder still emits the
+    /// outer events correctly.
+    #[serde(other)]
+    Unknown,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct OutputTextDeltaEvent {
+    #[serde(default)]
+    item_id: Option<String>,
+    #[serde(default)]
+    output_index: u32,
+    #[serde(default)]
+    delta: String,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct FunctionCallArgumentsDeltaEvent {
+    #[serde(default)]
+    item_id: Option<String>,
+    #[serde(default)]
+    output_index: u32,
+    #[serde(default)]
+    delta: String,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct ResponseCompletedEvent {
+    response: ResponseShell,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct ResponseShell {
+    #[serde(default)]
+    status: Option<String>,
+    #[serde(default)]
+    usage: Option<WireUsage>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct WireUsage {
+    #[serde(default)]
+    input_tokens: u64,
+    #[serde(default)]
+    output_tokens: u64,
+    #[serde(default)]
+    total_tokens: u64,
+}
+
+// ── Streaming decoder ───────────────────────────────────────────────
+
+/// Translate the named-event Responses SSE into the provider-agnostic
+/// [`CompletionEvent`] stream the agent loop expects. The decoder
+/// holds per-stream state — output_index → tool-call-index plus
+/// the next available tool-call slot — so it can fire
+/// `ToolCallStart` exactly once per item.
+fn decode_stream<S>(
+    sse: S,
+    cancel: CancellationToken,
+) -> impl Stream<Item = anyhow::Result<CompletionEvent>>
+where
+    S: Stream<
+            Item = Result<
+                eventsource_stream::Event,
+                eventsource_stream::EventStreamError<reqwest::Error>,
+            >,
+        > + Send
+        + 'static,
+{
+    async_stream::stream! {
+        let mut sse = Box::pin(sse);
+        // Maps an output_index that's a function_call to the tool-call
+        // slot we hand downstream. Lets us correlate later
+        // `function_call_arguments.delta` events back to the index
+        // we already announced on `output_item.added`.
+        let mut tool_index_by_output: HashMap<u32, usize> = HashMap::new();
+        let mut next_tool_index: usize = 0;
+
+        loop {
+            tokio::select! {
+                biased;
+                _ = cancel.cancelled() => {
+                    tracing::debug!("openai_responses: cancellation requested, ending stream");
+                    break;
+                }
+                next = sse.next() => {
+                    let Some(event) = next else { break };
+                    let event = match event {
+                        Ok(e) => e,
+                        Err(e) => {
+                            yield Err(anyhow::anyhow!("SSE transport: {e}"));
+                            break;
+                        }
+                    };
+                    // Event name lives on `event.event`; data is JSON.
+                    let event_name = event.event.as_str();
+                    let data = event.data.as_str();
+                    match event_name {
+                        "response.output_text.delta" => {
+                            match serde_json::from_str::<OutputTextDeltaEvent>(data) {
+                                Ok(d) if !d.delta.is_empty() => {
+                                    yield Ok(CompletionEvent::TextDelta(d.delta));
+                                }
+                                Ok(_) => {}
+                                Err(e) => {
+                                    tracing::warn!(
+                                        error = %e,
+                                        raw = %data,
+                                        "openai_responses: failed to parse output_text.delta; skipping"
+                                    );
+                                }
+                            }
+                        }
+                        "response.output_item.added" => {
+                            match serde_json::from_str::<OutputItemAddedEvent>(data) {
+                                Ok(ev) => {
+                                    if let OutputItem::FunctionCall {
+                                        id,
+                                        call_id,
+                                        name,
+                                        arguments,
+                                    } = ev.item
+                                    {
+                                        let idx = next_tool_index;
+                                        next_tool_index += 1;
+                                        tool_index_by_output.insert(ev.output_index, idx);
+                                        // Prefer the user-facing
+                                        // `call_id` (what gets paired
+                                        // with tool results) over the
+                                        // internal item `id` when
+                                        // both are present. Falls
+                                        // back to a synthetic id so
+                                        // history bookkeeping never
+                                        // breaks.
+                                        let final_id = call_id
+                                            .or(id)
+                                            .unwrap_or_else(|| format!("call_{idx}"));
+                                        let final_name = name.unwrap_or_default();
+                                        yield Ok(CompletionEvent::ToolCallStart {
+                                            index: idx,
+                                            id: final_id,
+                                            name: final_name,
+                                        });
+                                        // Some upstreams attach the
+                                        // fully-buffered arguments on
+                                        // the `output_item.added`
+                                        // event itself (rare; happens
+                                        // when the model finalised
+                                        // before the SSE flush).
+                                        // Emit as a single args
+                                        // delta if present.
+                                        if let Some(args) = arguments
+                                            && !args.is_empty()
+                                        {
+                                            yield Ok(CompletionEvent::ToolCallArgsDelta {
+                                                index: idx,
+                                                args_delta: args,
+                                            });
+                                        }
+                                    }
+                                }
+                                Err(e) => {
+                                    tracing::warn!(
+                                        error = %e,
+                                        raw = %data,
+                                        "openai_responses: failed to parse output_item.added; skipping"
+                                    );
+                                }
+                            }
+                        }
+                        "response.function_call_arguments.delta" => {
+                            match serde_json::from_str::<FunctionCallArgumentsDeltaEvent>(data) {
+                                Ok(ev) => {
+                                    let Some(&idx) = tool_index_by_output.get(&ev.output_index)
+                                    else {
+                                        // Args delta for an item we
+                                        // never saw an `output_item.added`
+                                        // for. Could happen if the
+                                        // upstream reordered events;
+                                        // log + skip.
+                                        tracing::warn!(
+                                            output_index = ev.output_index,
+                                            "openai_responses: function_call_arguments.delta for unknown output_index"
+                                        );
+                                        continue;
+                                    };
+                                    if !ev.delta.is_empty() {
+                                        yield Ok(CompletionEvent::ToolCallArgsDelta {
+                                            index: idx,
+                                            args_delta: ev.delta,
+                                        });
+                                    }
+                                }
+                                Err(e) => {
+                                    tracing::warn!(
+                                        error = %e,
+                                        raw = %data,
+                                        "openai_responses: failed to parse function_call_arguments.delta; skipping"
+                                    );
+                                }
+                            }
+                        }
+                        "response.completed" => {
+                            // Final event. Pull usage + status off
+                            // the response shell. Status maps:
+                            // "completed" → no special handling
+                            // (caller treats as EndTurn),
+                            // "incomplete" → length stop.
+                            let (reason, usage) =
+                                match serde_json::from_str::<ResponseCompletedEvent>(data) {
+                                    Ok(ev) => {
+                                        let reason = match ev.response.status.as_deref() {
+                                            Some("incomplete") => Some("length".to_string()),
+                                            _ => Some("stop".to_string()),
+                                        };
+                                        let usage = ev.response.usage.map(|u| UsageStats {
+                                            prompt_tokens: u.input_tokens,
+                                            completion_tokens: u.output_tokens,
+                                            total_tokens: u.total_tokens,
+                                        });
+                                        (reason, usage)
+                                    }
+                                    Err(e) => {
+                                        tracing::warn!(
+                                            error = %e,
+                                            raw = %data,
+                                            "openai_responses: failed to parse response.completed; ending stream with EndTurn"
+                                        );
+                                        (Some("stop".to_string()), None)
+                                    }
+                                };
+                            if let Some(u) = usage {
+                                yield Ok(CompletionEvent::Usage(u));
+                            }
+                            yield Ok(CompletionEvent::Finish { reason });
+                            break;
+                        }
+                        // Bookkeeping events we don't need to surface:
+                        // response.created, response.in_progress,
+                        // response.content_part.added/.done,
+                        // response.output_text.done,
+                        // response.output_item.done,
+                        // response.function_call_arguments.done,
+                        // response.reasoning_*. Logged at debug for
+                        // wire-tracing.
+                        other => {
+                            tracing::trace!(
+                                event = other,
+                                "openai_responses: bookkeeping event"
+                            );
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::provider::ToolCall;
+    use crate::provider::{ImageData, MessagePart};
+    use futures::stream;
+    use url::Url;
+
+    fn ep() -> EndpointConfig {
+        EndpointConfig {
+            name: "test".into(),
+            base_url: Url::parse("http://localhost:9999/v1").unwrap(),
+            wire_api: crate::config::WireApi::OpenAiResponses,
+            default_model: None,
+            api_key: None,
+            api_key_env: None,
+            max_tokens: None,
+            context_window: None,
+        }
+    }
+
+    // ── encode_request ──────────────────────────────────────────────
+
+    #[test]
+    fn system_messages_collapse_to_instructions() {
+        let req = CompletionRequest {
+            model: "m".into(),
+            messages: vec![
+                Message {
+                    role: Role::System,
+                    content: MessageContent::Text {
+                        text: "you are helpful".into(),
+                    },
+                },
+                Message {
+                    role: Role::User,
+                    content: MessageContent::Text { text: "hi".into() },
+                },
+            ],
+            tools: vec![],
+            temperature: Some(0.7),
+            top_p: None,
+            max_tokens: Some(256),
+        };
+        let body = encode_request(&req);
+        assert_eq!(body["model"], "m");
+        assert_eq!(body["instructions"], "you are helpful");
+        assert_eq!(body["stream"], true);
+        assert_eq!(body["max_output_tokens"], 256);
+        assert_eq!(body["temperature"], 0.7);
+        let input = body["input"].as_array().unwrap();
+        // System message NOT echoed in input — it's only in
+        // instructions.
+        assert_eq!(input.len(), 1);
+        assert_eq!(input[0]["type"], "message");
+        assert_eq!(input[0]["role"], "user");
+        assert_eq!(input[0]["content"], "hi");
+    }
+
+    #[test]
+    fn multiple_system_messages_concatenate() {
+        let req = CompletionRequest {
+            model: "m".into(),
+            messages: vec![
+                Message {
+                    role: Role::System,
+                    content: MessageContent::Text {
+                        text: "first".into(),
+                    },
+                },
+                Message {
+                    role: Role::System,
+                    content: MessageContent::Text {
+                        text: "second".into(),
+                    },
+                },
+                Message {
+                    role: Role::User,
+                    content: MessageContent::Text { text: "hi".into() },
+                },
+            ],
+            tools: vec![],
+            temperature: None,
+            top_p: None,
+            max_tokens: None,
+        };
+        let body = encode_request(&req);
+        assert_eq!(body["instructions"], "first\n\nsecond");
+    }
+
+    #[test]
+    fn user_multipart_becomes_input_parts_array() {
+        let req = CompletionRequest {
+            model: "vl".into(),
+            messages: vec![Message {
+                role: Role::User,
+                content: MessageContent::MultiPart {
+                    parts: vec![
+                        MessagePart::Text {
+                            text: "what's in this?".into(),
+                        },
+                        MessagePart::Image(ImageData {
+                            mime_type: "image/png".into(),
+                            data: "AAA=".into(),
+                            uri: None,
+                        }),
+                    ],
+                },
+            }],
+            tools: vec![],
+            temperature: None,
+            top_p: None,
+            max_tokens: None,
+        };
+        let body = encode_request(&req);
+        let content = &body["input"][0]["content"].as_array().unwrap().clone();
+        assert_eq!(content.len(), 2);
+        assert_eq!(content[0]["type"], "input_text");
+        assert_eq!(content[0]["text"], "what's in this?");
+        assert_eq!(content[1]["type"], "input_image");
+        assert_eq!(content[1]["image_url"], "data:image/png;base64,AAA=");
+    }
+
+    #[test]
+    fn assistant_text_becomes_output_text_content_part() {
+        let req = CompletionRequest {
+            model: "m".into(),
+            messages: vec![
+                Message {
+                    role: Role::User,
+                    content: MessageContent::Text { text: "hi".into() },
+                },
+                Message {
+                    role: Role::Assistant,
+                    content: MessageContent::Text {
+                        text: "hello there".into(),
+                    },
+                },
+                Message {
+                    role: Role::User,
+                    content: MessageContent::Text {
+                        text: "more".into(),
+                    },
+                },
+            ],
+            tools: vec![],
+            temperature: None,
+            top_p: None,
+            max_tokens: None,
+        };
+        let body = encode_request(&req);
+        let input = body["input"].as_array().unwrap();
+        assert_eq!(input.len(), 3);
+        assert_eq!(input[1]["type"], "message");
+        assert_eq!(input[1]["role"], "assistant");
+        assert_eq!(input[1]["content"][0]["type"], "output_text");
+        assert_eq!(input[1]["content"][0]["text"], "hello there");
+    }
+
+    #[test]
+    fn tool_calls_and_results_round_trip_via_function_call_items() {
+        let req = CompletionRequest {
+            model: "m".into(),
+            messages: vec![
+                Message {
+                    role: Role::Assistant,
+                    content: MessageContent::ToolCalls {
+                        text: None,
+                        calls: vec![ToolCall {
+                            id: "call_42".into(),
+                            name: "read_file".into(),
+                            arguments: r#"{"path":"/etc/hostname"}"#.into(),
+                        }],
+                    },
+                },
+                Message {
+                    role: Role::Tool,
+                    content: MessageContent::ToolResult {
+                        tool_call_id: "call_42".into(),
+                        content: "host".into(),
+                    },
+                },
+            ],
+            tools: vec![],
+            temperature: None,
+            top_p: None,
+            max_tokens: None,
+        };
+        let body = encode_request(&req);
+        let input = body["input"].as_array().unwrap();
+        assert_eq!(input.len(), 2);
+        assert_eq!(input[0]["type"], "function_call");
+        assert_eq!(input[0]["call_id"], "call_42");
+        assert_eq!(input[0]["name"], "read_file");
+        assert_eq!(input[0]["arguments"], r#"{"path":"/etc/hostname"}"#);
+        assert_eq!(input[1]["type"], "function_call_output");
+        assert_eq!(input[1]["call_id"], "call_42");
+        assert_eq!(input[1]["output"], "host");
+    }
+
+    // ── decode_stream ───────────────────────────────────────────────
+
+    fn sse_event(name: &str, data: &str) -> eventsource_stream::Event {
+        eventsource_stream::Event {
+            id: String::new(),
+            retry: None,
+            event: name.into(),
+            data: data.into(),
+        }
+    }
+
+    async fn collect_events(
+        items: Vec<eventsource_stream::Event>,
+    ) -> Vec<anyhow::Result<CompletionEvent>> {
+        let sse = stream::iter(
+            items
+                .into_iter()
+                .map(Ok::<_, eventsource_stream::EventStreamError<reqwest::Error>>),
+        );
+        let decoded = decode_stream(sse, CancellationToken::new());
+        decoded.collect().await
+    }
+
+    #[tokio::test]
+    async fn decodes_text_then_finish() {
+        let events = collect_events(vec![
+            sse_event("response.created", "{}"),
+            sse_event(
+                "response.output_text.delta",
+                r#"{"item_id":"msg_1","output_index":0,"delta":"hel"}"#,
+            ),
+            sse_event(
+                "response.output_text.delta",
+                r#"{"item_id":"msg_1","output_index":0,"delta":"lo"}"#,
+            ),
+            sse_event(
+                "response.completed",
+                r#"{"response":{"status":"completed","usage":{"input_tokens":3,"output_tokens":2,"total_tokens":5}}}"#,
+            ),
+        ])
+        .await;
+        let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
+        let mut iter = events.into_iter();
+        assert!(matches!(iter.next(), Some(CompletionEvent::TextDelta(t)) if t == "hel"));
+        assert!(matches!(iter.next(), Some(CompletionEvent::TextDelta(t)) if t == "lo"));
+        assert!(matches!(iter.next(), Some(CompletionEvent::Usage(u)) if u.total_tokens == 5));
+        assert!(matches!(
+            iter.next(),
+            Some(CompletionEvent::Finish { reason: Some(r) }) if r == "stop"
+        ));
+        assert!(iter.next().is_none());
+    }
+
+    #[tokio::test]
+    async fn empty_delta_is_dropped() {
+        let events = collect_events(vec![
+            sse_event(
+                "response.output_text.delta",
+                r#"{"item_id":"m","output_index":0,"delta":""}"#,
+            ),
+            sse_event(
+                "response.completed",
+                r#"{"response":{"status":"completed"}}"#,
+            ),
+        ])
+        .await;
+        let mut completion_events = events.into_iter().map(|r| r.unwrap());
+        // First event MUST be the Finish — the empty delta dropped.
+        assert!(matches!(
+            completion_events.next(),
+            Some(CompletionEvent::Finish { .. })
+        ));
+    }
+
+    #[tokio::test]
+    async fn incomplete_status_maps_to_length_finish_reason() {
+        let events = collect_events(vec![sse_event(
+            "response.completed",
+            r#"{"response":{"status":"incomplete"}}"#,
+        )])
+        .await;
+        let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
+        assert!(matches!(
+            events.last(),
+            Some(CompletionEvent::Finish { reason: Some(r) }) if r == "length"
+        ));
+    }
+
+    #[tokio::test]
+    async fn function_call_items_emit_toolcall_events() {
+        let events = collect_events(vec![
+            sse_event(
+                "response.output_item.added",
+                r#"{"output_index":0,"item":{"type":"function_call","id":"item_1","call_id":"call_xyz","name":"read_file"}}"#,
+            ),
+            sse_event(
+                "response.function_call_arguments.delta",
+                r#"{"item_id":"item_1","output_index":0,"delta":"{\"path"}"#,
+            ),
+            sse_event(
+                "response.function_call_arguments.delta",
+                r#"{"item_id":"item_1","output_index":0,"delta":"\":\"/etc/hostname\"}"}"#,
+            ),
+            sse_event("response.completed", r#"{"response":{"status":"completed"}}"#),
+        ])
+        .await;
+        let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
+        let mut iter = events.into_iter();
+        assert!(matches!(
+            iter.next(),
+            Some(CompletionEvent::ToolCallStart { index: 0, ref id, ref name })
+                if id == "call_xyz" && name == "read_file"
+        ));
+        assert!(matches!(
+            iter.next(),
+            Some(CompletionEvent::ToolCallArgsDelta { index: 0, ref args_delta })
+                if args_delta == r#"{"path"#
+        ));
+        assert!(matches!(
+            iter.next(),
+            Some(CompletionEvent::ToolCallArgsDelta { index: 0, ref args_delta })
+                if args_delta == r#"":"/etc/hostname"}"#
+        ));
+        assert!(matches!(iter.next(), Some(CompletionEvent::Finish { .. })));
+    }
+
+    #[tokio::test]
+    async fn function_call_added_with_inline_arguments_emits_single_args_delta() {
+        // Some upstreams (rare) include the fully-buffered arguments
+        // on the `output_item.added` event when the model finalised
+        // the call before SSE flush. Verify both ToolCallStart and a
+        // single args delta fire.
+        let events = collect_events(vec![
+            sse_event(
+                "response.output_item.added",
+                r#"{"output_index":0,"item":{"type":"function_call","call_id":"call_a","name":"f","arguments":"{\"x\":1}"}}"#,
+            ),
+            sse_event("response.completed", r#"{"response":{"status":"completed"}}"#),
+        ])
+        .await;
+        let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
+        let mut iter = events.into_iter();
+        assert!(matches!(
+            iter.next(),
+            Some(CompletionEvent::ToolCallStart { .. })
+        ));
+        assert!(matches!(
+            iter.next(),
+            Some(CompletionEvent::ToolCallArgsDelta { index: 0, ref args_delta })
+                if args_delta == r#"{"x":1}"#
+        ));
+        assert!(matches!(iter.next(), Some(CompletionEvent::Finish { .. })));
+    }
+
+    #[tokio::test]
+    async fn cancellation_ends_stream_promptly() {
+        // Hand the decoder an empty stream + a triggered cancellation
+        // token; it should terminate without yielding anything.
+        let sse = stream::iter(Vec::<
+            Result<eventsource_stream::Event, eventsource_stream::EventStreamError<reqwest::Error>>,
+        >::new());
+        let cancel = CancellationToken::new();
+        cancel.cancel();
+        let decoded = decode_stream(sse, cancel);
+        let events: Vec<_> = decoded.collect().await;
+        assert!(events.is_empty());
+    }
+
+    #[tokio::test]
+    async fn malformed_event_payload_is_skipped() {
+        let events = collect_events(vec![
+            sse_event("response.output_text.delta", "{not valid json"),
+            sse_event(
+                "response.output_text.delta",
+                r#"{"item_id":"m","output_index":0,"delta":"ok"}"#,
+            ),
+            sse_event(
+                "response.completed",
+                r#"{"response":{"status":"completed"}}"#,
+            ),
+        ])
+        .await;
+        let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
+        // First text delta dropped; second one fires.
+        assert!(
+            events
+                .iter()
+                .any(|e| matches!(e, CompletionEvent::TextDelta(t) if t == "ok"))
+        );
+        // No errors yielded (parse failures are warn-and-skip).
+        assert!(
+            events
+                .iter()
+                .all(|e| !matches!(e, CompletionEvent::Finish { reason: None }))
+        );
+    }
+
+    #[test]
+    fn provider_construction_is_cheap() {
+        let _ = OpenAIResponsesProvider::new(ep()).unwrap();
+    }
+}
--- a/crates/helexa-acp/src/qwen3.rs
+++ b/crates/helexa-acp/src/qwen3.rs
--- a/crates/helexa-acp/src/session.rs
+++ b/crates/helexa-acp/src/session.rs
@@ -0,0 +1,188 @@
+//! Per-session state for the ACP agent loop.
+//!
+//! Concurrency:
+//!
+//! - [`SessionStore`] is an `Arc<RwLock<HashMap<SessionId, …>>>`. The map
+//!   itself is read-mostly: it changes only on `session/new` and never
+//!   shrinks during Stage 2, so an `RwLock` keeps concurrent reads
+//!   contention-free.
+//! - Each session is wrapped in its own `Arc<Mutex<SessionState>>`. Holding
+//!   one session's lock doesn't block requests against any other session,
+//!   which matters once a client opens multiple sessions in parallel.
+//!
+//! All operations hold a lock only long enough to copy out (or mutate) the
+//! state they need — never across an `await` that drives the upstream
+//! provider stream.
+
+use std::collections::HashMap;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use agent_client_protocol::schema::{SessionId, SessionModeId};
+use tokio::sync::{Mutex, RwLock};
+use tokio_util::sync::CancellationToken;
+
+use crate::provider::Message;
+
+/// Mode id advertised as the gated default. Writes / bash prompt for
+/// permission via `session/request_permission`.
+pub const MODE_DEFAULT: &str = "default";
+
+/// Mode id advertised as "auto-allow everything". Matches the
+/// favorite name (`bypassPermissions`) Zed clients tend to reference.
+pub const MODE_BYPASS: &str = "bypassPermissions";
+
+/// Mode id for read-and-plan-only operation. The model may read files
+/// and list directories freely, may write *only* into the per-project
+/// plan directory under `$XDG_DATA_HOME/helexa-acp/plans/<project-id>/`,
+/// and cannot run shell commands. Designed for "draft the
+/// implementation plan, then I'll review and let you execute" flows.
+pub const MODE_PLAN: &str = "plan";
+
+/// State carried for a single ACP session.
+///
+/// Mutated under `Mutex<SessionState>`; never share a clone across
+/// tasks expecting to see the same `cancel` token — clone the token
+/// explicitly when handing it to the streaming task.
+#[derive(Debug)]
+pub struct SessionState {
+    /// Conversation history in chronological order (user / assistant
+    /// turns). The system prompt is *not* stored here — it's built
+    /// fresh per request so any cwd / config changes take effect.
+    pub history: Vec<Message>,
+    /// Working directory the client opened the session against. Used
+    /// by [`crate::prompt::build_system_prompt`] and (Stage 3) by
+    /// filesystem tools.
+    pub cwd: PathBuf,
+    /// Currently-selected model id. Format is either a bare model id
+    /// (resolved against the default endpoint) or `endpoint:model`.
+    /// Mutated by `session/set_model` in Stage 4; Stage 2 sets it
+    /// once at session creation and never changes it.
+    pub model_id: String,
+    /// Cancellation handle for the in-flight prompt, if any. A fresh
+    /// token is installed at the start of every `session/prompt`
+    /// request; `session/cancel` fires this one. Between prompts the
+    /// token is "spent" — firing it does nothing — which is fine,
+    /// `session/cancel` is a no-op when there's nothing to cancel.
+    pub cancel: CancellationToken,
+    /// Permission gating mode. Stage 3 advertises two ids in
+    /// `NewSessionResponse.modes`: [`MODE_DEFAULT`] (writes / bash
+    /// prompt the user) and [`MODE_BYPASS`] (auto-allow). Mutated by
+    /// `session/set_mode`.
+    pub mode_id: SessionModeId,
+}
+
+impl SessionState {
+    pub fn new(cwd: PathBuf, model_id: String) -> Self {
+        Self {
+            history: Vec::new(),
+            cwd,
+            model_id,
+            cancel: CancellationToken::new(),
+            mode_id: SessionModeId::new(MODE_DEFAULT),
+        }
+    }
+}
+
+/// Concurrent map of live sessions.
+///
+/// Cloning is cheap (`Arc` bump). Pass clones into every handler that
+/// needs session access; never hold a clone across an `.await` that
+/// could outlive the request.
+pub type SessionStore = Arc<RwLock<HashMap<SessionId, Arc<Mutex<SessionState>>>>>;
+
+/// Fresh, empty session store.
+pub fn new_store() -> SessionStore {
+    Arc::new(RwLock::new(HashMap::new()))
+}
+
+/// Look up a session by id. Returns `None` if no such session is registered.
+pub async fn get(store: &SessionStore, id: &SessionId) -> Option<Arc<Mutex<SessionState>>> {
+    store.read().await.get(id).cloned()
+}
+
+/// Register a fresh session. Overwrites any prior entry with the same id
+/// (which should never happen — ids are uniquely generated by the agent).
+pub async fn insert(store: &SessionStore, id: SessionId, state: SessionState) {
+    store.write().await.insert(id, Arc::new(Mutex::new(state)));
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::provider::{MessageContent, Role};
+
+    fn id(s: &str) -> SessionId {
+        SessionId::new(s)
+    }
+
+    #[tokio::test]
+    async fn insert_then_get_round_trip() {
+        let store = new_store();
+        let state = SessionState::new(PathBuf::from("/tmp"), "m".into());
+        insert(&store, id("s1"), state).await;
+        let got = get(&store, &id("s1")).await.expect("session present");
+        let locked = got.lock().await;
+        assert_eq!(locked.cwd, PathBuf::from("/tmp"));
+        assert_eq!(locked.model_id, "m");
+        assert!(locked.history.is_empty());
+    }
+
+    #[tokio::test]
+    async fn missing_session_is_none() {
+        let store = new_store();
+        assert!(get(&store, &id("nope")).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn history_is_per_session() {
+        let store = new_store();
+        insert(
+            &store,
+            id("a"),
+            SessionState::new(PathBuf::from("/a"), "m".into()),
+        )
+        .await;
+        insert(
+            &store,
+            id("b"),
+            SessionState::new(PathBuf::from("/b"), "m".into()),
+        )
+        .await;
+
+        // Appending to a's history must not affect b's.
+        get(&store, &id("a"))
+            .await
+            .unwrap()
+            .lock()
+            .await
+            .history
+            .push(Message {
+                role: Role::User,
+                content: MessageContent::Text {
+                    text: "hello".into(),
+                },
+            });
+
+        assert_eq!(
+            get(&store, &id("a"))
+                .await
+                .unwrap()
+                .lock()
+                .await
+                .history
+                .len(),
+            1
+        );
+        assert_eq!(
+            get(&store, &id("b"))
+                .await
+                .unwrap()
+                .lock()
+                .await
+                .history
+                .len(),
+            0
+        );
+    }
+}
--- a/crates/helexa-acp/src/store.rs
+++ b/crates/helexa-acp/src/store.rs
@@ -0,0 +1,462 @@
+//! On-disk session persistence for `session/load` support.
+//!
+//! Storage layout:
+//!
+//! ```text
+//! $XDG_DATA_HOME/helexa-acp/sessions/{session_id}.json
+//! ```
+//!
+//! (Fallback to `~/.local/share/helexa-acp/sessions/` when
+//! `$XDG_DATA_HOME` is unset.) One JSON file per session. Writes
+//! happen at the end of every `session/prompt` round through
+//! [`save`], using tempfile-plus-rename so a crash mid-write can't
+//! corrupt the store. Reads happen on `session/load` via [`load`].
+//!
+//! No compaction, no rotation: files accumulate until the user
+//! cleans them up. That's deliberate — disk is cheap, and the
+//! resume-on-restart workflow matters more than tidiness. The
+//! [`SESSIONS_DIRNAME`] subdirectory is created lazily on first
+//! save so an unprivileged install path never errors at startup.
+
+use std::path::PathBuf;
+use std::time::SystemTime;
+
+use agent_client_protocol::schema::SessionId;
+use serde::{Deserialize, Serialize};
+
+use crate::provider::Message;
+
+const APP_DIRNAME: &str = "helexa-acp";
+const SESSIONS_DIRNAME: &str = "sessions";
+const PLANS_DIRNAME: &str = "plans";
+
+/// The shape persisted to disk for one session. Only what we can't
+/// rebuild from the running config goes in here: the conversation
+/// history, the mode toggle, the model id, and the cwd-at-creation.
+///
+/// `created_at` / `updated_at` are seconds-since-epoch — cheap to
+/// compare, no third-party time crate, and stable across runs.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PersistedSession {
+    pub session_id: String,
+    pub cwd: PathBuf,
+    pub model_id: String,
+    pub mode_id: String,
+    pub history: Vec<Message>,
+    pub created_at: u64,
+    pub updated_at: u64,
+}
+
+/// Resolve the directory that holds session JSON files. Honors
+/// `$XDG_DATA_HOME`; falls back to `~/.local/share/helexa-acp/sessions/`.
+/// Returns `None` if neither is resolvable (no `HOME` set — possible
+/// in stripped-down container environments).
+pub fn sessions_dir() -> Option<PathBuf> {
+    let base = std::env::var("XDG_DATA_HOME")
+        .ok()
+        .filter(|s| !s.is_empty())
+        .map(PathBuf::from)
+        .or_else(|| {
+            std::env::var("HOME")
+                .ok()
+                .map(|h| PathBuf::from(h).join(".local").join("share"))
+        })?;
+    Some(base.join(APP_DIRNAME).join(SESSIONS_DIRNAME))
+}
+
+/// Atomic save into the default sessions directory.
+pub fn save(session: &PersistedSession) -> anyhow::Result<()> {
+    let dir = sessions_dir()
+        .ok_or_else(|| anyhow::anyhow!("can't resolve XDG_DATA_HOME or HOME for session store"))?;
+    save_to_dir(&dir, session)
+}
+
+/// Load from the default sessions directory.
+pub fn load(session_id: &SessionId) -> anyhow::Result<PersistedSession> {
+    let dir = sessions_dir()
+        .ok_or_else(|| anyhow::anyhow!("can't resolve XDG_DATA_HOME or HOME for session store"))?;
+    load_from_dir(&dir, session_id)
+}
+
+/// Atomic save into an explicit directory. Writes to
+/// `{id}.json.tmp` then renames over `{id}.json`. Creates the
+/// target directory if it doesn't exist. Split from [`save`] so
+/// unit tests can target a per-test scratch dir without mutating
+/// process-global env vars.
+pub fn save_to_dir(dir: &std::path::Path, session: &PersistedSession) -> anyhow::Result<()> {
+    std::fs::create_dir_all(dir).map_err(|e| anyhow::anyhow!("create {}: {e}", dir.display()))?;
+    let safe = sanitize_id(&session.session_id);
+    let final_path = dir.join(format!("{safe}.json"));
+    let tmp_path = dir.join(format!("{safe}.json.tmp"));
+    let json = serde_json::to_string_pretty(session)?;
+    std::fs::write(&tmp_path, json)
+        .map_err(|e| anyhow::anyhow!("write {}: {e}", tmp_path.display()))?;
+    std::fs::rename(&tmp_path, &final_path)
+        .map_err(|e| anyhow::anyhow!("rename → {}: {e}", final_path.display()))?;
+    Ok(())
+}
+
+/// Load from an explicit directory. Returns a friendly error
+/// message when the session id has no file on disk so the caller
+/// can map it to a clean ACP error response.
+pub fn load_from_dir(
+    dir: &std::path::Path,
+    session_id: &SessionId,
+) -> anyhow::Result<PersistedSession> {
+    let safe = sanitize_id(session_id.0.as_ref());
+    let path = dir.join(format!("{safe}.json"));
+    let bytes = std::fs::read(&path).map_err(|e| {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            anyhow::anyhow!("no persisted session at {}", path.display())
+        } else {
+            anyhow::anyhow!("read {}: {e}", path.display())
+        }
+    })?;
+    let session: PersistedSession = serde_json::from_slice(&bytes)
+        .map_err(|e| anyhow::anyhow!("parse {}: {e}", path.display()))?;
+    Ok(session)
+}
+
+/// List all persisted sessions, optionally filtered by `cwd`. Used
+/// by the `session/list` handler so a client (Zed) can find the
+/// session that belongs to the workspace it's reopening.
+///
+/// `filter_cwd = None` returns every session on disk. `Some(path)`
+/// returns only sessions whose persisted `cwd` is exactly equal.
+///
+/// Files that fail to parse are skipped with a warning rather than
+/// aborting the whole list — one corrupt session shouldn't make
+/// the resume picker unusable.
+pub fn list(filter_cwd: Option<&std::path::Path>) -> anyhow::Result<Vec<PersistedSession>> {
+    let dir = sessions_dir()
+        .ok_or_else(|| anyhow::anyhow!("can't resolve XDG_DATA_HOME or HOME for session store"))?;
+    list_in_dir(&dir, filter_cwd)
+}
+
+/// Explicit-dir variant for tests, mirroring [`save_to_dir`] /
+/// [`load_from_dir`].
+pub fn list_in_dir(
+    dir: &std::path::Path,
+    filter_cwd: Option<&std::path::Path>,
+) -> anyhow::Result<Vec<PersistedSession>> {
+    let read = match std::fs::read_dir(dir) {
+        Ok(r) => r,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
+        Err(e) => return Err(anyhow::anyhow!("read_dir {}: {e}", dir.display())),
+    };
+    let mut out = Vec::new();
+    for entry in read.flatten() {
+        let path = entry.path();
+        if path.extension().and_then(|s| s.to_str()) != Some("json") {
+            continue;
+        }
+        match std::fs::read(&path).and_then(|bytes| {
+            serde_json::from_slice::<PersistedSession>(&bytes).map_err(std::io::Error::other)
+        }) {
+            Ok(session) => {
+                if let Some(want) = filter_cwd
+                    && session.cwd != want
+                {
+                    continue;
+                }
+                out.push(session);
+            }
+            Err(e) => {
+                tracing::warn!(
+                    path = %path.display(),
+                    error = %e,
+                    "store: skipping unparseable session file"
+                );
+            }
+        }
+    }
+    // Most-recent first by updated_at.
+    out.sort_by_key(|s| std::cmp::Reverse(s.updated_at));
+    Ok(out)
+}
+
+/// Seconds-since-epoch, saturating to 0 if the system clock is
+/// behind epoch (which shouldn't happen but the type system
+/// requires a fallible read).
+pub fn now_secs() -> u64 {
+    SystemTime::now()
+        .duration_since(SystemTime::UNIX_EPOCH)
+        .map(|d| d.as_secs())
+        .unwrap_or(0)
+}
+
+/// Root directory for plan-mode artefacts. Mirrors [`sessions_dir`]
+/// but under `…/helexa-acp/plans/` so plans and conversation
+/// transcripts are siblings, not nested.
+pub fn plans_root() -> Option<PathBuf> {
+    sessions_dir().and_then(|s| s.parent().map(|p| p.join(PLANS_DIRNAME)))
+}
+
+/// Per-project plan directory:
+/// `$XDG_DATA_HOME/helexa-acp/plans/<project-id>/`. The id derives
+/// from the session's cwd so plans for the same project survive
+/// across cwd-changes (a `/home/foo/git/bar` ↔ symlinked
+/// `/srv/checkout/bar` would technically diverge, accepted as a
+/// won't-fix corner case).
+pub fn plan_dir_for(cwd: &std::path::Path) -> Option<PathBuf> {
+    plans_root().map(|root| root.join(project_id_for(cwd)))
+}
+
+/// Deterministic, human-readable project identifier. Format:
+/// `<basename>-<8-hex>` where the 8-hex suffix is FNV-1a of the
+/// full path. Basename keeps the path skim-readable when poking
+/// around `$XDG_DATA_HOME` by hand; the hash suffix disambiguates
+/// repos that share a final path component (e.g. multiple
+/// `/.../checkout/beat` checkouts).
+///
+/// FNV-1a rather than `std::collections::hash::DefaultHasher`
+/// because the latter (SipHash) reseeds per process, so it'd give
+/// us a different project_id on every run.
+pub fn project_id_for(cwd: &std::path::Path) -> String {
+    let basename = cwd
+        .file_name()
+        .and_then(|s| s.to_str())
+        .unwrap_or("unknown");
+    let sanitised: String = basename
+        .chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                c
+            } else {
+                '_'
+            }
+        })
+        .collect();
+    let hash = fnv1a_32(cwd.to_string_lossy().as_bytes());
+    format!("{sanitised}-{hash:08x}")
+}
+
+/// FNV-1a (32-bit). Deterministic, no third-party crate. Used for
+/// project ids only — not cryptographic.
+fn fnv1a_32(bytes: &[u8]) -> u32 {
+    let mut h: u32 = 0x811c_9dc5;
+    for b in bytes {
+        h ^= u32::from(*b);
+        h = h.wrapping_mul(0x0100_0193);
+    }
+    h
+}
+
+/// Format seconds-since-epoch as an ISO 8601 / RFC 3339 string
+/// (`YYYY-MM-DDTHH:MM:SSZ`) for `SessionInfo.updated_at`. Returns
+/// `None` for values outside the representable range, in which
+/// case the caller should omit the field.
+pub fn unix_to_iso8601(secs: u64) -> Option<String> {
+    use chrono::TimeZone;
+    let dt = chrono::Utc.timestamp_opt(secs as i64, 0).single()?;
+    Some(dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
+}
+
+/// Strip anything that isn't a safe filename character so a
+/// mischievous (or just unconventional) session id can't escape
+/// the sessions directory.
+fn sanitize_id(id: &str) -> String {
+    id.chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                c
+            } else {
+                '_'
+            }
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::provider::{MessageContent, Role};
+
+    /// Unique scratch dir per test invocation. We use this dir
+    /// directly with the `*_to_dir` / `*_from_dir` functions so
+    /// the tests never mutate `$XDG_DATA_HOME` — that env var
+    /// would race across the parallel test harness.
+    fn unique_dir() -> PathBuf {
+        let base = std::env::var("CARGO_TARGET_TMPDIR")
+            .ok()
+            .map(PathBuf::from)
+            .unwrap_or_else(std::env::temp_dir);
+        let pid = std::process::id();
+        let nanos = SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map(|d| d.subsec_nanos())
+            .unwrap_or(0);
+        let dir = base.join(format!("helexa-acp-store-test-{pid}-{nanos}"));
+        std::fs::create_dir_all(&dir).expect("create test dir");
+        dir
+    }
+
+    fn sample(id: &str) -> PersistedSession {
+        PersistedSession {
+            session_id: id.into(),
+            cwd: PathBuf::from("/home/me/proj"),
+            model_id: "Qwen/Qwen3.6-27B".into(),
+            mode_id: "default".into(),
+            history: vec![
+                Message {
+                    role: Role::User,
+                    content: MessageContent::Text {
+                        text: "hello".into(),
+                    },
+                },
+                Message {
+                    role: Role::Assistant,
+                    content: MessageContent::Text { text: "hi".into() },
+                },
+            ],
+            created_at: 1_700_000_000,
+            updated_at: 1_700_000_001,
+        }
+    }
+
+    #[test]
+    fn round_trip_save_then_load() {
+        let dir = unique_dir();
+        save_to_dir(&dir, &sample("hxa-1")).expect("save");
+        let loaded = load_from_dir(&dir, &SessionId::new("hxa-1")).expect("load");
+        assert_eq!(loaded.session_id, "hxa-1");
+        assert_eq!(loaded.cwd, PathBuf::from("/home/me/proj"));
+        assert_eq!(loaded.history.len(), 2);
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn load_missing_session_errors_with_not_found_message() {
+        let dir = unique_dir();
+        let err = load_from_dir(&dir, &SessionId::new("nope")).unwrap_err();
+        let msg = format!("{err}");
+        assert!(
+            msg.contains("no persisted session"),
+            "want NotFound, got: {msg}"
+        );
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn save_overwrites_existing_atomically() {
+        let dir = unique_dir();
+        save_to_dir(&dir, &sample("hxa-1")).expect("save");
+        let mut updated = sample("hxa-1");
+        updated.history.push(Message {
+            role: Role::User,
+            content: MessageContent::Text {
+                text: "third turn".into(),
+            },
+        });
+        updated.updated_at = 1_700_000_500;
+        save_to_dir(&dir, &updated).expect("re-save");
+        let loaded = load_from_dir(&dir, &SessionId::new("hxa-1")).expect("load");
+        assert_eq!(loaded.history.len(), 3);
+        assert_eq!(loaded.updated_at, 1_700_000_500);
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn save_then_load_preserves_tool_calls_and_results() {
+        use crate::provider::ToolCall;
+        let dir = unique_dir();
+        let mut session = sample("hxa-2");
+        session.history.push(Message {
+            role: Role::Assistant,
+            content: MessageContent::ToolCalls {
+                text: Some("calling".into()),
+                calls: vec![ToolCall {
+                    id: "call_0".into(),
+                    name: "read_file".into(),
+                    arguments: r#"{"path":"/etc/hostname"}"#.into(),
+                }],
+            },
+        });
+        session.history.push(Message {
+            role: Role::Tool,
+            content: MessageContent::ToolResult {
+                tool_call_id: "call_0".into(),
+                content: "host".into(),
+            },
+        });
+        save_to_dir(&dir, &session).expect("save");
+        let loaded = load_from_dir(&dir, &SessionId::new("hxa-2")).expect("load");
+        assert_eq!(loaded.history.len(), 4);
+        match &loaded.history[2].content {
+            MessageContent::ToolCalls { calls, .. } => {
+                assert_eq!(calls[0].name, "read_file");
+            }
+            other => panic!("expected ToolCalls, got {other:?}"),
+        }
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn list_filters_by_cwd_and_sorts_recent_first() {
+        let dir = unique_dir();
+        let mut a = sample("a");
+        a.cwd = PathBuf::from("/home/me/proj-x");
+        a.updated_at = 1_700_000_010;
+        let mut b = sample("b");
+        b.cwd = PathBuf::from("/home/me/proj-x");
+        b.updated_at = 1_700_000_020;
+        let mut c = sample("c");
+        c.cwd = PathBuf::from("/home/me/elsewhere");
+        c.updated_at = 1_700_000_030;
+        save_to_dir(&dir, &a).unwrap();
+        save_to_dir(&dir, &b).unwrap();
+        save_to_dir(&dir, &c).unwrap();
+
+        let proj_x = PathBuf::from("/home/me/proj-x");
+        let list = list_in_dir(&dir, Some(&proj_x)).unwrap();
+        let ids: Vec<&str> = list.iter().map(|s| s.session_id.as_str()).collect();
+        // Filtered to proj-x; b before a because b is more recent.
+        assert_eq!(ids, vec!["b", "a"]);
+
+        let all = list_in_dir(&dir, None).unwrap();
+        assert_eq!(all.len(), 3);
+        // Global list still sorted recent-first across all cwds.
+        assert_eq!(all[0].session_id, "c");
+
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn list_returns_empty_for_missing_dir() {
+        let dir = unique_dir().join("does-not-exist");
+        let list = list_in_dir(&dir, None).unwrap();
+        assert!(list.is_empty());
+    }
+
+    #[test]
+    fn list_skips_unparseable_files() {
+        let dir = unique_dir();
+        save_to_dir(&dir, &sample("good")).unwrap();
+        std::fs::write(dir.join("garbage.json"), b"{not valid json").unwrap();
+        let list = list_in_dir(&dir, None).unwrap();
+        // Garbage skipped; good survives.
+        assert_eq!(list.len(), 1);
+        assert_eq!(list[0].session_id, "good");
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+
+    #[test]
+    fn iso8601_formats_unix_seconds() {
+        // 2024-01-01T00:00:00Z is 1704067200 unix seconds.
+        assert_eq!(
+            unix_to_iso8601(1_704_067_200),
+            Some("2024-01-01T00:00:00Z".into())
+        );
+        assert_eq!(unix_to_iso8601(0), Some("1970-01-01T00:00:00Z".into()));
+    }
+
+    #[test]
+    fn sanitize_id_rejects_path_traversal() {
+        // `../../etc/passwd` — 6 non-alnum chars before "etc"
+        // (`.`, `.`, `/`, `.`, `.`, `/`), one between, none
+        // after, none before nothing. Every disallowed char
+        // collapses to `_`.
+        assert_eq!(sanitize_id("../../etc/passwd"), "______etc_passwd");
+        assert_eq!(sanitize_id("ok-name_42"), "ok-name_42");
+    }
+}
--- a/crates/helexa-acp/src/tool_runner.rs
+++ b/crates/helexa-acp/src/tool_runner.rs
--- a/Show More
+++ b/Show More