feat(#47 #55 phase 2d): cortex load-aware routing across replicas

Stage 2 completes: when a model is loaded on more than one healthy neuron, the router picks the least-busy replica instead of always taking the first, and neuron backpressure propagates to the client intact. - NodeState.model_load: per-model admission load (in_flight + queue_depth), stashed by the poller from neuron's /health (#53/#2b). - router::resolve collects all loaded replicas and picks the one with the lowest in_flight+queue_depth (ties break by node name for determinism), replacing the previous first-match-wins. - Backpressure passthrough: the existing streaming proxy already forwards the upstream status + all headers verbatim, so a neuron 503/429 + Retry-After + #60 envelope reaches the client unmodified — now covered by a regression test so a future change can't silently unwrap it. Tests (tests/load_routing.rs): routes to the idle replica and follows the lighter load when it flips; ties break by name; a saturated neuron's 503 + Retry-After + envelope propagates through the gateway intact. All cortex-side (no CUDA); local fmt/clippy/test green. Retry-route-to-another-replica-on-backpressure (the issue's stretch goal) is deferred — least-busy spread + honest passthrough is the substantive win. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
feat(#47 #53 phase 2b): expose per-model admission load in GET /health
2026-06-17 20:45:50 +03:00 · 2026-06-17 20:13:07 +03:00 · 2026-06-17 20:03:07 +03:00 · 2026-06-17 19:35:04 +03:00 · 2026-06-17 19:29:51 +03:00 · 2026-06-17 19:07:10 +03:00
169 changed files with 27970 additions and 1124 deletions
--- a/.gitea/workflows/build-prerelease.yml
+++ b/.gitea/workflows/build-prerelease.yml
@@ -1,11 +1,20 @@
 name: build-prerelease

-# Manually-dispatched workflow that builds CUDA-flavoured neuron binaries
-# (and a single cortex binary), packages each as a Fedora RPM, signs
-# them, and publishes to the `unstable` channel at rpm.lair.cafe.
+# Builds CUDA-flavoured neuron binaries (and a single cortex binary),
+# packages each as a Fedora RPM, signs them, and publishes to the
+# `unstable` channel at rpm.lair.cafe.
 #
-# Trigger from the Gitea UI: Actions → build-prerelease → Run workflow.
-# Optionally provide a `ref` to build from a non-default branch.
+# Change-aware: the `prepare` job diffs HEAD against the git sha
+# embedded in the most recently *published* unstable RPM (per package)
+# and skips builds whose inputs didn't change. Docs-only commits build
+# nothing; gateway-only commits skip the 3 CUDA builds (and, via
+# deploy.yml's own check-update gate, the neuron restarts + model
+# cold-loads). Diffing against the published sha — not the previous
+# push — means a failed run can never cause a change to be missed.
+#
+# Lint (fmt+clippy) and test run here as parallel jobs and gate
+# `publish`; ci.yml no longer runs on pushes to main (see its trigger
+# comment), so the two workflows stop competing for the same runners.
 #
 # The published packages are versioned as e.g.
 #   helexa-neuron-blackwell-0.1.16-0.1.20260518T140530.gitabcdef0.fc43.x86_64
@@ -22,6 +31,7 @@ on:
  push:
    branches: [main]
  # Manual dispatch still available to build from a non-main ref.
+  # Dispatched runs skip change detection and build everything.
  workflow_dispatch:
    inputs:
      ref:
@@ -29,15 +39,15 @@ on:
        required: false
        default: ""

+# Coalesce same-ref pushes: a newer push cancels the older in-flight
+# run — the newest commit is the one we want on the fleet. The publish
+# job keeps its own `rpm-publish` group (cancel=false) so an in-flight
+# repo update is never interrupted. Runners are ephemeral (one VM per
+# job) so concurrent runs no longer race on a shared workspace; the
+# old shared `cortex-runner-pool` group with ci.yml is gone.
 concurrency:
-  # Share the group with ci.yml so the two workflows can't run
-  # concurrently on the same `rust` runner (act reuses the workspace
-  # cache and races destroy each other's build files mid-compile).
-  # cancel-in-progress=false → workflows queue; if a newer push lands,
-  # the older run is still picked up by ci.yml's own ref-keyed
-  # concurrency (same group, queued).
-  group: cortex-runner-pool-${{ github.ref }}
-  cancel-in-progress: false
+  group: build-prerelease-${{ github.ref }}
+  cancel-in-progress: true

 env:
  CARGO_INCREMENTAL: "0"
@@ -45,13 +55,18 @@ env:

 jobs:
  prepare:
-    name: Resolve version stamps
+    name: Resolve version stamps + change detection
+    timeout-minutes: 10
    runs-on: rust
    outputs:
      version: ${{ steps.info.outputs.version }}
      release: ${{ steps.info.outputs.release }}
      short_sha: ${{ steps.info.outputs.short_sha }}
      commit_timestamp: ${{ steps.info.outputs.commit_timestamp }}
+      build_cortex: ${{ steps.changes.outputs.build_cortex }}
+      build_neuron: ${{ steps.changes.outputs.build_neuron }}
+      build_bench: ${{ steps.changes.outputs.build_bench }}
+      check_rust: ${{ steps.changes.outputs.check_rust }}
    steps:
      - uses: actions/checkout@v4
        with:
@@ -78,19 +93,164 @@ jobs:
          echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
          echo "commit_timestamp=${COMMIT_TIMESTAMP}" >> "$GITHUB_OUTPUT"

+      - id: changes
+        run: |
+          set -ux
+          # Default: build everything. Detection only ever narrows
+          # this, and any failure along the way (manifest unreachable,
+          # unparsable, sha not in history after a force-push) leaves
+          # the full build in place. Manual dispatches always build
+          # everything — predictable when building odd refs.
+          BUILD_CORTEX=true
+          BUILD_NEURON=true
+          BUILD_BENCH=true
+          CHECK_RUST=true
+
+          if [ "${GITHUB_EVENT_NAME}" = "push" ]; then
+            MANIFEST_URL="https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json"
+            if curl -fsS --max-time 20 -o /tmp/packages.json "$MANIFEST_URL"; then
+              # Latest published sha per package, by buildTime.
+              base_for() {
+                python3 - "$1" <<'PY'
+          import json, re, sys
+          name = sys.argv[1]
+          try:
+              with open("/tmp/packages.json") as f:
+                  pkgs = json.load(f)["packages"]
+              cands = [p for p in pkgs if p.get("name") == name]
+              if cands:
+                  latest = max(cands, key=lambda p: p.get("buildTime", 0))
+                  m = re.search(r"git\.?([0-9a-f]{7,40})", latest.get("release", ""))
+                  if m:
+                      print(m.group(1))
+          except Exception:
+              pass
+          PY
+              }
+
+              # true if no usable base, else true iff the diff since
+              # the published sha touches the given path pattern.
+              decide() {
+                local base="$1" pattern="$2"
+                if [ -z "$base" ] \
+                   || ! git cat-file -e "${base}^{commit}" 2>/dev/null \
+                   || ! git merge-base --is-ancestor "$base" HEAD 2>/dev/null; then
+                  echo true; return
+                fi
+                if git diff --name-only "${base}..HEAD" | grep -qE "$pattern"; then
+                  echo true
+                else
+                  echo false
+                fi
+              }
+
+              # cortex-core is shared by both binaries; Cargo.{toml,lock}
+              # affect both; this workflow file affects both.
+              NEURON_RE='^crates/neuron/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/helexa-neuron-prerelease\.spec$|^data/neuron|^neuron\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
+              CORTEX_RE='^crates/cortex-gateway/|^crates/cortex-cli/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/cortex-prerelease\.spec$|^data/cortex|^cortex\.example\.toml$|^models\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
+              BENCH_RE='^crates/helexa-bench/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/helexa-bench-prerelease\.spec$|^data/helexa-bench|^helexa-bench\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
+              # Any Rust change (incl. crates not packaged here, e.g.
+              # helexa-acp) still needs lint+test on main.
+              RUST_RE='\.rs$|^crates/|Cargo\.toml$|^Cargo\.lock$'
+
+              CORTEX_BASE=$(base_for cortex)
+              NEURON_BASE=$(base_for helexa-neuron-blackwell)
+              BENCH_BASE=$(base_for helexa-bench)
+              BUILD_CORTEX=$(decide "$CORTEX_BASE" "$CORTEX_RE")
+              BUILD_NEURON=$(decide "$NEURON_BASE" "$NEURON_RE")
+              BUILD_BENCH=$(decide "$BENCH_BASE" "$BENCH_RE")
+              if [ "$BUILD_CORTEX" = "true" ] || [ "$BUILD_NEURON" = "true" ] || [ "$BUILD_BENCH" = "true" ]; then
+                CHECK_RUST=true
+              else
+                CHECK_RUST=$(decide "$CORTEX_BASE" "$RUST_RE")
+              fi
+            fi
+          fi
+
+          echo "build_cortex=${BUILD_CORTEX}" >> "$GITHUB_OUTPUT"
+          echo "build_neuron=${BUILD_NEURON}" >> "$GITHUB_OUTPUT"
+          echo "build_bench=${BUILD_BENCH}" >> "$GITHUB_OUTPUT"
+          echo "check_rust=${CHECK_RUST}" >> "$GITHUB_OUTPUT"
+          echo "### change detection: build_cortex=${BUILD_CORTEX} build_neuron=${BUILD_NEURON} build_bench=${BUILD_BENCH} check_rust=${CHECK_RUST}"
+
+  # fmt + clippy + test moved here from ci.yml for main pushes so the
+  # two workflows stop queueing against each other (ci.yml's checks
+  # used to delay build-cortex by ~12 minutes on the shared runner
+  # pool). They run in parallel with the builds and gate `publish`,
+  # not the builds themselves — a clippy warning still can't reach the
+  # fleet, but it also doesn't serialize the pipeline.
+  lint:
+    name: Lint (fmt + clippy)
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.check_rust == 'true'
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+      - run: cargo fmt --check --all
+      # Failure-aware sccache escalation lives in the shared script: a
+      # signal death (rustc SIGSEGV / OOM-kill) keeps the cache and fails
+      # fast instead of triggering a slower uncached rebuild; only a real
+      # sccache fault drops the cache. See script/ci-cargo-escalate.sh.
+      - name: Clippy (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo clippy --workspace -- -D warnings
+
+  test:
+    name: Test
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.check_rust == 'true'
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+      # See script/ci-cargo-escalate.sh for the escalation rationale.
+      - name: Test (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo test --workspace
+
  build-cortex:
    name: Build cortex binary
+    timeout-minutes: 25
    needs: prepare
+    if: needs.prepare.outputs.build_cortex == 'true'
    # runner-rust image already provides rust/cargo/clippy/rustfmt via
    # dnf — no rustup install step needed.
    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.ref }}

-      - name: Build cortex (release)
-        run: cargo build --release -p cortex-cli
+      # See script/ci-cargo-escalate.sh for the escalation rationale.
+      - name: Build cortex (release, sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo build --release -p cortex-cli

      - name: Stage binary
        run: |
@@ -104,9 +264,50 @@ jobs:
          path: artifacts/cortex
          retention-days: 1

+  build-bench:
+    name: Build helexa-bench binary
+    timeout-minutes: 25
+    needs: prepare
+    if: needs.prepare.outputs.build_bench == 'true'
+    # Pure-Rust, non-CUDA binary — same runner as cortex.
+    runs-on: rust
+    env:
+      RUSTC_WRAPPER: sccache
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - name: Build helexa-bench (release, sccache escalation)
+        run: |
+          # Stamp the SHA helexa-bench records as bench_sha against every
+          # run (option_env! in sweep.rs reads it at compile time).
+          export HELEXA_BUILD_SHA="$(git rev-parse HEAD)"
+          script/ci-cargo-escalate.sh cargo build --release -p helexa-bench
+
+      - name: Stage binary
+        run: |
+          mkdir --parents artifacts
+          cp target/release/helexa-bench artifacts/helexa-bench
+          ./artifacts/helexa-bench --version || true
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: bench-fc43
+          path: artifacts/helexa-bench
+          retention-days: 1
+
  build-neuron:
    name: Build neuron-${{ matrix.flavour }}
+    timeout-minutes: 35
    needs: prepare
+    if: needs.prepare.outputs.build_neuron == 'true'
    strategy:
      fail-fast: false
      matrix:
@@ -117,34 +318,53 @@ jobs:
            cuda_home: /usr/local/cuda-13.0
            build_jobs: 8
            nvcc_threads: 4
-            cargo_features: "cuda cudnn flash-attn"
+            cargo_features: "cuda cudnn"
          - flavour: ada
            compute_cap: "89"
            runner: cuda-13.0
            cuda_home: /usr/local/cuda-13.0
            build_jobs: 8
            nvcc_threads: 4
-            cargo_features: "cuda cudnn flash-attn"
+            cargo_features: "cuda cudnn"
          - flavour: blackwell
            compute_cap: "120"
            runner: cuda-13.0
            cuda_home: /usr/local/cuda-13.0
            build_jobs: 8
            nvcc_threads: 4
-            cargo_features: "cuda cudnn flash-attn"
+            cargo_features: "cuda cudnn"
    runs-on: ${{ matrix.runner }}
+    env:
+      SCCACHE_BUCKET: sccache
+      SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+      SCCACHE_REGION: auto
+      SCCACHE_S3_USE_SSL: "false"
+      AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.ref }}

+      # sccache handling + failure classification lives in
+      # script/ci-cargo-escalate.sh: it probes for sccache (the CUDA
+      # image may not ship it — a missing binary degrades to an uncached
+      # build rather than failing at `sccache rustc -vV`), and a rustc
+      # SIGSEGV / OOM-kill keeps the cache and fails fast instead of
+      # escalating to a slower uncached rebuild. The cache covers the
+      # ~600-crate host-side dep tree (the bulk of the 10-14 min build),
+      # shared across all three flavours, so even one run seeds the next.
      - name: Build neuron with CUDA (${{ matrix.flavour }})
        run: |
-          set -eux
          export PATH="${{ matrix.cuda_home }}/bin:${PATH}"
          export LD_LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LD_LIBRARY_PATH:-}"
          export LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LIBRARY_PATH:-}"
-          cargo build --release -p neuron --features "${{ matrix.cargo_features }}"
+          # Pin the build SHA neuron reports from GET /version. The git
+          # fallback in build.rs would also work on a full checkout, but
+          # injecting the exact checked-out commit is unambiguous under
+          # shallow/detached states and makes the artifact self-describing.
+          export HELEXA_BUILD_SHA="$(git rev-parse HEAD)"
+          script/ci-cargo-escalate.sh cargo build --release -p neuron --features "${{ matrix.cargo_features }}"
        env:
          CUDA_COMPUTE_CAP: ${{ matrix.compute_cap }}
          CARGO_BUILD_JOBS: ${{ matrix.build_jobs }}
@@ -164,6 +384,7 @@ jobs:

  package-cortex:
    name: Package cortex RPM
+    timeout-minutes: 20
    needs: [prepare, build-cortex]
    runs-on: rpm
    steps:
@@ -200,8 +421,47 @@ jobs:
          path: ~/rpmbuild/RPMS/x86_64/*.rpm
          retention-days: 7

+  package-bench:
+    name: Package helexa-bench RPM
+    timeout-minutes: 20
+    needs: [prepare, build-bench]
+    runs-on: rpm
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+
+      - uses: actions/download-artifact@v3
+        with:
+          name: bench-fc43
+          path: artifacts/
+
+      - name: Build RPM
+        run: |
+          set -eux
+          rm -f ~/.rpmmacros
+          rpmdev-setuptree
+          cp artifacts/helexa-bench ~/rpmbuild/SOURCES/
+          cp data/helexa-bench.service ~/rpmbuild/SOURCES/
+          cp data/helexa-bench-sysusers.conf ~/rpmbuild/SOURCES/
+          cp data/helexa-bench-firewalld.xml ~/rpmbuild/SOURCES/
+          cp helexa-bench.example.toml ~/rpmbuild/SOURCES/
+          cp LICENSE ~/rpmbuild/SOURCES/
+          rpmbuild -bb rpm/helexa-bench-prerelease.spec \
+            --define "bench_version ${{ needs.prepare.outputs.version }}" \
+            --define "bench_prerelease ${{ needs.prepare.outputs.release }}" \
+            --undefine dist \
+            --define "dist .fc43"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: rpm-bench-fc43
+          path: ~/rpmbuild/RPMS/x86_64/*.rpm
+          retention-days: 7
+
  package-neuron:
    name: Package helexa-neuron-${{ matrix.flavour }} RPM
+    timeout-minutes: 20
    needs: [prepare, build-neuron]
    runs-on: rpm
    strategy:
@@ -247,7 +507,22 @@ jobs:

  publish:
    name: Publish to rpm.lair.cafe (unstable)
-    needs: [package-cortex, package-neuron]
+    timeout-minutes: 25
+    needs: [lint, test, package-cortex, package-neuron, package-bench]
+    # Runs when at least one package was built and nothing failed.
+    # lint/test may be skipped (docs-only refs never get here because
+    # no packages build), but a real failure in any blocks the
+    # fleet from receiving the RPMs.
+    if: >-
+      ${{
+        !cancelled()
+        && (needs.lint.result == 'success' || needs.lint.result == 'skipped')
+        && (needs.test.result == 'success' || needs.test.result == 'skipped')
+        && (needs.package-cortex.result == 'success' || needs.package-neuron.result == 'success' || needs.package-bench.result == 'success')
+        && needs.package-cortex.result != 'failure'
+        && needs.package-neuron.result != 'failure'
+        && needs.package-bench.result != 'failure'
+      }}
    runs-on: rpm
    concurrency:
      group: rpm-publish
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -1,21 +1,25 @@
 name: CI

+# Pushes to main are deliberately excluded: build-prerelease.yml runs
+# its own lint/test jobs there (gating publish), and running both
+# workflows on the same push made them queue against each other on the
+# same runner labels — ~12 minutes of added latency per deploy. Feature
+# branches, PRs to main, and release tags keep the full gate here.
 on:
  push:
-    branches: ["**"]
+    branches-ignore: [main]
    tags: ["v*"]
  pull_request:
    branches: [main]

-# Share a concurrency group with build-prerelease.yml so the two
-# workflows don't race on the same `rust` runner workspace (act's
-# /root/.cache/act/<hash>/hostexecutor/ is shared across concurrent
-# jobs and one job's checkout step nukes another's in-flight build
-# files). cancel-in-progress=false → they queue; same-ref pushes
-# coalesce per workflow via cancel-in-progress on each.
+# Coalesce same-ref pushes; a newer push supersedes the in-flight run.
+# (The old shared `cortex-runner-pool` group with build-prerelease.yml
+# is gone — the workflows no longer trigger on the same refs, and
+# ephemeral one-VM-per-job runners removed the shared-workspace race
+# that group existed to serialize.)
 concurrency:
-  group: cortex-runner-pool-${{ github.ref }}
-  cancel-in-progress: false
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true

 env:
  CARGO_INCREMENTAL: "0"
@@ -37,6 +41,7 @@ env:
 jobs:
  fmt:
    name: Format
+    timeout-minutes: 15
    runs-on: rust
    steps:
      - uses: actions/checkout@v4
@@ -44,53 +49,26 @@ jobs:

  clippy:
    name: Clippy
+    timeout-minutes: 25
    runs-on: rust
    steps:
      - uses: actions/checkout@v4
-      # sccache occasionally fails with spurious race-condition errors;
-      # retrying the same invocation succeeds without code changes.
-      # Allow up to 3 attempts before declaring real failure.
-      - name: Clippy (with retry)
-        run: |
-          for attempt in 1 2 3; do
-            echo "::group::clippy attempt ${attempt}"
-            if cargo clippy --workspace -- -D warnings; then
-              echo "::endgroup::"
-              exit 0
-            fi
-            echo "::endgroup::"
-            echo "clippy failed on attempt ${attempt}"
-            if [ "${attempt}" -lt 3 ]; then
-              sleep 5
-            fi
-          done
-          echo "clippy failed after 3 attempts"
-          exit 1
-      - run: sccache --show-stats
+      # Failure-aware sccache escalation lives in the shared script (kept
+      # in sync with build-prerelease.yml): a signal death (rustc SIGSEGV
+      # / OOM-kill) keeps the cache and fails fast instead of an uncached
+      # rebuild; only a real sccache fault drops the cache.
+      - name: Clippy (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo clippy --workspace -- -D warnings

  test:
    name: Test
+    timeout-minutes: 25
    runs-on: rust
    steps:
      - uses: actions/checkout@v4
-      # See the clippy job for why this is retried.
-      - name: Test (with retry)
-        run: |
-          for attempt in 1 2 3; do
-            echo "::group::test attempt ${attempt}"
-            if cargo test --workspace; then
-              echo "::endgroup::"
-              exit 0
-            fi
-            echo "::endgroup::"
-            echo "test failed on attempt ${attempt}"
-            if [ "${attempt}" -lt 3 ]; then
-              sleep 5
-            fi
-          done
-          echo "test failed after 3 attempts"
-          exit 1
-      - run: sccache --show-stats
+      # See script/ci-cargo-escalate.sh for the escalation rationale.
+      - name: Test (sccache escalation)
+        run: script/ci-cargo-escalate.sh cargo test --workspace

  # Type-check the CUDA-only code path. Borrow-check-only — we
  # never run the tests here (the runner has no GPU). This catches
@@ -104,54 +82,44 @@ jobs:
  # see commit history).
  cuda-check:
    name: CUDA type-check
+    timeout-minutes: 35
    runs-on: cuda-13.0
-    # The workflow-level env sets `RUSTC_WRAPPER: sccache` for the
-    # `rust` runner (where fmt/clippy/test live and sccache is
-    # installed). The `cuda-13.0` runner doesn't have sccache on
-    # PATH, so inheriting the wrapper makes cargo bail with
-    # `could not execute process `sccache rustc -vV` (never executed)`
-    # before borrow-check even starts. Clear it locally. Also clear
-    # SCCACHE_* so cargo doesn't try to contact the cache (the
-    # remote auth headers come from secrets that aren't present on
-    # this runner either). Lose the cache, keep the gate.
+    # The workflow-level env sets `RUSTC_WRAPPER: sccache`
+    # unconditionally, which hard-fails cargo if the CUDA image
+    # doesn't ship sccache. Clear it at job level; the "Enable
+    # sccache when available" step opts back in only after probing
+    # for the binary. SCCACHE_*/AWS creds stay set — harmless when
+    # the wrapper is off, required when it's on.
    env:
      RUSTC_WRAPPER: ""
-      SCCACHE_BUCKET: ""
-      SCCACHE_ENDPOINT: ""
-      SCCACHE_REGION: ""
-      SCCACHE_S3_USE_SSL: ""
-      AWS_ACCESS_KEY_ID: ""
-      AWS_SECRET_ACCESS_KEY: ""
+      # candle-kernels' build script falls back to `nvidia-smi` for
+      # compute-cap detection when this is unset — and the GPU-less
+      # builder image doesn't ship nvidia-smi. Any valid cap works for
+      # a borrow-check; the real per-flavour caps live in
+      # build-prerelease.yml's matrix.
+      CUDA_COMPUTE_CAP: "86"
    steps:
      - uses: actions/checkout@v4
-      - name: cargo check --features cuda (with retry)
+      # sccache probing + failure classification lives in the shared
+      # script (see build-prerelease.yml's neuron build for the same
+      # pattern). It probes for sccache and, on a rustc SIGSEGV / OOM,
+      # keeps the cache and fails fast rather than rebuilding uncached.
+      - name: cargo check --features cuda (sccache escalation)
        run: |
          # act launches the step shell without /etc/profile, so the
          # gitea_runner user's inherited PATH lacks /usr/local/cuda-13.0/bin.
-          # cudarc's build.rs:157 shells out to `nvcc --version` (because
-          # the neuron crate enables cuda-version-from-build-system) and
-          # panics with ENOENT if nvcc isn't resolvable. build-prerelease.yml
-          # does the same export — keep them in sync.
+          # cudarc's build.rs shells out to `nvcc --version` (the neuron
+          # crate enables cuda-version-from-build-system) and panics with
+          # ENOENT if nvcc isn't resolvable — keep this export in sync
+          # with build-prerelease.yml.
          export PATH="/usr/local/cuda-13.0/bin:${PATH}"
          export LD_LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH:-}"
          export LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LIBRARY_PATH:-}"
-          for attempt in 1 2 3; do
-            echo "::group::cuda-check attempt ${attempt}"
-            if cargo check -p neuron --features cuda --all-targets; then
-              echo "::endgroup::"
-              exit 0
-            fi
-            echo "::endgroup::"
-            echo "cuda-check failed on attempt ${attempt}"
-            if [ "${attempt}" -lt 3 ]; then
-              sleep 5
-            fi
-          done
-          echo "cuda-check failed after 3 attempts"
-          exit 1
+          script/ci-cargo-escalate.sh cargo check -p neuron --features cuda --all-targets

  srpm-cortex:
    name: Build cortex SRPM
+    timeout-minutes: 25
    runs-on: rpm
    needs: [fmt, clippy, test, cuda-check]
    if: startsWith(github.ref, 'refs/tags/v')
@@ -212,6 +180,7 @@ jobs:

  srpm-neuron:
    name: Build neuron SRPM
+    timeout-minutes: 25
    runs-on: rpm
    needs: [fmt, clippy, test, cuda-check]
    if: startsWith(github.ref, 'refs/tags/v')
@@ -272,6 +241,7 @@ jobs:

  copr-cortex:
    name: Publish cortex to COPR
+    timeout-minutes: 60
    runs-on: fedora-43
    needs: srpm-cortex
    steps:
@@ -289,6 +259,7 @@ jobs:

  copr-neuron:
    name: Publish neuron to COPR
+    timeout-minutes: 60
    runs-on: fedora-43
    needs: srpm-neuron
    steps:
@@ -306,6 +277,7 @@ jobs:

  bump-version:
    name: Bump version in source
+    timeout-minutes: 15
    runs-on: rust
    needs: [copr-cortex, copr-neuron]
    steps:
@@ -349,6 +321,6 @@ jobs:
            echo "Nothing to commit for ${VERSION}"
          else
            git commit -m "chore: bump version to ${VERSION}"
-            git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/helexa/cortex.git"
+            git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/${{ github.repository }}.git"
            git push origin HEAD:main
          fi
--- a/.gitea/workflows/deploy-dev.yml
+++ b/.gitea/workflows/deploy-dev.yml
@@ -0,0 +1,136 @@
+name: deploy-dev
+
+# Fast-path iteration deploy for a SINGLE neuron host: build one CUDA
+# flavour, copy the raw binary to the host, restart neuron.service.
+# Skips the other two flavours, all RPM packaging, signing, repo
+# publish, and dnf — push-to-testable drops from ~20 min to roughly
+# one CUDA build plus a service restart.
+#
+# This is a DEV convenience, not a release path:
+#   - the binary lands at /usr/bin/neuron *outside* RPM ownership;
+#     the next regular deploy.yml run reconciles the host back to the
+#     packaged binary (dnf sees the newer RPM and reinstalls). `rpm -V
+#     helexa-neuron-<flavour>` flagging a modified /usr/bin/neuron in
+#     the interim is expected.
+#   - nothing is published; other hosts are untouched.
+#   - requires the `install` sudoers rule from
+#     asset/sudoers.d/neuron-host.conf (re-run script/infra-setup.sh
+#     after updating it).
+#
+# Trigger from the Gitea UI: Actions → deploy-dev → Run workflow,
+# pick the target host. Defaults to the ref you dispatch from, so it
+# works from feature branches without touching main.
+
+on:
+  workflow_dispatch:
+    inputs:
+      target:
+        description: "neuron host to deploy to"
+        required: true
+        type: choice
+        options: [beast, benjy, quadbrat]
+        default: beast
+
+# One dev deploy at a time; a newer dispatch for the same host wins.
+concurrency:
+  group: deploy-dev-${{ inputs.target }}
+  cancel-in-progress: true
+
+env:
+  CARGO_INCREMENTAL: "0"
+  CARGO_TERM_COLOR: "always"
+
+jobs:
+  build:
+    name: Build neuron (${{ inputs.target }})
+    runs-on: cuda-13.0
+    outputs:
+      flavour: ${{ steps.map.outputs.flavour }}
+    steps:
+      - uses: actions/checkout@v4
+
+      # host → flavour → compute cap. Keep in sync with the
+      # build-neuron matrix in build-prerelease.yml and the
+      # deploy-neurons matrix in deploy.yml.
+      - id: map
+        run: |
+          case "${{ inputs.target }}" in
+            beast)    flavour=blackwell cap=120 ;;
+            benjy)    flavour=ada       cap=89  ;;
+            quadbrat) flavour=ampere    cap=86  ;;
+            *) echo "unknown target ${{ inputs.target }}"; exit 1 ;;
+          esac
+          echo "flavour=${flavour}" >> "$GITHUB_OUTPUT"
+          echo "cap=${cap}" >> "$GITHUB_OUTPUT"
+
+      - name: Build neuron with CUDA
+        run: |
+          set -eux
+          export PATH="/usr/local/cuda-13.0/bin:${PATH}"
+          export LD_LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH:-}"
+          export LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LIBRARY_PATH:-}"
+          cargo build --release -p neuron --features "cuda cudnn"
+        env:
+          CUDA_COMPUTE_CAP: ${{ steps.map.outputs.cap }}
+          CARGO_BUILD_JOBS: "8"
+          NVCC_THREADS: "4"
+
+      - name: Stage binary
+        run: |
+          mkdir --parents artifacts
+          cp target/release/neuron artifacts/neuron-dev
+          file artifacts/neuron-dev
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: neuron-dev-${{ inputs.target }}
+          path: artifacts/neuron-dev
+          retention-days: 1
+
+  deploy:
+    name: Deploy to ${{ inputs.target }}
+    needs: build
+    runs-on: fedora-43
+    env:
+      DEPLOY_KEY: |
+        ${{ secrets.RSYNC_SSH_KEY }}
+      TARGET_HOST: ${{ inputs.target }}.hanzalova.internal
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              "gitea_ci@${TARGET_HOST}" 'hostname -f'
+
+      - uses: actions/download-artifact@v3
+        with:
+          name: neuron-dev-${{ inputs.target }}
+          path: artifacts/
+
+      - name: Copy binary to host
+        run: |
+          scp artifacts/neuron-dev "gitea_ci@${TARGET_HOST}:/var/lib/gitea_ci/neuron-dev"
+
+      - name: Install binary and restart neuron.service
+        run: |
+          ssh "gitea_ci@${TARGET_HOST}" '
+            set -eu
+            if systemctl is-active --quiet neuron.service; then
+              sudo /usr/bin/systemctl stop neuron.service
+            fi
+            # Exact command form required by the sudoers rule in
+            # asset/sudoers.d/neuron-host.conf — change both together.
+            sudo /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
+            # enable --now so a dev deploy also leaves the unit enabled
+            # for boot, consistent with deploy.yml.
+            sudo /usr/bin/systemctl enable --now neuron.service
+            rm -f /var/lib/gitea_ci/neuron-dev'
+
+      - name: Capture neuron.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh "gitea_ci@${TARGET_HOST}" \
+              'journalctl --unit neuron.service -I --no-pager'
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -0,0 +1,448 @@
+name: deploy
+
+# Roll the freshly-published unstable RPMs onto the helexa fleet:
+# cortex on the gateway, helexa-neuron-<flavour> on each neuron host,
+# and helexa-bench on bob (the bench host).
+#
+# Triggered automatically after `build-prerelease` succeeds (by which
+# point the new RPMs are live on rpm.lair.cafe/unstable), and also
+# re-runnable manually from the Gitea UI.
+#
+# Each host self-gates: if dnf sees no newer package than what is
+# installed, the service is left alone — no stop, no restart, no model
+# cold-load. Combined with build-prerelease's change detection this
+# means a docs- or gateway-only push never restarts the neurons (a
+# neuron restart costs ~5 min of 27B cold-load, see issue #1).
+#
+# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
+# sudoers drop-in) lives in script/infra-setup.sh — run that once per
+# host before this workflow can succeed.
+
+on:
+  workflow_run:
+    workflows: [build-prerelease]
+    types: [completed]
+  workflow_dispatch:
+
+# Serialize deploys. Overlapping runs would race on dnf metadata
+# refresh and service-restart timing; queueing keeps the fleet
+# predictable. Don't cancel an in-flight deploy — a half-applied dnf
+# transaction is worse than a slightly stale deploy.
+concurrency:
+  group: deploy
+  cancel-in-progress: false
+
+env:
+  DEPLOY_KEY: |
+    ${{ secrets.RSYNC_SSH_KEY }}
+
+jobs:
+  deploy-cortex:
+    runs-on: fedora-43
+    # Two trigger paths: manual dispatch always runs; workflow_run
+    # only runs if the upstream `build-prerelease` actually succeeded.
+    if: >-
+      ${{
+        github.event_name == 'workflow_dispatch'
+        || github.event.workflow_run.conclusion == 'success'
+      }}
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@hanzalova.internal 'hostname -f'
+
+      # Gating compares `rpm -q` against the packages.json manifest the
+      # publish job maintains — NOT unprivileged `dnf check-update`,
+      # which proved unreliable as the gitea_ci user (hung on metadata
+      # locks on one host, silently reported "no updates" on others).
+      # An unreadable/unparsable manifest fails open: deploy proceeds.
+      - name: Deploy cortex (skips when already current)
+        run: |
+          ssh gitea_ci@hanzalova.internal 'bash -s' <<'DEPLOY'
+          set -eu
+          pkg=cortex
+          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
+          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
+            | python3 -c '
+          import json, sys
+          name = sys.argv[1]
+          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
+          if cands:
+              p = max(cands, key=lambda p: p.get("buildTime", 0))
+              print(p["version"] + "-" + p["release"])
+          ' "${pkg}" 2>/dev/null || true)
+          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
+            echo "${pkg}-${installed} already current — leaving service untouched"
+            exit 0
+          fi
+          echo "installed=${installed} published=${latest:-unknown} — deploying"
+          if systemctl is-active --quiet cortex.service; then
+            sudo /usr/bin/systemctl stop cortex.service
+          fi
+          if rpm -q "${pkg}" >/dev/null 2>&1; then
+            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
+          else
+            sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
+          fi
+          sudo /usr/bin/systemctl daemon-reload
+          # enable --now: start the service AND enable it for boot so the
+          # fleet self-heals after a host reboot.
+          sudo /usr/bin/systemctl enable --now cortex.service
+          DEPLOY
+
+      # Wait for the service to either come up or wedge, then capture
+      # the latest-invocation journal. Runs even on prior failure so a
+      # failed start step still leaves a usable record in the deploy log.
+      - name: Capture cortex.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh gitea_ci@hanzalova.internal \
+              'journalctl --unit cortex.service -I --no-pager'
+
+  deploy-neurons:
+    needs: [deploy-cortex]
+    runs-on: fedora-43
+    strategy:
+      # One neuron failing must not cancel the others. Cortex is up
+      # already; a partial neuron deploy is strictly better than
+      # rolling back to zero.
+      fail-fast: false
+      matrix:
+        include:
+          # load_timeout: how long to wait for default_models to finish
+          # loading after a restart. beast cold-loads Qwen3.6-27B Q6K
+          # TP=2 (~5-6 min typical, see #1); benjy/quadbrat load small
+          # single-GPU models in well under a minute.
+          #
+          # max_prompt_tokens: per-model context cap, written to the
+          # neuron.service.d/model.conf drop-in (NEURON_MAX_PROMPT_TOKENS).
+          # A change here restarts the neuron even with no new RPM. Values
+          # are VRAM-safe ceilings derived per model — see
+          # doc/context-limits.md. beast (Qwen3.6-27B, hybrid linear, 2x
+          # 32GB) has ample KV headroom; benjy (Qwen3-8B dense, ~6GB free)
+          # is VRAM-bound and stays at the default; quadbrat (Qwen3-1.7B)
+          # likewise conservative.
+          - host: beast.hanzalova.internal
+            flavour: blackwell
+            load_timeout: 900
+            max_prompt_tokens: 131072
+          - host: benjy.hanzalova.internal
+            flavour: ada
+            load_timeout: 300
+            max_prompt_tokens: 16384
+          - host: quadbrat.hanzalova.internal
+            flavour: ampere
+            load_timeout: 300
+            max_prompt_tokens: 16384
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@${{ matrix.host }} 'hostname -f'
+
+      # See deploy-cortex for why gating uses the publish manifest and
+      # not unprivileged `dnf check-update`.
+      - name: Deploy helexa-neuron-${{ matrix.flavour }} (skips when already current)
+        run: |
+          ssh gitea_ci@${{ matrix.host }} 'bash -s' <<'DEPLOY'
+          set -eu
+          pkg=helexa-neuron-${{ matrix.flavour }}
+          max_prompt_tokens="${{ matrix.max_prompt_tokens }}"
+
+          # ── Desired per-model systemd drop-in ─────────────────────────
+          # model.conf carries NEURON_MAX_PROMPT_TOKENS so the context cap
+          # is deterministic per host and rolled out (with a restart) by
+          # this workflow, not hand-edited. It sorts after local.conf, so a
+          # deploy-managed value wins over any manual local override of the
+          # same variable. See doc/context-limits.md.
+          conf=/etc/systemd/system/neuron.service.d/model.conf
+          config_changed=0
+          if [ -n "${max_prompt_tokens}" ]; then
+            desired=$(printf '%s\n%s\n%s\n%s' \
+              "# Managed by .gitea/workflows/deploy.yml - do not edit by hand." \
+              "# Per-model context cap; see doc/context-limits.md." \
+              "[Service]" \
+              "Environment=NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}")
+            [ "${desired}" = "$(cat "${conf}" 2>/dev/null || true)" ] || config_changed=1
+          fi
+
+          # ── Package version gate (manifest rationale: see deploy-cortex) ──
+          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
+          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
+            | python3 -c '
+          import json, sys
+          name = sys.argv[1]
+          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
+          if cands:
+              p = max(cands, key=lambda p: p.get("buildTime", 0))
+              print(p["version"] + "-" + p["release"])
+          ' "${pkg}" 2>/dev/null || true)
+          pkg_changed=1
+          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
+            pkg_changed=0
+          fi
+
+          # Skip only when BOTH the package and the drop-in are unchanged —
+          # a context-cap change must restart the neuron even with no new RPM.
+          if [ "${pkg_changed}" -eq 0 ] && [ "${config_changed}" -eq 0 ]; then
+            echo "${pkg}-${installed} current; NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens:-<unset>} unchanged — leaving service untouched"
+            exit 0
+          fi
+          echo "installed=${installed} published=${latest:-unknown} pkg_changed=${pkg_changed} config_changed=${config_changed} — deploying"
+
+          # Write the drop-in (staged in gitea_ci's dir, installed root-owned).
+          if [ "${config_changed}" -eq 1 ]; then
+            printf '%s\n' "${desired}" > /var/lib/gitea_ci/model.conf
+            sudo /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf "${conf}"
+            rm -f /var/lib/gitea_ci/model.conf
+            echo "applied ${conf}: NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}"
+          fi
+
+          if systemctl is-active --quiet neuron.service; then
+            sudo /usr/bin/systemctl stop neuron.service
+          fi
+          if [ "${pkg_changed}" -eq 1 ]; then
+            if rpm -q "${pkg}" >/dev/null 2>&1; then
+              sudo /usr/bin/dnf upgrade --refresh --allowerasing -y "${pkg}"
+            else
+              sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
+            fi
+          fi
+          # daemon-reload picks up both a new unit (dnf) and the drop-in.
+          sudo /usr/bin/systemctl daemon-reload
+          # enable --now: start the service AND enable it for boot so the
+          # fleet self-heals after a host reboot.
+          sudo /usr/bin/systemctl enable --now neuron.service
+
+          # ── Post-deploy validation ────────────────────────────────
+          # A deploy only goes green if the neuron (a) finishes loading
+          # its default models and (b) answers a trivial prompt like an
+          # LLM should. Catches the class of bug where the binary
+          # starts fine but model load or inference is broken — which
+          # previously surfaced only when a human noticed. The wait
+          # polls /health activation (the structured source of the
+          # "loaded default model" journal line, plus per-model failure
+          # detail); the journal-capture step below still runs for
+          # forensics either way.
+          load_timeout=${{ matrix.load_timeout }}
+          echo "waiting for default models (timeout ${load_timeout}s)"
+          deadline=$(( $(date +%s) + load_timeout ))
+          health=""
+          while :; do
+            health=$(curl -fsS --max-time 5 http://localhost:13131/health 2>/dev/null || true)
+            state=$(printf %s "${health}" | python3 -c '
+          import json, sys
+          try:
+              print(json.load(sys.stdin).get("activation", {}).get("state", ""))
+          except Exception:
+              print("")
+          ')
+            if [ "${state}" = "ready" ]; then
+              break
+            fi
+            if [ "$(date +%s)" -ge "${deadline}" ]; then
+              echo "FAIL: activation not ready within ${load_timeout}s (last state: ${state:-unreachable})"
+              exit 1
+            fi
+            sleep 10
+          done
+
+          model=$(printf %s "${health}" | python3 -c '
+          import json, sys
+          a = json.load(sys.stdin).get("activation", {})
+          failed = a.get("failed", [])
+          if failed:
+              for f in failed:
+                  msg = "FAILED " + str(f.get("model_id")) + ": " + str(f.get("error", ""))[:400]
+                  sys.stderr.write(msg + chr(10))
+              sys.exit(1)
+          completed = a.get("completed", [])
+          print(completed[0] if completed else "")
+          ')
+          if [ -z "${model}" ]; then
+            echo "no default models configured — skipping LLM probe"
+            exit 0
+          fi
+
+          echo "LLM probe against ${model}"
+          probe_body=$(printf '{"model":"%s","messages":[{"role":"user","content":"Reply with exactly one word: pineapple"}],"max_tokens":512,"temperature":0}' "${model}")
+          resp=$(curl -fsS --max-time 180 -H "content-type: application/json" \
+            -d "${probe_body}" http://localhost:13131/v1/chat/completions) || {
+            echo "FAIL: probe request errored"
+            exit 1
+          }
+          if printf %s "${resp}" | grep -qi pineapple; then
+            echo "LLM probe passed"
+          else
+            echo "FAIL: probe response missing expected token"
+            printf %s "${resp}" | head -c 2000
+            echo
+            exit 1
+          fi
+          DEPLOY
+
+      - name: Ensure firewalld allows helexa-neuron
+        run: |
+          ssh gitea_ci@${{ matrix.host }} '
+            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
+              sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
+              sudo /usr/bin/firewall-cmd --reload
+            fi'
+
+      # Wait for the service to either come up or wedge, then capture
+      # the latest-invocation journal. Runs even on prior failure so a
+      # failed start step still leaves a usable record in the deploy log.
+      - name: Capture neuron.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh gitea_ci@${{ matrix.host }} \
+              'journalctl --unit neuron.service -I --no-pager'
+
+  # helexa-bench is a separate package on a separate host (bob), and it
+  # only consumes the fleet's HTTP APIs — it has no deploy-ordering
+  # dependency on cortex or the neurons (the sweep loop is version-aware
+  # and picks up whatever each neuron reports whenever). So it runs
+  # alongside the cortex→neurons chain rather than after it.
+  deploy-bench:
+    runs-on: fedora-43
+    if: >-
+      ${{
+        github.event_name == 'workflow_dispatch'
+        || github.event.workflow_run.conclusion == 'success'
+      }}
+    steps:
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@bob.hanzalova.internal 'hostname -f'
+
+      # See deploy-cortex for why gating uses the publish manifest and
+      # not unprivileged `dnf check-update`.
+      - name: Deploy helexa-bench (skips when already current)
+        run: |
+          ssh gitea_ci@bob.hanzalova.internal 'bash -s' <<'DEPLOY'
+          set -eu
+          pkg=helexa-bench
+          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
+          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
+            | python3 -c '
+          import json, sys
+          name = sys.argv[1]
+          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
+          if cands:
+              p = max(cands, key=lambda p: p.get("buildTime", 0))
+              print(p["version"] + "-" + p["release"])
+          ' "${pkg}" 2>/dev/null || true)
+          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
+            echo "${pkg}-${installed} already current — leaving service untouched"
+            exit 0
+          fi
+          echo "installed=${installed} published=${latest:-unknown} — deploying"
+          if systemctl is-active --quiet helexa-bench.service; then
+            sudo /usr/bin/systemctl stop helexa-bench.service
+          fi
+          if rpm -q "${pkg}" >/dev/null 2>&1; then
+            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
+          else
+            sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
+          fi
+          sudo /usr/bin/systemctl daemon-reload
+          # enable --now: start the service AND enable it for boot so the
+          # bench resumes collecting after a host reboot.
+          sudo /usr/bin/systemctl enable --now helexa-bench.service
+
+          # ── Post-deploy validation ────────────────────────────────
+          # The bench serves a read-only API on :13132 alongside the
+          # outbound sweep loop. Probe the API over localhost (bypasses
+          # firewalld) — catches a crash-on-start or a bad bind. Bail
+          # early if the unit drops out of active (Restart backoff).
+          echo "waiting for bench API on :13132"
+          deadline=$(( $(date +%s) + 30 ))
+          while :; do
+            if curl -fsS --max-time 5 http://localhost:13132/api/health >/dev/null 2>&1; then
+              echo "bench API healthy"
+              break
+            fi
+            if ! systemctl is-active --quiet helexa-bench.service; then
+              echo "FAIL: helexa-bench.service is not active"
+              systemctl --no-pager status helexa-bench.service | head -20 || true
+              exit 1
+            fi
+            if [ "$(date +%s)" -ge "${deadline}" ]; then
+              echo "FAIL: bench API not healthy within 30s"
+              exit 1
+            fi
+            sleep 3
+          done
+          DEPLOY
+
+      - name: Ensure firewalld allows helexa-bench
+        run: |
+          ssh gitea_ci@bob.hanzalova.internal '
+            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-bench --quiet 2>/dev/null; then
+              sudo /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
+              sudo /usr/bin/firewall-cmd --reload
+            fi'
+
+      # Wait for the service to either come up or wedge, then capture
+      # the latest-invocation journal. Runs even on prior failure so a
+      # failed start step still leaves a usable record in the deploy log.
+      - name: Capture helexa-bench.service startup journal
+        if: always()
+        run: |
+          sleep 10
+          ssh gitea_ci@bob.hanzalova.internal \
+              'journalctl --unit helexa-bench.service -I --no-pager'
+
+  # Build the bench UI and publish it to the public nginx vhost on the
+  # gateway (https://bench.helexa.ai). The vhost + Let's Encrypt cert are
+  # one-time host setup (script/infra-setup.sh); this job just refreshes
+  # the static assets. nginx reverse-proxies /api to the bob API, so the
+  # SPA is built same-origin (no VITE_API_BASE). Independent of the other
+  # deploy jobs.
+  deploy-bench-ui:
+    runs-on: fedora-43
+    if: >-
+      ${{
+        github.event_name == 'workflow_dispatch'
+        || github.event.workflow_run.conclusion == 'success'
+      }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Build UI
+        run: |
+          cd bench
+          npm ci
+          npm run build
+
+      - name: SSH init
+        run: |
+          mkdir -p ~/.ssh
+          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
+              gitea_ci@hanzalova.internal 'hostname -f'
+
+      - name: Rsync built UI to gateway webroot
+        run: |
+          rsync --archive --compress --delete \
+            --rsync-path 'sudo rsync' \
+            bench/dist/ \
+            gitea_ci@hanzalova.internal:/var/www/bench.helexa.ai/
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 /target
+/bench/node_modules
+/bench/dist
 *.swp
 *.swo
 .idea/
@@ -7,3 +9,4 @@ cortex.toml
 models.toml
 doc/plan/*
 /target-cuda/
+.claude/
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,268 @@
+# AGENTS.md — helexa/cortex
+
+## Project Overview
+
+helexa is a self-hosted LLM serving stack for multi-node GPU inference clusters. It has two components:
+
+- **cortex** — the per-operator control plane and LLM proxy. A Rust reverse-proxy that sits in front of the fleet and presents a unified OpenAI + Anthropic compatible API surface. It handles model routing, lifecycle management (load/unload/evict), request translation, and metrics collection.
+- **neuron** — the per-host LLM harness. One instance runs on every GPU host, serving candle-based in-process inference and managing local hardware discovery and model lifecycle.
+
+## Repository Layout
+
+```
+cortex/
+├── Cargo.toml              # workspace root (Rust 2024 edition, GPL-3.0)
+├── cortex.example.toml     # example gateway config
+├── models.example.toml     # example model catalogue
+├── neuron.example.toml     # example neuron config
+├── README.md               # public-facing documentation
+├── CLAUDE.md               # detailed design rationale and implementation history
+├── AGENTS.md               # ← you are here
+├── cortex.spec             # RPM spec for cortex
+├── helexa-neuron.spec      # RPM spec for neuron (renamed to avoid Fedora collision)
+├── rpm/                    # prerelease RPM specs
+│   ├── cortex-prerelease.spec
+│   ├── helexa-neuron-prerelease.spec
+│   └── helexa-bench-prerelease.spec
+├── data/                   # systemd units and example configs for packaging
+│   ├── cortex.service
+│   ├── neuron.service
+│   ├── cortex.example.toml
+│   ├── neuron.example.toml
+│   └── models.example.toml
+└── crates/
+    ├── cortex-core/            # shared types, config, envelopes
+    │   └── src/
+    │       ├── lib.rs
+    │       ├── build_info.rs   # BuildInfo type for /version endpoint
+    │       ├── config.rs       # figment-based config structs
+    │       ├── catalogue.rs    # ModelProfile, placement matching
+    │       ├── discovery.rs    # DeviceInfo, DiscoveryResponse
+    │       ├── harness.rs      # Harness trait, HarnessConfig, HarnessHealth
+    │       ├── node.rs         # NodeState, ModelStatus
+    │       ├── openai.rs       # OpenAI request/response types
+    │       ├── anthropic.rs    # Anthropic request/response types
+    │       ├── translate.rs    # OpenAI <-> Anthropic translation
+    │       └── metrics.rs      # RequestMetrics, histogram helpers
+    ├── cortex-gateway/         # the HTTP proxy server
+    │   └── src/
+    │       ├── lib.rs
+    │       ├── state.rs        # CortexState: Arc<RwLock<...>>
+    │       ├── router.rs       # model -> node routing logic
+    │       ├── proxy.rs        # streaming HTTP proxy to backends
+    │       ├── evictor.rs      # LRU/priority eviction logic
+    │       ├── poller.rs       # background task polling neuron status
+    │       ├── handlers.rs     # axum handlers (chat, completions, models, etc.)
+    │       └── metrics.rs      # prometheus exporter endpoint
+    ├── cortex-cli/             # CLI entrypoint
+    │   └── src/main.rs         # binary: `cortex`
+    ├── neuron/                 # per-host LLM daemon (replaces cortex-agent)
+    │   ├── Cargo.toml          # features: cuda, cudnn, flash-attn, cuda-integration
+    │   ├── build.rs            # compiles CUDA kernels, emits build metadata
+    │   └── src/
+    │       ├── main.rs         # binary: `neuron`
+    │       ├── discovery.rs    # nvidia-smi parsing, device enumeration
+    │       ├── health.rs       # runtime GPU polling
+    │       ├── api.rs          # HTTP handlers for /discovery, /models, etc.
+    │       ├── version.rs      # GET /version endpoint with BuildInfo
+    │       ├── models.rs       # local model lifecycle orchestration
+    │       └── harness/        # in-process candle inference
+    │           ├── device_worker/  # per-device CUDA worker threads
+    │           │   ├── mod.rs      # canonical narrative for worker architecture
+    │           │   ├── jobs.rs     # Job enum, dispatch handlers
+    │ │           └── dispatch.rs   # DeviceWorkerState struct
+    │           ├── candle.rs       # candle model implementation
+    │           └── tp/             # tensor parallelism
+    │               └── worker.rs   # TP worker subprocesses
+    ├── helexa-acp/             # Agent Client Protocol bridge (Apache-2.0)
+    │   └── src/main.rs         # binary: `helexa-acp`, self-contained (no workspace deps)
+    └── helexa-bench/           # benchmark harness
+        └── src/main.rs         # binary: `helexa-bench`, SQLite-backed, version-aware
+```
+
+## Key Design Decisions
+
+### Architecture
+- **cortex** is the control plane. It exposes the unified API, routes requests, manages model lifecycle across the fleet, and collects metrics.
+- **neuron** is the node plane. One instance runs on every GPU host. It discovers local hardware, manages in-process candle inference, handles NCCL tensor parallelism, and reports runtime state.
+- cortex never shells out to `nvidia-smi`, never touches systemd units, and never talks directly to a harness. It talks only to neurons via HTTP API on port 13131.
+
+### Per-device worker thread (neuron)
+Every CUDA device gets one dedicated OS thread that owns its `CudaContext` for the daemon's lifetime. All CUDA operations route through this thread via a `std::sync::mpsc` job channel. Tensors never escape the worker thread alive. Inference replies carry `Vec<f32>` CPU-side logits; sampled tokens come back as `u32`. The opaque `ArchHandle(u64)` and `TpHandle(u64)` are indices into the worker's state slab, not pointers.
+
+CPU loads (`Device::Cpu` fallback) keep the legacy `tokio::task::spawn_blocking + Arc<Mutex<ModelArch>>` path — there's no context to own and the channel hop would only add latency. Four `spawn_blocking` references in `harness/candle.rs` are deliberate CPU fallback.
+
+### candle-native (not mistral.rs)
+neuron builds directly on [candle](https://github.com/huggingface/candle). Every model architecture it serves is implemented in this repository, ported against the HuggingFace reference. No external inference server to babysit. The Harness trait remains as an internal seam for adding future engines (vision/audio/diffusion) but its only implementation is in-process candle.
+
+### Streaming proxy
+Chat completions are proxied as SSE streams. The gateway must:
+1. Parse the inbound request to extract the model name
+2. Route to the correct backend neuron
+3. Stream the response back, capturing token timing for metrics
+4. NOT buffer the full response — true streaming passthrough
+
+### Anthropic translation
+When a request arrives at `/v1/messages` (Anthropic format), the gateway translates it to OpenAI format before proxying to neuron, then translates the response back. This is stateless envelope transformation. Non-streaming round-trip is implemented; streaming SSE translation deferred.
+
+### Eviction
+The evictor runs as a background task. Before loading a model on a node where VRAM is tight:
+1. Check if the model is already loaded elsewhere → route there instead
+2. Find the LRU model on the target node (excluding pinned models)
+3. Call `POST {neuron}/models/unload` on that model
+4. The incoming request's lazy-load triggers the new model load
+
+### Metrics
+Per-request: model, node, prompt_tokens, completion_tokens, total_tokens, tok_per_sec, time_to_first_token_ms, total_latency_ms. Exposed as Prometheus histograms/counters on a separate port (31314).
+
+## Tech Stack
+
+- **Rust 2024 edition** — workspace with 6 crates
+- **Axum 0.8** — HTTP framework
+- **reqwest** — HTTP client for proxying to backends
+- **figment** — config loading (TOML + env vars)
+- **tokio** — async runtime
+- **metrics + metrics-exporter-prometheus** — observability
+- **tracing** — structured logging
+- **candle** — in-process inference engine (neuron only, with CUDA support)
+- **cudarc** — patched for neuron's needs (see workspace `[patch]`)
+- **clap** — CLI parsing
+- **rusqlite** (bundled) — helexa-bench SQLite system-of-record
+
+## Build Commands
+
+```sh
+cargo build --release           # build all crates
+cargo run -p cortex-cli -- serve    # run the gateway
+cargo test                      # run all tests
+cargo clippy --workspace        # lint
+```
+
+### neuron Features
+- `cuda`: Enables CUDA acceleration in candle and cudarc/nccl bindings. Without it, falls back to CPU.
+- `cudnn`: Use cuDNN for convolution/attention kernels (requires `cuda`).
+- `flash-attn`: FlashAttention kernels (requires `cuda`).
+- `cuda-integration`: Reserved for GPU-only integration tests (requires multiple CUDA devices + libnccl).
+
+### Build Scripts
+- `neuron/build.rs`: Compiles CUDA kernels (`src/cuda/*.cu`) using `cudaforge::KernelBuilder` when `cuda` feature is enabled. Handles compute capability checks (sm_<80 disables bf16 intrinsics). Also captures build metadata: git SHA, dirty flag, timestamp, rustc version, profile, features, candle-core version.
+
+## CI
+
+Gitea Actions runs on every push to any branch. All three checks must pass before merging:
+
+```sh
+cargo fmt --check --all                    # formatting
+cargo clippy --workspace -- -D warnings   # lint (warnings are errors)
+cargo test --workspace                     # tests
+```
+
+Run these locally before pushing. `cargo fmt --all` fixes formatting automatically. Clippy warnings must be resolved, not suppressed with `#[allow(...)]` unless there is a clear rationale.
+
+Tagged releases (`v*`) build SRPMs for `cortex`, `helexa-neuron`, and `helexa-bench` and publish to COPR (`helexa/helexa`). Build metadata SHA injection: CI sets `HELEXA_BUILD_SHA=$(git rev-parse HEAD)`.
+
+## Environment
+
+- Targets Fedora 43 (systemd, SELinux enforcing)
+- Nodes communicate over a private network (e.g. WireGuard mesh)
+- cortex listens on port 31313 (API) and 31314 (metrics)
+- neuron listens on port 13131 on each GPU host
+- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
+
+## Conventions
+
+- Error handling: `anyhow` for binaries, `thiserror` for library crates
+- No `unwrap()` in library code; `expect()` only with clear rationale
+- All public types derive `Debug, Clone, Serialize, Deserialize` where sensible
+- Config structs use `figment` with TOML as primary source, env vars as override
+- Prefer `Arc<RwLock<...>>` for shared fleet state; minimize lock duration
+- SSE streaming uses `tokio_stream` + `eventsource-stream` for parsing
+- Log at `info` for request routing, `debug` for proxy details, `warn` for eviction and node health, `error` for proxy failures
+
+## Testing
+
+### Gateway tests
+Use mock neurons spawned via axum in `crates/cortex-gateway/tests/common/mod.rs`. Helpers: `spawn_mock_backend()`, `spawn_gateway()`.
+
+### neuron integration tests
+- Numerical reference tests (`numerical_reference.rs`) require `NEURON_REF_MODEL_PATH` env var pointing to a HF snapshot directory. Fixtures are f32-based for precision validation against HuggingFace transformers.
+- CUDA integration tests (`tp_worker_lifecycle_cuda.rs`) gated behind `cuda-integration` feature; requires 2+ CUDA devices (e.g., 2x RTX 5090).
+
+### Metrics testing
+Use `install_test_recorder()` in test code to capture metrics without the HTTP listener.
+
+## helexa-bench
+
+A continuous, version-aware benchmark harness. Hits each neuron directly on `:13131`, exercises each warm model with a Scenario suite (chat-latency family), and records results into SQLite stamped with the neuron's full `BuildInfo`. The loop is version-aware: skips any (target, build SHA, model, scenario) cell already at `samples_per_version`.
+
+Packaged as `helexa-bench` RPM (prebuilt-binary spec). One systemd unit, typically on the metrics host.
+
+## helexa-acp
+
+Agent Client Protocol bridge — connects ACP editors (Zed, etc.) to any OpenAI-compatible endpoint, cortex by default. Intentionally self-contained: no workspace crate dependencies. Uses `agent-client-protocol` with `unstable_session_model` feature for Zed model picker support. Licensed Apache-2.0 (workspace is GPL-3.0).
+
+## RPM Packaging
+
+- `cortex.spec` — installs the `cortex` binary
+- `helexa-neuron.spec` — installs the `neuron` binary under package name `helexa-neuron` (renamed to avoid Fedora's NEURON neural-simulation package collision)
+- Systemd units in `data/cortex.service`, `data/neuron.service`
+- Example configs: `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
+
+Install:
+```sh
+dnf copr enable helexa/helexa
+dnf install cortex                # gateway host
+dnf install helexa-neuron         # GPU nodes
+```
+
+## Configuration Files
+
+### cortex.toml (gateway)
+```toml
+[gateway]
+listen = "0.0.0.0:31313"
+metrics_listen = "0.0.0.0:31314"
+
+[eviction]
+strategy = "lru"          # lru | priority
+defrag_after_cycles = 50
+
+[[neurons]]
+name = "beast"
+endpoint = "http://beast.internal:13131"
+```
+
+### models.toml (catalogue)
+```toml
+[[models]]
+id = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
+harness = "candle"
+quant = "Q4_K_M"
+vram_mb = 19000
+min_devices = 2
+min_device_vram_mb = 10000
+pinned_on = ["beast"]       # optional: never evict from these neurons
+```
+
+### neuron.toml (per-host)
+Configured via figment + env override. See `neuron.example.toml` for reference.
+
+## neuron API Endpoints
+
+```
+GET  /discovery        → hardware discovery (hostname, OS, CUDA, devices, harnesses)
+GET  /health           → runtime GPU stats (VRAM, utilization, temperature)
+GET  /models           → loaded/unloaded models with VRAM usage
+POST /models/load      → load a model with spec (quant, TP, devices)
+POST /models/unload    → unload a model, freeing device memory
+GET  /models/{id}/endpoint → inference URL for a model
+GET  /version          → build metadata (SHA, features, candle version, etc.)
+```
+
+## Sources of Truth
+
+When prose documentation conflicts with code, trust:
+1. Executable configuration (`*.toml`, `Cargo.toml` features)
+2. Type definitions in `cortex-core/`
+3. Test files in `crates/*/tests/` and `*/src/**/*_test.rs`
+4. `CLAUDE.md` for historical design rationale
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,16 +1,26 @@
-# CLAUDE.md — cortex
+# CLAUDE.md — helexa

 ## Project overview

-cortex is a Rust reverse-proxy that sits in front of multiple
-mistral.rs inference nodes and presents a unified OpenAI + Anthropic
-compatible API surface. It handles model routing, lifecycle management
-(load/unload/evict), request translation, and metrics collection.
+helexa is a self-hosted LLM serving stack for multi-node GPU inference
+clusters. It has two components:
+
+- **cortex** — the per-operator control plane and LLM proxy. A Rust
+  reverse-proxy that sits in front of the fleet and presents a unified
+  OpenAI + Anthropic compatible API surface. It handles model routing,
+  lifecycle management (load/unload/evict), request translation, and
+  metrics collection.
+- **neuron** — the per-host LLM harness. One instance runs on every GPU
+  host, serving candle-based in-process inference and managing local
+  hardware discovery and model lifecycle.
+
+(Historical note: cortex originally proxied to mistral.rs nodes; neuron
+replaced that — see the 2026-05-18 candle-native addendum below.)

 ## Repository layout

 ```
-cortex/
+helexa/
 ├── Cargo.toml              # workspace root
 ├── cortex.toml      # example gateway config
 ├── README.md
@@ -548,7 +558,7 @@ and the hardcoded `vram_mb` per node.
 ## Revised repository layout

 ```
-cortex/
+helexa/
 ├── Cargo.toml
 ├── cortex.toml                 # gateway config (neurons only)
 ├── models.toml                 # model catalogue
@@ -754,3 +764,39 @@ Landed in four PRs:
  from Phases 2/3 deleted; `SendComm` newtype no longer needed in the
  load path. `grep -rn spawn_blocking crates/neuron/src/harness/`
  returns only deliberate CPU-fallback hits after this PR.
+
+## 2026-06-13 addendum: build metadata + helexa-bench
+
+Two coupled additions so fleet performance can be tracked automatically
+across neuron updates instead of by hand-running `script/bench.py` and
+editing `doc/benchmarks.md`.
+
+**neuron build metadata + `GET /version`.** neuron's `build.rs` now also
+captures build identity (`HELEXA_GIT_SHA` — preferring a CI/RPM-injected
+`HELEXA_BUILD_SHA`, falling back to git, else `unknown` — plus dirty
+flag, build timestamp, rustc version, profile, enabled cargo features,
+and a best-effort `candle-core` version from `Cargo.lock`). These are
+exposed as `cortex_core::build_info::BuildInfo` (new module) from a new
+`GET /version` endpoint (`neuron/src/version.rs`, wired in `api.rs`) and
+in clap's `--version` long form. The SHA is injected in CI
+(`build-prerelease.yml` build-neuron step: `export HELEXA_BUILD_SHA=$(git
+rev-parse HEAD)`) and via `--define helexa_commit` in the source-build
+spec, so tarball-built RPMs report the real SHA. `/version` is now the
+canonical "which build is live" probe (supersedes the per-host RPM-sha
+check in the fleet-validation flow).
+
+**`crates/helexa-bench`** — a new binary: a continuous, version-aware
+benchmark harness (one systemd unit, typically on the metrics host). It
+hits each neuron **directly** on `:13131`, exercises each **warm**
+(`status == "loaded"`) model with an extensible `Scenario` suite (phase
+1: the chat-latency family ported verbatim from `bench.py` — synthetic
+128/4096-tok prompts, `/no_think`, streamed TTFT + decode-window
+tok/s), and records each run into a SQLite system-of-record stamped with
+the neuron's full `BuildInfo`. The loop is **version-aware**: it skips
+any (target, build SHA, model, scenario) cell already at
+`samples_per_version`, so a steady fleet costs only cheap `/version` +
+`/models` polls until a new SHA ships. `helexa-bench report` regenerates
+the `benchmarks.md`-style table from the DB. `kind = "openai"` targets
+(mistral.rs/llama.cpp comparison) are scaffolded but not yet wired.
+Packaged as the `helexa-bench` RPM (prebuilt-binary spec, outbound-only
+so no firewalld service) via the same `build-prerelease.yml` pipeline.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -472,6 +472,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"

+[[package]]
+name = "byteorder-lite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
+
 [[package]]
 name = "bytes"
 version = "1.11.1"
@@ -668,6 +674,12 @@ dependencies = [
 "cc",
 ]

+[[package]]
+name = "color_quant"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
+
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
@@ -781,6 +793,7 @@ name = "cortex-gateway"
 version = "0.1.16"
 dependencies = [
 "anyhow",
+ "async-trait",
 "axum",
 "bytes",
 "chrono",
@@ -893,8 +906,7 @@ dependencies = [
 [[package]]
 name = "cudarc"
 version = "0.19.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cea5f10a99e025c1b44ae2354c2d8326b25ddbd0baf76bde8e55cfd4018a2cc"
+source = "git+https://github.com/grenade/cudarc?rev=63327a256059f8252641ae46c6bb9eefe707f382#63327a256059f8252641ae46c6bb9eefe707f382"
 dependencies = [
 "float8",
 "half",
@@ -1206,6 +1218,18 @@ dependencies = [
 "pin-project-lite",
 ]

+[[package]]
+name = "fallible-iterator"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
+
+[[package]]
+name = "fallible-streaming-iterator"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
+
 [[package]]
 name = "fancy-regex"
 version = "0.17.0"
@@ -1223,6 +1247,15 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"

+[[package]]
+name = "fdeflate"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
+dependencies = [
+ "simd-adler32",
+]
+
 [[package]]
 name = "figment"
 version = "0.10.19"
@@ -1230,8 +1263,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8cb01cd46b0cf372153850f4c6c272d9cbea2da513e07538405148f95bd789f3"
 dependencies = [
 "atomic",
+ "parking_lot",
 "pear",
 "serde",
+ "tempfile",
 "toml",
 "uncased",
 "version_check",
@@ -1731,6 +1766,16 @@ dependencies = [
 "wasip3",
 ]

+[[package]]
+name = "gif"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159"
+dependencies = [
+ "color_quant",
+ "weezl",
+]
+
 [[package]]
 name = "glob"
 version = "0.3.3"
@@ -1777,6 +1822,15 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"

+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.15.5"
@@ -1805,6 +1859,15 @@ version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"

+[[package]]
+name = "hashlink"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
+dependencies = [
+ "hashbrown 0.14.5",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -1835,6 +1898,30 @@ dependencies = [
 "url",
 ]

+[[package]]
+name = "helexa-bench"
+version = "0.1.16"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "axum",
+ "chrono",
+ "clap",
+ "cortex-core",
+ "eventsource-stream",
+ "figment",
+ "futures",
+ "reqwest",
+ "rusqlite",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tokio-stream",
+ "tower-http",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "hermit-abi"
 version = "0.5.2"
@@ -2135,6 +2222,34 @@ dependencies = [
 "icu_properties",
 ]

+[[package]]
+name = "image"
+version = "0.25.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104"
+dependencies = [
+ "bytemuck",
+ "byteorder-lite",
+ "color_quant",
+ "gif",
+ "image-webp",
+ "moxcms",
+ "num-traits",
+ "png",
+ "zune-core",
+ "zune-jpeg",
+]
+
+[[package]]
+name = "image-webp"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3"
+dependencies = [
+ "byteorder-lite",
+ "quick-error",
+]
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@@ -2299,6 +2414,17 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "libsqlite3-sys"
+version = "0.30.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149"
+dependencies = [
+ "cc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.12.1"
@@ -2449,6 +2575,16 @@ dependencies = [
 "serde_json",
 ]

+[[package]]
+name = "minijinja-contrib"
+version = "2.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99df5123c54391e2a228014c1dbbd85a3dab08a25e776c810526f2f47542b3de"
+dependencies = [
+ "minijinja",
+ "serde",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -2498,6 +2634,16 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "moxcms"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b"
+dependencies = [
+ "num-traits",
+ "pxfm",
+]
+
 [[package]]
 name = "native-tls"
 version = "0.2.18"
@@ -2522,6 +2668,7 @@ dependencies = [
 "anyhow",
 "async-trait",
 "axum",
+ "base64 0.22.1",
 "candle-core",
 "candle-nn",
 "candle-transformers",
@@ -2533,7 +2680,10 @@ dependencies = [
 "futures",
 "half",
 "hf-hub",
+ "image",
 "minijinja",
+ "minijinja-contrib",
+ "rayon",
 "reqwest",
 "safetensors 0.7.0",
 "serde",
@@ -2861,6 +3011,19 @@ version = "0.3.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"

+[[package]]
+name = "png"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
+dependencies = [
+ "bitflags",
+ "crc32fast",
+ "fdeflate",
+ "flate2",
+ "miniz_oxide",
+]
+
 [[package]]
 name = "polling"
 version = "3.11.0"
@@ -2974,6 +3137,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40e24eee682d89fb193496edf918a7f407d30175b2e785fe057e4392dfd182e0"

+[[package]]
+name = "pxfm"
+version = "0.1.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f"
+
 [[package]]
 name = "quanta"
 version = "0.12.6"
@@ -2989,6 +3158,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "quick-error"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
+
 [[package]]
 name = "quinn"
 version = "0.11.9"
@@ -3324,6 +3499,20 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "rusqlite"
+version = "0.32.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e"
+dependencies = [
+ "bitflags",
+ "fallible-iterator",
+ "fallible-streaming-iterator",
+ "hashlink",
+ "libsqlite3-sys",
+ "smallvec",
+]
+
 [[package]]
 name = "rustc-hash"
 version = "2.1.2"
@@ -4627,6 +4816,12 @@ dependencies = [
 "rustls-pki-types",
 ]

+[[package]]
+name = "weezl"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
+
 [[package]]
 name = "which"
 version = "7.0.3"
@@ -5164,3 +5359,18 @@ name = "zmij"
 version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+
+[[package]]
+name = "zune-core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
+
+[[package]]
+name = "zune-jpeg"
+version = "0.5.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296"
+dependencies = [
+ "zune-core",
+]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,13 +6,14 @@ members = [
    "crates/cortex-cli",
    "crates/neuron",
    "crates/helexa-acp",
+    "crates/helexa-bench",
 ]

 [workspace.package]
 version = "0.1.16"
 edition = "2024"
 license = "GPL-3.0-or-later"
-repository = "https://git.lair.cafe/helexa/cortex"
+repository = "https://git.lair.cafe/helexa/helexa"

 [workspace.dependencies]
 # async runtime
@@ -61,3 +62,12 @@ eventsource-stream = "0.2"
 # workspace crates
 cortex-core = { path = "crates/cortex-core" }
 cortex-gateway = { path = "crates/cortex-gateway" }
+
+# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is
+# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds
+# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP
+# hang-recovery (abort a wedged collective from another thread, then
+# rebuild the comm). Pinned to a fork revision pending upstream review
+# (grenade/cudarc @ nccl-comm-abort).
+[patch.crates-io]
+cudarc = { git = "https://github.com/grenade/cudarc", rev = "63327a256059f8252641ae46c6bb9eefe707f382" }
--- a/README.md
+++ b/README.md
@@ -1,25 +1,68 @@
-# cortex
+# helexa

-A Rust reverse-proxy and fleet management layer for multi-node GPU inference
-clusters. Cortex sits in front of one or more `neuron` daemons (each running
-candle-based inference on a local GPU host) and presents a unified OpenAI +
-Anthropic compatible API surface.
+**Near-frontier AI for mortals.**

-## Problem
+helexa is a self-hosted LLM serving stack, written in Rust, for people
+who run open-weight models on their own consumer GPUs. It has two
+components:

-Running local LLMs across multiple GPU nodes (different VRAM tiers, different
-model affinities) requires a unified API surface that:
+- **cortex** — the per-operator control plane and LLM proxy. It sits in
+  front of your GPU fleet and presents a unified OpenAI + Anthropic
+  compatible API surface, handling model routing, lifecycle management
+  (load / unload / evict), request translation, and metrics.
+- **neuron** — the per-host LLM harness. One instance runs on every GPU
+  host, serving candle-based in-process inference and managing local
+  hardware discovery and model lifecycle.

- Presents a **single `/v1/models` catalogue** merging every model that can be
-  served by any neuron in the fleet.
- **Routes requests** to the correct node based on where a model is loaded
-  (or can be loaded), handling cold-load and eviction transparently.
- Manages **model lifecycle** — load on demand, unload cold models, pin
-  critical ones — by calling each neuron's `/models/{load,unload}` API.
- Translates between **OpenAI and Anthropic** request/response envelopes so
-  every client speaks whichever dialect it prefers.
- Captures **per-request metrics** (tokens, tok/s, TTFT, latency) and exposes
-  them as Prometheus counters/histograms.
+## Why
+
+Two principles constrain everything in this repository:
+
+1. **Frontier or close to it.** helexa serves the open-weight models
+   that get nearest to frontier capability — not every architecture
+   ever published.
+2. **Consumer hardware.** Everything must run on the cards mortals can
+   actually buy: a 3060 here, a 4090 there, a 5090 if you got lucky.
+   Mixed VRAM tiers across mismatched boxes are the expected topology,
+   not a degraded case.
+
+GPU acquisition is harder than it was a year ago, and the gap between
+what cloud providers charge and what your own silicon costs keeps
+widening. The intersection of those two principles — near-frontier
+models, squeezed onto hardware you own — is helexa's entire niche.
+
+The secondary objective is **predictable consumption**. If you own the
+hardware, your tooling shouldn't break because a cloud provider changed
+billing, deprecated a model, or reshaped an API. cortex's OpenAI and
+Anthropic surfaces are a stability contract: point your editor, agent,
+or CLI at it once, and it keeps working.
+
+## What helexa is not
+
+This is an intentionally different path from vLLM, SGLang, and peers —
+not a smaller version of them. Out of scope, permanently:
+
+- Any-model breadth. Architectures are ported because they're at or
+  near the frontier, not to complete a compatibility matrix.
+- Datacenter-class scheduling. No sophisticated continuous-batching /
+  paged-attention machinery — the workload is a handful of operators
+  and their agents, not 200 QPS.
+- Wrapping external inference engines. neuron builds directly on
+  [candle](https://github.com/huggingface/candle); every model
+  architecture it serves is implemented in this repository, ported
+  against the HuggingFace reference.
+
+One thing that is *not* a principle: CUDA exclusivity. All high-end
+consumer hardware is in scope. helexa is CUDA-only today because
+that's the hardware on the bench — nothing ships untested — and ROCm
+or other consumer accelerators join as soon as there's real hardware
+to build against.
+
+In scope, and where the engineering effort goes: aggressive
+quantization (GGUF Q4_K_M / Q6_K / Q8_0), NCCL tensor parallelism
+across heterogeneous consumer GPUs, careful CUDA failure handling, and
+single-request latency — the performance that one operator at a
+keyboard actually feels.

 ## Architecture

@@ -29,7 +72,7 @@ model affinities) requires a unified API surface that:
 └──────┬───────┘  └─────┬────┘  └──────┬─────┘  └──────┬─────┘
       │                │              │               │
       └────────────────┴──────┬───────┴───────────────┘
-                               │
+                               │  OpenAI + Anthropic APIs
                    ┌──────────▼──────────┐
                    │      cortex         │
                    │  (cortex-gateway)   │
@@ -46,40 +89,59 @@ model affinities) requires a unified API surface that:
                  private network (.internal)
 ```

+cortex discovers each neuron's hardware (devices, VRAM, compute
+capability) at runtime and matches it against a model catalogue
+(`models.toml`) to decide placement: which models fit where, what to
+evict when VRAM is tight, where to route a request right now. Adding a
+GPU host to the fleet is one `[[neurons]]` entry — no device specs in
+config.
+
 ### Crates

 | Crate | Purpose |
 |---|---|
 | `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic envelopes, harness trait, discovery types |
 | `cortex-gateway` | Axum HTTP server: proxy, router, evictor, poller, metrics exporter |
-| `neuron` | Per-node daemon: GPU discovery, in-process candle inference, model lifecycle API |
+| `neuron` | Per-host daemon: GPU discovery, in-process candle inference, NCCL tensor parallelism, model lifecycle API |
 | `cortex-cli` | CLI entrypoint (`cortex serve`, `cortex status`, etc.) |
+| `helexa-acp` | Agent Client Protocol bridge — connects ACP editors (Zed, etc.) to any OpenAI-compatible endpoint, cortex by default |

-## Node setup
+## The engine

-Each GPU node runs `neuron` (listening on `:13131`). Neuron uses
-huggingface/candle for in-process inference — there is no external
-inference subprocess to manage.
+neuron runs inference in-process on candle — there is no external
+inference server to babysit. The parts that earn their keep:

-Inside the daemon, every CUDA device gets one dedicated OS thread
-(named `cuda-dev-N`) that owns the device's CUDA context for the
-daemon's lifetime. Model loads, forward passes, KV-cache resets,
-NCCL collectives, VRAM queries, and unloads all route through that
-thread via a job channel; tensors never escape it alive. This pins
-context binding to a known thread, makes the CUDA Drop contract
-structurally safe, and isolates driver-error poisoning to one worker
-rather than the whole process. See `CLAUDE.md` for the design
-rationale and `crates/neuron/src/harness/device_worker/` for the code.
+- **Per-device worker threads.** Every CUDA device gets one dedicated
+  OS thread that owns its CUDA context for the daemon's lifetime. All
+  loads, forward passes, KV-cache resets, NCCL collectives, VRAM
+  queries, and unloads route through it; tensors never escape it
+  alive. Context binding is pinned to a known thread, the CUDA `Drop`
+  contract is structurally safe, and a driver error poisons one worker
+  — visibly — instead of hanging the whole process.
+- **Tensor parallelism on consumer cards.** Megatron-style row/column
+  parallel layers with NCCL all-reduce, spanning the mismatched GPUs
+  you actually have. A step watchdog aborts wedged collectives instead
+  of letting a request hang forever.
+- **Current model focus: the Qwen3 family** — dense and GGUF-quantized,
+  including the hybrid linear-attention (Gated DeltaNet) generation.
+  Vision support is in progress. Each architecture is ported against
+  its HuggingFace reference implementation.

-The neuron RPM (`helexa-neuron`) ships a systemd unit:
+See `CLAUDE.md` for design rationale and
+`crates/neuron/src/harness/device_worker/` for the worker narrative.
+
+## Install
+
+Pre-built RPMs for Fedora:

 ```sh
 dnf copr enable helexa/helexa
-dnf install helexa-neuron
-systemctl enable --now neuron
+dnf install cortex            # on the gateway host
+dnf install helexa-neuron     # on each GPU host
+systemctl enable --now cortex   # or neuron, respectively
 ```

-## Gateway config
+## Configure

 ```toml
 # /etc/cortex/cortex.toml
@@ -100,29 +162,10 @@ name = "benjy"
 endpoint = "http://benjy.internal:13131"
 ```

-Model placement profiles live in `models.toml` — see `models.example.toml`.
+Model placement profiles (VRAM requirements, quant, device minimums,
+pinning) live in `models.toml` — see `models.example.toml`.

-## Building
-
-```sh
-cargo build --release
-```
-
-## CI
-
-Every push triggers format, lint, and test checks. Ensure these pass
-locally before pushing:
-
-```sh
-cargo fmt --check --all                    # must be clean
-cargo clippy --workspace -- -D warnings   # warnings are errors
-cargo test --workspace                     # all tests must pass
-```
-
-Tagged releases (`v*`) additionally build SRPMs for both `cortex` and
-`helexa-neuron` and publish to COPR.
-
-## Running
+## Run

 ```sh
 # start the gateway
@@ -131,10 +174,37 @@ cortex serve --config /etc/cortex/cortex.toml
 # check fleet status
 cortex status

-# list all models across nodes
+# one catalogue across every node
 curl http://localhost:31313/v1/models
 ```

+## Build from source
+
+```sh
+cargo build --release
+```
+
+CI runs on every push; keep it green locally:
+
+```sh
+cargo fmt --check --all                    # must be clean
+cargo clippy --workspace -- -D warnings   # warnings are errors
+cargo test --workspace                     # all tests must pass
+```
+
+Tagged releases (`v*`) build SRPMs for `cortex` and `helexa-neuron`
+and publish to COPR.
+
+## Status
+
+Pre-1.0 and moving fast. The gateway path (routing, eviction,
+translation, metrics) is stable and tested; the candle-native engine
+is under active development — expect the supported-model list to track
+the open-weight frontier, deliberately narrowly.
+
+Development happens at <https://git.lair.cafe/helexa/helexa>;
+<https://github.com/helexa-ai/helexa> is a read-only mirror.
+
 ## License

 GPL-3.0
--- a/asset/helexa-bench/bob.toml
+++ b/asset/helexa-bench/bob.toml
@@ -0,0 +1,38 @@
+# helexa-bench config for bob.hanzalova.internal.
+#
+# Synced to /etc/helexa-bench/helexa-bench.toml by script/infra-setup.sh
+# (the helexa-bench RPM ships helexa-bench.example.toml as a
+# %config(noreplace) default; this per-host file overrides it).
+#
+# bob is a client host (it also runs Agent Zero); helexa-bench here hits
+# every neuron on the fleet directly and records build-stamped results
+# into the local SQLite store.
+
+[bench]
+sweep_interval_secs = 1800
+samples_per_version = 5
+iteration_pause_secs = 2
+request_timeout_secs = 600
+db_path = "/var/lib/helexa-bench/bench.sqlite"
+
+[scenarios]
+prompt_sizes = [128, 4096]
+max_tokens = 256
+
+# Read-only JSON API consumed by the bench UI (hosted separately) and for
+# programmatic access. Served alongside the sweep loop.
+[api]
+enabled = true
+listen = "0.0.0.0:13132"
+
+[[targets]]
+name = "beast"
+endpoint = "http://beast.hanzalova.internal:13131"
+
+[[targets]]
+name = "benjy"
+endpoint = "http://benjy.hanzalova.internal:13131"
+
+[[targets]]
+name = "quadbrat"
+endpoint = "http://quadbrat.hanzalova.internal:13131"
--- a/asset/manifest.yml
+++ b/asset/manifest.yml
@@ -1,30 +0,0 @@
-# Helexa fleet manifest.
-#
-# Drives rolling deploys via script/deploy.sh and serves as the source
-# of truth for which hosts run cortex vs neuron, and which CUDA
-# compute-capability flavour each neuron host needs.
-#
-# Flavour ↔ NVIDIA generation ↔ compute cap:
-#   ampere    sm_86   (RTX 30 series — e.g. 3060)
-#   ada       sm_89   (RTX 40 series — e.g. 4090)
-#   blackwell sm_120  (RTX 50 series — e.g. 5090)
-#
-# The flavour determines which RPM is installed on a given neuron host:
-# helexa-neuron-<flavour>. Only one flavour may be installed at a time
-# (the packages Conflict: with each other).
-
-cortex:
-  host: hanzalova.internal
-
-neurons:
-  - host: beast.hanzalova.internal
-    flavour: blackwell
-    gpu: "2x RTX 5090"
-
-  - host: benjy.hanzalova.internal
-    flavour: ada
-    gpu: "RTX 4090"
-
-  - host: quadbrat.hanzalova.internal
-    flavour: ampere
-    gpu: "RTX 3060"
--- a/asset/neuron/beast.toml
+++ b/asset/neuron/beast.toml
@@ -5,9 +5,9 @@
 # invocation: `validate-neuron.sh beast.hanzalova.internal
 # Qwen/Qwen3.6-27B q5k 2`.
 #
-# Synced by script/deploy.sh from asset/neuron/<short-host>.toml. Edits
-# take effect on the next deploy.sh run (which stops + restarts the
-# service so default_models is re-read at activation).
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. Edits
+# take effect after the next deploy workflow run restarts the service
+# (default_models is read at activation).

 port = 13131

--- a/asset/neuron/benjy.toml
+++ b/asset/neuron/benjy.toml
@@ -4,7 +4,7 @@
 # Qwen3-8B (bf16, ~18 GB), leaving ~6 GB for KV cache + activations on
 # moderate-length contexts.
 #
-# Synced by script/deploy.sh from asset/neuron/<short-host>.toml.
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.

 port = 13131

--- a/asset/neuron/quadbrat.toml
+++ b/asset/neuron/quadbrat.toml
@@ -4,7 +4,7 @@
 # (bf16, ~4 GB), leaving ~7 GB for KV cache so long contexts on a small
 # model still have plenty of room.
 #
-# Synced by script/deploy.sh from asset/neuron/<short-host>.toml.
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.

 port = 13131

--- a/asset/nginx/bench.helexa.ai.bootstrap.conf
+++ b/asset/nginx/bench.helexa.ai.bootstrap.conf
@@ -0,0 +1,15 @@
+# Bootstrap vhost for bench.helexa.ai — http-only, used ONLY to obtain
+# the initial Let's Encrypt cert via the webroot challenge (the full TLS
+# vhost can't load before the cert file exists). script/infra-setup.sh
+# installs this, runs certbot, then swaps in bench.helexa.ai.conf.
+server {
+    listen 80;
+    server_name bench.helexa.ai;
+
+    location /.well-known/acme-challenge/ {
+        root /var/www/bench.helexa.ai;
+    }
+    location / {
+        try_files $uri $uri/ =404;
+    }
+}
--- a/asset/nginx/bench.helexa.ai.conf
+++ b/asset/nginx/bench.helexa.ai.conf
@@ -0,0 +1,56 @@
+# Public, auth-less bench UI at https://bench.helexa.ai.
+#
+# Serves the static SPA from /var/www/bench.helexa.ai (rsynced by
+# .gitea/workflows/deploy.yml's deploy-bench-ui job) and reverse-proxies
+# /api to the helexa-bench read API on bob over the WireGuard mesh — so
+# the browser stays same-origin (no CORS) and the internal API never
+# needs to be exposed publicly.
+#
+# TLS via Let's Encrypt; the cert is obtained/renewed by certbot
+# (bootstrapped one-time in script/infra-setup.sh). Mirrors the
+# dev.swym.hanzalova.internal vhost convention on this host.
+
+server {
+    listen 80;
+    server_name bench.helexa.ai;
+
+    # Keep serving the ACME webroot so certbot can renew.
+    location /.well-known/acme-challenge/ {
+        root /var/www/bench.helexa.ai;
+    }
+    location / {
+        return 301 https://$host$request_uri;
+    }
+}
+
+server {
+    listen 443 ssl;
+    http2 on;
+    server_name bench.helexa.ai;
+
+    ssl_certificate     /etc/letsencrypt/live/bench.helexa.ai/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/bench.helexa.ai/privkey.pem;
+    ssl_protocols TLSv1.2 TLSv1.3;
+    ssl_ciphers HIGH:!aNULL:!MD5;
+    ssl_prefer_server_ciphers on;
+    ssl_session_cache shared:SSL:10m;
+
+    root /var/www/bench.helexa.ai;
+    index index.html;
+
+    # Bench read API on bob (internal WireGuard); browser stays same-origin.
+    location /api/ {
+        proxy_pass http://bob.hanzalova.internal:13132;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_read_timeout 60s;
+    }
+
+    # SPA fallback — client-side routes (/trends, /runs) resolve to index.html.
+    location / {
+        try_files $uri $uri/ /index.html;
+    }
+}
--- a/asset/nginx/bench.internal.conf
+++ b/asset/nginx/bench.internal.conf
@@ -0,0 +1,34 @@
+# Internal bench UI vhost — https://bench.internal, reachable from inside
+# the WireGuard mesh (the public bench.helexa.ai dead-ends at the OPNsense
+# LAN interface, which only port-forwards :443 from the WAN). Same SPA +
+# /api→bob proxy as bench.helexa.ai, but with an internal-CA cert
+# (smallstep "lair", renewed by step@bench.timer). Mirrors the
+# *.internal vhost convention on oolon.kosherinata.internal.
+server {
+    server_name bench.internal;
+    listen 443 ssl;
+    http2 on;
+
+    ssl_certificate /etc/nginx/tls/cert/bench.internal.pem;
+    ssl_certificate_key /etc/nginx/tls/key/bench.internal.pem;
+    ssl_trusted_certificate /etc/pki/ca-trust/source/anchors/root-internal.pem;
+    ssl_protocols TLSv1.3;
+
+    # Shared webroot with the public vhost — same built SPA.
+    root /var/www/bench.helexa.ai;
+    index index.html;
+
+    location /api/ {
+        proxy_pass http://bob.hanzalova.internal:13132;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_read_timeout 60s;
+    }
+
+    location / {
+        try_files $uri $uri/ /index.html;
+    }
+}
--- a/asset/sudoers.d/bench-host.conf
+++ b/asset/sudoers.d/bench-host.conf
@@ -0,0 +1,25 @@
+# Install on the bench host (bob) as /etc/sudoers.d/helexa_gitea_ci
+# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
+# which SSHes as gitea_ci@bob to roll out helexa-bench package upgrades
+# and config changes.
+#
+# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
+# helexa-org apps can drop their own sudoers files on the same host
+# without overwriting this one.
+#
+# helexa-bench polls the neuron fleet (outbound) and serves a read-only
+# JSON API on tcp/13132 for the bench UI — hence the firewall-cmd grants.
+
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/helexa-bench/helexa-bench.toml
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start helexa-bench.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop helexa-bench.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now helexa-bench.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
+# sudoers reserves `:` and `=` and requires `\` escaping inside command
+# arguments — without it visudo errors at the first `:` in `https://`.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
--- a/asset/sudoers.d/cortex-host.conf
+++ b/asset/sudoers.d/cortex-host.conf
@@ -0,0 +1,23 @@
+# Install on the cortex gateway host as /etc/sudoers.d/helexa_gitea_ci
+# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
+# which SSHes as gitea_ci@<gateway> to roll out cortex package upgrades
+# and config changes.
+#
+# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
+# helexa-org apps can drop their own sudoers files on the same host
+# without overwriting this one.
+
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
+# deploy-bench-ui rsyncs the built bench SPA into the nginx webroot.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /var/www/bench.helexa.ai/
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now cortex.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
+# sudoers reserves `:` and `=` and requires `\` escaping inside command
+# arguments — without it visudo errors at the first `:` in `https://`.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
--- a/asset/sudoers.d/neuron-host.conf
+++ b/asset/sudoers.d/neuron-host.conf
@@ -0,0 +1,43 @@
+# Install on every neuron host as /etc/sudoers.d/helexa_gitea_ci
+# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
+# which SSHes as gitea_ci@<neuron-host> to roll out helexa-neuron-<flavour>
+# package upgrades and config changes.
+#
+# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
+# helexa-org apps can drop their own sudoers files on the same host
+# without overwriting this one.
+#
+# All three CUDA flavours are listed because a host's flavour can change
+# (e.g. GPU swap) and we don't want the sudoers file to need to change
+# in lockstep. Only one flavour can be installed at a time (the packages
+# Conflict: with each other), so the attack surface is bounded to "wrong
+# flavour installed" — vandalism, not privilege escalation.
+
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
+# deploy.yml writes the per-model systemd drop-in carrying
+# NEURON_MAX_PROMPT_TOKENS: gitea_ci stages it in its own dir, then
+# installs it root-owned. Exact source/dest paths; see doc/context-limits.md.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf /etc/systemd/system/neuron.service.d/model.conf
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now neuron.service
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ada
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ada
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-blackwell
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-blackwell
+# sudoers reserves `:` and `=` and requires `\` escaping inside command
+# arguments — without it visudo errors at the first `:` in `https://`.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install -y libcudnn9-cuda-13
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
+# deploy-dev.yml fast path: install a freshly-built dev binary over the
+# packaged one. Exact source path + args; the workflow must use this
+# command form verbatim. The next deploy.yml run reconciles the host
+# back to the RPM-owned binary.
+gitea_ci ALL=(root) NOPASSWD: /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
--- a/asset/systemd/step@.service
+++ b/asset/systemd/step@.service
@@ -0,0 +1,20 @@
+# Internal-CA cert renewal for %i.internal, driven by step@%i.timer.
+# Replicated from oolon.kosherinata.internal (the kosherinata DC proxy).
+# Renews an EXISTING cert via mTLS (step ca renew) — the initial cert
+# must be issued once with a provisioner (see script/infra-setup.sh).
+# Installed to /etc/systemd/system/step@.service.
+[Unit]
+Description=step cert renew for %i.internal
+Documentation=https://smallstep.com/docs/step-ca/renewal
+
+[Service]
+Type=oneshot
+ExecCondition=/usr/bin/step certificate needs-renewal \
+    /etc/nginx/tls/cert/%i.internal.pem
+ExecStart=/usr/bin/step ca renew \
+    --force \
+    --ca-url https://ca.internal \
+    --root /etc/pki/ca-trust/source/anchors/root-internal.pem \
+    /etc/nginx/tls/cert/%i.internal.pem \
+    /etc/nginx/tls/key/%i.internal.pem
+ExecStartPost=/usr/bin/systemctl reload nginx.service
--- a/asset/systemd/step@.timer
+++ b/asset/systemd/step@.timer
@@ -0,0 +1,15 @@
+# Periodic internal-cert renewal for %i.internal (every 15 min, jittered).
+# Replicated from oolon.kosherinata.internal. Installed to
+# /etc/systemd/system/step@.timer; enable per-cert with
+# `systemctl enable --now step@bench.timer`.
+[Unit]
+Description=step cert renew timer for %i.internal
+
+[Timer]
+Persistent=true
+OnCalendar=*:1/15
+AccuracySec=1us
+RandomizedDelaySec=5m
+
+[Install]
+WantedBy=timers.target
--- a/bench/.gitignore
+++ b/bench/.gitignore
@@ -0,0 +1,3 @@
+node_modules
+dist
+*.local
--- a/bench/README.md
+++ b/bench/README.md
@@ -0,0 +1,45 @@
+# helexa bench UI
+
+A Vite + React (SWC, TypeScript) app that visualises the fleet benchmark
+data collected by `helexa-bench`. It reads the read-only JSON API the
+bench daemon serves (`crates/helexa-bench/src/api.rs`, default
+`:13132` on bob).
+
+Stack: React Router, react-bootstrap, Recharts.
+
+## Pages
+
+- **Overview** — latest median results per (host, model, scenario) cell.
+- **Trends** — decode-tok/s and TTFT plotted across neuron build SHAs as
+  releases roll out (the headline view). Pick host / model / scenario.
+- **Runs** — filterable raw-run explorer.
+
+## Develop
+
+```sh
+cd bench
+npm install
+npm run dev      # http://localhost:5173
+```
+
+`vite.config.ts` proxies `/api` → `http://bob.hanzalova.internal:13132`,
+so the dev server talks to the live bench API with no CORS fuss. Point
+the proxy elsewhere (or run a local `helexa-bench serve`) to develop
+against other data.
+
+## Production hosting
+
+Public at **https://bench.helexa.ai** — nginx on the gateway
+(`hanzalova.internal`) serves the static `dist/` and reverse-proxies
+`/api` to the bench API on bob over WireGuard, so the SPA is same-origin
+(no CORS) and the internal API stays off the public internet.
+
+- `npm run build` is run with **no** `VITE_API_BASE` (the app calls
+  `/api/...` on its own origin; nginx proxies it to bob).
+- `.gitea/workflows/deploy.yml` (`deploy-bench-ui`) builds and rsyncs
+  `dist/` to `/var/www/bench.helexa.ai` on every deploy.
+- The nginx vhost (`asset/nginx/bench.helexa.ai.conf`) and the
+  Let's Encrypt cert are one-time host setup in `script/infra-setup.sh`.
+
+To host elsewhere instead, build with
+`VITE_API_BASE=<bob-api-origin>` and serve the static `dist/`.
--- a/bench/index.html
+++ b/bench/index.html
@@ -0,0 +1,12 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>helexa bench</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.tsx"></script>
+  </body>
+</html>
--- a/bench/package-lock.json
+++ b/bench/package-lock.json
--- a/bench/package.json
+++ b/bench/package.json
@@ -0,0 +1,28 @@
+{
+  "name": "helexa-bench-ui",
+  "private": true,
+  "version": "0.1.0",
+  "type": "module",
+  "description": "Visualisation app for helexa-bench fleet benchmark data.",
+  "scripts": {
+    "dev": "vite",
+    "build": "tsc && vite build",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "bootstrap": "^5.3.3",
+    "react": "^18.3.1",
+    "react-bootstrap": "^2.10.5",
+    "react-dom": "^18.3.1",
+    "react-router-dom": "^6.26.2",
+    "recharts": "^2.12.7"
+  },
+  "devDependencies": {
+    "@types/node": "^20.14.0",
+    "@types/react": "^18.3.5",
+    "@types/react-dom": "^18.3.0",
+    "@vitejs/plugin-react-swc": "^3.7.0",
+    "typescript": "^5.5.4",
+    "vite": "^5.4.0"
+  }
+}
--- a/bench/src/App.tsx
+++ b/bench/src/App.tsx
@@ -0,0 +1,30 @@
+import { Container, Nav, Navbar } from "react-bootstrap";
+import { NavLink, Outlet } from "react-router-dom";
+
+export default function App() {
+  return (
+    <>
+      <Navbar bg="dark" variant="dark" expand="md">
+        <Container>
+          <Navbar.Brand as={NavLink} to="/">
+            helexa&nbsp;bench
+          </Navbar.Brand>
+          <Nav className="me-auto">
+            <Nav.Link as={NavLink} to="/" end>
+              Overview
+            </Nav.Link>
+            <Nav.Link as={NavLink} to="/trends">
+              Trends
+            </Nav.Link>
+            <Nav.Link as={NavLink} to="/runs">
+              Runs
+            </Nav.Link>
+          </Nav>
+        </Container>
+      </Navbar>
+      <Container className="py-4">
+        <Outlet />
+      </Container>
+    </>
+  );
+}
--- a/bench/src/api.ts
+++ b/bench/src/api.ts
@@ -0,0 +1,45 @@
+import type { Dimensions, ReportRow, RunRow, SeriesPoint } from "./types";
+
+// Empty default → `fetch('/api/...')` hits the dev proxy (vite.config.ts)
+// or the same origin. For a separately-hosted build, set VITE_API_BASE to
+// the bob API origin (e.g. http://bob.hanzalova.internal:13132).
+const BASE = import.meta.env.VITE_API_BASE ?? "";
+
+async function getJson<T>(path: string): Promise<T> {
+  const res = await fetch(`${BASE}${path}`);
+  if (!res.ok) {
+    throw new Error(`${res.status} ${res.statusText}: ${await res.text()}`);
+  }
+  return res.json() as Promise<T>;
+}
+
+export const getDimensions = () => getJson<Dimensions>("/api/dimensions");
+export const getSummary = () => getJson<ReportRow[]>("/api/summary");
+
+// host is resolved server-side (each model maps to one host today), so the
+// public UI selects by model + scenario alone.
+export const getSeries = (model: string, scenario: string) =>
+  getJson<SeriesPoint[]>(
+    `/api/series?model=${encodeURIComponent(model)}&scenario=${encodeURIComponent(scenario)}`,
+  );
+
+export interface RunsParams {
+  host?: string;
+  model?: string;
+  scenario?: string;
+  sha?: string;
+  ok?: boolean;
+  limit?: number;
+}
+
+export const getRuns = (p: RunsParams = {}) => {
+  const q = new URLSearchParams();
+  if (p.host) q.set("host", p.host);
+  if (p.model) q.set("model", p.model);
+  if (p.scenario) q.set("scenario", p.scenario);
+  if (p.sha) q.set("sha", p.sha);
+  if (p.ok !== undefined) q.set("ok", String(p.ok));
+  if (p.limit) q.set("limit", String(p.limit));
+  const qs = q.toString();
+  return getJson<RunRow[]>(`/api/runs${qs ? `?${qs}` : ""}`);
+};
--- a/bench/src/baseline.ts
+++ b/bench/src/baseline.ts
@@ -0,0 +1,52 @@
+// Pre-helexa-bench baseline, transcribed verbatim from doc/benchmarks.md.
+//
+// IMPORTANT — different measurement regime. These were measured by
+// script/bench.py *through the cortex gateway* (so TTFT/total include a
+// proxy hop), reported as medians only, before helexa-bench existed.
+// helexa-bench measures each neuron *directly*. So these points are an
+// honest historical anchor, NOT apples-to-apples with the live series —
+// the Trends view renders them dashed + labelled, never merged into the
+// live line.
+//
+// Host is inferred from the model via the doc's Fleet table
+// (beast=27B, benjy=8B, quadbrat=1.7B). Timestamps are the two 2026-06-12
+// snapshots in the doc, ordered (08:00 = pre-#11, 16:00 = post-#11) so
+// they sort before the bench era on the shared time axis.
+
+export interface BaselinePoint {
+  host: string;
+  model: string;
+  scenario: string;
+  git_sha: string;
+  build_timestamp: string;
+  ttft_s: number;
+  decode_tps: number;
+  total_s: number;
+}
+
+/** Source: bench.py via cortex gateway — see doc/benchmarks.md. */
+export const BASELINE_SOURCE = "bench.py · via cortex gateway";
+
+export const BASELINE: BaselinePoint[] = [
+  // ── 8f6f1d3 — baseline (2026-06-12) ────────────────────────────────
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 1.658, decode_tps: 35.0, total_s: 8.981 },
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 7.067, decode_tps: 33.7, total_s: 14.63 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 0.884, decode_tps: 62.4, total_s: 4.938 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 1.818, decode_tps: 46.5, total_s: 7.27 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 0.685, decode_tps: 81.3, total_s: 3.741 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 2.743, decode_tps: 35.4, total_s: 9.884 },
+  // ── a1952a4 — post prefix-KV-cache (#11, 2026-06-12) ───────────────
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.355, decode_tps: 45.8, total_s: 4.147 },
+  { host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.431, decode_tps: 43.3, total_s: 4.387 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 0.886, decode_tps: 78.6, total_s: 2.478 },
+  { host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.824, decode_tps: 58.3, total_s: 3.969 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 0.702, decode_tps: 104.8, total_s: 1.895 },
+  { host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 2.749, decode_tps: 44.9, total_s: 5.534 },
+];
+
+/** Baseline points for one (model, scenario) cell, oldest first. */
+export function baselineFor(model: string, scenario: string): BaselinePoint[] {
+  return BASELINE.filter(
+    (b) => b.model === model && b.scenario === scenario,
+  ).sort((a, b) => a.build_timestamp.localeCompare(b.build_timestamp));
+}
--- a/bench/src/main.tsx
+++ b/bench/src/main.tsx
@@ -0,0 +1,22 @@
+import React from "react";
+import ReactDOM from "react-dom/client";
+import { BrowserRouter, Route, Routes } from "react-router-dom";
+import "bootstrap/dist/css/bootstrap.min.css";
+import App from "./App";
+import Overview from "./pages/Overview";
+import Trends from "./pages/Trends";
+import Runs from "./pages/Runs";
+
+ReactDOM.createRoot(document.getElementById("root")!).render(
+  <React.StrictMode>
+    <BrowserRouter>
+      <Routes>
+        <Route path="/" element={<App />}>
+          <Route index element={<Overview />} />
+          <Route path="trends" element={<Trends />} />
+          <Route path="runs" element={<Runs />} />
+        </Route>
+      </Routes>
+    </BrowserRouter>
+  </React.StrictMode>,
+);
--- a/bench/src/pages/Overview.tsx
+++ b/bench/src/pages/Overview.tsx
@@ -0,0 +1,64 @@
+import { useEffect, useState } from "react";
+import { Alert, Spinner, Table } from "react-bootstrap";
+import { getSummary } from "../api";
+import type { ReportRow } from "../types";
+
+const f = (n: number | null, p = 2) => (n == null ? "—" : n.toFixed(p));
+
+export default function Overview() {
+  const [rows, setRows] = useState<ReportRow[]>([]);
+  const [err, setErr] = useState<string | null>(null);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    getSummary()
+      .then(setRows)
+      .catch((e) => setErr(String(e)))
+      .finally(() => setLoading(false));
+  }, []);
+
+  if (loading) return <Spinner animation="border" />;
+  if (err) return <Alert variant="danger">{err}</Alert>;
+
+  return (
+    <>
+      <h3 className="mb-3">Latest results per cell</h3>
+      <p className="text-muted">
+        Median of each cell's samples on the most recent build seen for that
+        (host, model, scenario).
+      </p>
+      <Table striped bordered hover responsive size="sm">
+        <thead>
+          <tr>
+            <th>GPU</th>
+            <th>model</th>
+            <th className="text-end">prompt tok</th>
+            <th className="text-end">TTFT (s)</th>
+            <th className="text-end">decode tok/s</th>
+            <th className="text-end">total (s)</th>
+            <th>build</th>
+            <th className="text-end">n</th>
+          </tr>
+        </thead>
+        <tbody>
+          {rows.map((r, i) => (
+            <tr key={i}>
+              <td>{r.gpu ?? r.target_name}</td>
+              <td>{r.model_id}</td>
+              <td className="text-end">
+                {r.prompt_tokens ?? `~${r.prompt_size_approx}`}
+              </td>
+              <td className="text-end">{f(r.ttft_s_median, 3)}</td>
+              <td className="text-end">{f(r.decode_tps_median, 1)}</td>
+              <td className="text-end">{f(r.total_s_median, 3)}</td>
+              <td>
+                <code>{r.git_sha}</code>
+              </td>
+              <td className="text-end">{r.samples}</td>
+            </tr>
+          ))}
+        </tbody>
+      </Table>
+    </>
+  );
+}
--- a/bench/src/pages/Runs.tsx
+++ b/bench/src/pages/Runs.tsx
@@ -0,0 +1,141 @@
+import { useEffect, useState } from "react";
+import { Alert, Badge, Col, Form, Row, Spinner, Table } from "react-bootstrap";
+import { getDimensions, getRuns } from "../api";
+import type { Dimensions, RunRow } from "../types";
+
+const f = (n: number | null, p = 2) => (n == null ? "—" : n.toFixed(p));
+
+function Picker({
+  label,
+  value,
+  set,
+  options,
+}: {
+  label: string;
+  value: string;
+  set: (v: string) => void;
+  options: string[];
+}) {
+  return (
+    <Form.Group as={Col}>
+      <Form.Label>{label}</Form.Label>
+      <Form.Select value={value} onChange={(e) => set(e.target.value)}>
+        <option value="">(all)</option>
+        {options.map((o) => (
+          <option key={o} value={o}>
+            {o}
+          </option>
+        ))}
+      </Form.Select>
+    </Form.Group>
+  );
+}
+
+export default function Runs() {
+  const [dims, setDims] = useState<Dimensions | null>(null);
+  const [host, setHost] = useState("");
+  const [model, setModel] = useState("");
+  const [scenario, setScenario] = useState("");
+  const [rows, setRows] = useState<RunRow[]>([]);
+  const [err, setErr] = useState<string | null>(null);
+  const [loading, setLoading] = useState(false);
+
+  useEffect(() => {
+    getDimensions()
+      .then(setDims)
+      .catch((e) => setErr(String(e)));
+  }, []);
+
+  useEffect(() => {
+    setLoading(true);
+    getRuns({
+      host: host || undefined,
+      model: model || undefined,
+      scenario: scenario || undefined,
+      limit: 200,
+    })
+      .then(setRows)
+      .catch((e) => setErr(String(e)))
+      .finally(() => setLoading(false));
+  }, [host, model, scenario]);
+
+  if (err) return <Alert variant="danger">{err}</Alert>;
+
+  return (
+    <>
+      <h3 className="mb-3">Runs</h3>
+      {dims && (
+        <Row className="g-3 mb-3">
+          {/* GPU filter — labelled by GPU, but filters by the underlying host. */}
+          <Form.Group as={Col}>
+            <Form.Label>GPU</Form.Label>
+            <Form.Select value={host} onChange={(e) => setHost(e.target.value)}>
+              <option value="">(all)</option>
+              {dims.hosts.map((h) => (
+                <option key={h} value={h}>
+                  {dims.host_gpus[h] ?? h}
+                </option>
+              ))}
+            </Form.Select>
+          </Form.Group>
+          <Picker
+            label="Model"
+            value={model}
+            set={setModel}
+            options={dims.models}
+          />
+          <Picker
+            label="Scenario"
+            value={scenario}
+            set={setScenario}
+            options={dims.scenarios}
+          />
+        </Row>
+      )}
+      {loading ? (
+        <Spinner animation="border" />
+      ) : (
+        <Table striped bordered hover responsive size="sm">
+          <thead>
+            <tr>
+              <th>ts</th>
+              <th>GPU</th>
+              <th>model</th>
+              <th>scenario</th>
+              <th>build</th>
+              <th className="text-end">TTFT</th>
+              <th className="text-end">tok/s</th>
+              <th className="text-end">total</th>
+              <th>ok</th>
+            </tr>
+          </thead>
+          <tbody>
+            {rows.map((r) => (
+              <tr key={r.id}>
+                <td>{r.ts}</td>
+                <td>{r.gpu ?? r.host}</td>
+                <td>{r.model_id}</td>
+                <td>{r.scenario_id}</td>
+                <td>
+                  <code>{r.git_sha}</code>
+                </td>
+                <td className="text-end">{f(r.ttft_s, 3)}</td>
+                <td className="text-end">{f(r.decode_tps, 1)}</td>
+                <td className="text-end">{f(r.total_s, 3)}</td>
+                <td>
+                  {r.ok ? (
+                    <Badge bg="success">ok</Badge>
+                  ) : (
+                    <Badge bg="danger" title={r.error ?? ""}>
+                      fail
+                    </Badge>
+                  )}
+                </td>
+              </tr>
+            ))}
+          </tbody>
+        </Table>
+      )}
+    </>
+  );
+}
--- a/bench/src/pages/Trends.tsx
+++ b/bench/src/pages/Trends.tsx
@@ -0,0 +1,221 @@
+import { useEffect, useMemo, useState } from "react";
+import { Alert, Col, Form, Row, Spinner } from "react-bootstrap";
+import {
+  CartesianGrid,
+  Legend,
+  Line,
+  LineChart,
+  ReferenceLine,
+  ResponsiveContainer,
+  Tooltip,
+  XAxis,
+  YAxis,
+} from "recharts";
+import { getDimensions, getSeries } from "../api";
+import type { Dimensions, SeriesPoint } from "../types";
+import { BASELINE_SOURCE, baselineFor } from "../baseline";
+
+function Picker({
+  label,
+  value,
+  set,
+  options,
+}: {
+  label: string;
+  value: string;
+  set: (v: string) => void;
+  options: string[];
+}) {
+  return (
+    <Form.Group as={Col}>
+      <Form.Label>{label}</Form.Label>
+      <Form.Select value={value} onChange={(e) => set(e.target.value)}>
+        {options.map((o) => (
+          <option key={o} value={o}>
+            {o}
+          </option>
+        ))}
+      </Form.Select>
+    </Form.Group>
+  );
+}
+
+export default function Trends() {
+  const [dims, setDims] = useState<Dimensions | null>(null);
+  const [model, setModel] = useState("");
+  const [scenario, setScenario] = useState("");
+  const [series, setSeries] = useState<SeriesPoint[]>([]);
+  const [err, setErr] = useState<string | null>(null);
+
+  useEffect(() => {
+    getDimensions()
+      .then((d) => {
+        setDims(d);
+        if (d.models[0]) setModel(d.models[0]);
+        if (d.scenarios[0]) setScenario(d.scenarios[0]);
+      })
+      .catch((e) => setErr(String(e)));
+  }, []);
+
+  useEffect(() => {
+    if (model && scenario) {
+      getSeries(model, scenario)
+        .then(setSeries)
+        .catch((e) => setErr(String(e)));
+    }
+  }, [model, scenario]);
+
+  // Prepend the pre-helexa-bench baseline (dashed, separate keys) so it
+  // anchors the timeline without being merged into the live line. Different
+  // measurement regime — see baseline.ts / doc/benchmarks.md.
+  const base = useMemo(
+    () => baselineFor(model, scenario),
+    [model, scenario],
+  );
+  const data = useMemo(
+    () => [
+      ...base.map((p) => ({
+        label: p.git_sha,
+        baseTtft: p.ttft_s,
+        baseDecode: p.decode_tps,
+        baseTotal: p.total_s,
+      })),
+      ...series.map((p) => ({
+        label: p.git_sha,
+        ttft: p.ttft_s_median,
+        decode: p.decode_tps_median,
+        total: p.total_s_median,
+      })),
+    ],
+    [series, base],
+  );
+
+  // Divider marking the boundary between the two regimes (drawn at the
+  // first live build, with baseline points to its left).
+  const firstLive = series[0]?.git_sha;
+  const showDivider = base.length > 0 && series.length > 0;
+
+  if (err) return <Alert variant="danger">{err}</Alert>;
+  if (!dims) return <Spinner animation="border" />;
+
+  return (
+    <>
+      <h3 className="mb-3">Trends over builds</h3>
+      <Row className="g-3 mb-4">
+        <Picker
+          label="Model"
+          value={model}
+          set={setModel}
+          options={dims.models}
+        />
+        <Picker
+          label="Scenario"
+          value={scenario}
+          set={setScenario}
+          options={dims.scenarios}
+        />
+      </Row>
+
+      {dims.model_gpus[model] && (
+        <p className="text-muted mb-3">
+          Measured on <strong>{dims.model_gpus[model]}</strong>.
+        </p>
+      )}
+
+      {data.length === 0 ? (
+        <Alert variant="info">No data for this selection yet.</Alert>
+      ) : (
+        <>
+          {base.length > 0 && (
+            <p className="text-muted small mb-3">
+              Dashed = pre-helexa-bench baseline ({BASELINE_SOURCE}); solid =
+              helexa-bench (direct to neuron). Different measurement regimes —
+              see <code>doc/benchmarks.md</code>.
+            </p>
+          )}
+          <h5 className="mt-3">decode tok/s (higher is better)</h5>
+          <ResponsiveContainer width="100%" height={280}>
+            <LineChart data={data} margin={{ top: 8, right: 24, bottom: 8, left: 0 }}>
+              <CartesianGrid strokeDasharray="3 3" />
+              <XAxis dataKey="label" />
+              <YAxis />
+              <Tooltip />
+              <Legend />
+              {showDivider && firstLive && (
+                <ReferenceLine
+                  x={firstLive}
+                  stroke="#bbb"
+                  strokeDasharray="3 3"
+                  label={{
+                    value: "bench.py → helexa-bench",
+                    position: "top",
+                    fill: "#999",
+                    fontSize: 11,
+                  }}
+                />
+              )}
+              <Line
+                type="monotone"
+                dataKey="decode"
+                name="decode tok/s"
+                stroke="#0d6efd"
+                connectNulls
+              />
+              {base.length > 0 && (
+                <Line
+                  type="monotone"
+                  dataKey="baseDecode"
+                  name="baseline (bench.py · gateway)"
+                  stroke="#888"
+                  strokeDasharray="5 5"
+                  connectNulls
+                />
+              )}
+            </LineChart>
+          </ResponsiveContainer>
+
+          <h5 className="mt-4">TTFT seconds (lower is better)</h5>
+          <ResponsiveContainer width="100%" height={280}>
+            <LineChart data={data} margin={{ top: 8, right: 24, bottom: 8, left: 0 }}>
+              <CartesianGrid strokeDasharray="3 3" />
+              <XAxis dataKey="label" />
+              <YAxis />
+              <Tooltip />
+              <Legend />
+              {showDivider && firstLive && (
+                <ReferenceLine
+                  x={firstLive}
+                  stroke="#bbb"
+                  strokeDasharray="3 3"
+                  label={{
+                    value: "bench.py → helexa-bench",
+                    position: "top",
+                    fill: "#999",
+                    fontSize: 11,
+                  }}
+                />
+              )}
+              <Line
+                type="monotone"
+                dataKey="ttft"
+                name="TTFT (s)"
+                stroke="#dc3545"
+                connectNulls
+              />
+              {base.length > 0 && (
+                <Line
+                  type="monotone"
+                  dataKey="baseTtft"
+                  name="baseline (bench.py · gateway)"
+                  stroke="#888"
+                  strokeDasharray="5 5"
+                  connectNulls
+                />
+              )}
+            </LineChart>
+          </ResponsiveContainer>
+        </>
+      )}
+    </>
+  );
+}
--- a/bench/src/types.ts
+++ b/bench/src/types.ts
@@ -0,0 +1,69 @@
+// Mirrors the JSON served by helexa-bench's read API (crates/helexa-bench/src/api.rs).
+
+export interface BuildRef {
+  git_sha: string;
+  build_timestamp: string | null;
+  package_version: string | null;
+}
+
+export interface Dimensions {
+  hosts: string[];
+  models: string[];
+  scenarios: string[];
+  builds: BuildRef[];
+  /** host → GPU label, e.g. "2× RTX 5090". */
+  host_gpus: Record<string, string>;
+  /** model → GPU label (model maps to one host today). */
+  model_gpus: Record<string, string>;
+}
+
+/** Latest-SHA-per-cell medians (the report table). */
+export interface ReportRow {
+  target_name: string;
+  model_id: string;
+  scenario_id: string;
+  prompt_size_approx: number;
+  git_sha: string;
+  prompt_tokens: number | null;
+  ttft_s_median: number | null;
+  decode_tps_median: number | null;
+  total_s_median: number | null;
+  samples: number;
+  /** Public-facing resource name (the host's GPU(s)). */
+  gpu: string | null;
+}
+
+/** One point in a per-build time-series for a (host, model, scenario) cell. */
+export interface SeriesPoint {
+  git_sha: string;
+  build_timestamp: string | null;
+  package_version: string | null;
+  ttft_s_median: number | null;
+  decode_tps_median: number | null;
+  total_s_median: number | null;
+  samples: number;
+}
+
+export interface RunRow {
+  id: number;
+  ts: string;
+  host: string;
+  /** Public-facing resource name (the host's GPU(s)). */
+  gpu: string | null;
+  hostname: string | null;
+  git_sha: string;
+  build_timestamp: string | null;
+  package_version: string;
+  model_id: string;
+  harness: string;
+  scenario_id: string;
+  prompt_size_approx: number;
+  prompt_tokens_actual: number | null;
+  max_tokens: number;
+  ttft_s: number | null;
+  decode_tps: number | null;
+  total_s: number | null;
+  completion_tokens: number | null;
+  ok: boolean;
+  error: string | null;
+}
--- a/bench/src/vite-env.d.ts
+++ b/bench/src/vite-env.d.ts
@@ -0,0 +1,9 @@
+/// <reference types="vite/client" />
+
+interface ImportMetaEnv {
+  /** Base origin of the bench API. Empty → use the dev proxy / same origin. */
+  readonly VITE_API_BASE?: string;
+}
+interface ImportMeta {
+  readonly env: ImportMetaEnv;
+}
--- a/bench/tsconfig.json
+++ b/bench/tsconfig.json
@@ -0,0 +1,22 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "useDefineForClassFields": true,
+    "lib": ["ES2022", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "types": ["node", "vite/client"]
+  },
+  "include": ["src", "vite.config.ts"]
+}
--- a/bench/vite.config.ts
+++ b/bench/vite.config.ts
@@ -0,0 +1,18 @@
+import { defineConfig } from "vite";
+import react from "@vitejs/plugin-react-swc";
+
+// Dev server proxies /api to the bench API on bob so `fetch('/api/...')`
+// works without CORS/mixed-origin fuss during local development.
+// For a production build hosted elsewhere, set VITE_API_BASE to the bob
+// API origin (e.g. http://bob.hanzalova.internal:13132) instead.
+export default defineConfig({
+  plugins: [react()],
+  server: {
+    proxy: {
+      "/api": {
+        target: "http://bob.hanzalova.internal:13132",
+        changeOrigin: true,
+      },
+    },
+  },
+});
--- a/cortex.example.toml
+++ b/cortex.example.toml
@@ -5,6 +5,11 @@
 # Environment variable overrides use CORTEX_ prefix with __ separators:
 #   CORTEX_GATEWAY__LISTEN=0.0.0.0:31313

+# Path to the model catalogue (limits, cost, pinning, aliases, feasibility).
+# Defaults to the packaged location below; uncomment to override for a
+# non-packaged / local run.
+# models_config = "/etc/cortex/models.toml"
+
 [gateway]
 listen = "0.0.0.0:31313"
 metrics_listen = "0.0.0.0:31314"
@@ -43,3 +48,45 @@ vram_mb = 12288           # e.g. RTX 3060 (12 GB)
 pinned = [
    "your-org/embedding-model",
 ]
+
+# -- Entitlements (multi-tenant governance, #47) -------------------------
+# Identity + per-key token budgets. Omit this section entirely for the
+# legacy single-operator behaviour: requests are anonymous and uncapped.
+#
+# The local/static provider below is the source of truth for accounts,
+# keys, and hard caps until the upstream clearing house exists. Identity
+# rides standard bearer auth only — clients send
+#   Authorization: Bearer <key>
+# no custom headers or body fields.
+
+[entitlements]
+# Reject unauthenticated requests with 401 invalid_api_key. Leave false
+# (allow-anonymous) during rollout; flip to true once keys are issued.
+require_auth = false
+
+# One entry per API key.
+[[entitlements.keys]]
+key = "sk-example-rolling"        # the bearer token the client sends
+account_id = "team-research"      # billable account (keys may share one)
+key_id = "research-ci"            # stable label for ledger/metrics (optional)
+hard_cap = 5_000_000              # hard token cap over the window
+# Rolling window that resets — over-cap requests get 429 rate_limit_exceeded
+# + Retry-After, so well-behaved clients (opencode/AI SDK) back off and retry.
+window = { kind = "rolling", seconds = 3600 }
+
+[[entitlements.keys]]
+key = "sk-example-balance"
+account_id = "team-research"
+key_id = "research-prepaid"
+hard_cap = 20_000_000
+# Hard balance, no reset — exhaustion returns 429 insufficient_quota
+# (the client surfaces and stops). This is the default when `window` is
+# omitted. Never 402.
+window = { kind = "balance" }
+
+[[entitlements.keys]]
+key = "sk-example-infra"
+account_id = "operator"
+key_id = "infra"
+# No hard_cap → uncapped operator infra key (own fleet, own use). Still
+# metered for visibility.
--- a/cortex.spec
+++ b/cortex.spec
@@ -4,7 +4,7 @@ Release:        1%{?dist}
 Summary:        Inference gateway for multi-node GPU clusters

 License:        GPL-3.0-or-later
-URL:            https://git.lair.cafe/helexa/cortex
+URL:            https://git.lair.cafe/helexa/helexa
 Source0:        %{name}-%{version}.tar.gz
 Source1:        %{name}-%{version}-vendor.tar.gz

--- a/crates/cortex-core/src/build_info.rs
+++ b/crates/cortex-core/src/build_info.rs
@@ -0,0 +1,119 @@
+//! Build/version metadata shared between cortex and neuron.
+//!
+//! neuron captures these facts at compile time in its `build.rs`
+//! (git SHA, enabled cargo features, rustc/candle versions, …) and
+//! serves them from `GET /version`. cortex and `helexa-bench`
+//! deserialize the same struct so a benchmark run can be attributed to
+//! the exact daemon build that produced it — not just the host's CUDA
+//! and driver versions that `/discovery` already reports.
+//!
+//! Every field beyond the always-present package version is
+//! `#[serde(default)]` so a newer reader stays compatible with an
+//! older neuron that omits a field (and vice versa) — the same
+//! forward/backward-compat discipline as
+//! [`crate::discovery::ActivationStatus`].
+
+use serde::{Deserialize, Serialize};
+
+/// Build-time identity of a neuron daemon.
+///
+/// Returned by `GET /version`. The `git_sha` is the canonical "which
+/// build is live" key — benchmark records are bucketed by it, so a
+/// regression can be pinned to a daemon change rather than a host
+/// change. When neuron is built from a source tarball with no git
+/// metadata available (and no `HELEXA_BUILD_SHA` injected by CI/RPM),
+/// `git_sha` is the string `"unknown"`.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct BuildInfo {
+    /// Crate version from `CARGO_PKG_VERSION` (e.g. `"0.1.16"`).
+    pub package_version: String,
+    /// Short git SHA, or `"unknown"` when unavailable at build time.
+    #[serde(default = "unknown")]
+    pub git_sha: String,
+    /// Full 40-char git SHA when available.
+    #[serde(default)]
+    pub git_sha_long: Option<String>,
+    /// Whether the working tree had uncommitted changes at build time.
+    /// `false` when the SHA is unknown (tarball build).
+    #[serde(default)]
+    pub git_dirty: bool,
+    /// RFC3339 build timestamp.
+    #[serde(default)]
+    pub build_timestamp: Option<String>,
+    /// `rustc --version` output of the compiler used.
+    #[serde(default)]
+    pub rustc_version: Option<String>,
+    /// Cargo build profile: `"release"` or `"debug"`.
+    #[serde(default)]
+    pub profile: Option<String>,
+    /// Target triple the binary was compiled for.
+    #[serde(default)]
+    pub target: Option<String>,
+    /// Enabled cargo features (e.g. `["cuda", "cudnn"]`). These define
+    /// the performance envelope, so they are recorded against every
+    /// benchmark run.
+    #[serde(default)]
+    pub features: Vec<String>,
+    /// Locked `candle-core` version, best-effort from `Cargo.lock`.
+    #[serde(default)]
+    pub candle_version: Option<String>,
+}
+
+fn unknown() -> String {
+    "unknown".to_string()
+}
+
+impl BuildInfo {
+    /// A placeholder used by non-neuron benchmark targets (and tests)
+    /// that have no build metadata to report.
+    pub fn unknown() -> Self {
+        BuildInfo {
+            package_version: env!("CARGO_PKG_VERSION").to_string(),
+            git_sha: unknown(),
+            git_sha_long: None,
+            git_dirty: false,
+            build_timestamp: None,
+            rustc_version: None,
+            profile: None,
+            target: None,
+            features: Vec::new(),
+            candle_version: None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn round_trips_full() {
+        let info = BuildInfo {
+            package_version: "0.1.16".into(),
+            git_sha: "30d50d6".into(),
+            git_sha_long: Some("30d50d6abc123".into()),
+            git_dirty: true,
+            build_timestamp: Some("2026-06-13T10:00:00+00:00".into()),
+            rustc_version: Some("rustc 1.85.0".into()),
+            profile: Some("release".into()),
+            target: Some("x86_64-unknown-linux-gnu".into()),
+            features: vec!["cuda".into(), "cudnn".into()],
+            candle_version: Some("0.10.2".into()),
+        };
+        let json = serde_json::to_string(&info).unwrap();
+        let back: BuildInfo = serde_json::from_str(&json).unwrap();
+        assert_eq!(info, back);
+    }
+
+    #[test]
+    fn deserializes_minimal_payload() {
+        // An older neuron might send only the package version; every
+        // other field must default rather than fail.
+        let back: BuildInfo = serde_json::from_str(r#"{"package_version":"0.1.0"}"#).unwrap();
+        assert_eq!(back.package_version, "0.1.0");
+        assert_eq!(back.git_sha, "unknown");
+        assert!(!back.git_dirty);
+        assert!(back.features.is_empty());
+        assert!(back.candle_version.is_none());
+    }
+}
--- a/crates/cortex-core/src/catalogue.rs
+++ b/crates/cortex-core/src/catalogue.rs
@@ -1,6 +1,7 @@
 //! Model catalogue — profiles describing how to serve each model.

 use crate::discovery::DeviceInfo;
+use crate::harness::{ModelCost, ModelLimit};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::path::Path;
@@ -24,6 +25,32 @@ pub struct ModelProfile {
    /// Neurons where this model should never be evicted.
    #[serde(default)]
    pub pinned_on: Vec<String>,
+    /// Source scheme this profile's weights come from. When set, the
+    /// router prefixes `id` with `scheme:` before forwarding the load
+    /// request to neuron, ensuring the daemon fetches from the right
+    /// registry regardless of which entry happens to match `id`.
+    ///
+    /// `None` lets neuron substitute its own `default_source` (typically
+    /// `huggingface`). Set to `"helexa"` when the model is hosted in
+    /// the helexa registry — operator-procurement-grade audit relies
+    /// on this being explicit per model rather than implicit.
+    #[serde(default)]
+    pub source: Option<String>,
+
+    // ── Enrichment (issue #62) ────────────────────────────────
+    /// Per-model token budget. When present, advertised in `/v1/models`
+    /// so clients can size and compact their context automatically.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
+    /// Operator-set pricing (USD per 1M tokens). `0.0` for self-hosted.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cost: Option<ModelCost>,
+    /// Static capability flags the operator wants to advertise even
+    /// before the model is loaded on any neuron (e.g. `"reasoning"`,
+    /// `"tool_call"`). Runtime-detected capabilities from the harness
+    /// are unioned with this set in the gateway's `/v1/models` response.
+    #[serde(default)]
+    pub capabilities: Vec<String>,
 }

 fn default_min_devices() -> u32 {
@@ -140,6 +167,10 @@ mod tests {
            min_devices: 2,
            min_device_vram_mb: Some(24_000),
            pinned_on: vec![],
+            source: None,
+            limit: None,
+            cost: None,
+            capabilities: vec![],
        }
    }

@@ -197,6 +228,29 @@ mod tests {
        assert_eq!(cat.resolve_alias("Qwen/Qwen3-8B"), "Qwen/Qwen3-8B");
    }

+    #[test]
+    fn source_defaults_to_none_when_absent_from_toml() {
+        let src = r#"
+[[models]]
+id = "Qwen/Qwen3-30B"
+harness = "candle"
+"#;
+        let cat: ModelCatalogue = toml::from_str(src).expect("parse models table");
+        assert!(cat.models[0].source.is_none());
+    }
+
+    #[test]
+    fn source_round_trips_through_toml() {
+        let src = r#"
+[[models]]
+id = "Helexa/Qwen3.6-27B-Uncensored"
+harness = "candle"
+source = "helexa"
+"#;
+        let cat: ModelCatalogue = toml::from_str(src).expect("parse models table");
+        assert_eq!(cat.models[0].source.as_deref(), Some("helexa"));
+    }
+
    #[test]
    fn aliases_table_round_trips_through_toml() {
        let src = r#"
--- a/crates/cortex-core/src/config.rs
+++ b/crates/cortex-core/src/config.rs
@@ -1,3 +1,4 @@
+use crate::entitlements::CapWindow;
 use figment::{
    Figment,
    providers::{Env, Format, Toml},
@@ -11,13 +12,61 @@ pub struct GatewayConfig {
    pub eviction: EvictionSettings,
    /// Neuron endpoints (replaces old NodeConfig with static vram_mb/pinned).
    pub neurons: Vec<NeuronEndpoint>,
-    /// Path to the model catalogue file (default: "models.toml").
+    /// Path to the model catalogue file. Defaults to the packaged
+    /// location (`/etc/cortex/models.toml`); set explicitly for
+    /// non-packaged / local runs.
    #[serde(default = "default_models_path")]
    pub models_config: String,
+    /// Multi-tenant governance: auth + per-key token budgets (#47). Empty
+    /// by default — anonymous, uncapped — so existing single-operator
+    /// setups keep working until keys are configured.
+    #[serde(default)]
+    pub entitlements: EntitlementsConfig,
+}
+
+/// `[entitlements]` — the local/static [`crate::entitlements::EntitlementProvider`]
+/// source of truth (#50). Accounts, keys, and hard caps live here; the
+/// future upstream client (#57) ignores this section.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct EntitlementsConfig {
+    /// Reject unauthenticated requests with `401 invalid_api_key` when
+    /// true. Default `false` (allow-anonymous) for dev / single-operator
+    /// continuity.
+    #[serde(default)]
+    pub require_auth: bool,
+    /// Static API keys and their budgets, consumed by the local provider.
+    #[serde(default)]
+    pub keys: Vec<ApiKeyConfig>,
+}
+
+/// One configured API key: the bearer token, the account it bills to, and
+/// its hard cap. `[[entitlements.keys]]` in TOML.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ApiKeyConfig {
+    /// The bearer token clients send in `Authorization: Bearer <key>`.
+    pub key: String,
+    /// Billable account. Multiple keys may share one account.
+    pub account_id: String,
+    /// Stable per-key identifier for ledger/metrics labels. Defaults to
+    /// `account_id` when omitted, so the secret is never used as a label.
+    #[serde(default)]
+    pub key_id: Option<String>,
+    /// Hard token cap. `None`/omitted = uncapped (e.g. operator infra key).
+    #[serde(default)]
+    pub hard_cap: Option<u64>,
+    /// Cap-window semantics. Default: a non-resetting [`CapWindow::Balance`].
+    #[serde(default)]
+    pub window: CapWindow,
 }

 fn default_models_path() -> String {
-    "models.toml".into()
+    // Absolute, so the systemd-launched binary finds the catalogue
+    // regardless of its working directory. The RPM installs the catalogue
+    // here (`cortex.spec`); a relative "models.toml" silently resolved to
+    // the service cwd and left the catalogue empty in production
+    // (pinning / aliases / limits all no-ops). Override via `models_config`
+    // in cortex.toml for local runs.
+    "/etc/cortex/models.toml".into()
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -79,6 +128,7 @@ impl Default for GatewayConfig {
            },
            neurons: vec![],
            models_config: default_models_path(),
+            entitlements: EntitlementsConfig::default(),
        }
    }
 }
--- a/crates/cortex-core/src/discovery.rs
+++ b/crates/cortex-core/src/discovery.rs
@@ -22,6 +22,23 @@ pub struct DiscoveryResponse {
    pub driver_version: Option<String>,
    pub devices: Vec<DeviceInfo>,
    pub harnesses: Vec<String>,
+    /// Set when the host has an NVIDIA stack that is currently
+    /// unusable — specifically the userspace↔kernel-module version
+    /// skew after an un-rebooted driver update ("Driver/library
+    /// version mismatch"), where every CUDA call including nvidia-smi
+    /// fails (#19). `None` on healthy hosts AND on hosts with no
+    /// NVIDIA stack at all (CPU-only is not an error). Carries an
+    /// operator-actionable description; cortex can read it to route
+    /// around the node instead of cold-loading into a guaranteed
+    /// failure.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cuda_unavailable_reason: Option<String>,
+    /// The neuron's effective maximum prompt size in tokens
+    /// (`NEURON_MAX_PROMPT_TOKENS`) — the enforced prompt cap on this
+    /// host. `#[serde(default)]` (→ 0) for forward-compat with neurons
+    /// that predate this field; cortex treats 0 as "unknown".
+    #[serde(default)]
+    pub max_prompt_tokens: u64,
 }

 /// Runtime health metrics for a single GPU device.
@@ -51,6 +68,57 @@ pub struct HealthResponse {
    pub devices: Vec<DeviceHealth>,
    #[serde(default)]
    pub activation: ActivationStatus,
+    /// Per-model admission load (#53): how many requests are running vs.
+    /// queued on each loaded model right now. Cortex's load-aware router
+    /// (#55) reads this to spread traffic across replicas and to propagate
+    /// honest backpressure. `#[serde(default)]` keeps older gateways/neurons
+    /// interoperable (absent → empty → treated as no load info).
+    #[serde(default)]
+    pub models: Vec<ModelLoad>,
+}
+
+/// Live admission load for one loaded model (#53).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelLoad {
+    pub id: String,
+    /// Requests currently running (batch-1 → 0 or 1).
+    pub in_flight: usize,
+    /// Requests waiting in the bounded admission queue.
+    pub queue_depth: usize,
+}
+
+#[cfg(test)]
+mod health_load_tests {
+    use super::*;
+
+    #[test]
+    fn health_response_without_models_field_still_deserializes() {
+        // A pre-#53 neuron's /health payload omits `models`; the gateway
+        // must still parse it (serde default → empty).
+        let json = r#"{"uptime_secs":42,"devices":[]}"#;
+        let resp: HealthResponse = serde_json::from_str(json).expect("back-compat parse");
+        assert_eq!(resp.uptime_secs, 42);
+        assert!(resp.models.is_empty());
+    }
+
+    #[test]
+    fn health_response_round_trips_model_load() {
+        let resp = HealthResponse {
+            uptime_secs: 1,
+            devices: vec![],
+            activation: ActivationStatus::default(),
+            models: vec![ModelLoad {
+                id: "Qwen/Qwen3.6-27B".into(),
+                in_flight: 1,
+                queue_depth: 3,
+            }],
+        };
+        let s = serde_json::to_string(&resp).unwrap();
+        let back: HealthResponse = serde_json::from_str(&s).unwrap();
+        assert_eq!(back.models.len(), 1);
+        assert_eq!(back.models[0].in_flight, 1);
+        assert_eq!(back.models[0].queue_depth, 3);
+    }
 }

 /// High-level activation state of the neuron daemon. The HTTP listener
--- a/crates/cortex-core/src/entitlements.rs
+++ b/crates/cortex-core/src/entitlements.rs
@@ -0,0 +1,145 @@
+//! Identity and entitlement primitives for multi-tenant governance (#47).
+//!
+//! Identity is the shared substrate the whole epic hangs off:
+//! `identity (principal) → accounting (spend) → policy → enforcement`. This
+//! module defines the seam — the [`EntitlementProvider`] trait and its data
+//! types — so the local/static provider (operator-config caps, in
+//! cortex-gateway) can land the auth + per-key-cap + amplification fix
+//! *before* any upstream clearing house exists. The future helexa-upstream
+//! client (#57) is just another impl of this trait.
+//!
+//! The provider owns three jobs:
+//! 1. **resolve** a bearer key to a [`Principal`] (drives auth, #49);
+//! 2. **reserve → settle/release** token budget around a request so spend
+//!    can never overshoot a hard cap under concurrency (drives budget
+//!    enforcement, #52);
+//! 3. expose a [`BudgetSnapshot`] for metering/metrics (#51).
+//!
+//! [`BudgetError`] carries the cap-window semantics so the caller can pick
+//! the correct #63 rejection (`rate_limit_exceeded` + `Retry-After` for a
+//! resetting window vs `insufficient_quota` for a hard balance) without the
+//! provider knowing anything about HTTP.
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+/// Internal header carrying the resolved account id from cortex to neuron.
+/// neuron trusts these over the WireGuard link (#54); cortex **strips** any
+/// client-supplied copy before stamping the authoritative value, so a client
+/// can never assert a principal directly.
+pub const HEADER_ACCOUNT_ID: &str = "x-helexa-account-id";
+/// Internal header carrying the resolved key id from cortex to neuron.
+pub const HEADER_KEY_ID: &str = "x-helexa-key-id";
+
+/// Who a request is for. Resolved once at the edge from the bearer key and
+/// carried through the request context. `account_id` is the billable owner
+/// (spendable at any operator, by decision); `key_id` identifies the
+/// specific API key for per-key hard caps and ledger/metrics labels.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct Principal {
+    pub account_id: String,
+    pub key_id: String,
+}
+
+/// Cap-window semantics for a key's hard cap. Determines which #63 code an
+/// over-cap reservation maps to.
+#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind", rename_all = "snake_case")]
+pub enum CapWindow {
+    /// Hard balance — the cap never resets. Exhaustion is permanent
+    /// (`429 insufficient_quota`, no `Retry-After`).
+    #[default]
+    Balance,
+    /// Rolling window of `seconds` that resets. Exhaustion is transient
+    /// (`429 rate_limit_exceeded` + `Retry-After` until reset).
+    Rolling { seconds: u64 },
+}
+
+/// An outstanding budget reservation. The caller holds this opaque handle
+/// between [`EntitlementProvider::reserve`] and exactly one of
+/// [`EntitlementProvider::settle`] / [`EntitlementProvider::release`]. Not
+/// `Clone` — a reservation is consumed once.
+#[derive(Debug)]
+pub struct Reservation {
+    /// Provider-local handle; opaque to the caller.
+    pub id: u64,
+    /// The principal this reservation belongs to.
+    pub principal: Principal,
+    /// Tokens reserved against the cap.
+    pub reserved: u64,
+}
+
+/// A point-in-time view of a key's budget, for metering and metrics (#51).
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct BudgetSnapshot {
+    /// Hard cap in tokens. `None` means uncapped (e.g. an operator infra
+    /// key, #58).
+    pub hard_cap: Option<u64>,
+    /// Settled spend in the current window.
+    pub spent: u64,
+    /// Sum of outstanding (un-settled) reservations.
+    pub reserved: u64,
+}
+
+/// Authentication failure — the bearer key could not be resolved. Maps to
+/// `401 invalid_api_key` (#49/#63).
+#[derive(Debug, thiserror::Error)]
+pub enum AuthError {
+    #[error("invalid or unknown API key")]
+    InvalidKey,
+}
+
+/// Why a reservation was refused. Carries enough for the caller to build the
+/// correct #63 envelope without the provider touching HTTP.
+#[derive(Debug, thiserror::Error)]
+pub enum BudgetError {
+    /// A resetting window is exhausted → `429 rate_limit_exceeded` +
+    /// `Retry-After: retry_after_secs`.
+    #[error(
+        "rolling-window budget exhausted ({requested} requested, {available} available); \
+         resets in {retry_after_secs}s"
+    )]
+    RateLimited {
+        requested: u64,
+        available: u64,
+        retry_after_secs: u64,
+    },
+    /// A hard balance is exhausted → `429 insufficient_quota` (no
+    /// `Retry-After`; the client surfaces and stops). Never `402`.
+    #[error("hard balance exhausted ({requested} requested, {available} available)")]
+    InsufficientQuota { requested: u64, available: u64 },
+}
+
+/// The seam between cortex's enforcement and whatever decides entitlement —
+/// a local/static config provider today (#50), the helexa-upstream client
+/// later (#57). All methods are async so the upstream impl can do network
+/// I/O; the local impl resolves in-process.
+#[async_trait]
+pub trait EntitlementProvider: Send + Sync {
+    /// Resolve a bearer API key to its principal. `Err(InvalidKey)` for an
+    /// unknown/empty key.
+    async fn resolve(&self, api_key: &str) -> Result<Principal, AuthError>;
+
+    /// Reserve up to `max_tokens` against the principal's cap. Returns a
+    /// handle on success, or a [`BudgetError`] (which the caller maps to a
+    /// #63 `429`) if the reservation would exceed the cap. Reserving the
+    /// *maximum* a request could consume before dispatch is what prevents
+    /// overshoot under concurrency.
+    async fn reserve(
+        &self,
+        principal: &Principal,
+        max_tokens: u64,
+    ) -> Result<Reservation, BudgetError>;
+
+    /// Settle a reservation with the tokens actually consumed, releasing the
+    /// unused remainder back to the cap.
+    async fn settle(&self, reservation: Reservation, actual_tokens: u64);
+
+    /// Release a reservation in full — e.g. dispatch failed before any
+    /// tokens were consumed.
+    async fn release(&self, reservation: Reservation);
+
+    /// Current budget snapshot for a principal, for metering/metrics.
+    /// `None` if the provider doesn't track this principal.
+    async fn snapshot(&self, principal: &Principal) -> Option<BudgetSnapshot>;
+}
--- a/crates/cortex-core/src/error_envelope.rs
+++ b/crates/cortex-core/src/error_envelope.rs
@@ -0,0 +1,257 @@
+//! The OpenAI-standard error envelope (#60) and the rejection contract
+//! that rides on it (#63).
+//!
+//! Every non-2xx response cortex and neuron emit uses the shape
+//!
+//! ```json
+//! { "error": { "message": "...", "type": "...", "code": "...", "param": null } }
+//! ```
+//!
+//! because OpenAI-compatible clients (opencode, the AI SDK, litellm, the
+//! OpenAI SDKs) read `error.type` / `error.code` to decide what to do —
+//! most importantly `code == "context_length_exceeded"` triggers
+//! auto-compaction, and a `429` with `Retry-After` makes them back off and
+//! retry rather than surfacing an opaque failure. A flat `{"error":"..."}`
+//! string is invisible to that logic.
+//!
+//! This module is the single source of truth for that envelope. It is
+//! deliberately **axum-agnostic** — cortex-core is a pure types crate — so
+//! it carries the response as data (`status`, `body()`, `retry_after_secs`)
+//! and each HTTP crate (cortex-gateway, neuron) owns a tiny adapter that
+//! turns an [`OpenAiError`] into its framework's response type, setting the
+//! `Retry-After` header when present.
+//!
+//! Retryable conditions **must** carry `Retry-After` (per #63). The named
+//! constructors below encode that: [`OpenAiError::rate_limit_exceeded`] and
+//! [`OpenAiError::service_unavailable`] take a retry hint;
+//! [`OpenAiError::insufficient_quota`] (hard balance, no reset) and
+//! [`OpenAiError::context_length_exceeded`] / [`OpenAiError::invalid_api_key`]
+//! (permanent) do not. `402 Payment Required` is banned by the contract — use
+//! `429 insufficient_quota` for hard budget exhaustion.
+
+use serde_json::{Map, Value, json};
+
+/// A rejection rendered in the OpenAI error envelope.
+///
+/// Build with [`OpenAiError::new`] (or a named constructor), refine with the
+/// `with_*` builders, then hand to the consuming crate's adapter to turn into
+/// an HTTP response.
+#[derive(Debug, Clone)]
+pub struct OpenAiError {
+    /// HTTP status code (e.g. `401`, `429`, `503`).
+    pub status: u16,
+    /// Broad OpenAI category — `"invalid_request_error"`, `"api_error"`,
+    /// `"rate_limit_error"`, …
+    pub error_type: String,
+    /// Specific machine-readable code clients key on (`"invalid_api_key"`,
+    /// `"rate_limit_exceeded"`, `"context_length_exceeded"`, …). `None`
+    /// renders as JSON `null`.
+    pub code: Option<String>,
+    /// Human-readable, actionable message.
+    pub message: String,
+    /// OpenAI's `param` field — the offending request parameter, if any.
+    pub param: Option<String>,
+    /// Seconds to advertise in the `Retry-After` header. Set only on
+    /// retryable conditions; `None` means no header.
+    pub retry_after_secs: Option<u64>,
+    /// Diagnostic fields merged *inside* the `error` object (e.g.
+    /// `prompt_len`, `max`, `free_mb`) so they don't break the envelope
+    /// shape. Clients ignore unknown keys.
+    pub extra: Map<String, Value>,
+}
+
+impl OpenAiError {
+    /// Construct an envelope with an explicit code. For a `null` code use
+    /// [`OpenAiError::without_code`].
+    pub fn new(
+        status: u16,
+        error_type: impl Into<String>,
+        code: impl Into<String>,
+        message: impl Into<String>,
+    ) -> Self {
+        Self {
+            status,
+            error_type: error_type.into(),
+            code: Some(code.into()),
+            message: message.into(),
+            param: None,
+            retry_after_secs: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Construct an envelope whose `code` is `null` (e.g. an unclassified
+    /// internal error).
+    pub fn without_code(
+        status: u16,
+        error_type: impl Into<String>,
+        message: impl Into<String>,
+    ) -> Self {
+        Self {
+            status,
+            error_type: error_type.into(),
+            code: None,
+            message: message.into(),
+            param: None,
+            retry_after_secs: None,
+            extra: Map::new(),
+        }
+    }
+
+    /// Advertise a `Retry-After` (seconds). Use on retryable rejections.
+    pub fn with_retry_after(mut self, secs: u64) -> Self {
+        self.retry_after_secs = Some(secs);
+        self
+    }
+
+    /// Set the OpenAI `param` field.
+    pub fn with_param(mut self, param: impl Into<String>) -> Self {
+        self.param = Some(param.into());
+        self
+    }
+
+    /// Merge one diagnostic field into the error object.
+    pub fn with_extra(mut self, key: impl Into<String>, value: Value) -> Self {
+        self.extra.insert(key.into(), value);
+        self
+    }
+
+    /// Merge a bag of diagnostic fields into the error object.
+    pub fn with_extras(mut self, extras: Map<String, Value>) -> Self {
+        for (k, v) in extras {
+            self.extra.insert(k, v);
+        }
+        self
+    }
+
+    /// Render the `{ "error": { … } }` body. Field order is irrelevant to
+    /// clients (they parse JSON); the standard keys come first, then any
+    /// diagnostic extras.
+    pub fn body(&self) -> Value {
+        let mut error = Map::new();
+        error.insert("message".into(), Value::String(self.message.clone()));
+        error.insert("type".into(), Value::String(self.error_type.clone()));
+        error.insert(
+            "code".into(),
+            self.code.clone().map(Value::String).unwrap_or(Value::Null),
+        );
+        error.insert(
+            "param".into(),
+            self.param.clone().map(Value::String).unwrap_or(Value::Null),
+        );
+        for (k, v) in &self.extra {
+            error.insert(k.clone(), v.clone());
+        }
+        json!({ "error": Value::Object(error) })
+    }
+
+    // ── Named constructors for the #63 standard codes ──────────────────
+
+    /// `401 invalid_api_key` — missing/invalid bearer token (#49). Permanent.
+    pub fn invalid_api_key(message: impl Into<String>) -> Self {
+        Self::new(401, "invalid_request_error", "invalid_api_key", message)
+    }
+
+    /// `429 rate_limit_exceeded` + `Retry-After` — transient overload,
+    /// fair-share/in-flight cap, admission rejection, or a rolling budget
+    /// window that resets (#52/#53/#54/#55). Clients back off and retry.
+    pub fn rate_limit_exceeded(message: impl Into<String>, retry_after_secs: u64) -> Self {
+        Self::new(429, "rate_limit_error", "rate_limit_exceeded", message)
+            .with_retry_after(retry_after_secs)
+    }
+
+    /// `429 insufficient_quota` — hard balance exhausted, no reset (#52).
+    /// No `Retry-After`; the client surfaces and stops. (Never `402`.)
+    pub fn insufficient_quota(message: impl Into<String>) -> Self {
+        Self::new(429, "insufficient_quota", "insufficient_quota", message)
+    }
+
+    /// `400 context_length_exceeded` — prompt exceeds the model's context
+    /// window (#56/#60). Permanent for this request; opencode auto-compacts.
+    pub fn context_length_exceeded(message: impl Into<String>) -> Self {
+        Self::new(
+            400,
+            "invalid_request_error",
+            "context_length_exceeded",
+            message,
+        )
+    }
+
+    /// `503 service_unavailable` + optional `Retry-After` — transient
+    /// backend unavailability (no healthy nodes, recovery, fail-closed
+    /// upstream). Retryable when a hint is given.
+    pub fn service_unavailable(message: impl Into<String>, retry_after_secs: Option<u64>) -> Self {
+        let mut err = Self::new(503, "api_error", "service_unavailable", message);
+        err.retry_after_secs = retry_after_secs;
+        err
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn body_has_standard_envelope_shape() {
+        let env = OpenAiError::new(429, "rate_limit_error", "rate_limit_exceeded", "slow down");
+        let body = env.body();
+        let error = body.get("error").and_then(Value::as_object).unwrap();
+        assert_eq!(error["message"], "slow down");
+        assert_eq!(error["type"], "rate_limit_error");
+        assert_eq!(error["code"], "rate_limit_exceeded");
+        assert_eq!(error["param"], Value::Null);
+    }
+
+    #[test]
+    fn without_code_renders_null_code() {
+        let env = OpenAiError::without_code(500, "api_error", "kaboom");
+        assert_eq!(env.body()["error"]["code"], Value::Null);
+    }
+
+    #[test]
+    fn extras_ride_inside_the_error_object() {
+        let env = OpenAiError::context_length_exceeded("too long")
+            .with_extra("prompt_len", json!(60_000))
+            .with_extra("max", json!(49_152));
+        let error = &env.body()["error"];
+        assert_eq!(error["prompt_len"], 60_000);
+        assert_eq!(error["max"], 49_152);
+        assert_eq!(error["code"], "context_length_exceeded");
+    }
+
+    #[test]
+    fn rolling_window_rejection_carries_retry_after() {
+        let env = OpenAiError::rate_limit_exceeded("budget window", 30);
+        assert_eq!(env.status, 429);
+        assert_eq!(env.retry_after_secs, Some(30));
+    }
+
+    #[test]
+    fn hard_balance_rejection_has_no_retry_after() {
+        let env = OpenAiError::insufficient_quota("out of credit");
+        assert_eq!(env.status, 429);
+        assert_eq!(env.code.as_deref(), Some("insufficient_quota"));
+        assert_eq!(env.retry_after_secs, None);
+    }
+
+    #[test]
+    fn permanent_rejections_have_no_retry_after() {
+        assert_eq!(OpenAiError::invalid_api_key("nope").retry_after_secs, None);
+        assert_eq!(
+            OpenAiError::context_length_exceeded("too long").retry_after_secs,
+            None
+        );
+    }
+
+    #[test]
+    fn service_unavailable_retry_after_is_optional() {
+        assert_eq!(
+            OpenAiError::service_unavailable("recovering", Some(5)).retry_after_secs,
+            Some(5)
+        );
+        assert_eq!(
+            OpenAiError::service_unavailable("gone", None).retry_after_secs,
+            None
+        );
+    }
+}
--- a/crates/cortex-core/src/harness.rs
+++ b/crates/cortex-core/src/harness.rs
@@ -36,6 +36,44 @@ pub struct ModelSpec {
    pub devices: Option<Vec<u32>>,
 }

+/// Per-model token budget advertised by the catalogue or neuron.
+///
+/// `context` is the hard wall (the served max-seq-len).  `input` is the
+/// compaction trigger — when set, opencode treats it as "usable context =
+/// input − reserved".  When omitted, clients fall back to `context − output`.
+/// `output` is the maximum number of generation tokens.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelLimit {
+    /// Hard wall — served max-seq-len in tokens.
+    pub context: usize,
+    /// Compaction trigger / usable input budget.  When absent clients fall
+    /// back to `context − output`.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub input: Option<usize>,
+    /// Maximum number of generation tokens.
+    pub output: usize,
+}
+
+/// Operator-set pricing in USD per 1M tokens.
+///
+/// Self-hosted deployments typically leave both at `0.0`.  Cache fields are
+/// optional — set when the backend supports a prefix-cache discount tier.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelCost {
+    /// USD per 1M input (prompt) tokens.
+    #[serde(default)]
+    pub input: f64,
+    /// USD per 1M output (completion) tokens.
+    #[serde(default)]
+    pub output: f64,
+    /// USD per 1M cache-hit tokens (optional).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_read: Option<f64>,
+    /// USD per 1M cache-write tokens (optional).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_write: Option<f64>,
+}
+
 /// A model as reported by a harness.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ModelInfo {
@@ -44,6 +82,33 @@ pub struct ModelInfo {
    pub status: String,
    pub devices: Vec<u32>,
    pub vram_used_mb: Option<u64>,
+    /// Modalities this loaded model supports. Today: `["text"]` for
+    /// text-only checkpoints, `["text", "vision"]` for vision-capable
+    /// ones (Stage B7). Clients like litellm / agent0 can gate
+    /// `image_url` submission on the advertised set.
+    ///
+    /// Optional in the wire format so older clients that don't read
+    /// it stay compatible. Default-empty for absent/older data, which
+    /// callers can interpret as "text".
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub capabilities: Vec<String>,
+
+    // ── Enrichment (issue #62) ────────────────────────────────
+    /// Token budget advertised by the catalogue or discovered at load time.
+    /// `None` when neither the catalogue nor the loaded model can provide it.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
+    /// Operator-set pricing in USD per 1M tokens (0.0 = free/self-hosted).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cost: Option<ModelCost>,
+    /// `true` when the model's tokenizer contains recognised tool-call
+    /// marker tokens (`<tool_call>` / `<\/tool_call>` convention).
+    #[serde(default)]
+    pub tool_call: bool,
+    /// `true` when the model's tokenizer contains recognised reasoning
+    /// marker tokens (`<think>` / `<\/think>` or similar).
+    #[serde(default)]
+    pub reasoning: bool,
 }

 /// What an inference harness must do, from neuron's perspective.
--- a/crates/cortex-core/src/lib.rs
+++ b/crates/cortex-core/src/lib.rs
@@ -1,10 +1,14 @@
 pub mod anthropic;
+pub mod build_info;
 pub mod catalogue;
 pub mod config;
 pub mod discovery;
+pub mod entitlements;
+pub mod error_envelope;
 pub mod harness;
 pub mod metrics;
 pub mod node;
 pub mod openai;
 pub mod responses;
+pub mod source;
 pub mod translate;
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -1,4 +1,5 @@
-use crate::discovery::{ActivationStatus, DiscoveryResponse};
+use crate::discovery::{ActivationStatus, DiscoveryResponse, ModelLoad};
+use crate::harness::{ModelCost, ModelLimit};
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
@@ -26,6 +27,11 @@ pub struct NodeState {
    /// to synthesize `Loading` locations so clients see a catalogued
    /// model that's mid-prewarm as "loading", not "missing".
    pub activation: Option<ActivationStatus>,
+    /// Last-seen per-model admission load from this neuron's `/health`
+    /// (#53), keyed by model id. The router (#55) reads it to pick the
+    /// least-busy replica when a model is loaded on more than one neuron.
+    /// Empty until the first /health poll reports load.
+    pub model_load: HashMap<String, ModelLoad>,
 }

 /// A model registered on a node, with its runtime status.
@@ -37,6 +43,27 @@ pub struct ModelEntry {
    pub last_accessed: Option<DateTime<Utc>>,
    /// Estimated VRAM usage in MB when loaded.
    pub vram_estimate_mb: Option<u64>,
+    /// Modalities the loaded model advertises (e.g. `["text", "vision"]`),
+    /// copied verbatim from the neuron's `ModelInfo.capabilities` at poll
+    /// time. Empty when the neuron reports none. `#[serde(default)]` keeps
+    /// older persisted/serialised entries deserialisable.
+    #[serde(default)]
+    pub capabilities: Vec<String>,
+    /// Runtime-detected capability flags from the neuron's `/models`
+    /// response (`ModelInfo`). `false` when the neuron predates these
+    /// fields or hasn't reported them yet.
+    #[serde(default)]
+    pub tool_call: bool,
+    #[serde(default)]
+    pub reasoning: bool,
+    /// Self-derived token budget the neuron computed for this loaded
+    /// model (#67), copied from `ModelInfo.limit` at poll time. `None`
+    /// when the neuron doesn't compute one (arch without a context
+    /// profile, or derivation disabled). This is the authoritative
+    /// source the gateway advertises — operator-declared catalogue
+    /// limits are no longer consulted.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
 }

 /// Model lifecycle status.
@@ -55,6 +82,12 @@ pub enum ModelStatus {
    Unloaded,
    Reloading,
    Loading,
+    /// Reported by neuron while a poisoned model auto-recovers via
+    /// unload→reload (#17/#20). Temporarily unservable but NOT
+    /// evicted: the gateway holds the route, answers with a transient
+    /// retry error instead of 404, and must not race a second
+    /// placement elsewhere.
+    Recovering,
 }

 /// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
@@ -85,6 +118,27 @@ pub struct CortexModelEntry {
    /// disjoint from) `feasible_on` depending on whether the catalogue
    /// covers this model.
    pub locations: Vec<ModelLocation>,
+    /// Union of the modalities advertised by every neuron that has this
+    /// model loaded (e.g. `["text", "vision"]`). Empty for catalogue-only
+    /// entries with no loaded location — filled from catalogue profile
+    /// capabilities when available, then unioned with runtime-detected
+    /// values from loaded neurons.
+    #[serde(default)]
+    pub capabilities: Vec<String>,
+    // ── Enrichment (issue #62) ────────────────────────────────
+    /// Per-model token budget from the catalogue profile or discovered
+    /// at load time. `None` when neither source provides it.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub limit: Option<ModelLimit>,
+    /// Operator-set pricing in USD per 1M tokens (0.0 = free/self-hosted).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cost: Option<ModelCost>,
+    /// `true` when any neuron reports this model supports tool calls.
+    #[serde(default)]
+    pub tool_call: bool,
+    /// `true` when any neuron reports this model supports reasoning tokens.
+    #[serde(default)]
+    pub reasoning: bool,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/crates/cortex-core/src/openai.rs
+++ b/crates/cortex-core/src/openai.rs
@@ -71,10 +71,18 @@ pub struct ChatCompletionChoice {

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ChatCompletionChunk {
+    #[serde(default)]
    pub id: String,
+    #[serde(default)]
    pub object: String,
+    #[serde(default)]
    pub created: u64,
+    // Lenient deserialization throughout: the gateway parses chunks
+    // from arbitrary OpenAI-compatible upstreams, and some engines
+    // omit fields on special frames (e.g. usage-only final chunks).
+    #[serde(default)]
    pub model: String,
+    #[serde(default)]
    pub choices: Vec<ChunkChoice>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub usage: Option<Usage>,
@@ -98,6 +106,31 @@ pub struct Usage {
    pub prompt_tokens: u64,
    pub completion_tokens: u64,
    pub total_tokens: u64,
+    /// OpenAI-standard breakdown of `completion_tokens`. Optional and
+    /// additive — clients that don't read it are unaffected. Carries
+    /// `reasoning_tokens` for reasoning models (a sub-count of
+    /// `completion_tokens`, never added into `total_tokens`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub completion_tokens_details: Option<CompletionTokensDetails>,
+    /// OpenAI-standard breakdown of `prompt_tokens`. Populated once
+    /// prompt caching lands (#11); `None` until then.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prompt_tokens_details: Option<PromptTokensDetails>,
+}
+
+/// Sub-counts of `Usage::completion_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CompletionTokensDetails {
+    /// Tokens generated inside the model's reasoning span.
+    pub reasoning_tokens: u64,
+}
+
+/// Sub-counts of `Usage::prompt_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PromptTokensDetails {
+    /// Prompt tokens served from cache (cache-read rate). Populated
+    /// once prompt caching lands (#11).
+    pub cached_tokens: u64,
 }

 // ── Models list response ─────────────────────────────────────────────
--- a/crates/cortex-core/src/responses.rs
+++ b/crates/cortex-core/src/responses.rs
@@ -202,6 +202,30 @@ pub struct ResponsesUsage {
    pub input_tokens: u64,
    pub output_tokens: u64,
    pub total_tokens: u64,
+    /// OpenAI-standard breakdown of `output_tokens`. Optional and
+    /// additive. Carries `reasoning_tokens` for reasoning models (a
+    /// sub-count of `output_tokens`, never added into `total_tokens`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub output_tokens_details: Option<OutputTokensDetails>,
+    /// OpenAI-standard breakdown of `input_tokens`. Populated once
+    /// prompt caching lands (#11); `None` until then.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub input_tokens_details: Option<InputTokensDetails>,
+}
+
+/// Sub-counts of `ResponsesUsage::output_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OutputTokensDetails {
+    /// Tokens generated inside the model's reasoning span.
+    pub reasoning_tokens: u64,
+}
+
+/// Sub-counts of `ResponsesUsage::input_tokens`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InputTokensDetails {
+    /// Input tokens served from cache (cache-read rate). Populated
+    /// once prompt caching lands (#11).
+    pub cached_tokens: u64,
 }

 // ── Streaming event names ────────────────────────────────────────────
@@ -336,6 +360,8 @@ mod tests {
                input_tokens: 5,
                output_tokens: 3,
                total_tokens: 8,
+                output_tokens_details: None,
+                input_tokens_details: None,
            }),
        };
        let json = serde_json::to_string(&r).unwrap();
--- a/crates/cortex-core/src/source.rs
+++ b/crates/cortex-core/src/source.rs
@@ -0,0 +1,267 @@
+//! Scheme-qualified model identifiers.
+//!
+//! cortex/neuron historically resolves every model id through hf-hub
+//! against `https://huggingface.co`. Helexa is adding an EU-hosted
+//! registry (`registry.helexa.ai`) alongside HF — both speak the same
+//! HF-compatible wire format, but the bytes, jurisdiction, and trust
+//! root differ. Model ids therefore need a scheme:
+//!
+//!   - `huggingface:Qwen/Qwen3.6-27B`         — HF-hosted bytes
+//!   - `helexa:Qwen/Qwen3.6-27B-Uncensored`  — helexa registry bytes
+//!   - `helexa:SomeOperator/CustomFinetune`  — operator publishing
+//!     under the helexa namespace; same scheme handles all `org/name`
+//!     pairs hosted in that registry.
+//!
+//! Bare `org/name` parses with an empty scheme; the caller (typically
+//! a harness) substitutes its configured default scheme so existing
+//! configs keep working through the transition.
+
+use serde::{Deserialize, Serialize};
+use std::fmt;
+use std::str::FromStr;
+
+/// Parsed `scheme:org/name`. Bare `org/name` produces an empty scheme
+/// — call `with_default_scheme` (or check `is_scheme_unset`) to
+/// resolve before using.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ModelSourceId {
+    pub scheme: String,
+    pub org: String,
+    pub name: String,
+}
+
+/// Errors from `ModelSourceId::from_str`. Carries the offending input
+/// so log lines / API errors can echo what the operator typed.
+#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
+pub enum ParseError {
+    #[error("empty model id")]
+    Empty,
+    #[error("model id '{0}' is missing the '/' between org and name")]
+    MissingSlash(String),
+    #[error("model id '{0}' has an empty scheme before ':'")]
+    EmptyScheme(String),
+    #[error("model id '{0}' has an empty org")]
+    EmptyOrg(String),
+    #[error("model id '{0}' has an empty name")]
+    EmptyName(String),
+    #[error("model id '{0}' has a scheme containing '/' which is reserved for org/name")]
+    SchemeContainsSlash(String),
+    #[error("model id '{0}' has a name containing ':' which is reserved for the scheme prefix")]
+    NameContainsColon(String),
+}
+
+impl ModelSourceId {
+    /// Construct directly from already-validated parts. Used by tests
+    /// and call sites that have the fields separately; the public API
+    /// for parsing user input is `FromStr`.
+    pub fn new(scheme: impl Into<String>, org: impl Into<String>, name: impl Into<String>) -> Self {
+        Self {
+            scheme: scheme.into(),
+            org: org.into(),
+            name: name.into(),
+        }
+    }
+
+    /// True when this id parsed from a bare `org/name` (no scheme
+    /// prefix). The harness substitutes its configured default in
+    /// `with_default_scheme` before resolving against a registry.
+    pub fn is_scheme_unset(&self) -> bool {
+        self.scheme.is_empty()
+    }
+
+    /// Substitute `default` for an empty scheme. No-op when the scheme
+    /// is already set. Returns self by value so it composes neatly:
+    /// `id.parse::<ModelSourceId>()?.with_default_scheme("huggingface")`.
+    pub fn with_default_scheme(mut self, default: &str) -> Self {
+        if self.scheme.is_empty() {
+            self.scheme = default.to_string();
+        }
+        self
+    }
+
+    /// The `org/name` half — what an hf-hub `Api::model(...)` call
+    /// expects regardless of which scheme/endpoint we're hitting.
+    pub fn repo_path(&self) -> String {
+        format!("{}/{}", self.org, self.name)
+    }
+}
+
+impl fmt::Display for ModelSourceId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.scheme.is_empty() {
+            write!(f, "{}/{}", self.org, self.name)
+        } else {
+            write!(f, "{}:{}/{}", self.scheme, self.org, self.name)
+        }
+    }
+}
+
+impl FromStr for ModelSourceId {
+    type Err = ParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        if s.is_empty() {
+            return Err(ParseError::Empty);
+        }
+        // Scheme split. Only the *first* colon counts — anything after
+        // belongs to org/name (and would be rejected separately because
+        // `:` isn't allowed there).
+        let (scheme, rest) = match s.split_once(':') {
+            Some((scheme, rest)) => {
+                if scheme.is_empty() {
+                    return Err(ParseError::EmptyScheme(s.to_string()));
+                }
+                if scheme.contains('/') {
+                    return Err(ParseError::SchemeContainsSlash(s.to_string()));
+                }
+                (scheme.to_string(), rest)
+            }
+            None => (String::new(), s),
+        };
+        let (org, name) = rest
+            .split_once('/')
+            .ok_or_else(|| ParseError::MissingSlash(s.to_string()))?;
+        if org.is_empty() {
+            return Err(ParseError::EmptyOrg(s.to_string()));
+        }
+        if name.is_empty() {
+            return Err(ParseError::EmptyName(s.to_string()));
+        }
+        if name.contains(':') {
+            return Err(ParseError::NameContainsColon(s.to_string()));
+        }
+        Ok(Self {
+            scheme,
+            org: org.to_string(),
+            name: name.to_string(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parses_qualified() {
+        let id: ModelSourceId = "huggingface:Qwen/Qwen3.6-27B".parse().unwrap();
+        assert_eq!(id.scheme, "huggingface");
+        assert_eq!(id.org, "Qwen");
+        assert_eq!(id.name, "Qwen3.6-27B");
+        assert_eq!(id.repo_path(), "Qwen/Qwen3.6-27B");
+        assert!(!id.is_scheme_unset());
+    }
+
+    #[test]
+    fn parses_helexa_scheme() {
+        let id: ModelSourceId = "helexa:SomeOperator/Qwen3.6-27B-Uncensored"
+            .parse()
+            .unwrap();
+        assert_eq!(id.scheme, "helexa");
+        assert_eq!(id.org, "SomeOperator");
+        assert_eq!(id.name, "Qwen3.6-27B-Uncensored");
+    }
+
+    #[test]
+    fn parses_bare_id_with_empty_scheme() {
+        let id: ModelSourceId = "Qwen/Qwen3-30B-A3B-Instruct".parse().unwrap();
+        assert_eq!(id.scheme, "");
+        assert_eq!(id.org, "Qwen");
+        assert_eq!(id.name, "Qwen3-30B-A3B-Instruct");
+        assert!(id.is_scheme_unset());
+    }
+
+    #[test]
+    fn substitutes_default_scheme_only_when_unset() {
+        let id: ModelSourceId = "Qwen/Q3".parse().unwrap();
+        assert_eq!(id.with_default_scheme("huggingface").scheme, "huggingface");
+
+        let id: ModelSourceId = "helexa:Qwen/Q3".parse().unwrap();
+        assert_eq!(
+            id.with_default_scheme("huggingface").scheme,
+            "helexa",
+            "default substitution must not override an explicit scheme"
+        );
+    }
+
+    #[test]
+    fn display_roundtrips_qualified_id() {
+        let s = "helexa:Helexa/Qwen3.6-27B";
+        let id: ModelSourceId = s.parse().unwrap();
+        assert_eq!(id.to_string(), s);
+    }
+
+    #[test]
+    fn display_roundtrips_bare_id() {
+        let s = "Qwen/Q3";
+        let id: ModelSourceId = s.parse().unwrap();
+        assert_eq!(id.to_string(), s);
+    }
+
+    #[test]
+    fn rejects_empty() {
+        assert_eq!("".parse::<ModelSourceId>().unwrap_err(), ParseError::Empty);
+    }
+
+    #[test]
+    fn rejects_missing_slash() {
+        match "Qwen".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::MissingSlash(s) => assert_eq!(s, "Qwen"),
+            other => panic!("expected MissingSlash, got {other:?}"),
+        }
+        match "huggingface:Qwen".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::MissingSlash(s) => assert_eq!(s, "huggingface:Qwen"),
+            other => panic!("expected MissingSlash, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_empty_scheme() {
+        match ":Qwen/Q3".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::EmptyScheme(s) => assert_eq!(s, ":Qwen/Q3"),
+            other => panic!("expected EmptyScheme, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_scheme_with_slash() {
+        match "hugg/ingface:Q/N".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::SchemeContainsSlash(s) => assert_eq!(s, "hugg/ingface:Q/N"),
+            other => panic!("expected SchemeContainsSlash, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_empty_org_or_name() {
+        match "huggingface:/N".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::EmptyOrg(_) => {}
+            other => panic!("expected EmptyOrg, got {other:?}"),
+        }
+        match "huggingface:Q/".parse::<ModelSourceId>().unwrap_err() {
+            ParseError::EmptyName(_) => {}
+            other => panic!("expected EmptyName, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn rejects_name_with_colon() {
+        match "huggingface:Q/N:weird"
+            .parse::<ModelSourceId>()
+            .unwrap_err()
+        {
+            ParseError::NameContainsColon(s) => assert_eq!(s, "huggingface:Q/N:weird"),
+            other => panic!("expected NameContainsColon, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn serde_roundtrips_via_struct() {
+        // We serialize as a struct (scheme/org/name fields) so the
+        // shape is self-describing in API payloads. Callers that want
+        // the compact `scheme:org/name` string use `Display`/`FromStr`.
+        let id = ModelSourceId::new("helexa", "Helexa", "Qwen3.6-27B");
+        let json = serde_json::to_string(&id).unwrap();
+        let back: ModelSourceId = serde_json::from_str(&json).unwrap();
+        assert_eq!(back, id);
+    }
+}
--- a/crates/cortex-core/src/translate.rs
+++ b/crates/cortex-core/src/translate.rs
--- a/crates/cortex-gateway/Cargo.toml
+++ b/crates/cortex-gateway/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 cortex-core.workspace = true
+async-trait.workspace = true
 tokio.workspace = true
 axum.workspace = true
 tower.workspace = true
--- a/crates/cortex-gateway/src/anthropic_sse.rs
+++ b/crates/cortex-gateway/src/anthropic_sse.rs
@@ -0,0 +1,235 @@
+//! Streaming Anthropic SSE translation (#24).
+//!
+//! The `/v1/messages` handler translates the request envelope to
+//! OpenAI before proxying (see `cortex_core::translate`); this module
+//! completes the round trip for `stream: true` — the upstream OpenAI
+//! SSE stream is re-framed, event by event, into Anthropic's
+//! `message_start` / `content_block_*` / `message_delta` /
+//! `message_stop` sequence as it arrives. True streaming: each
+//! upstream chunk is translated and forwarded immediately; nothing is
+//! buffered beyond the current SSE event's bytes.
+//!
+//! The translation state machine itself is pure and lives in
+//! [`cortex_core::translate::AnthropicStreamTranslator`]; this module
+//! owns the wire concerns — splitting the upstream byte stream into
+//! SSE events, parsing `data:` payloads, and framing the translated
+//! events as `event: <name>\ndata: <json>\n\n`.
+
+use axum::body::Body;
+use axum::http::StatusCode;
+use axum::response::Response;
+use bytes::Bytes;
+use cortex_core::openai::ChatCompletionChunk;
+use cortex_core::translate::AnthropicStreamTranslator;
+use futures::StreamExt;
+use tokio_stream::wrappers::ReceiverStream;
+
+/// Forward the translated OpenAI request to the upstream node and
+/// return the response translated to Anthropic SSE framing.
+pub async fn stream_translated(
+    client: &reqwest::Client,
+    endpoint: &str,
+    openai_body: axum::body::Bytes,
+    model_id: &str,
+    node_name: &str,
+    inbound_headers: &axum::http::HeaderMap,
+    usage_sink: Option<crate::metering::UsageSink>,
+) -> Response {
+    let url = format!("{endpoint}/v1/chat/completions");
+    tracing::info!(
+        handler = "anthropic_messages",
+        model = %model_id,
+        node = %node_name,
+        url = %url,
+        "proxying streaming request (anthropic SSE translation)"
+    );
+
+    let request = crate::auth::forward_principal_headers(
+        client
+            .post(&url)
+            .header("content-type", "application/json")
+            .body(openai_body),
+        inbound_headers,
+    );
+    let upstream = match request.send().await {
+        Ok(r) => r,
+        Err(e) => {
+            tracing::warn!(
+                handler = "anthropic_messages",
+                node = %node_name,
+                url = %url,
+                error = %e,
+                "anthropic stream: upstream request failed"
+            );
+            return anthropic_error(StatusCode::BAD_GATEWAY, "upstream request failed");
+        }
+    };
+
+    let status = upstream.status();
+    if !status.is_success() {
+        tracing::warn!(
+            handler = "anthropic_messages",
+            node = %node_name,
+            url = %url,
+            status = status.as_u16(),
+            "anthropic stream: upstream returned non-2xx"
+        );
+        return anthropic_error(
+            StatusCode::from_u16(status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY),
+            "upstream returned an error",
+        );
+    }
+
+    // Bounded channel: a slow client back-pressures the pump task,
+    // which back-pressures the upstream read — same propagation
+    // discipline as neuron's own projectors.
+    let (tx, rx) = tokio::sync::mpsc::channel::<Result<Bytes, std::convert::Infallible>>(32);
+    let node = node_name.to_string();
+    let model = model_id.to_string();
+    tokio::spawn(async move {
+        let mut upstream = upstream.bytes_stream();
+        let mut translator = AnthropicStreamTranslator::new();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut done = false;
+        // Wire-debug accounting for the stream summary emitted at the
+        // end: did the model emit a structured tool call, what was the
+        // final finish_reason, and how many upstream frames did we see.
+        let mut saw_tool_call = false;
+        let mut last_finish: Option<String> = None;
+        let mut frames = 0u64;
+        // Engine-truth usage for metering (#51), scanned from the upstream
+        // frames (neuron emits a final `usage` object on the stream, #48).
+        let mut usage_prompt = 0u64;
+        let mut usage_completion = 0u64;
+
+        'outer: while let Some(block) = upstream.next().await {
+            let block = match block {
+                Ok(b) => b,
+                Err(e) => {
+                    tracing::warn!(node = %node, error = %e, "anthropic stream: upstream read failed mid-stream");
+                    break;
+                }
+            };
+            buf.extend_from_slice(&block);
+            // SSE events are separated by a blank line.
+            while let Some(pos) = find_event_boundary(&buf) {
+                let event: Vec<u8> = buf.drain(..pos + 2).collect();
+                let text = String::from_utf8_lossy(&event);
+                for line in text.lines() {
+                    let Some(data) = line.strip_prefix("data:") else {
+                        continue;
+                    };
+                    let data = data.trim();
+                    if data == "[DONE]" {
+                        done = true;
+                        if !send_frames(&tx, translator.finish()).await {
+                            break 'outer;
+                        }
+                        continue;
+                    }
+                    tracing::trace!(node = %node, frame = %data, "anthropic stream: upstream frame");
+                    // Capture usage for metering before translation — the
+                    // usage object rides on a late frame (often after the
+                    // last content delta).
+                    if let Some(p) = crate::proxy::last_count_for(data, "prompt_tokens") {
+                        usage_prompt = p;
+                    }
+                    if let Some(c) = crate::proxy::last_count_for(data, "completion_tokens") {
+                        usage_completion = c;
+                    }
+                    let Ok(chunk) = serde_json::from_str::<ChatCompletionChunk>(data) else {
+                        tracing::debug!(node = %node, "anthropic stream: unparsable upstream frame skipped");
+                        continue;
+                    };
+                    frames += 1;
+                    if chunk
+                        .choices
+                        .iter()
+                        .any(|c| c.delta.get("tool_calls").is_some())
+                    {
+                        saw_tool_call = true;
+                    }
+                    if let Some(fr) = chunk.choices.iter().find_map(|c| c.finish_reason.clone()) {
+                        last_finish = Some(fr);
+                    }
+                    if !send_frames(&tx, translator.on_chunk(&chunk)).await {
+                        break 'outer;
+                    }
+                }
+            }
+        }
+        // Upstream ended without [DONE] (error or truncation): still
+        // close the Anthropic event sequence so clients aren't left
+        // with an unterminated message.
+        if !done {
+            let _ = send_frames(&tx, translator.finish()).await;
+        }
+        // Stream summary: the streaming counterpart to the non-streaming
+        // handler's "upstream response" line. `upstream_tool_calls =
+        // false` on a tools-bearing request is the fingerprint of the
+        // model improvising an unparsed tool-call format.
+        tracing::debug!(
+            wire = "anthropic",
+            model = %model,
+            node = %node,
+            frames,
+            upstream_tool_calls = saw_tool_call,
+            finish_reason = ?last_finish,
+            terminated = done,
+            "anthropic stream complete"
+        );
+
+        // Settle metering with the observed usage (#51). Runs on every exit
+        // path of the pump — clean end, early break, or upstream error — so
+        // the reservation is always resolved. `(0, 0)` when no usage frame
+        // was seen, which releases without recording spend.
+        if let Some(sink) = usage_sink {
+            sink(usage_prompt, usage_completion);
+        }
+    });
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header("content-type", "text/event-stream")
+        .header("cache-control", "no-cache")
+        .body(Body::from_stream(ReceiverStream::new(rx)))
+        .unwrap_or_else(|_| {
+            anthropic_error(
+                StatusCode::INTERNAL_SERVER_ERROR,
+                "failed to build response",
+            )
+        })
+}
+
+/// `\n\n` boundary of the first complete SSE event in `buf`, if any.
+fn find_event_boundary(buf: &[u8]) -> Option<usize> {
+    buf.windows(2).position(|w| w == b"\n\n")
+}
+
+/// Render translated events as SSE frames and send them. Returns
+/// `false` when the client has gone away (receiver dropped).
+async fn send_frames(
+    tx: &tokio::sync::mpsc::Sender<Result<Bytes, std::convert::Infallible>>,
+    events: Vec<(String, serde_json::Value)>,
+) -> bool {
+    for (name, payload) in events {
+        let frame = format!("event: {name}\ndata: {payload}\n\n");
+        if tx.send(Ok(Bytes::from(frame))).await.is_err() {
+            return false;
+        }
+    }
+    true
+}
+
+/// Anthropic-shaped error body (`{"type":"error","error":{...}}`).
+fn anthropic_error(status: StatusCode, message: &str) -> Response {
+    let body = serde_json::json!({
+        "type": "error",
+        "error": { "type": "api_error", "message": message }
+    });
+    Response::builder()
+        .status(status)
+        .header("content-type", "application/json")
+        .body(Body::from(body.to_string()))
+        .expect("static error response must build")
+}
--- a/crates/cortex-gateway/src/auth.rs
+++ b/crates/cortex-gateway/src/auth.rs
@@ -0,0 +1,119 @@
+//! API-key authentication + principal resolution (#49).
+//!
+//! Identity rides standard bearer auth only — `Authorization: Bearer <key>`
+//! — which is what keeps every tier OpenAI-compatible by construction (no
+//! custom required headers or body fields, per #47). The middleware resolves
+//! the key to a [`Principal`] via the [`EntitlementProvider`], carries it in
+//! the request extensions for cortex-side metering/enforcement (#51/#52), and
+//! stamps it as internal headers on the request so it reaches neuron, which
+//! trusts cortex's assertion over WireGuard (#54).
+//!
+//! Anti-spoofing: any client-supplied principal header is **stripped** before
+//! the authoritative value is stamped, so a client can never assert a
+//! principal it didn't authenticate as.
+//!
+//! Rejection contract (#63): missing key under `require_auth`, or any present
+//! but unresolvable key, yields `401 invalid_api_key` in the #60 envelope.
+
+use crate::error::envelope_response;
+use crate::state::CortexState;
+use axum::extract::{Request, State};
+use axum::http::header::AUTHORIZATION;
+use axum::http::{HeaderMap, HeaderValue};
+use axum::middleware::Next;
+use axum::response::Response;
+use cortex_core::entitlements::{HEADER_ACCOUNT_ID, HEADER_KEY_ID};
+use cortex_core::error_envelope::OpenAiError;
+use std::sync::Arc;
+
+/// Endpoints that never require auth: liveness/readiness probes. Everything
+/// else flows through resolution.
+fn is_public(path: &str) -> bool {
+    path == "/health" || path == "/"
+}
+
+/// Extract the bearer token from an `Authorization` header value, if present
+/// and well-formed. Scheme match is case-insensitive per RFC 7235.
+fn parse_bearer(headers: &HeaderMap) -> Option<String> {
+    let raw = headers.get(AUTHORIZATION)?.to_str().ok()?;
+    let (scheme, token) = raw.split_once(' ')?;
+    if scheme.eq_ignore_ascii_case("bearer") {
+        let token = token.trim();
+        (!token.is_empty()).then(|| token.to_string())
+    } else {
+        None
+    }
+}
+
+/// Axum middleware: resolve the bearer key, attach the principal, stamp the
+/// internal headers. Wired in `build_app` via `from_fn_with_state`.
+pub async fn require_principal(
+    State(fleet): State<Arc<CortexState>>,
+    mut req: Request,
+    next: Next,
+) -> Response {
+    if is_public(req.uri().path()) {
+        return next.run(req).await;
+    }
+
+    // Anti-spoof: drop any client-supplied principal headers up front.
+    {
+        let headers = req.headers_mut();
+        headers.remove(HEADER_ACCOUNT_ID);
+        headers.remove(HEADER_KEY_ID);
+    }
+
+    match parse_bearer(req.headers()) {
+        Some(key) => match fleet.entitlements.resolve(&key).await {
+            Ok(principal) => {
+                // Stamp the authoritative principal for neuron. Account/key
+                // ids come from operator config, so they're valid header
+                // values; guard anyway and skip a malformed one rather than
+                // panic.
+                if let (Ok(account), Ok(key_id)) = (
+                    HeaderValue::from_str(&principal.account_id),
+                    HeaderValue::from_str(&principal.key_id),
+                ) {
+                    let headers = req.headers_mut();
+                    headers.insert(HEADER_ACCOUNT_ID, account);
+                    headers.insert(HEADER_KEY_ID, key_id);
+                }
+                // Carry the typed principal for cortex-side metering (#51)
+                // and budget enforcement (#52).
+                req.extensions_mut().insert(principal);
+                next.run(req).await
+            }
+            // A present-but-invalid credential is always an error, even when
+            // anonymous access is otherwise allowed.
+            Err(_) => unauthorized("invalid API key"),
+        },
+        None => {
+            if fleet.require_auth {
+                unauthorized("missing API key; supply 'Authorization: Bearer <key>'")
+            } else {
+                next.run(req).await
+            }
+        }
+    }
+}
+
+/// `401 invalid_api_key` in the standard envelope (#63).
+fn unauthorized(message: &str) -> Response {
+    envelope_response(OpenAiError::invalid_api_key(message))
+}
+
+/// Copy the cortex-stamped principal headers from an inbound [`HeaderMap`]
+/// onto an outbound reqwest builder. Used by the Anthropic proxy paths,
+/// which construct their own upstream requests instead of going through
+/// [`crate::proxy::forward_request`] (which forwards all headers verbatim).
+pub fn forward_principal_headers(
+    mut builder: reqwest::RequestBuilder,
+    headers: &HeaderMap,
+) -> reqwest::RequestBuilder {
+    for name in [HEADER_ACCOUNT_ID, HEADER_KEY_ID] {
+        if let Some(value) = headers.get(name) {
+            builder = builder.header(name, value);
+        }
+    }
+    builder
+}
--- a/crates/cortex-gateway/src/entitlements_local.rs
+++ b/crates/cortex-gateway/src/entitlements_local.rs
@@ -0,0 +1,317 @@
+//! The local/static [`EntitlementProvider`] (#50).
+//!
+//! Accounts, keys, and hard caps come from operator config
+//! ([`cortex_core::config::EntitlementsConfig`]); reservations and settled
+//! spend are tracked in-process. This lands auth + per-key caps + the
+//! amplification fix before any upstream clearing house exists; the future
+//! helexa-upstream client (#57) implements the same trait.
+//!
+//! Budget math is serialized under a single [`std::sync::Mutex`] so
+//! reserve/settle/release are atomic — a key's `spent + reserved` can never
+//! exceed its hard cap even under concurrent requests (the #52 guarantee).
+//! The lock is held only for the in-memory arithmetic, never across an
+//! await.
+
+use cortex_core::config::{ApiKeyConfig, EntitlementsConfig};
+use cortex_core::entitlements::{
+    AuthError, BudgetError, BudgetSnapshot, CapWindow, EntitlementProvider, Principal, Reservation,
+};
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Instant;
+
+/// Per-key budget configuration (resolved from [`ApiKeyConfig`]).
+struct Budget {
+    hard_cap: Option<u64>,
+    window: CapWindow,
+}
+
+/// Live, mutable accounting for one key over its current window.
+#[derive(Default)]
+struct Ledger {
+    /// Settled spend in the current window.
+    spent: u64,
+    /// Sum of outstanding (un-settled) reservations.
+    reserved: u64,
+    /// Start of the current rolling window; `None` until the first reserve.
+    /// Unused for [`CapWindow::Balance`].
+    window_start: Option<Instant>,
+}
+
+pub struct LocalEntitlementProvider {
+    /// Bearer token → principal.
+    keys: HashMap<String, Principal>,
+    /// `key_id` → budget config.
+    budgets: HashMap<String, Budget>,
+    /// `key_id` → live ledger.
+    ledgers: Mutex<HashMap<String, Ledger>>,
+    /// Monotonic source of opaque reservation handles.
+    next_id: AtomicU64,
+}
+
+impl LocalEntitlementProvider {
+    /// Build from the `[entitlements]` config. A key without an explicit
+    /// `key_id` is tracked at `account_id` granularity (its secret is never
+    /// used as a label).
+    pub fn from_config(config: &EntitlementsConfig) -> Self {
+        let mut keys = HashMap::new();
+        let mut budgets = HashMap::new();
+        for ApiKeyConfig {
+            key,
+            account_id,
+            key_id,
+            hard_cap,
+            window,
+        } in &config.keys
+        {
+            let key_id = key_id.clone().unwrap_or_else(|| account_id.clone());
+            keys.insert(
+                key.clone(),
+                Principal {
+                    account_id: account_id.clone(),
+                    key_id: key_id.clone(),
+                },
+            );
+            budgets.insert(
+                key_id,
+                Budget {
+                    hard_cap: *hard_cap,
+                    window: window.clone(),
+                },
+            );
+        }
+        Self {
+            keys,
+            budgets,
+            ledgers: Mutex::new(HashMap::new()),
+            next_id: AtomicU64::new(1),
+        }
+    }
+}
+
+/// Tokens still available under `cap` given current `spent`/`reserved`.
+/// `None` cap = unlimited.
+fn available(cap: Option<u64>, spent: u64, reserved: u64) -> Option<u64> {
+    cap.map(|c| c.saturating_sub(spent).saturating_sub(reserved))
+}
+
+#[async_trait::async_trait]
+impl EntitlementProvider for LocalEntitlementProvider {
+    async fn resolve(&self, api_key: &str) -> Result<Principal, AuthError> {
+        self.keys.get(api_key).cloned().ok_or(AuthError::InvalidKey)
+    }
+
+    async fn reserve(
+        &self,
+        principal: &Principal,
+        max_tokens: u64,
+    ) -> Result<Reservation, BudgetError> {
+        // A principal with no configured budget (or an uncapped one) always
+        // reserves; we still track spend for metrics.
+        let budget = self.budgets.get(&principal.key_id);
+        let (cap, window) = match budget {
+            Some(b) => (b.hard_cap, b.window.clone()),
+            None => (None, CapWindow::Balance),
+        };
+
+        let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        let ledger = ledgers.entry(principal.key_id.clone()).or_default();
+
+        // Lazily reset a rolling window that has elapsed before checking.
+        let mut retry_after_secs = 0;
+        if let CapWindow::Rolling { seconds } = window {
+            let now = Instant::now();
+            match ledger.window_start {
+                Some(start) if now.duration_since(start).as_secs() < seconds => {
+                    retry_after_secs = seconds - now.duration_since(start).as_secs();
+                }
+                _ => {
+                    // First reserve, or the window has fully elapsed: reset.
+                    ledger.spent = 0;
+                    ledger.window_start = Some(now);
+                    retry_after_secs = seconds;
+                }
+            }
+        }
+
+        if let Some(avail) = available(cap, ledger.spent, ledger.reserved)
+            && max_tokens > avail
+        {
+            return Err(match window {
+                CapWindow::Rolling { .. } => BudgetError::RateLimited {
+                    requested: max_tokens,
+                    available: avail,
+                    // At least 1s so clients don't hot-loop on a sub-second
+                    // remainder.
+                    retry_after_secs: retry_after_secs.max(1),
+                },
+                CapWindow::Balance => BudgetError::InsufficientQuota {
+                    requested: max_tokens,
+                    available: avail,
+                },
+            });
+        }
+
+        ledger.reserved += max_tokens;
+        Ok(Reservation {
+            id: self.next_id.fetch_add(1, Ordering::Relaxed),
+            principal: principal.clone(),
+            reserved: max_tokens,
+        })
+    }
+
+    async fn settle(&self, reservation: Reservation, actual_tokens: u64) {
+        let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        if let Some(ledger) = ledgers.get_mut(&reservation.principal.key_id) {
+            ledger.reserved = ledger.reserved.saturating_sub(reservation.reserved);
+            ledger.spent += actual_tokens;
+        }
+    }
+
+    async fn release(&self, reservation: Reservation) {
+        let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        if let Some(ledger) = ledgers.get_mut(&reservation.principal.key_id) {
+            ledger.reserved = ledger.reserved.saturating_sub(reservation.reserved);
+        }
+    }
+
+    async fn snapshot(&self, principal: &Principal) -> Option<BudgetSnapshot> {
+        let ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
+        let (spent, reserved) = ledgers
+            .get(&principal.key_id)
+            .map(|l| (l.spent, l.reserved))
+            .unwrap_or((0, 0));
+        let hard_cap = self.budgets.get(&principal.key_id).and_then(|b| b.hard_cap);
+        Some(BudgetSnapshot {
+            hard_cap,
+            spent,
+            reserved,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn provider() -> LocalEntitlementProvider {
+        let config = EntitlementsConfig {
+            require_auth: true,
+            keys: vec![
+                ApiKeyConfig {
+                    key: "sk-balance".into(),
+                    account_id: "acct-a".into(),
+                    key_id: Some("key-balance".into()),
+                    hard_cap: Some(1_000),
+                    window: CapWindow::Balance,
+                },
+                ApiKeyConfig {
+                    key: "sk-rolling".into(),
+                    account_id: "acct-b".into(),
+                    key_id: Some("key-rolling".into()),
+                    hard_cap: Some(500),
+                    window: CapWindow::Rolling { seconds: 3_600 },
+                },
+                ApiKeyConfig {
+                    key: "sk-infra".into(),
+                    account_id: "operator".into(),
+                    key_id: Some("key-infra".into()),
+                    hard_cap: None,
+                    window: CapWindow::Balance,
+                },
+            ],
+        };
+        LocalEntitlementProvider::from_config(&config)
+    }
+
+    #[tokio::test]
+    async fn resolves_configured_key_to_principal() {
+        let p = provider();
+        let principal = p.resolve("sk-balance").await.expect("known key resolves");
+        assert_eq!(principal.account_id, "acct-a");
+        assert_eq!(principal.key_id, "key-balance");
+    }
+
+    #[tokio::test]
+    async fn unknown_key_is_invalid() {
+        let p = provider();
+        assert!(matches!(
+            p.resolve("sk-nope").await,
+            Err(AuthError::InvalidKey)
+        ));
+    }
+
+    #[tokio::test]
+    async fn reserve_settle_release_round_trip() {
+        let p = provider();
+        let principal = p.resolve("sk-balance").await.unwrap();
+
+        let r = p.reserve(&principal, 400).await.expect("within cap");
+        // Reserved, not yet spent.
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.hard_cap, Some(1_000));
+        assert_eq!(snap.reserved, 400);
+        assert_eq!(snap.spent, 0);
+
+        // Used fewer tokens than reserved → remainder released, spend exact.
+        p.settle(r, 250).await;
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.reserved, 0);
+        assert_eq!(snap.spent, 250);
+
+        // A reservation that is released contributes no spend.
+        let r2 = p.reserve(&principal, 100).await.unwrap();
+        p.release(r2).await;
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.reserved, 0);
+        assert_eq!(snap.spent, 250);
+    }
+
+    #[tokio::test]
+    async fn balance_over_cap_is_insufficient_quota() {
+        let p = provider();
+        let principal = p.resolve("sk-balance").await.unwrap();
+        // Reserve most of the cap, then ask for more than remains.
+        let _r = p.reserve(&principal, 900).await.unwrap();
+        let err = p.reserve(&principal, 200).await.expect_err("over cap");
+        match err {
+            BudgetError::InsufficientQuota {
+                requested,
+                available,
+            } => {
+                assert_eq!(requested, 200);
+                assert_eq!(available, 100);
+            }
+            other => panic!("expected InsufficientQuota, got {other:?}"),
+        }
+    }
+
+    #[tokio::test]
+    async fn rolling_over_cap_is_rate_limited_with_retry_after() {
+        let p = provider();
+        let principal = p.resolve("sk-rolling").await.unwrap();
+        let _r = p.reserve(&principal, 500).await.unwrap();
+        let err = p.reserve(&principal, 1).await.expect_err("over cap");
+        match err {
+            BudgetError::RateLimited {
+                retry_after_secs, ..
+            } => {
+                assert!(retry_after_secs >= 1, "must advertise a retry hint");
+                assert!(retry_after_secs <= 3_600);
+            }
+            other => panic!("expected RateLimited, got {other:?}"),
+        }
+    }
+
+    #[tokio::test]
+    async fn uncapped_infra_key_never_refuses() {
+        let p = provider();
+        let principal = p.resolve("sk-infra").await.unwrap();
+        let r = p.reserve(&principal, 10_000_000).await.expect("uncapped");
+        p.settle(r, 10_000_000).await;
+        let snap = p.snapshot(&principal).await.unwrap();
+        assert_eq!(snap.hard_cap, None);
+        assert_eq!(snap.spent, 10_000_000);
+    }
+}
--- a/crates/cortex-gateway/src/error.rs
+++ b/crates/cortex-gateway/src/error.rs
@@ -0,0 +1,24 @@
+//! Gateway adapter that turns the shared, axum-agnostic
+//! [`cortex_core::error_envelope::OpenAiError`] into an axum [`Response`],
+//! setting the `Retry-After` header when the envelope carries one.
+//!
+//! cortex-core owns the envelope shape and the rejection contract (#60/#63);
+//! this is the only place the gateway crosses from that data into axum.
+
+use axum::http::{HeaderValue, StatusCode, header};
+use axum::response::{IntoResponse, Json, Response};
+use cortex_core::error_envelope::OpenAiError;
+
+/// Render an [`OpenAiError`] as an axum response (status + JSON envelope +
+/// optional `Retry-After`).
+pub fn envelope_response(err: OpenAiError) -> Response {
+    let status = StatusCode::from_u16(err.status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+    let retry_after = err.retry_after_secs;
+    let mut response = (status, Json(err.body())).into_response();
+    if let Some(secs) = retry_after
+        && let Ok(value) = HeaderValue::from_str(&secs.to_string())
+    {
+        response.headers_mut().insert(header::RETRY_AFTER, value);
+    }
+    response
+}
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
@@ -11,6 +11,8 @@ use axum::http::HeaderMap;
 use axum::response::{IntoResponse, Json, Response};
 use axum::routing::{get, post};
 use chrono::Utc;
+use cortex_core::error_envelope::OpenAiError;
+use cortex_core::harness::ModelLimit;
 use cortex_core::node::{CortexModelEntry, ModelLocation};
 use serde_json::{Value, json};
 use std::sync::Arc;
@@ -33,6 +35,7 @@ async fn chat_completions(
    headers: HeaderMap,
    body: Bytes,
 ) -> Response {
+    log_inbound("openai-chat", "/v1/chat/completions", &body);
    let model_id = match extract_model(&body) {
        Some(m) => m,
        None => {
@@ -40,7 +43,12 @@ async fn chat_completions(
                handler = "chat_completions",
                "rejected: missing 'model' field in request body"
            );
-            return error_response(400, "missing 'model' field in request body");
+            return error_response(
+                400,
+                "invalid_request_error",
+                "missing_model_field",
+                "missing 'model' field in request body",
+            );
        }
    };

@@ -53,11 +61,7 @@ async fn chat_completions(
                error = %e,
                "route resolve failed"
            );
-            // RouteError's Display strings are short and informative
-            // ("model 'X' not found...", "no healthy nodes available")
-            // — fine to surface to the caller. The warn above carries
-            // any extra context for operators.
-            return error_response(404, &e.to_string());
+            return route_error_response(&e);
        }
    };

@@ -89,6 +93,7 @@ async fn responses(
    headers: HeaderMap,
    body: Bytes,
 ) -> Response {
+    log_inbound("openai-responses", "/v1/responses", &body);
    let model_id = match extract_model(&body) {
        Some(m) => m,
        None => {
@@ -96,7 +101,12 @@ async fn responses(
                handler = "responses",
                "rejected: missing 'model' field in request body"
            );
-            return error_response(400, "missing 'model' field in request body");
+            return error_response(
+                400,
+                "invalid_request_error",
+                "missing_model_field",
+                "missing 'model' field in request body",
+            );
        }
    };

@@ -109,7 +119,7 @@ async fn responses(
                error = %e,
                "route resolve failed"
            );
-            return error_response(404, &e.to_string());
+            return route_error_response(&e);
        }
    };

@@ -133,6 +143,7 @@ async fn completions(
    headers: HeaderMap,
    body: Bytes,
 ) -> Response {
+    log_inbound("openai-completions", "/v1/completions", &body);
    let model_id = match extract_model(&body) {
        Some(m) => m,
        None => {
@@ -140,7 +151,12 @@ async fn completions(
                handler = "completions",
                "rejected: missing 'model' field in request body"
            );
-            return error_response(400, "missing 'model' field in request body");
+            return error_response(
+                400,
+                "invalid_request_error",
+                "missing_model_field",
+                "missing 'model' field in request body",
+            );
        }
    };

@@ -153,11 +169,7 @@ async fn completions(
                error = %e,
                "route resolve failed"
            );
-            // RouteError's Display strings are short and informative
-            // ("model 'X' not found...", "no healthy nodes available")
-            // — fine to surface to the caller. The warn above carries
-            // any extra context for operators.
-            return error_response(404, &e.to_string());
+            return route_error_response(&e);
        }
    };

@@ -190,13 +202,48 @@ async fn anthropic_messages(
                error = %e,
                "rejected: invalid Anthropic request body"
            );
-            return error_response(400, "invalid Anthropic request body");
+            return error_response(
+                400,
+                "invalid_request_error",
+                "invalid_anthropic_body",
+                "invalid Anthropic request body",
+            );
        }
    };

    let model_id = anth_req.model.clone();
    let is_streaming = anth_req.stream.unwrap_or(false);

+    // Wire-debug: make the exercised path and request shape concrete
+    // rather than guesswork. `tool_history` flags whether the client is
+    // continuing a tool conversation (tool_use/tool_result blocks in the
+    // message history) vs. opening a fresh one. Full bodies ride at
+    // trace! (cortex/neuron ship at info; operator infra runs at debug).
+    if tracing::enabled!(tracing::Level::DEBUG) {
+        let n_tools = anth_req
+            .extra
+            .get("tools")
+            .and_then(Value::as_array)
+            .map(|a| a.len())
+            .unwrap_or(0);
+        let tool_history = anth_req
+            .messages
+            .iter()
+            .any(|m| anthropic_message_has_tool_blocks(&m.content));
+        tracing::debug!(
+            wire = "anthropic",
+            endpoint = "/v1/messages",
+            model = %model_id,
+            stream = is_streaming,
+            messages = anth_req.messages.len(),
+            tools = n_tools,
+            tool_history,
+            system = anth_req.system.is_some(),
+            "inbound request"
+        );
+    }
+    tracing::trace!(wire = "anthropic", body = %body_preview(&body), "inbound anthropic body");
+
    // Translate to OpenAI format.
    let openai_req = cortex_core::translate::anthropic_to_openai(anth_req);
    let openai_body = match serde_json::to_vec(&openai_req) {
@@ -208,7 +255,12 @@ async fn anthropic_messages(
                error = %e,
                "internal: failed to serialise translated OpenAI request"
            );
-            return error_response(500, "internal translation error");
+            return error_response(
+                500,
+                "api_error",
+                "internal_translation_error",
+                "internal translation error",
+            );
        }
    };

@@ -225,7 +277,7 @@ async fn anthropic_messages(
            // ("model 'X' not found...", "no healthy nodes available")
            // — fine to surface to the caller. The warn above carries
            // any extra context for operators.
-            return error_response(404, &e.to_string());
+            return route_error_response(&e);
        }
    };

@@ -235,6 +287,14 @@ async fn anthropic_messages(
    // neuron's harness sees a model name that matches what it has
    // loaded.
    let openai_body = rewrite_model_in_body(openai_body, &route.resolved_model_id);
+    // The translated body is what neuron actually sees — the reshaped
+    // OpenAI-form tools live here. Tracing it makes "did the tool
+    // definitions survive translation?" a log line, not a guess.
+    tracing::trace!(
+        wire = "anthropic",
+        body = %body_preview(&openai_body),
+        "translated openai body (sent upstream)"
+    );

    let labels = [
        ("model", route.resolved_model_id.clone()),
@@ -246,29 +306,49 @@ async fn anthropic_messages(
    }
    let start = Instant::now();

+    // Per-request metering + budget enforcement (#51/#52), same lifecycle as
+    // the OpenAI paths. Estimate from the translated OpenAI body (what neuron
+    // sees). Refuse over-cap before dispatch via the #63 envelope; otherwise
+    // build the sink consumed by whichever branch runs below.
+    let usage_sink = match crate::metering::principal_from_headers(&headers) {
+        Some(principal) => {
+            let advertised =
+                advertised_output_limit(&fleet, &route.node_name, &route.resolved_model_id).await;
+            let max_tokens = crate::metering::reservation_estimate(&openai_body, advertised);
+            match crate::metering::reserve_or_reject(
+                Arc::clone(&fleet.entitlements),
+                &principal,
+                max_tokens,
+            )
+            .await
+            {
+                Ok(guard) => Some(crate::metering::usage_sink(principal, guard)),
+                Err(env) => return crate::error::envelope_response(env),
+            }
+        }
+        None => None,
+    };
+
    if is_streaming {
-        // TODO: streaming Anthropic translation requires converting SSE format.
-        // For now, proxy the OpenAI SSE stream directly (clients that can handle
-        // OpenAI SSE will work; full Anthropic SSE translation is a follow-up).
-        let result = proxy::forward_request(
+        // Anthropic SSE translation (#24): upstream speaks OpenAI SSE;
+        // re-frame it event-by-event into Anthropic's message_start /
+        // content_block_* / message_delta / message_stop sequence.
+        let resp = crate::anthropic_sse::stream_translated(
            &fleet.http_client,
-            &route,
-            "/v1/chat/completions",
-            headers,
+            &route.endpoint,
            openai_body,
+            &model_id,
+            &route.node_name,
+            &headers,
+            usage_sink,
        )
        .await;
        metrics::histogram!("cortex_request_duration_seconds", &labels)
            .record(start.elapsed().as_secs_f64());
-        match result {
-            Ok(resp) => resp,
-            Err(e) => {
+        if !resp.status().is_success() {
            metrics::counter!("cortex_request_errors_total", &labels).increment(1);
-                // forward_request already warn'd with the wire-level
-                // detail; no need to log again here.
-                e.into_response()
-            }
        }
+        resp
    } else {
        // Non-streaming: proxy, buffer full response, translate back to Anthropic.
        let target_url = format!("{}/v1/chat/completions", route.endpoint);
@@ -280,11 +360,14 @@ async fn anthropic_messages(
            cold_start = route.cold_start,
            "proxying request"
        );
-        let upstream_resp = fleet
+        let upstream_resp = crate::auth::forward_principal_headers(
+            fleet
                .http_client
                .post(&target_url)
                .body(openai_body)
-            .header("content-type", "application/json")
+                .header("content-type", "application/json"),
+            &headers,
+        )
        .send()
        .await;

@@ -300,7 +383,12 @@ async fn anthropic_messages(
                    error = %e,
                    "upstream request failed (network)"
                );
-                return error_response(502, "upstream request failed");
+                return error_response(
+                    502,
+                    "api_error",
+                    "upstream_connection_error",
+                    "upstream request failed",
+                );
            }
        };

@@ -319,7 +407,12 @@ async fn anthropic_messages(
                body = %body_snippet,
                "upstream returned non-2xx"
            );
-            return error_response(status, &format!("upstream returned {status}"));
+            return error_response(
+                status,
+                "api_error",
+                "upstream_error",
+                &format!("upstream returned {status}"),
+            );
        }

        let body_bytes = match upstream_resp.bytes().await {
@@ -334,7 +427,12 @@ async fn anthropic_messages(
                    error = %e,
                    "failed to read upstream response body"
                );
-                return error_response(502, "failed to read upstream response");
+                return error_response(
+                    502,
+                    "api_error",
+                    "upstream_connection_error",
+                    "failed to read upstream response",
+                );
            }
        };

@@ -356,17 +454,68 @@ async fn anthropic_messages(
                        body = %body_snippet,
                        "failed to parse upstream response as OpenAI ChatCompletionResponse"
                    );
-                    return error_response(502, "malformed upstream response");
+                    return error_response(
+                        502,
+                        "api_error",
+                        "upstream_malformed_response",
+                        "malformed upstream response",
+                    );
                }
            };

        metrics::histogram!("cortex_request_duration_seconds", &labels)
            .record(start.elapsed().as_secs_f64());
+        // Settle metering with the upstream usage (#51). Scanned from the
+        // raw body — same engine-truth source as the streaming path — so we
+        // don't depend on the typed usage struct's optionality.
+        if let Some(sink) = usage_sink {
+            let tail = String::from_utf8_lossy(&body_bytes);
+            let prompt = proxy::last_count_for(&tail, "prompt_tokens").unwrap_or(0);
+            let completion = proxy::last_count_for(&tail, "completion_tokens").unwrap_or(0);
+            sink(prompt, completion);
+        }
+        // Did the model actually produce a structured tool call, or just
+        // text? This is the single most useful signal for "is tool
+        // calling working end-to-end" — a `false` here alongside a
+        // request that carried tools means the model improvised an
+        // unparsed format (the original failure mode).
+        let upstream_tool_calls = openai_resp.choices.iter().any(|c| {
+            c.message
+                .extra
+                .get("tool_calls")
+                .and_then(Value::as_array)
+                .map(|a| !a.is_empty())
+                .unwrap_or(false)
+        });
+        let finish_reason = openai_resp
+            .choices
+            .first()
+            .and_then(|c| c.finish_reason.clone());
+        tracing::debug!(
+            wire = "anthropic",
+            model = %model_id,
+            node = %route.node_name,
+            upstream_tool_calls,
+            finish_reason = ?finish_reason,
+            "upstream non-streaming response"
+        );
        let anthropic_resp = cortex_core::translate::openai_to_anthropic(openai_resp);
        Json(json!(anthropic_resp)).into_response()
    }
 }

+/// Combine two self-derived limits for the same model loaded on
+/// different neurons (#67): keep the tightest (smallest `context`) so a
+/// client sized against the advertised limit never overflows the
+/// most-constrained deployment that might serve the request. `None`
+/// means "that neuron reported no limit"; the present one wins.
+fn tightest_limit(a: Option<ModelLimit>, b: Option<ModelLimit>) -> Option<ModelLimit> {
+    match (a, b) {
+        (None, x) | (x, None) => x,
+        (Some(a), Some(b)) => Some(if b.context < a.context { b } else { a }),
+    }
+}
+
 /// `GET /v1/models` — union of (catalogue × topology feasibility) and
 /// (currently loaded somewhere). The result is what the fleet *could*
 /// serve, not just what's already loaded — so OpenAI-compatible tools
@@ -414,6 +563,20 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
                loaded: false,
                feasible_on,
                locations: Vec::new(),
+                // Start with catalogue-declared capabilities; Pass 2 unions
+                // runtime-detected ones from loaded neurons.
+                capabilities: profile.capabilities.clone(),
+                // `limit` is no longer operator-declared (#67): the neuron
+                // self-derives it from live VRAM + throughput and reports it
+                // per loaded model — Pass 2 fills it from the neuron's
+                // ModelEntry. A catalogue `limit`, if present, is ignored
+                // (it can't track hot-swapped models or live capacity).
+                // `cost` stays operator-set and flows from the catalogue.
+                limit: None,
+                cost: profile.cost.clone(),
+                // Runtime-detected — will be OR-ed in Pass 2 from neuron data.
+                tool_call: false,
+                reasoning: false,
            },
        );
    }
@@ -438,6 +601,23 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
                    if was_loaded {
                        e.loaded = true;
                    }
+                    // Union the per-node capabilities so a model loaded
+                    // on several neurons reports every modality any of
+                    // them advertises.
+                    for cap in &entry.capabilities {
+                        if !e.capabilities.contains(cap) {
+                            e.capabilities.push(cap.clone());
+                        }
+                    }
+                    // OR-in runtime-detected capability flags from the neuron.
+                    e.tool_call = e.tool_call || entry.tool_call;
+                    e.reasoning = e.reasoning || entry.reasoning;
+                    // Adopt the neuron's self-derived limit (#67). When a
+                    // model is loaded on several neurons with different
+                    // headroom, advertise the tightest (smallest context)
+                    // so a client never overflows the most-constrained
+                    // deployment that might serve it.
+                    e.limit = tightest_limit(e.limit.take(), entry.limit.clone());
                })
                .or_insert_with(|| CortexModelEntry {
                    id: model_id.clone(),
@@ -449,6 +629,11 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
                    // feasibility; leave empty.
                    feasible_on: Vec::new(),
                    locations: vec![location],
+                    capabilities: entry.capabilities.clone(),
+                    limit: entry.limit.clone(),
+                    cost: None,
+                    tool_call: entry.tool_call,
+                    reasoning: entry.reasoning,
                });
        }
    }
@@ -498,6 +683,13 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
                    loaded: false,
                    feasible_on: Vec::new(),
                    locations: vec![location],
+                    // A model that's only mid-prewarm has no loaded
+                    // location to read capabilities from yet.
+                    capabilities: Vec::new(),
+                    limit: None,
+                    cost: None,
+                    tool_call: false,
+                    reasoning: false,
                });
        }
    }
@@ -527,6 +719,11 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
                loaded: target_entry.loaded,
                feasible_on: target_entry.feasible_on,
                locations: target_entry.locations,
+                capabilities: target_entry.capabilities,
+                limit: target_entry.limit.clone(),
+                cost: target_entry.cost.clone(),
+                tool_call: target_entry.tool_call,
+                reasoning: target_entry.reasoning,
            },
        );
    }
@@ -574,8 +771,42 @@ async fn proxy_with_metrics(
        metrics::counter!("cortex_cold_starts_total", &labels).increment(1);
    }

+    // Per-request metering + budget enforcement (#51/#52): reconstruct the
+    // principal from the middleware-stamped headers, reserve the request's
+    // upper-bound cost (prompt estimate + max output), and build the
+    // completion sink that settles actual spend when the response finishes.
+    // A reservation over the hard cap is refused *before* dispatch with the
+    // #63 envelope. Anonymous requests skip all of this. Must happen before
+    // `headers`/`body` are moved into the proxy.
+    let usage_sink = match crate::metering::principal_from_headers(&headers) {
+        Some(principal) => {
+            let advertised = advertised_output_limit(fleet, &route.node_name, model_id).await;
+            let max_tokens = crate::metering::reservation_estimate(&body, advertised);
+            match crate::metering::reserve_or_reject(
+                Arc::clone(&fleet.entitlements),
+                &principal,
+                max_tokens,
+            )
+            .await
+            {
+                Ok(guard) => Some(crate::metering::usage_sink(principal, guard)),
+                Err(env) => return crate::error::envelope_response(env),
+            }
+        }
+        None => None,
+    };
+
    let start = Instant::now();
-    let result = proxy::forward_request(&fleet.http_client, route, path, headers, body).await;
+    let result = proxy::forward_request(
+        &fleet.http_client,
+        route,
+        path,
+        headers,
+        body,
+        model_id,
+        usage_sink,
+    )
+    .await;
    let duration = start.elapsed();

    match result {
@@ -594,6 +825,25 @@ async fn proxy_with_metrics(
    }
 }

+/// The model's advertised `limit.output` (#62) on a given node, used as the
+/// default output budget for budget reservations (#52) when the request
+/// omits `max_(completion_)tokens`. `None` when the node/model/limit is
+/// unknown — callers fall back to [`crate::metering::FALLBACK_MAX_OUTPUT`].
+async fn advertised_output_limit(
+    fleet: &CortexState,
+    node_name: &str,
+    model_id: &str,
+) -> Option<u64> {
+    let nodes = fleet.nodes.read().await;
+    nodes
+        .get(node_name)?
+        .models
+        .get(model_id)?
+        .limit
+        .as_ref()
+        .map(|l| l.output as u64)
+}
+
 /// Update `last_accessed` timestamp for a model on a node (drives LRU eviction).
 async fn touch_model(fleet: &CortexState, node_name: &str, model_id: &str) {
    let mut nodes = fleet.nodes.write().await;
@@ -609,6 +859,57 @@ fn extract_model(body: &[u8]) -> Option<String> {
    v.get("model")?.as_str().map(|s| s.to_string())
 }

+/// Emit a uniform wire-debug summary for an OpenAI-family inbound
+/// request (chat/completions, completions, responses). Makes which
+/// surface a client exercised — and whether it sent tools / asked for
+/// streaming — a concrete log line. The full body rides at trace!.
+///
+/// Parsing is gated on the debug level being enabled so info-level
+/// deployments pay nothing.
+fn log_inbound(wire: &str, endpoint: &str, body: &[u8]) {
+    if tracing::enabled!(tracing::Level::DEBUG) {
+        let v: Value = match serde_json::from_slice(body) {
+            Ok(v) => v,
+            Err(_) => return,
+        };
+        let model = v.get("model").and_then(Value::as_str).unwrap_or("?");
+        let stream = v.get("stream").and_then(Value::as_bool).unwrap_or(false);
+        let tools = v
+            .get("tools")
+            .and_then(Value::as_array)
+            .map(|a| a.len())
+            .unwrap_or(0);
+        tracing::debug!(wire, endpoint, model, stream, tools, "inbound request");
+    }
+    tracing::trace!(wire, endpoint, body = %body_preview(body), "inbound body");
+}
+
+/// True if an Anthropic message's content carries any `tool_use` or
+/// `tool_result` block — i.e. the client is mid tool-conversation.
+fn anthropic_message_has_tool_blocks(content: &cortex_core::anthropic::AnthropicContent) -> bool {
+    use cortex_core::anthropic::AnthropicContent;
+    match content {
+        AnthropicContent::Text(_) => false,
+        AnthropicContent::Blocks(blocks) => blocks
+            .iter()
+            .any(|b| matches!(b.block_type.as_str(), "tool_use" | "tool_result")),
+    }
+}
+
+/// Render a UTF-8-safe, length-capped preview of a request/response
+/// body for trace logging. Caps by characters (not bytes) so the slice
+/// can never split a multi-byte codepoint.
+fn body_preview(body: &[u8]) -> String {
+    const MAX_CHARS: usize = 8192;
+    let text = String::from_utf8_lossy(body);
+    if text.chars().count() > MAX_CHARS {
+        let head: String = text.chars().take(MAX_CHARS).collect();
+        format!("{head}…<truncated, {} bytes total>", body.len())
+    } else {
+        text.into_owned()
+    }
+}
+
 /// Rewrite the `model` field of an OpenAI-style JSON request body to
 /// the resolved concrete id. Returns the original bytes if `new_model`
 /// matches what's already there or the body fails to parse — the
@@ -641,14 +942,16 @@ fn rewrite_model_in_body(body: Bytes, new_model: &str) -> Bytes {
    }
 }

-fn error_response(status: u16, message: &str) -> Response {
-    let code = axum::http::StatusCode::from_u16(status)
-        .unwrap_or(axum::http::StatusCode::INTERNAL_SERVER_ERROR);
-    let body = json!({
-        "error": {
-            "message": message,
-            "type": "gateway_error",
+fn error_response(status: u16, typ: &str, code: &str, message: &str) -> Response {
+    crate::error::envelope_response(OpenAiError::new(status, typ, code, message))
 }
-    });
-    (code, Json(body)).into_response()
+
+/// Render a [`RouteError`] in the standard envelope, attaching `Retry-After`
+/// for its transient variants (#63).
+fn route_error_response(e: &router::RouteError) -> Response {
+    let mut env = OpenAiError::new(e.http_status(), e.broad_type(), e.code(), e.to_string());
+    if let Some(secs) = e.retry_after_secs() {
+        env = env.with_retry_after(secs);
+    }
+    crate::error::envelope_response(env)
 }
--- a/crates/cortex-gateway/src/lib.rs
+++ b/crates/cortex-gateway/src/lib.rs
@@ -1,5 +1,10 @@
+pub mod anthropic_sse;
+pub mod auth;
+pub mod entitlements_local;
+pub mod error;
 pub mod evictor;
 pub mod handlers;
+pub mod metering;
 pub mod metrics;
 pub mod poller;
 pub mod proxy;
@@ -8,15 +13,26 @@ pub mod state;

 use anyhow::Result;
 use axum::Router;
+use axum::middleware::from_fn_with_state;
 use cortex_core::config::GatewayConfig;
 use std::sync::Arc;
 use tower_http::cors::CorsLayer;
 use tower_http::trace::TraceLayer;

 /// Build the Axum application router with all routes wired up.
+///
+/// Layer order (outermost first): trace → CORS → auth → handlers. CORS is
+/// outer to auth so preflight `OPTIONS` short-circuits before resolution;
+/// auth (`require_principal`) resolves the bearer key, attaches the
+/// principal, and stamps the internal principal headers before any handler
+/// runs.
 pub fn build_app(fleet: Arc<state::CortexState>) -> Router {
    Router::new()
        .merge(handlers::api_routes())
+        .layer(from_fn_with_state(
+            Arc::clone(&fleet),
+            auth::require_principal,
+        ))
        .layer(CorsLayer::permissive())
        .layer(TraceLayer::new_for_http())
        .with_state(fleet)
--- a/crates/cortex-gateway/src/metering.rs
+++ b/crates/cortex-gateway/src/metering.rs
@@ -0,0 +1,219 @@
+//! Per-request token metering (#51).
+//!
+//! Captures the real `(prompt, completion)` usage of every request and feeds
+//! it to two places: the [`EntitlementProvider`] spend ledger (via
+//! reserve→settle) and per-principal Prometheus counters. The principal is
+//! reconstructed from the internal headers the auth middleware stamped (#49),
+//! so this works uniformly across every proxy path without threading the
+//! typed principal through each handler.
+//!
+//! The reserve→settle lifecycle is established here but, in this phase,
+//! reserves **zero** tokens — metering only, no enforcement. Budget
+//! enforcement (#52) flips the reserved amount to the real
+//! `prompt + max_output` and handles the [`BudgetError`] rejection; the
+//! settle/release plumbing is identical, so that change is localized.
+//!
+//! [`ReservationGuard`] makes leaks impossible: settling records actual
+//! spend and releases the unused remainder; dropping a guard that was never
+//! settled releases the whole reservation. So an early return, error path,
+//! or dropped stream can't strand a reservation.
+
+use axum::http::HeaderMap;
+use cortex_core::entitlements::{
+    BudgetError, EntitlementProvider, HEADER_ACCOUNT_ID, HEADER_KEY_ID, Principal,
+};
+use cortex_core::error_envelope::OpenAiError;
+use std::sync::Arc;
+
+/// Fallback output-token budget when neither the request nor the model's
+/// advertised limit gives one. Bounds the reservation so a capped key is
+/// still gated even on under-specified requests (#52).
+pub const FALLBACK_MAX_OUTPUT: u64 = 4096;
+
+/// Invoked exactly once at request completion with best-effort
+/// `(prompt_tokens, completion_tokens)`. When no usage could be observed
+/// (e.g. a pre-dispatch failure or a dropped stream) it is dropped unused —
+/// which releases the held reservation via [`ReservationGuard`]'s `Drop`.
+pub type UsageSink = Box<dyn FnOnce(u64, u64) + Send>;
+
+/// Reconstruct the principal from the cortex-stamped internal headers. The
+/// auth middleware strips any client copy and stamps the authoritative value,
+/// so these headers are trustworthy within cortex. `None` for anonymous
+/// (unauthenticated) requests.
+pub fn principal_from_headers(headers: &HeaderMap) -> Option<Principal> {
+    let account_id = headers.get(HEADER_ACCOUNT_ID)?.to_str().ok()?.to_string();
+    let key_id = headers.get(HEADER_KEY_ID)?.to_str().ok()?.to_string();
+    Some(Principal { account_id, key_id })
+}
+
+/// Emit per-principal spend counters (#51). Labelled by account/key only —
+/// both are operator-bounded, so cardinality is controlled.
+pub fn record_spend(principal: &Principal, prompt: u64, completion: u64) {
+    let labels = [
+        ("account", principal.account_id.clone()),
+        ("key", principal.key_id.clone()),
+    ];
+    metrics::counter!("cortex_spend_tokens_total", &labels).increment(prompt + completion);
+    metrics::counter!("cortex_spend_prompt_tokens_total", &labels).increment(prompt);
+    metrics::counter!("cortex_spend_completion_tokens_total", &labels).increment(completion);
+}
+
+/// Holds a budget reservation for the life of a request. [`settle`] records
+/// actual spend and releases the remainder; an un-settled guard releases the
+/// whole reservation when dropped. Anonymous requests carry an empty guard,
+/// where every operation is a no-op.
+///
+/// [`settle`]: ReservationGuard::settle
+pub struct ReservationGuard {
+    provider: Arc<dyn EntitlementProvider>,
+    reservation: Option<cortex_core::entitlements::Reservation>,
+}
+
+impl ReservationGuard {
+    /// An empty guard for an anonymous request — no reservation to resolve.
+    pub fn anonymous(provider: Arc<dyn EntitlementProvider>) -> Self {
+        Self {
+            provider,
+            reservation: None,
+        }
+    }
+
+    /// Wrap an already-acquired reservation.
+    fn held(
+        provider: Arc<dyn EntitlementProvider>,
+        reservation: cortex_core::entitlements::Reservation,
+    ) -> Self {
+        Self {
+            provider,
+            reservation: Some(reservation),
+        }
+    }
+
+    /// Settle with the tokens actually consumed, disarming the drop-release.
+    /// Spawns the (fast, in-process for the local provider) settle so the
+    /// caller — which may be a sync stream-completion callback — needn't
+    /// await.
+    pub fn settle(mut self, actual_tokens: u64) {
+        if let Some(reservation) = self.reservation.take() {
+            let provider = Arc::clone(&self.provider);
+            tokio::spawn(async move {
+                provider.settle(reservation, actual_tokens).await;
+            });
+        }
+    }
+}
+
+impl Drop for ReservationGuard {
+    fn drop(&mut self) {
+        if let Some(reservation) = self.reservation.take() {
+            let provider = Arc::clone(&self.provider);
+            tokio::spawn(async move {
+                provider.release(reservation).await;
+            });
+        }
+    }
+}
+
+/// Build the completion sink for an authenticated request: record spend and
+/// settle the reservation with the observed total. Dropping it unused (no
+/// usage observed) releases the reservation via the guard.
+pub fn usage_sink(principal: Principal, guard: ReservationGuard) -> UsageSink {
+    Box::new(move |prompt, completion| {
+        record_spend(&principal, prompt, completion);
+        guard.settle(prompt + completion);
+    })
+}
+
+/// Reserve the request's upper-bound token cost for the principal, refusing
+/// *before* dispatch if it would exceed the hard cap (#52). On success
+/// returns a guard the caller settles with actual usage; on refusal returns
+/// the #63 envelope (`rate_limit_exceeded` + `Retry-After` for a resetting
+/// window, `insufficient_quota` for a hard balance — never `402`).
+pub async fn reserve_or_reject(
+    provider: Arc<dyn EntitlementProvider>,
+    principal: &Principal,
+    max_tokens: u64,
+) -> Result<ReservationGuard, OpenAiError> {
+    match provider.reserve(principal, max_tokens).await {
+        Ok(reservation) => Ok(ReservationGuard::held(provider, reservation)),
+        Err(err) => Err(budget_error_to_envelope(err)),
+    }
+}
+
+/// Map a [`BudgetError`] to the #63 envelope. The provider chose the window
+/// semantics; this only translates them to HTTP.
+fn budget_error_to_envelope(err: BudgetError) -> OpenAiError {
+    match err {
+        BudgetError::RateLimited {
+            retry_after_secs, ..
+        } => OpenAiError::rate_limit_exceeded(err.to_string(), retry_after_secs),
+        BudgetError::InsufficientQuota { .. } => OpenAiError::insufficient_quota(err.to_string()),
+    }
+}
+
+/// Upper-bound tokens to reserve for a request (#52): an over-estimate of
+/// the prompt plus the maximum output. `advertised_output` is the model's
+/// `limit.output` (#62), used when the request omits `max_(completion_)tokens`.
+/// Over-reserving is safe — settle corrects spend to the actual usage.
+pub fn reservation_estimate(body: &[u8], advertised_output: Option<u64>) -> u64 {
+    let max_output = requested_max_output(body)
+        .or(advertised_output)
+        .unwrap_or(FALLBACK_MAX_OUTPUT);
+    estimate_prompt_tokens(body).saturating_add(max_output)
+}
+
+/// The client's requested output cap, from `max_completion_tokens` (or the
+/// legacy `max_tokens`). `None` when unspecified.
+fn requested_max_output(body: &[u8]) -> Option<u64> {
+    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
+    v.get("max_completion_tokens")
+        .or_else(|| v.get("max_tokens"))
+        .and_then(serde_json::Value::as_u64)
+}
+
+/// Rough prompt-token estimate at ~4 chars/token over the whole body. cortex
+/// has no tokenizer; JSON overhead makes this a conservative over-estimate,
+/// and neuron remains the exact context wall (#56/#60). Settle reconciles to
+/// the real usage afterward.
+fn estimate_prompt_tokens(body: &[u8]) -> u64 {
+    (body.len() as u64 / 4).max(1)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn requested_max_output_prefers_max_completion_tokens() {
+        let body = br#"{"model":"m","max_completion_tokens":256,"max_tokens":99}"#;
+        assert_eq!(requested_max_output(body), Some(256));
+    }
+
+    #[test]
+    fn requested_max_output_falls_back_to_legacy_max_tokens() {
+        let body = br#"{"model":"m","max_tokens":128}"#;
+        assert_eq!(requested_max_output(body), Some(128));
+    }
+
+    #[test]
+    fn estimate_uses_requested_output_when_present() {
+        // Requested output dominates; prompt estimate is small for a tiny body.
+        let body = br#"{"model":"m","max_tokens":1000}"#;
+        let est = reservation_estimate(body, Some(8192));
+        assert!(est >= 1000 && est < 1100, "est was {est}");
+    }
+
+    #[test]
+    fn estimate_uses_advertised_output_when_request_omits_it() {
+        let body = br#"{"model":"m","messages":[]}"#;
+        let est = reservation_estimate(body, Some(8192));
+        assert!(est >= 8192, "est was {est}");
+    }
+
+    #[test]
+    fn estimate_falls_back_when_nothing_advertised() {
+        let body = br#"{"model":"m"}"#;
+        let est = reservation_estimate(body, None);
+        assert!(est >= FALLBACK_MAX_OUTPUT, "est was {est}");
+    }
+}
--- a/crates/cortex-gateway/src/metrics.rs
+++ b/crates/cortex-gateway/src/metrics.rs
@@ -46,6 +46,14 @@ fn describe_metrics() {
        "Generation throughput in tokens per second"
    );
    metrics::describe_counter!("cortex_requests_total", "Total number of proxied requests");
+    metrics::describe_counter!(
+        "cortex_prompt_tokens_total",
+        "Total prompt tokens reported by upstream usage objects"
+    );
+    metrics::describe_counter!(
+        "cortex_completion_tokens_total",
+        "Total completion tokens reported by upstream usage objects"
+    );
    metrics::describe_counter!(
        "cortex_request_errors_total",
        "Total number of failed proxy requests"
@@ -55,4 +63,16 @@ fn describe_metrics() {
        "cortex_cold_starts_total",
        "Total number of cold-start model loads"
    );
+    metrics::describe_counter!(
+        "cortex_spend_tokens_total",
+        "Total metered tokens (prompt + completion) per principal, labelled by account/key (#51)"
+    );
+    metrics::describe_counter!(
+        "cortex_spend_prompt_tokens_total",
+        "Metered prompt tokens per principal, labelled by account/key (#51)"
+    );
+    metrics::describe_counter!(
+        "cortex_spend_completion_tokens_total",
+        "Metered completion tokens per principal, labelled by account/key (#51)"
+    );
 }
--- a/crates/cortex-gateway/src/poller.rs
+++ b/crates/cortex-gateway/src/poller.rs
@@ -26,14 +26,23 @@ pub async fn poll_once(fleet: &CortexState) {
    }
 }

-/// One-shot fetch of `GET /discovery`. Cached on the NodeState forever
-/// after the first success — topology is invariant for a given neuron
-/// process. Skipped when the cache is already populated.
+/// Fetch `GET /discovery` and cache it on the NodeState — topology is
+/// invariant for a given neuron process, so a successful fetch is kept.
+/// Re-polled only while `max_prompt_tokens` is still unknown (0): on a
+/// rolling deploy cortex can win the race and cache a neuron's discovery
+/// before that neuron reports the field (it deserialises to 0). Re-polling
+/// until a real cap arrives self-heals that without periodic polling.
 async fn maybe_poll_discovery(fleet: &CortexState, name: &str, endpoint: &str) {
    {
        let nodes = fleet.nodes.read().await;
        match nodes.get(name) {
-            Some(n) if n.discovery.is_some() => return,
+            Some(n)
+                if n.discovery
+                    .as_ref()
+                    .is_some_and(|d| d.max_prompt_tokens > 0) =>
+            {
+                return;
+            }
            _ => {}
        }
    }
@@ -107,12 +116,22 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
                            .and_modify(|e| {
                                e.status = status;
                                e.vram_estimate_mb = upstream.vram_used_mb;
+                                e.capabilities = upstream.capabilities.clone();
+                                e.tool_call = upstream.tool_call;
+                                e.reasoning = upstream.reasoning;
+                                // Neuron's self-derived limit (#67) — the
+                                // authoritative source the gateway advertises.
+                                e.limit = upstream.limit.clone();
                            })
                            .or_insert_with(|| ModelEntry {
                                id: upstream.id.clone(),
                                status,
                                last_accessed: None,
                                vram_estimate_mb: upstream.vram_used_mb,
+                                capabilities: upstream.capabilities.clone(),
+                                tool_call: upstream.tool_call,
+                                reasoning: upstream.reasoning,
+                                limit: upstream.limit.clone(),
                            });
                    }

@@ -181,6 +200,9 @@ async fn poll_health(fleet: &CortexState, name: &str, endpoint: &str) {
            let mut nodes = fleet.nodes.write().await;
            if let Some(node) = nodes.get_mut(name) {
                node.activation = Some(h.activation);
+                // Per-model admission load (#53) → keyed by id for the
+                // load-aware router (#55).
+                node.model_load = h.models.into_iter().map(|m| (m.id.clone(), m)).collect();
            }
        }
        Err(e) => {
@@ -195,6 +217,7 @@ fn parse_status(s: &str) -> ModelStatus {
        "unloaded" => ModelStatus::Unloaded,
        "reloading" => ModelStatus::Reloading,
        "loading" => ModelStatus::Loading,
+        "recovering" => ModelStatus::Recovering,
        _ => ModelStatus::Loaded,
    }
 }
--- a/crates/cortex-gateway/src/proxy.rs
+++ b/crates/cortex-gateway/src/proxy.rs
@@ -9,7 +9,12 @@ use anyhow::Result;
 use axum::body::Body;
 use axum::http::{HeaderMap, StatusCode};
 use axum::response::{IntoResponse, Response};
+use futures::Stream;
+use futures::stream::BoxStream;
 use reqwest::Client;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::time::Instant;

 /// Proxy a request body to the resolved backend node and stream the response.
 ///
@@ -25,7 +30,10 @@ pub async fn forward_request(
    path: &str,
    headers: HeaderMap,
    body: bytes::Bytes,
+    model_id: &str,
+    usage_sink: Option<crate::metering::UsageSink>,
 ) -> Result<Response, ProxyError> {
+    let request_start = Instant::now();
    let url = format!("{}{}", route.endpoint, path);
    tracing::info!(
        node = %route.node_name,
@@ -73,7 +81,10 @@ pub async fn forward_request(
    let status = StatusCode::from_u16(upstream_status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);

    let resp_headers = upstream_resp.headers().clone();
-    let stream = upstream_resp.bytes_stream();
+    let stream = TokenMetricsStream::new(
+        Box::pin(upstream_resp.bytes_stream()),
+        TokenMetrics::new(model_id, &route.node_name, request_start, usage_sink),
+    );

    let body = Body::from_stream(stream);

@@ -103,19 +114,263 @@ pub enum ProxyError {

 impl IntoResponse for ProxyError {
    fn into_response(self) -> Response {
-        let (status, message) = match &self {
-            ProxyError::Upstream(_) => (StatusCode::BAD_GATEWAY, "upstream request failed"),
+        let (status, code, message) = match &self {
+            ProxyError::Upstream(_) => (
+                StatusCode::BAD_GATEWAY,
+                "upstream_connection_error",
+                "upstream request failed",
+            ),
            ProxyError::ResponseBuild(_) => (
                StatusCode::INTERNAL_SERVER_ERROR,
+                "internal_server_error",
                "failed to build response",
            ),
        };
-        let body = serde_json::json!({
-            "error": {
-                "message": message,
-                "type": "proxy_error",
-            }
-        });
-        (status, axum::Json(body)).into_response()
+        crate::error::envelope_response(cortex_core::error_envelope::OpenAiError::new(
+            status.as_u16(),
+            "api_error",
+            code,
+            message,
+        ))
+    }
+}
+
+// ── Per-request token metrics (#21) ─────────────────────────────────
+//
+// The proxy never buffers or re-serialises the upstream body — chunks
+// are forwarded verbatim. For metrics it observes each chunk's arrival
+// time and keeps a bounded tail of the body text, from which the final
+// OpenAI `usage` object (present on the last SSE chunk and on
+// non-streaming JSON bodies alike) yields engine-truth token counts.
+//
+// Emitted per request, labelled {model, node}:
+//   cortex_time_to_first_token_seconds  (histogram) — first body chunk
+//   cortex_tokens_per_second            (histogram) — completion tokens
+//       over the decode window (first→last chunk); falls back to the
+//       full request duration for single-chunk (non-streaming) bodies
+//   cortex_prompt_tokens_total / cortex_completion_tokens_total (counters)
+
+/// Cap on the retained body tail. The usage object rides on the final
+/// chunk, so a generous tail is plenty; the cap bounds memory on huge
+/// non-streaming bodies.
+const TAIL_CAP_BYTES: usize = 64 * 1024;
+
+/// Find the value of the LAST `"key": <integer>` occurrence in `tail`.
+/// Pure and chunk-boundary-safe (the tail is contiguous appended text).
+/// The quoted-needle form means `completion_tokens` never matches
+/// `completion_tokens_details`.
+pub(crate) fn last_count_for(tail: &str, key: &str) -> Option<u64> {
+    let needle = format!("\"{key}\"");
+    let mut result = None;
+    for (idx, _) in tail.match_indices(&needle) {
+        let rest = tail[idx + needle.len()..].trim_start();
+        let Some(rest) = rest.strip_prefix(':') else {
+            continue;
+        };
+        let rest = rest.trim_start();
+        let digits: &str = &rest[..rest
+            .char_indices()
+            .find(|(_, c)| !c.is_ascii_digit())
+            .map(|(i, _)| i)
+            .unwrap_or(rest.len())];
+        if let Ok(v) = digits.parse::<u64>() {
+            result = Some(v);
+        }
+    }
+    result
+}
+
+struct TokenMetrics {
+    labels: [(&'static str, String); 2],
+    request_start: Instant,
+    first_chunk: Option<Instant>,
+    last_chunk: Option<Instant>,
+    tail: String,
+    finished: bool,
+    /// Per-principal metering hook (#51). Invoked exactly once in `finish`
+    /// with the observed `(prompt, completion)` so the reservation can be
+    /// settled and spend recorded. `None` for anonymous requests.
+    usage_sink: Option<crate::metering::UsageSink>,
+}
+
+impl TokenMetrics {
+    fn new(
+        model_id: &str,
+        node_name: &str,
+        request_start: Instant,
+        usage_sink: Option<crate::metering::UsageSink>,
+    ) -> Self {
+        Self {
+            labels: [
+                ("model", model_id.to_string()),
+                ("node", node_name.to_string()),
+            ],
+            request_start,
+            first_chunk: None,
+            last_chunk: None,
+            tail: String::new(),
+            finished: false,
+            usage_sink,
+        }
+    }
+
+    fn observe(&mut self, chunk: &[u8]) {
+        let now = Instant::now();
+        self.first_chunk.get_or_insert(now);
+        self.last_chunk = Some(now);
+        self.tail.push_str(&String::from_utf8_lossy(chunk));
+        if self.tail.len() > TAIL_CAP_BYTES {
+            // Keep the newest half; the usage object is always at the
+            // very end of the body. Split at a char boundary.
+            let mut cut = self.tail.len() - TAIL_CAP_BYTES / 2;
+            while !self.tail.is_char_boundary(cut) {
+                cut += 1;
+            }
+            self.tail.drain(..cut);
+        }
+    }
+
+    /// Emit the metrics exactly once — called on clean stream end and
+    /// from Drop (client disconnect mid-stream still records what we
+    /// saw).
+    fn finish(&mut self) {
+        if self.finished {
+            return;
+        }
+        self.finished = true;
+
+        let prompt = last_count_for(&self.tail, "prompt_tokens");
+        let completion = last_count_for(&self.tail, "completion_tokens");
+
+        // Per-model metrics — only when body chunks actually arrived.
+        if let Some(first) = self.first_chunk {
+            let ttft = first.duration_since(self.request_start).as_secs_f64();
+            metrics::histogram!("cortex_time_to_first_token_seconds", &self.labels).record(ttft);
+
+            if let Some(prompt) = prompt {
+                metrics::counter!("cortex_prompt_tokens_total", &self.labels).increment(prompt);
+            }
+            if let Some(completion) = completion.filter(|c| *c > 0) {
+                metrics::counter!("cortex_completion_tokens_total", &self.labels)
+                    .increment(completion);
+
+                let last = self.last_chunk.unwrap_or(first);
+                let decode_window = last.duration_since(first).as_secs_f64();
+                // Streaming: rate over the decode window (first→last chunk).
+                // Non-streaming bodies arrive as ~one chunk (window ≈ 0),
+                // where the only honest denominator is the full request
+                // duration.
+                let secs = if decode_window >= 0.1 {
+                    decode_window
+                } else {
+                    last.duration_since(self.request_start).as_secs_f64()
+                };
+                if secs > 0.0 {
+                    metrics::histogram!("cortex_tokens_per_second", &self.labels)
+                        .record(completion as f64 / secs);
+                }
+            }
+        }
+
+        // Per-principal metering + reservation settle (#51). Always runs so
+        // the reservation is resolved even when no usage/body was observed
+        // (sink with (0, 0) → settle 0 → release).
+        if let Some(sink) = self.usage_sink.take() {
+            sink(prompt.unwrap_or(0), completion.unwrap_or(0));
+        }
+    }
+}
+
+/// Pass-through stream wrapper that feeds [`TokenMetrics`]. Emits on
+/// clean end-of-stream; the Drop impl covers client disconnects.
+struct TokenMetricsStream {
+    inner: BoxStream<'static, Result<bytes::Bytes, reqwest::Error>>,
+    metrics: TokenMetrics,
+}
+
+impl TokenMetricsStream {
+    fn new(
+        inner: BoxStream<'static, Result<bytes::Bytes, reqwest::Error>>,
+        metrics: TokenMetrics,
+    ) -> Self {
+        Self { inner, metrics }
+    }
+}
+
+impl Stream for TokenMetricsStream {
+    type Item = Result<bytes::Bytes, reqwest::Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        match this.inner.as_mut().poll_next(cx) {
+            Poll::Ready(Some(Ok(chunk))) => {
+                this.metrics.observe(&chunk);
+                Poll::Ready(Some(Ok(chunk)))
+            }
+            Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
+            Poll::Ready(None) => {
+                this.metrics.finish();
+                Poll::Ready(None)
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl Drop for TokenMetricsStream {
+    fn drop(&mut self) {
+        self.metrics.finish();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::last_count_for;
+
+    #[test]
+    fn extracts_counts_from_final_sse_usage_chunk() {
+        let tail = concat!(
+            "data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n",
+            "data: {\"choices\":[],\"usage\":{\"prompt_tokens\":225,",
+            "\"completion_tokens\":42,\"total_tokens\":267}}\n\n",
+            "data: [DONE]\n\n"
+        );
+        assert_eq!(last_count_for(tail, "prompt_tokens"), Some(225));
+        assert_eq!(last_count_for(tail, "completion_tokens"), Some(42));
+    }
+
+    #[test]
+    fn extracts_counts_from_non_streaming_body() {
+        let tail = "{\"choices\":[{\"message\":{\"content\":\"hi\"}}],\
+                    \"usage\":{\"prompt_tokens\": 12, \"completion_tokens\": 7}}";
+        assert_eq!(last_count_for(tail, "prompt_tokens"), Some(12));
+        assert_eq!(last_count_for(tail, "completion_tokens"), Some(7));
+    }
+
+    #[test]
+    fn ignores_details_variants_and_takes_last_occurrence() {
+        // completion_tokens_details must not shadow completion_tokens,
+        // and the LAST usage object wins (matters when content echoes
+        // a usage-shaped string earlier in the stream).
+        let tail = concat!(
+            "data: {\"usage\":{\"completion_tokens\":1}}\n\n",
+            "data: {\"usage\":{\"completion_tokens\":99,",
+            "\"completion_tokens_details\":{\"reasoning_tokens\":3}}}\n\n"
+        );
+        assert_eq!(last_count_for(tail, "completion_tokens"), Some(99));
+    }
+
+    #[test]
+    fn absent_keys_yield_none() {
+        assert_eq!(
+            last_count_for("data: [DONE]\n\n", "completion_tokens"),
+            None
+        );
+        assert_eq!(last_count_for("", "prompt_tokens"), None);
+        // key present but non-numeric value
+        assert_eq!(
+            last_count_for("\"completion_tokens\": null", "completion_tokens"),
+            None
+        );
    }
 }
--- a/crates/cortex-gateway/src/router.rs
+++ b/crates/cortex-gateway/src/router.rs
@@ -56,6 +56,59 @@ pub enum RouteError {
        node: String,
        message: String,
    },
+    #[error(
+        "model '{model_id}' is recovering on node '{node}' (device context rebuild in progress) — retry shortly"
+    )]
+    ModelRecovering { model_id: String, node: String },
+}
+
+impl RouteError {
+    /// HTTP status the gateway should answer with. `NoHealthyNodes` and
+    /// `ModelRecovering` are the transient cases (503 service_unavailable,
+    /// safe to retry the same request); everything else is 404.
+    pub fn http_status(&self) -> u16 {
+        match self {
+            RouteError::NoHealthyNodes | RouteError::ModelRecovering { .. } => 503,
+            _ => 404,
+        }
+    }
+
+    /// Broad OpenAI error category for the JSON envelope.
+    pub fn broad_type(&self) -> &'static str {
+        match self {
+            RouteError::ModelNotFound(_) => "invalid_request_error",
+            RouteError::NoHealthyNodes
+            | RouteError::EndpointResolveFailed(_, _)
+            | RouteError::NoFeasibleNeuron { .. }
+            | RouteError::ColdLoadFailed { .. }
+            | RouteError::ModelRecovering { .. } => "api_error",
+        }
+    }
+
+    /// Specific machine-readable error code.
+    pub fn code(&self) -> &'static str {
+        match self {
+            RouteError::ModelNotFound(_) => "model_not_found",
+            RouteError::NoHealthyNodes => "service_unavailable",
+            RouteError::EndpointResolveFailed(_, _) => "service_unavailable",
+            RouteError::NoFeasibleNeuron { .. } => "service_unavailable",
+            RouteError::ColdLoadFailed { .. } => "service_unavailable",
+            RouteError::ModelRecovering { .. } => "service_unavailable",
+        }
+    }
+
+    /// Seconds to advertise in `Retry-After` for the transient variants
+    /// (#63). `NoHealthyNodes` may clear once the poller re-marks a node
+    /// healthy; `ModelRecovering` clears once the device context finishes
+    /// rebuilding — both are safe to retry. Everything else is permanent
+    /// for this request (404) and carries no hint.
+    pub fn retry_after_secs(&self) -> Option<u64> {
+        match self {
+            RouteError::ModelRecovering { .. } => Some(2),
+            RouteError::NoHealthyNodes => Some(5),
+            _ => None,
+        }
+    }
 }

 /// Resolve which node should serve a request for the given model.
@@ -76,11 +129,14 @@ pub async fn resolve(
            "alias resolved"
        );
    }
-    // Snapshot loaded / unloaded state from the poller cache.
-    let (loaded_route, unloaded_route, any_healthy) = {
+    // Snapshot loaded / unloaded / recovering state from the poller cache.
+    let (loaded_route, unloaded_route, recovering_node, any_healthy) = {
        let nodes = fleet.nodes.read().await;
-        let mut loaded_route = None;
+        // All healthy nodes with the model loaded, each with its current
+        // admission load (#53) so we can pick the least-busy replica (#55).
+        let mut loaded_candidates: Vec<(String, String, usize)> = Vec::new();
        let mut unloaded_route = None;
+        let mut recovering_node = None;
        let mut any_healthy = false;
        for node in nodes.values() {
            if !node.healthy {
@@ -90,14 +146,32 @@ pub async fn resolve(
            if let Some(entry) = node.models.get(model_id) {
                match entry.status {
                    ModelStatus::Loaded | ModelStatus::Reloading => {
-                        loaded_route = Some((node.name.clone(), node.endpoint.clone(), false));
-                        break;
+                        // Least-busy score: in-flight + queued from the
+                        // neuron's last /health (#53). Unknown load (no poll
+                        // yet) scores 0 so the replica stays eligible.
+                        let score = node
+                            .model_load
+                            .get(model_id)
+                            .map(|l| l.in_flight + l.queue_depth)
+                            .unwrap_or(0);
+                        loaded_candidates.push((node.name.clone(), node.endpoint.clone(), score));
                    }
                    ModelStatus::Unloaded => {
                        if unloaded_route.is_none() {
                            unloaded_route = Some((node.name.clone(), node.endpoint.clone(), true));
                        }
                    }
+                    // Auto-recovering (#17/#20): the model is rebuilding
+                    // its device context on this node. Hold the route —
+                    // answer "retry shortly" rather than 404, and do NOT
+                    // fall through to the catalogue cold-load, which
+                    // would race a second placement (and a second copy's
+                    // worth of VRAM) against the in-flight recovery.
+                    ModelStatus::Recovering => {
+                        if recovering_node.is_none() {
+                            recovering_node = Some(node.name.clone());
+                        }
+                    }
                    // Loading is gateway-synthesised from neuron's
                    // activation snapshot; it never appears on the
                    // wire from neuron's `/models`. Skip — the model
@@ -110,7 +184,13 @@ pub async fn resolve(
                }
            }
        }
-        (loaded_route, unloaded_route, any_healthy)
+        // Pick the least-busy loaded replica; ties break by node name for
+        // deterministic routing. `false` = not a cold start.
+        let loaded_route = loaded_candidates
+            .into_iter()
+            .min_by(|a, b| a.2.cmp(&b.2).then_with(|| a.0.cmp(&b.0)))
+            .map(|(name, endpoint, _score)| (name, endpoint, false));
+        (loaded_route, unloaded_route, recovering_node, any_healthy)
    };

    if !any_healthy {
@@ -122,12 +202,20 @@ pub async fn resolve(
        return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await;
    }

-    // Priority 2: known to neuron but unloaded (neuron's lazy load).
+    // Priority 2: recovering somewhere — transient hold, not a reroute.
+    if let Some(node) = recovering_node {
+        return Err(RouteError::ModelRecovering {
+            model_id: model_id.to_string(),
+            node,
+        });
+    }
+
+    // Priority 3: known to neuron but unloaded (neuron's lazy load).
    if let Some((node_name, neuron_endpoint, cold_start)) = unloaded_route {
        return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await;
    }

-    // Priority 3: catalogue × topology cold-load.
+    // Priority 4: catalogue × topology cold-load.
    if let Some(profile) = fleet.catalogue.get(model_id) {
        let (node_name, neuron_endpoint) = pick_feasible_neuron(fleet, profile).await?;
        cold_load(fleet, &node_name, &neuron_endpoint, profile).await?;
@@ -244,6 +332,10 @@ async fn cold_load(
                    status: ModelStatus::Loaded,
                    last_accessed: Some(chrono::Utc::now()),
                    vram_estimate_mb: profile.vram_mb,
+                    capabilities: Vec::new(),
+                    tool_call: false,
+                    reasoning: false,
+                    limit: None,
                },
            );
        }
@@ -292,7 +384,7 @@ async fn profile_to_spec(
    };

    ModelSpec {
-        model_id: profile.id.clone(),
+        model_id: qualified_model_id(profile),
        harness: profile.harness.clone(),
        quant: profile.quant.clone(),
        tensor_parallel,
@@ -300,6 +392,22 @@ async fn profile_to_spec(
    }
 }

+/// Prefix the catalogue id with the scheme when one is declared, so
+/// neuron resolves the load against the right registry. Without this,
+/// a profile pointing at the helexa registry would resolve via
+/// neuron's `default_source` (typically `huggingface`) and fetch
+/// bytes from the wrong place. Profiles that omit `source` continue
+/// to pass the bare id through, preserving the pre-Phase-3 contract.
+///
+/// Stays at module scope (not nested in `profile_to_spec`) so the unit
+/// tests can exercise it without spinning up CortexState topology.
+fn qualified_model_id(profile: &ModelProfile) -> String {
+    match profile.source.as_deref() {
+        Some(scheme) if !scheme.is_empty() => format!("{scheme}:{}", profile.id),
+        _ => profile.id.clone(),
+    }
+}
+
 /// Resolve neuron's `/models/{id}/endpoint` to its inference URL and
 /// build the final `RouteDecision`. Shared by all three priority
 /// branches above.
@@ -375,7 +483,46 @@ fn rewrite_loopback_host(inference_url: &str, neuron_endpoint: &str) -> Option<S

 #[cfg(test)]
 mod tests {
-    use super::rewrite_loopback_host;
+    use super::{ModelProfile, qualified_model_id, rewrite_loopback_host};
+
+    fn bare_profile(id: &str, source: Option<&str>) -> ModelProfile {
+        ModelProfile {
+            id: id.into(),
+            harness: "candle".into(),
+            quant: None,
+            vram_mb: None,
+            min_devices: 1,
+            min_device_vram_mb: None,
+            pinned_on: vec![],
+            source: source.map(String::from),
+            limit: None,
+            cost: None,
+            capabilities: vec![],
+        }
+    }
+
+    #[test]
+    fn qualified_id_passes_through_when_source_absent() {
+        let p = bare_profile("Qwen/Qwen3-30B", None);
+        assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B");
+    }
+
+    #[test]
+    fn qualified_id_prefixes_when_source_set() {
+        let p = bare_profile("Helexa/Qwen3.6-27B-Uncensored", Some("helexa"));
+        assert_eq!(
+            qualified_model_id(&p),
+            "helexa:Helexa/Qwen3.6-27B-Uncensored"
+        );
+    }
+
+    #[test]
+    fn qualified_id_passes_through_when_source_is_empty_string() {
+        // An empty scheme is treated as absent — neuron's default_source
+        // substitution kicks in.
+        let p = bare_profile("Qwen/Qwen3-30B", Some(""));
+        assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B");
+    }

    #[test]
    fn rewrites_localhost_keeps_port_and_path() {
--- a/crates/cortex-gateway/src/state.rs
+++ b/crates/cortex-gateway/src/state.rs
@@ -1,7 +1,10 @@
+use crate::entitlements_local::LocalEntitlementProvider;
 use cortex_core::catalogue::ModelCatalogue;
 use cortex_core::config::{EvictionSettings, GatewayConfig, NeuronEndpoint};
+use cortex_core::entitlements::EntitlementProvider;
 use cortex_core::node::NodeState;
 use std::collections::HashMap;
+use std::sync::Arc;
 use tokio::sync::RwLock;

 /// Shared fleet state, protected by a RwLock for concurrent reader access.
@@ -11,6 +14,12 @@ pub struct CortexState {
    pub eviction: EvictionSettings,
    pub catalogue: ModelCatalogue,
    pub http_client: reqwest::Client,
+    /// Resolves bearer keys to principals and enforces token budgets (#47).
+    /// A local/static provider today (#50); the upstream client later (#57).
+    pub entitlements: Arc<dyn EntitlementProvider>,
+    /// Whether to reject unauthenticated requests (#49). Read by the auth
+    /// middleware once it lands.
+    pub require_auth: bool,
 }

 impl CortexState {
@@ -28,12 +37,16 @@ impl CortexState {
                    last_poll: None,
                    discovery: None,
                    activation: None,
+                    model_load: HashMap::new(),
                },
            );
        }

        let catalogue = ModelCatalogue::load(&config.models_config);

+        let entitlements: Arc<dyn EntitlementProvider> =
+            Arc::new(LocalEntitlementProvider::from_config(&config.entitlements));
+
        Self {
            nodes: RwLock::new(nodes),
            neuron_configs: config.neurons.clone(),
@@ -43,6 +56,8 @@ impl CortexState {
                .timeout(std::time::Duration::from_secs(300))
                .build()
                .expect("failed to build HTTP client"),
+            entitlements,
+            require_auth: config.entitlements.require_auth,
        }
    }
 }
--- a/crates/cortex-gateway/tests/aliases.rs
+++ b/crates/cortex-gateway/tests/aliases.rs
@@ -56,6 +56,7 @@ async fn test_alias_resolves_in_chat_completions() {
            endpoint: mock_url,
        }],
        models_config: models_path.to_string_lossy().to_string(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -74,6 +75,10 @@ async fn test_alias_resolves_in_chat_completions() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
@@ -137,6 +142,7 @@ async fn test_aliases_surface_in_v1_models() {
            endpoint: mock_url,
        }],
        models_config: models_path.to_string_lossy().to_string(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -154,6 +160,10 @@ async fn test_aliases_surface_in_v1_models() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: Some(2000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
@@ -221,6 +231,7 @@ async fn test_alias_falls_through_for_unmapped_model() {
            endpoint: mock_url,
        }],
        models_config: models_path.to_string_lossy().to_string(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -235,6 +246,10 @@ async fn test_alias_falls_through_for_unmapped_model() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
--- a/crates/cortex-gateway/tests/anthropic.rs
+++ b/crates/cortex-gateway/tests/anthropic.rs
@@ -123,3 +123,212 @@ async fn test_anthropic_invalid_request() {

    assert_eq!(resp.status(), 400);
 }
+
+/// Tool round-trip: an Anthropic `/v1/messages` request carrying tools
+/// (the Claude Code shape: `{name, description, input_schema}`) must
+/// reach the upstream neuron reshaped into OpenAI function-tool form,
+/// and tool history (`tool_use` / `tool_result` blocks) must become
+/// `tool_calls` / `role:"tool"` messages. This is the fix for the
+/// failure where the model received malformed tool defs and improvised
+/// an unparseable `<tool_use_name>` format.
+#[tokio::test]
+async fn test_anthropic_tools_reshaped_for_upstream() {
+    let (mock_url, captured) = common::spawn_capturing_mock_neuron().await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/messages"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "max_tokens": 100,
+            "tools": [{
+                "name": "Read",
+                "description": "Read a file from disk",
+                "input_schema": {
+                    "type": "object",
+                    "properties": {"path": {"type": "string"}},
+                    "required": ["path"]
+                }
+            }],
+            "tool_choice": {"type": "auto"},
+            "messages": [
+                {"role": "user", "content": "read /etc/hosts"},
+                {"role": "assistant", "content": [
+                    {"type": "text", "text": "Reading it."},
+                    {"type": "tool_use", "id": "toolu_42", "name": "Read",
+                     "input": {"path": "/etc/hosts"}}
+                ]},
+                {"role": "user", "content": [
+                    {"type": "tool_result", "tool_use_id": "toolu_42",
+                     "content": "127.0.0.1 localhost"}
+                ]}
+            ]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+    assert_eq!(resp.status(), 200);
+
+    let forwarded = {
+        let guard = captured.lock().unwrap();
+        guard.last().cloned().expect("upstream received a request")
+    };
+
+    // Tool definitions reshaped to OpenAI function form.
+    let tools = forwarded["tools"].as_array().expect("tools array");
+    assert_eq!(tools[0]["type"], "function");
+    assert_eq!(tools[0]["function"]["name"], "Read");
+    assert_eq!(
+        tools[0]["function"]["parameters"]["properties"]["path"]["type"],
+        "string"
+    );
+    assert!(tools[0]["function"].get("input_schema").is_none());
+
+    // tool_choice mapped.
+    assert_eq!(forwarded["tool_choice"], "auto");
+
+    // Message history: user, assistant(+tool_calls), tool, user.
+    let msgs = forwarded["messages"].as_array().expect("messages array");
+    let assistant = msgs
+        .iter()
+        .find(|m| m["role"] == "assistant")
+        .expect("assistant turn");
+    assert_eq!(assistant["tool_calls"][0]["id"], "toolu_42");
+    assert_eq!(assistant["tool_calls"][0]["function"]["name"], "Read");
+    // arguments is the parsed object, not a JSON string — the Qwen3.6
+    // chat template iterates `tool_call.arguments | items`.
+    assert_eq!(
+        assistant["tool_calls"][0]["function"]["arguments"],
+        json!({"path": "/etc/hosts"})
+    );
+
+    let tool_msg = msgs
+        .iter()
+        .find(|m| m["role"] == "tool")
+        .expect("tool result turn");
+    assert_eq!(tool_msg["tool_call_id"], "toolu_42");
+    assert_eq!(tool_msg["content"], "127.0.0.1 localhost");
+}
+
+/// #24: a streaming Anthropic request gets a translated Anthropic SSE
+/// stream — not raw OpenAI frames. Verifies the full event sequence,
+/// text reassembly, and the content type.
+#[tokio::test]
+async fn test_anthropic_streaming_sse_translation() {
+    let mock_url =
+        common::spawn_streaming_mock_neuron(4, std::time::Duration::from_millis(20)).await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/messages"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "max_tokens": 64,
+            "stream": true,
+            "messages": [{"role": "user", "content": "Hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+    assert!(
+        resp.headers()
+            .get("content-type")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("")
+            .starts_with("text/event-stream"),
+        "anthropic stream must be SSE"
+    );
+
+    let body = resp.text().await.expect("stream should complete");
+    assert!(
+        !body.contains("chat.completion.chunk"),
+        "raw OpenAI frames must not leak through:\n{body}"
+    );
+
+    let event_names: Vec<&str> = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("event: "))
+        .collect();
+    assert_eq!(
+        event_names,
+        vec![
+            "message_start",
+            "content_block_start",
+            "content_block_delta",
+            "content_block_delta",
+            "content_block_delta",
+            "content_block_delta",
+            "content_block_stop",
+            "message_delta",
+            "message_stop",
+        ],
+        "unexpected event sequence:\n{body}"
+    );
+
+    // Reassemble the text deltas: the mock emits token0..token3.
+    let text: String = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("data: "))
+        .filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
+        .filter(|v| v["type"] == "content_block_delta")
+        .filter_map(|v| v["delta"]["text"].as_str().map(String::from))
+        .collect();
+    assert_eq!(text, "token0token1token2token3");
+
+    // The mock sends no finish_reason — stop_reason defaults to
+    // end_turn, and output_tokens falls back to the delta count.
+    let message_delta = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("data: "))
+        .filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
+        .find(|v| v["type"] == "message_delta")
+        .expect("message_delta event present");
+    assert_eq!(message_delta["delta"]["stop_reason"], "end_turn");
+    assert_eq!(message_delta["usage"]["output_tokens"], 4);
+}
+
+/// #24: an upstream usage frame (stream_options include_usage shape)
+/// rides into message_delta as input/output token counts.
+#[tokio::test]
+async fn test_anthropic_streaming_usage_propagation() {
+    let mock_url = common::spawn_streaming_mock_neuron_with_usage(
+        3,
+        std::time::Duration::from_millis(10),
+        225,
+        42,
+    )
+    .await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let body = client
+        .post(format!("{gw_url}/v1/messages"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "max_tokens": 64,
+            "stream": true,
+            "messages": [{"role": "user", "content": "Hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed")
+        .text()
+        .await
+        .expect("stream should complete");
+
+    let message_delta = body
+        .lines()
+        .filter_map(|l| l.strip_prefix("data: "))
+        .filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
+        .find(|v| v["type"] == "message_delta")
+        .expect("message_delta event present");
+    assert_eq!(message_delta["usage"]["output_tokens"], 42);
+    assert_eq!(message_delta["usage"]["input_tokens"], 225);
+}
--- a/crates/cortex-gateway/tests/auth.rs
+++ b/crates/cortex-gateway/tests/auth.rs
@@ -0,0 +1,250 @@
+//! Integration tests for API-key auth + principal resolution (#49).
+//!
+//! Verifies the #63 rejection contract (401 invalid_api_key via the #60
+//! envelope) and that an authenticated request reaches neuron carrying the
+//! internal principal headers — while a client-supplied principal header is
+//! stripped (anti-spoofing).
+
+use axum::Json;
+use axum::extract::Path;
+use axum::http::HeaderMap;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, HEADER_ACCOUNT_ID, HEADER_KEY_ID};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::{Arc, Mutex};
+use tokio::net::TcpListener;
+
+/// What the mock neuron observed on the inbound `/v1/chat/completions`
+/// request: the principal headers cortex stamped (or didn't).
+#[derive(Default)]
+struct Seen {
+    account_id: Option<String>,
+    key_id: Option<String>,
+}
+
+/// Spawn a mock neuron that records the principal headers it receives and
+/// returns a trivial chat completion. Returns (base_url, observed).
+async fn spawn_capturing_neuron() -> (String, Arc<Mutex<Seen>>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let seen: Arc<Mutex<Seen>> = Arc::new(Mutex::new(Seen::default()));
+    let sink = Arc::clone(&seen);
+
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |headers: HeaderMap, Json(body): Json<Value>| {
+                let sink = Arc::clone(&sink);
+                async move {
+                    {
+                        let mut s = sink.lock().unwrap();
+                        s.account_id = headers
+                            .get(HEADER_ACCOUNT_ID)
+                            .and_then(|v| v.to_str().ok())
+                            .map(str::to_string);
+                        s.key_id = headers
+                            .get(HEADER_KEY_ID)
+                            .and_then(|v| v.to_str().ok())
+                            .map(str::to_string);
+                    }
+                    let model = body.get("model").and_then(Value::as_str).unwrap_or("m");
+                    Json(json!({
+                        "id": "chatcmpl-auth-001",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "message": {"role": "assistant", "content": "ok"},
+                            "finish_reason": "stop"
+                        }],
+                        "usage": {"prompt_tokens": 3, "completion_tokens": 1, "total_tokens": 4}
+                    }))
+                }
+            }),
+        )
+        .with_state(());
+
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    (base_url, seen)
+}
+
+/// Spawn a gateway with the given entitlements config, a single neuron, and
+/// `test-model` seeded as loaded (build_app spawns no poller).
+async fn spawn_gateway(neuron_url: &str, entitlements: EntitlementsConfig) -> String {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements,
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    format!("http://{addr}")
+}
+
+fn one_key_config(require_auth: bool) -> EntitlementsConfig {
+    EntitlementsConfig {
+        require_auth,
+        keys: vec![ApiKeyConfig {
+            key: "sk-good".into(),
+            account_id: "acct-1".into(),
+            key_id: Some("key-1".into()),
+            hard_cap: None,
+            window: CapWindow::Balance,
+        }],
+    }
+}
+
+fn chat_body() -> Value {
+    json!({
+        "model": "test-model",
+        "messages": [{"role": "user", "content": "hi"}]
+    })
+}
+
+#[tokio::test]
+async fn missing_key_when_required_is_401_invalid_api_key() {
+    let (neuron, _seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::UNAUTHORIZED);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "invalid_api_key");
+    assert_eq!(body["error"]["type"], "invalid_request_error");
+}
+
+#[tokio::test]
+async fn invalid_key_is_401_even_when_auth_not_required() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    // A present-but-wrong credential is always an error.
+    let gateway = spawn_gateway(&neuron, one_key_config(false)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-wrong")
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::UNAUTHORIZED);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "invalid_api_key");
+    // Rejected before dispatch — neuron never saw the request.
+    assert!(seen.lock().unwrap().account_id.is_none());
+}
+
+#[tokio::test]
+async fn valid_key_reaches_neuron_with_principal_headers() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-good")
+        // A spoofed principal header must be stripped, not forwarded.
+        .header(HEADER_ACCOUNT_ID, "attacker")
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let s = seen.lock().unwrap();
+    assert_eq!(s.account_id.as_deref(), Some("acct-1"));
+    assert_eq!(s.key_id.as_deref(), Some("key-1"));
+}
+
+#[tokio::test]
+async fn anonymous_allowed_when_auth_not_required() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, EntitlementsConfig::default()).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    // No principal resolved → no principal headers stamped.
+    let s = seen.lock().unwrap();
+    assert!(s.account_id.is_none());
+    assert!(s.key_id.is_none());
+}
+
+#[tokio::test]
+async fn health_is_public_even_when_auth_required() {
+    let (neuron, _seen) = spawn_capturing_neuron().await;
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
+    let resp = reqwest::Client::new()
+        .get(format!("{gateway}/health"))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+}
--- a/crates/cortex-gateway/tests/budget_enforcement.rs
+++ b/crates/cortex-gateway/tests/budget_enforcement.rs
@@ -0,0 +1,253 @@
+//! Integration tests for budget enforcement (#52) — the A0 seatbelt.
+//!
+//! A reservation over the key's hard cap is refused *before* neuron is hit,
+//! with the #63 code matching the cap-window semantics (rate_limit_exceeded
+//! + Retry-After for a resetting window, insufficient_quota for a hard
+//! balance). Spend never exceeds the cap. No 402, ever.
+
+use axum::Json;
+use axum::extract::Path;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, Principal};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use tokio::net::TcpListener;
+
+/// Mock neuron with a hit counter on the inference path, so a test can prove
+/// a request was (or wasn't) dispatched.
+async fn spawn_counting_neuron() -> (String, Arc<AtomicU64>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let hits = Arc::new(AtomicU64::new(0));
+    let sink = Arc::clone(&hits);
+
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| {
+                let sink = Arc::clone(&sink);
+                async move {
+                    sink.fetch_add(1, Ordering::SeqCst);
+                    let model = body.get("model").and_then(Value::as_str).unwrap_or("m");
+                    Json(json!({
+                        "id": "chatcmpl-budget",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}],
+                        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+                    }))
+                }
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (base_url, hits)
+}
+
+async fn spawn_gateway(neuron_url: &str, key: ApiKeyConfig) -> (Arc<CortexState>, String) {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig {
+            require_auth: true,
+            keys: vec![key],
+        },
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (fleet, format!("http://{addr}"))
+}
+
+fn key(window: CapWindow, hard_cap: u64) -> ApiKeyConfig {
+    ApiKeyConfig {
+        key: "sk-cap".into(),
+        account_id: "acct-cap".into(),
+        key_id: Some("key-cap".into()),
+        hard_cap: Some(hard_cap),
+        window,
+    }
+}
+
+fn chat(max_tokens: u64) -> Value {
+    json!({
+        "model": "test-model",
+        "max_tokens": max_tokens,
+        "messages": [{"role": "user", "content": "hi"}]
+    })
+}
+
+#[tokio::test]
+async fn balance_over_cap_is_429_insufficient_quota_before_dispatch() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    // Cap far below a single request's reservation (max_tokens 1000).
+    let (_fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 10)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(1000))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::TOO_MANY_REQUESTS);
+    // Hard balance → no Retry-After.
+    assert!(resp.headers().get(reqwest::header::RETRY_AFTER).is_none());
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "insufficient_quota");
+    // Refused before dispatch — neuron never saw it.
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn rolling_over_cap_is_429_rate_limited_with_retry_after() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (_fleet, gateway) =
+        spawn_gateway(&neuron, key(CapWindow::Rolling { seconds: 3600 }, 10)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(1000))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::TOO_MANY_REQUESTS);
+    let retry = resp
+        .headers()
+        .get(reqwest::header::RETRY_AFTER)
+        .expect("rolling-window rejection must carry Retry-After");
+    assert!(retry.to_str().unwrap().parse::<u64>().unwrap() >= 1);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn within_cap_is_served() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (_fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 1_000_000)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(50))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    assert_eq!(hits.load(Ordering::SeqCst), 1);
+}
+
+#[tokio::test]
+async fn a0_seatbelt_caps_a_runaway_fan_out() {
+    // An Agent-Zero-style key with a modest cap: a burst of requests drains
+    // it, then further requests are refused — the account stops draining and
+    // spend never exceeds the cap.
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 100)).await;
+    let client = reqwest::Client::new();
+
+    let mut ok = 0;
+    let mut refused = 0;
+    for _ in 0..20 {
+        let resp = client
+            .post(format!("{gateway}/v1/chat/completions"))
+            .bearer_auth("sk-cap")
+            .json(&chat(20))
+            .send()
+            .await
+            .unwrap();
+        match resp.status() {
+            reqwest::StatusCode::OK => {
+                ok += 1;
+                let _ = resp.bytes().await.unwrap();
+            }
+            reqwest::StatusCode::TOO_MANY_REQUESTS => {
+                refused += 1;
+                let body: Value = resp.json().await.unwrap();
+                assert_eq!(body["error"]["code"], "insufficient_quota");
+            }
+            other => panic!("unexpected status {other}"),
+        }
+    }
+
+    assert!(ok >= 1, "some requests should be served");
+    assert!(refused >= 1, "the cap must eventually refuse the fan-out");
+    assert_eq!(
+        hits.load(Ordering::SeqCst),
+        ok,
+        "refused requests never dispatched"
+    );
+
+    // Spend never exceeded the hard cap (reservation prevents overshoot).
+    // Poll briefly for in-flight settles to land.
+    let principal = Principal {
+        account_id: "acct-cap".into(),
+        key_id: "key-cap".into(),
+    };
+    for _ in 0..50 {
+        let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+        if snap.reserved == 0 {
+            break;
+        }
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+    }
+    let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+    assert!(snap.spent <= 100, "spent {} exceeded cap", snap.spent);
+}
--- a/crates/cortex-gateway/tests/common/mod.rs
+++ b/crates/cortex-gateway/tests/common/mod.rs
@@ -54,9 +54,64 @@ pub async fn spawn_mock_neuron() -> String {
    base_url
 }

+/// Like [`spawn_mock_neuron`] but captures the JSON body of every
+/// `POST /v1/chat/completions` it receives into the returned handle, so
+/// a test can assert what the gateway *actually forwarded upstream*
+/// (e.g. that Anthropic-shaped tools were reshaped to OpenAI form).
+pub async fn spawn_capturing_mock_neuron() -> (String, Arc<std::sync::Mutex<Vec<Value>>>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let captured: Arc<std::sync::Mutex<Vec<Value>>> = Arc::new(std::sync::Mutex::new(Vec::new()));
+    let sink = captured.clone();
+
+    let app = Router::new()
+        .route("/models", get(mock_neuron_list_models))
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| {
+                let sink = sink.clone();
+                async move {
+                    let model = body
+                        .get("model")
+                        .and_then(|v| v.as_str())
+                        .unwrap_or("unknown");
+                    let resp = json!({
+                        "id": "chatcmpl-capture-001",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "message": {"role": "assistant", "content": "Hello from mock backend"},
+                            "finish_reason": "stop"
+                        }],
+                        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+                    });
+                    sink.lock().unwrap().push(body);
+                    Json(resp)
+                }
+            }),
+        );
+
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    (base_url, captured)
+}
+
 async fn mock_neuron_list_models() -> Json<Value> {
    Json(json!([
-        {"id": "test-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000}
+        {"id": "test-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000, "capabilities": ["text"], "tool_call": false, "reasoning": false}
    ]))
 }

@@ -196,6 +251,91 @@ pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Durati
    base_url
 }

+/// Like `spawn_streaming_mock_neuron`, but the stream ends with an
+/// OpenAI `stream_options.include_usage`-style final chunk (empty
+/// choices + usage object) before `[DONE]` — the shape the gateway's
+/// token metrics (#21) extract counts from.
+pub async fn spawn_streaming_mock_neuron_with_usage(
+    chunk_count: usize,
+    chunk_delay: Duration,
+    prompt_tokens: u64,
+    completion_tokens: u64,
+) -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+
+    let app = Router::new()
+        .route("/models", get(mock_neuron_list_models))
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_model_id): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| async move {
+                let model = body
+                    .get("model")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("unknown")
+                    .to_string();
+
+                let mut chunks: Vec<String> = (0..chunk_count)
+                    .map(|i| {
+                        let chunk = json!({
+                            "id": "chatcmpl-stream-002",
+                            "object": "chat.completion.chunk",
+                            "created": 1700000000_u64,
+                            "model": model,
+                            "choices": [{
+                                "index": 0,
+                                "delta": { "content": format!("token{i}") },
+                                "finish_reason": null
+                            }]
+                        });
+                        format!("data: {chunk}\n\n")
+                    })
+                    .collect();
+                let usage_chunk = json!({
+                    "id": "chatcmpl-stream-002",
+                    "object": "chat.completion.chunk",
+                    "created": 1700000000_u64,
+                    "model": model,
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens,
+                        "completion_tokens": completion_tokens,
+                        "total_tokens": prompt_tokens + completion_tokens
+                    }
+                });
+                chunks.push(format!("data: {usage_chunk}\n\n"));
+                chunks.push("data: [DONE]\n\n".to_string());
+
+                let delay = chunk_delay;
+                let stream = stream::iter(chunks).then(move |chunk| async move {
+                    tokio::time::sleep(delay).await;
+                    Ok::<_, std::convert::Infallible>(chunk)
+                });
+
+                Response::builder()
+                    .header(header::CONTENT_TYPE, "text/event-stream")
+                    .header(header::CACHE_CONTROL, "no-cache")
+                    .body(Body::from_stream(stream))
+                    .unwrap()
+            }),
+        );
+
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    base_url
+}
+
 /// Spawns a mock neuron with a custom models list.
 pub async fn spawn_mock_neuron_with_models(models_response: Value) -> String {
    spawn_mock_neuron_with_models_and_health(models_response, default_health_response()).await
@@ -289,6 +429,7 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
            endpoint: mock_url.to_string(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -305,6 +446,10 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
--- a/crates/cortex-gateway/tests/error_envelope.rs
+++ b/crates/cortex-gateway/tests/error_envelope.rs
@@ -0,0 +1,140 @@
+mod common;
+
+use serde_json::json;
+
+#[tokio::test]
+async fn error_response_model_not_found() {
+    let neuron_url = common::spawn_mock_neuron().await;
+    let gateway_url = common::spawn_gateway(&neuron_url).await;
+
+    let client = reqwest::Client::new();
+
+    // Request a model that isn't loaded on the mock neuron.
+    let resp = client
+        .post(format!("{gateway_url}/v1/chat/completions"))
+        .header("Content-Type", "application/json")
+        .json(&json!({
+            "model": "nonexistent-model",
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), axum::http::StatusCode::NOT_FOUND);
+
+    let body: serde_json::Value = resp.json().await.expect("valid json");
+    let err = body.get("error").expect("response has error object");
+
+    // Broad type categorization
+    assert_eq!(err.get("type").unwrap(), "invalid_request_error");
+    // Specific machine-readable code
+    assert_eq!(
+        err.get("code").unwrap().as_str().unwrap(),
+        "model_not_found"
+    );
+    // param is always null
+    assert!(err.get("param").unwrap().is_null());
+}
+
+#[tokio::test]
+async fn error_response_missing_model_field() {
+    let neuron_url = common::spawn_mock_neuron().await;
+    let gateway_url = common::spawn_gateway(&neuron_url).await;
+
+    let client = reqwest::Client::new();
+
+    // Request without the required `model` field.
+    let resp = client
+        .post(format!("{gateway_url}/v1/chat/completions"))
+        .header("Content-Type", "application/json")
+        .json(&json!({
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), axum::http::StatusCode::BAD_REQUEST);
+
+    let body: serde_json::Value = resp.json().await.expect("valid json");
+    let err = body.get("error").expect("response has error object");
+
+    assert_eq!(err.get("type").unwrap(), "invalid_request_error");
+    assert_eq!(
+        err.get("code").unwrap().as_str().unwrap(),
+        "missing_model_field"
+    );
+    assert!(err.get("param").unwrap().is_null());
+}
+
+#[tokio::test]
+async fn error_response_no_healthy_nodes() {
+    use cortex_core::config::{EvictionSettings, GatewayConfig, GatewaySettings, NeuronEndpoint};
+    use std::sync::Arc;
+
+    // Create a gateway config with a neuron pointing at an unreachable port so no node is ever healthy.
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: cortex_core::config::EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "dead-node".into(),
+            endpoint: "http://127.0.0.1:1".into(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(cortex_gateway::state::CortexState::from_config(&config));
+
+    let app = cortex_gateway::build_app(fleet);
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    // Allow the poller a moment to mark the node unhealthy.
+    tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .header("Content-Type", "application/json")
+        .json(&json!({
+            "model": "any-model",
+            "messages": [{"role": "user", "content": "hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), axum::http::StatusCode::SERVICE_UNAVAILABLE);
+
+    // Transient 503 — the gateway advertises Retry-After so OpenAI-compatible
+    // clients back off and retry rather than surfacing an opaque error (#63).
+    let retry_after = resp
+        .headers()
+        .get(reqwest::header::RETRY_AFTER)
+        .expect("transient 503 must carry Retry-After")
+        .to_str()
+        .unwrap()
+        .to_string();
+    assert_eq!(retry_after, "5");
+
+    let body: serde_json::Value = resp.json().await.expect("valid json");
+    let err = body.get("error").expect("response has error object");
+
+    assert_eq!(err.get("type").unwrap(), "api_error");
+    assert_eq!(
+        err.get("code").unwrap().as_str().unwrap(),
+        "service_unavailable"
+    );
+    assert!(err.get("param").unwrap().is_null());
+}
--- a/crates/cortex-gateway/tests/eviction.rs
+++ b/crates/cortex-gateway/tests/eviction.rs
@@ -71,6 +71,7 @@ fn make_fleet(endpoint: &str, defrag_after: u32) -> Arc<CortexState> {
            endpoint: endpoint.to_string(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };
    Arc::new(CortexState::from_config(&config))
 }
@@ -91,6 +92,10 @@ async fn test_evict_lru_model() {
                status: ModelStatus::Loaded,
                last_accessed: Some(Utc::now() - chrono::Duration::hours(2)),
                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
        node.models.insert(
@@ -100,6 +105,10 @@ async fn test_evict_lru_model() {
                status: ModelStatus::Loaded,
                last_accessed: Some(Utc::now()),
                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
@@ -163,6 +172,10 @@ async fn test_eviction_increments_lifecycle_cycles() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
--- a/crates/cortex-gateway/tests/load_routing.rs
+++ b/crates/cortex-gateway/tests/load_routing.rs
@@ -0,0 +1,189 @@
+//! Load-aware routing across replicas (#55).
+//!
+//! When a model is loaded on more than one healthy neuron, the router picks
+//! the least-busy replica using the per-model admission load each neuron
+//! reports on `GET /health` (#53), rather than always taking the first.
+
+mod common;
+
+use axum::Json;
+use axum::extract::Path;
+use axum::http::{StatusCode, header};
+use axum::response::IntoResponse;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::discovery::ModelLoad;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+/// Seed a node as healthy with `test-model` loaded and a given admission load.
+async fn seed_loaded(fleet: &CortexState, node: &str, in_flight: usize, queue_depth: usize) {
+    let mut nodes = fleet.nodes.write().await;
+    let n = nodes.get_mut(node).expect("node exists");
+    n.healthy = true;
+    n.models.insert(
+        "test-model".into(),
+        ModelEntry {
+            id: "test-model".into(),
+            status: ModelStatus::Loaded,
+            last_accessed: None,
+            vram_estimate_mb: Some(8000),
+            capabilities: Vec::new(),
+            tool_call: false,
+            reasoning: false,
+            limit: None,
+        },
+    );
+    n.model_load.insert(
+        "test-model".into(),
+        ModelLoad {
+            id: "test-model".into(),
+            in_flight,
+            queue_depth,
+        },
+    );
+}
+
+/// Build a gateway state over two mock neurons (no poller; we seed state).
+async fn two_neuron_fleet(endpoint_a: &str, endpoint_b: &str) -> Arc<CortexState> {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![
+            NeuronEndpoint {
+                name: "node-a".into(),
+                endpoint: endpoint_a.to_string(),
+            },
+            NeuronEndpoint {
+                name: "node-b".into(),
+                endpoint: endpoint_b.to_string(),
+            },
+        ],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+    Arc::new(CortexState::from_config(&config))
+}
+
+#[tokio::test]
+async fn routes_to_least_busy_replica() {
+    let neuron_a = common::spawn_mock_neuron().await;
+    let neuron_b = common::spawn_mock_neuron().await;
+    let fleet = two_neuron_fleet(&neuron_a, &neuron_b).await;
+
+    // A is busy (1 running + 3 queued), B is idle.
+    seed_loaded(&fleet, "node-a", 1, 3).await;
+    seed_loaded(&fleet, "node-b", 0, 0).await;
+
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("model is loaded on both nodes");
+    assert_eq!(route.node_name, "node-b", "should pick the idle replica");
+
+    // Flip the load: now B is the busy one.
+    seed_loaded(&fleet, "node-a", 0, 0).await;
+    seed_loaded(&fleet, "node-b", 1, 5).await;
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("still loaded");
+    assert_eq!(route.node_name, "node-a", "should follow the lighter load");
+}
+
+/// Mock neuron whose inference endpoint always returns a #63 backpressure
+/// envelope (503 + Retry-After) — simulating a saturated neuron.
+async fn spawn_busy_neuron() -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(|| async {
+                let body = json!({"error": {
+                    "message": "model is busy (admission queue full); retry shortly",
+                    "type": "rate_limit_error",
+                    "code": "rate_limit_exceeded",
+                    "param": null
+                }});
+                (
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    [(header::RETRY_AFTER, "6")],
+                    Json(body),
+                )
+                    .into_response()
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    base_url
+}
+
+#[tokio::test]
+async fn neuron_backpressure_is_propagated_intact() {
+    // A saturated neuron's 503 + Retry-After + envelope must reach the client
+    // verbatim — not unwrapped, remapped, or stripped (#55 / #63).
+    let neuron = spawn_busy_neuron().await;
+    let fleet = two_neuron_fleet(&neuron, &neuron).await;
+    seed_loaded(&fleet, "node-a", 1, 8).await;
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let resp = reqwest::Client::new()
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::SERVICE_UNAVAILABLE);
+    assert_eq!(
+        resp.headers()
+            .get(reqwest::header::RETRY_AFTER)
+            .and_then(|v| v.to_str().ok()),
+        Some("6"),
+        "Retry-After must survive the proxy"
+    );
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+}
+
+#[tokio::test]
+async fn ties_break_deterministically_by_name() {
+    let neuron_a = common::spawn_mock_neuron().await;
+    let neuron_b = common::spawn_mock_neuron().await;
+    let fleet = two_neuron_fleet(&neuron_a, &neuron_b).await;
+
+    // Equal load on both → stable pick (lowest node name).
+    seed_loaded(&fleet, "node-a", 0, 0).await;
+    seed_loaded(&fleet, "node-b", 0, 0).await;
+
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("loaded");
+    assert_eq!(route.node_name, "node-a", "ties break by name");
+}
--- a/crates/cortex-gateway/tests/metering.rs
+++ b/crates/cortex-gateway/tests/metering.rs
@@ -0,0 +1,207 @@
+//! Integration tests for per-request token metering (#51).
+//!
+//! Drives authenticated requests through the gateway to a mock neuron that
+//! reports a fixed `usage` object, then asserts the EntitlementProvider's
+//! spend ledger reflects cumulative per-key spend and that reservations
+//! settle to actual (no outstanding reserved tokens once requests complete).
+
+mod common;
+
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, Principal};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::json;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::net::TcpListener;
+
+const ACCOUNT: &str = "acct-meter";
+const KEY_ID: &str = "key-meter";
+const BEARER: &str = "sk-meter";
+
+/// The mock neuron (common::spawn_mock_neuron) reports this fixed usage on
+/// every chat completion.
+const PROMPT_PER_REQ: u64 = 10;
+const COMPLETION_PER_REQ: u64 = 5;
+
+async fn spawn_metered_gateway(neuron_url: &str) -> (Arc<CortexState>, String) {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig {
+            require_auth: true,
+            keys: vec![ApiKeyConfig {
+                key: BEARER.into(),
+                account_id: ACCOUNT.into(),
+                key_id: Some(KEY_ID.into()),
+                hard_cap: Some(1_000_000),
+                window: CapWindow::Balance,
+            }],
+        },
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (fleet, format!("http://{addr}"))
+}
+
+fn principal() -> Principal {
+    Principal {
+        account_id: ACCOUNT.into(),
+        key_id: KEY_ID.into(),
+    }
+}
+
+/// Poll the provider ledger until settled spend reaches `expected` (settle
+/// runs in a spawned task after the response stream finishes) or time out.
+async fn await_spent(fleet: &CortexState, expected: u64) -> u64 {
+    let principal = principal();
+    for _ in 0..100 {
+        let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+        if snap.spent >= expected {
+            return snap.spent;
+        }
+        tokio::time::sleep(Duration::from_millis(20)).await;
+    }
+    fleet.entitlements.snapshot(&principal).await.unwrap().spent
+}
+
+#[tokio::test]
+async fn cumulative_spend_is_metered_per_key() {
+    let neuron = common::spawn_mock_neuron().await;
+    let (fleet, gateway) = spawn_metered_gateway(&neuron).await;
+    let client = reqwest::Client::new();
+
+    const N: u64 = 3;
+    for _ in 0..N {
+        let resp = client
+            .post(format!("{gateway}/v1/chat/completions"))
+            .bearer_auth(BEARER)
+            .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+            .send()
+            .await
+            .unwrap();
+        assert_eq!(resp.status(), reqwest::StatusCode::OK);
+        // Drain the body so the response stream finishes and metering settles.
+        let _ = resp.bytes().await.unwrap();
+    }
+
+    let expected = N * (PROMPT_PER_REQ + COMPLETION_PER_REQ);
+    let spent = await_spent(&fleet, expected).await;
+    assert_eq!(
+        spent, expected,
+        "ledger must reflect cumulative per-key spend"
+    );
+
+    // Reservations settled to actual — nothing left outstanding.
+    let snap = fleet.entitlements.snapshot(&principal()).await.unwrap();
+    assert_eq!(snap.reserved, 0, "all reservations must settle/release");
+    assert_eq!(snap.hard_cap, Some(1_000_000));
+}
+
+#[tokio::test]
+async fn anonymous_request_records_no_spend() {
+    // require_auth=false so the unauthenticated request is served, but with
+    // no principal it must not touch any ledger.
+    let neuron = common::spawn_mock_neuron().await;
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron.clone(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig::default(),
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let resp = reqwest::Client::new()
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+
+    // An unconfigured principal has a zeroed snapshot — nothing was metered.
+    let snap = fleet
+        .entitlements
+        .snapshot(&Principal {
+            account_id: "nobody".into(),
+            key_id: "nobody".into(),
+        })
+        .await
+        .unwrap();
+    assert_eq!(snap.spent, 0);
+}
--- a/crates/cortex-gateway/tests/metrics.rs
+++ b/crates/cortex-gateway/tests/metrics.rs
@@ -1,20 +1,26 @@
 mod common;

 use serde_json::json;
+use std::sync::OnceLock;
+
+/// The metrics recorder is a process-wide global; both tests in this
+/// binary run against one shared install. Assertions must therefore be
+/// order-independent (presence of names / monotonic counters, not
+/// "empty before").
+fn recorder() -> &'static metrics_exporter_prometheus::PrometheusHandle {
+    static HANDLE: OnceLock<metrics_exporter_prometheus::PrometheusHandle> = OnceLock::new();
+    HANDLE.get_or_init(|| {
+        cortex_gateway::metrics::install_test_recorder().expect("recorder should install")
+    })
+}

 #[tokio::test]
 async fn test_metrics_emitted_after_proxy() {
-    let handle = cortex_gateway::metrics::install_test_recorder().expect("recorder should install");
+    let handle = recorder();

    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

-    let before = handle.render();
-    assert!(
-        !before.contains("cortex_requests_total"),
-        "no request metrics before any requests"
-    );
-
    let client = reqwest::Client::new();
    let resp = client
        .post(format!("{gw_url}/v1/chat/completions"))
@@ -44,3 +50,72 @@ async fn test_metrics_emitted_after_proxy() {
        "no errors expected for a successful request"
    );
 }
+
+#[tokio::test]
+async fn test_token_metrics_emitted_for_streamed_request() {
+    // #21: a streamed chat completion with a final usage chunk must
+    // produce TTFT + tok/s histograms and prompt/completion token
+    // counters, labelled with model and node. The recorder is global
+    // per-process, so this test runs in its own binary invocation —
+    // cargo's per-file integration binaries give us that as long as
+    // only one test in this file installs the recorder... it isn't:
+    // test_metrics_emitted_after_proxy also installs. Whichever wins
+    // the race, both render from the same recorder, so assert on
+    // delta-able names rather than exact totals.
+    let handle = recorder();
+
+    let mock_url = common::spawn_streaming_mock_neuron_with_usage(
+        5,
+        std::time::Duration::from_millis(40),
+        225,
+        42,
+    )
+    .await;
+    let gw_url = common::spawn_gateway(&mock_url).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/chat/completions"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "Hi"}],
+            "stream": true
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+    assert_eq!(resp.status(), 200);
+    let body = resp.text().await.expect("stream should complete");
+    assert!(body.contains("[DONE]"));
+
+    let rendered = handle.render();
+    for needle in [
+        "cortex_time_to_first_token_seconds",
+        "cortex_tokens_per_second",
+    ] {
+        assert!(
+            rendered.contains(needle),
+            "{needle} should be present.\nMetrics:\n{rendered}"
+        );
+    }
+    // The recorder is shared with the sibling test (same model/node
+    // labels), so counters are lower bounds, not exact values: this
+    // request contributed prompt=225 / completion=42.
+    let counter_value = |name: &str| -> u64 {
+        rendered
+            .lines()
+            .find(|l| l.starts_with(name) && l.contains(r#"model="test-model""#))
+            .and_then(|l| l.rsplit(' ').next())
+            .and_then(|v| v.parse().ok())
+            .unwrap_or_else(|| panic!("{name} should be present.\nMetrics:\n{rendered}"))
+    };
+    assert!(
+        counter_value("cortex_prompt_tokens_total") >= 225,
+        "prompt token counter should include this request's 225.\nMetrics:\n{rendered}"
+    );
+    assert!(
+        counter_value("cortex_completion_tokens_total") >= 42,
+        "completion token counter should include this request's 42.\nMetrics:\n{rendered}"
+    );
+}
--- a/crates/cortex-gateway/tests/model_limits.rs
+++ b/crates/cortex-gateway/tests/model_limits.rs
@@ -0,0 +1,132 @@
+//! Issue #62 / #67: `GET /v1/models` advertises a per-model serving budget so
+//! an OpenAI-compatible client (opencode's helexa provider) can size and
+//! compact its context without hand-configuration.
+//!
+//! Asserts the composition sources land on the response:
+//!   - `limit` from the neuron's self-derived value (#67) — NOT the catalogue;
+//!     an operator-declared catalogue `limit` is deliberately ignored.
+//!   - `cost` from the catalogue profile (operator-set pricing).
+//!   - `tool_call` / `reasoning` from the neuron's runtime detection (OR-ed in)
+//!
+//! Also a regression guard for the removal of `max_model_len` — the misnamed,
+//! unconsumed vLLM-ism that this contract replaces.
+
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::harness::ModelLimit;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+#[tokio::test]
+async fn v1_models_surfaces_limit_cost_and_capability_flags() {
+    // Catalogue declares pricing + an operator `limit` that must be IGNORED
+    // (#67): the neuron's self-derived limit is authoritative.
+    let models_toml = r#"
+[[models]]
+id = "test-model"
+harness = "candle"
+limit.context = 999999
+limit.input = 999999
+limit.output = 999999
+cost.input = 0.0
+cost.output = 0.0
+capabilities = ["text"]
+"#;
+    let cat_path = std::env::temp_dir().join("cortex_test_issue62_models.toml");
+    std::fs::write(&cat_path, models_toml).unwrap();
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            // Never contacted: build_app does not spawn the poller, so the
+            // seeded state below is authoritative for /v1/models.
+            endpoint: "http://127.0.0.1:1".into(),
+        }],
+        models_config: cat_path.to_string_lossy().into_owned(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+
+    // Seed the model as loaded on the node with runtime-detected flags set —
+    // these must OR into the catalogue entry, not be lost.
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node exists");
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: vec!["text".into()],
+                tool_call: true,
+                reasoning: true,
+                // Neuron's self-derived limit (#67) — the authoritative
+                // source. Distinct from the catalogue's (ignored) values.
+                limit: Some(ModelLimit {
+                    context: 49152,
+                    input: Some(40960),
+                    output: 8192,
+                }),
+            },
+        );
+    }
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let body: serde_json::Value = reqwest::Client::new()
+        .get(format!("http://{addr}/v1/models"))
+        .send()
+        .await
+        .unwrap()
+        .json()
+        .await
+        .unwrap();
+
+    let entry = body["data"]
+        .as_array()
+        .expect("data is an array")
+        .iter()
+        .find(|m| m["id"] == "test-model")
+        .expect("test-model present in /v1/models");
+
+    // `limit` is the neuron's self-derived value (#67), NOT the catalogue's
+    // (which declared 999999 and must be ignored). `cost` still flows from
+    // the catalogue.
+    assert_eq!(entry["limit"]["context"], 49152);
+    assert_eq!(entry["limit"]["input"], 40960);
+    assert_eq!(entry["limit"]["output"], 8192);
+    assert_eq!(entry["cost"]["input"], 0.0);
+    assert_eq!(entry["cost"]["output"], 0.0);
+
+    // Runtime-detected capability flags OR-ed in from the neuron's ModelEntry.
+    assert_eq!(entry["tool_call"], true);
+    assert_eq!(entry["reasoning"], true);
+
+    // Regression guard: the removed, unconsumed vLLM-ism must not reappear.
+    assert!(
+        entry.get("max_model_len").is_none(),
+        "max_model_len was removed; /v1/models must not advertise it"
+    );
+
+    let _ = std::fs::remove_file(&cat_path);
+}
--- a/crates/cortex-gateway/tests/poller.rs
+++ b/crates/cortex-gateway/tests/poller.rs
@@ -31,6 +31,7 @@ async fn test_poller_discovers_models() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -82,6 +83,7 @@ async fn test_poller_updates_gateway_models_endpoint() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -118,6 +120,88 @@ async fn test_poller_updates_gateway_models_endpoint() {
    }
 }

+#[tokio::test]
+async fn test_models_endpoint_unions_capabilities_across_nodes() {
+    // C3: two neurons each have the same model loaded but advertise
+    // different capability sets. The gateway's /v1/models must report
+    // the union — a model loaded text-only on one node and
+    // text+vision on another is vision-capable to the fleet.
+    let node_a = common::spawn_mock_neuron_with_models(json!([
+        {"id": "shared-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null, "capabilities": ["text"]}
+    ]))
+    .await;
+    let node_b = common::spawn_mock_neuron_with_models(json!([
+        {"id": "shared-model", "harness": "candle", "status": "loaded", "devices": [1], "vram_used_mb": null, "capabilities": ["text", "vision"]}
+    ]))
+    .await;
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![
+            NeuronEndpoint {
+                name: "node-a".into(),
+                endpoint: node_a,
+            },
+            NeuronEndpoint {
+                name: "node-b".into(),
+                endpoint: node_b,
+            },
+        ],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    cortex_gateway::poller::poll_once(&fleet).await;
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let client = reqwest::Client::new();
+    let body: serde_json::Value = client
+        .get(format!("http://{addr}/v1/models"))
+        .send()
+        .await
+        .expect("request should succeed")
+        .json()
+        .await
+        .unwrap();
+
+    let model = body["data"]
+        .as_array()
+        .expect("data array")
+        .iter()
+        .find(|m| m["id"] == "shared-model")
+        .expect("shared-model should be present");
+
+    let caps: Vec<&str> = model["capabilities"]
+        .as_array()
+        .expect("capabilities array")
+        .iter()
+        .filter_map(|c| c.as_str())
+        .collect();
+    assert!(caps.contains(&"text"), "union must include text: {caps:?}");
+    assert!(
+        caps.contains(&"vision"),
+        "union must include vision: {caps:?}"
+    );
+    assert_eq!(caps.len(), 2, "union must not duplicate text: {caps:?}");
+
+    // Both nodes hold the model, so two locations regardless of caps.
+    assert_eq!(model["locations"].as_array().unwrap().len(), 2);
+}
+
 #[tokio::test]
 async fn test_poller_marks_unreachable_node_unhealthy() {
    let config = GatewayConfig {
@@ -134,6 +218,7 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
            endpoint: "http://127.0.0.1:1".into(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -171,6 +256,7 @@ async fn test_poller_removes_stale_models() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -201,6 +287,7 @@ async fn test_poller_removes_stale_models() {
            endpoint: new_mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet2 = Arc::new(CortexState::from_config(&config2));
@@ -216,6 +303,10 @@ async fn test_poller_removes_stale_models() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
        node.models.insert(
@@ -225,6 +316,10 @@ async fn test_poller_removes_stale_models() {
                status: ModelStatus::Loaded,
                last_accessed: None,
                vram_estimate_mb: None,
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
            },
        );
    }
@@ -274,6 +369,7 @@ async fn test_poller_captures_activation_from_health() {
            endpoint: mock_url,
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
@@ -292,3 +388,40 @@ async fn test_poller_captures_activation_from_health() {
    assert_eq!(activation.in_progress.as_deref(), Some("Qwen/model-x"));
    assert_eq!(activation.pending, vec!["Qwen/model-y".to_string()]);
 }
+
+#[tokio::test]
+async fn test_poller_parses_recovering_status() {
+    // #20: a model auto-recovering on a neuron (poisoned → unload →
+    // reload, #17) is reported with status "recovering" and must land
+    // in gateway state as the dedicated Recovering status — not fall
+    // through the parser's catch-all to Loaded.
+    let mock_url = common::spawn_mock_neuron_with_models(json!([
+        {"id": "model-r", "harness": "candle", "status": "recovering", "devices": [0, 1], "vram_used_mb": null}
+    ]))
+    .await;
+
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "test-node".into(),
+            endpoint: mock_url,
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+
+    let fleet = Arc::new(CortexState::from_config(&config));
+    cortex_gateway::poller::poll_once(&fleet).await;
+
+    let nodes = fleet.nodes.read().await;
+    let node = nodes.get("test-node").unwrap();
+    let model_r = node.models.get("model-r").expect("model-r should exist");
+    assert_eq!(model_r.status, ModelStatus::Recovering);
+}
--- a/crates/cortex-gateway/tests/proxy_basic.rs
+++ b/crates/cortex-gateway/tests/proxy_basic.rs
@@ -117,6 +117,7 @@ async fn test_no_healthy_nodes() {
            endpoint: "http://127.0.0.1:1".into(),
        }],
        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
    };
    let fleet = std::sync::Arc::new(cortex_gateway::state::CortexState::from_config(&config));

@@ -139,7 +140,7 @@ async fn test_no_healthy_nodes() {
        .await
        .expect("request should succeed");

-    assert_eq!(resp.status(), 404);
+    assert_eq!(resp.status(), 503);

    let body: serde_json::Value = resp.json().await.unwrap();
    assert!(
@@ -171,3 +172,67 @@ async fn test_missing_model_field() {
    let body: serde_json::Value = resp.json().await.unwrap();
    assert!(body["error"]["message"].as_str().unwrap().contains("model"));
 }
+
+#[tokio::test]
+async fn test_recovering_model_returns_503_and_stays_listed() {
+    // #20: while a model auto-recovers on a neuron, the gateway must
+    // hold the route — transient 503 ("retry shortly"), not the 404
+    // "not found on any node" that makes a recovering model look
+    // evicted — and keep listing it on /v1/models.
+    let mock_url = common::spawn_mock_neuron().await;
+    let (fleet, gw_url) = common::spawn_gateway_with_state(&mock_url).await;
+
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").expect("node must exist");
+        node.models.insert(
+            "recovering-model".into(),
+            cortex_core::node::ModelEntry {
+                id: "recovering-model".into(),
+                status: cortex_core::node::ModelStatus::Recovering,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(format!("{gw_url}/v1/chat/completions"))
+        .header("content-type", "application/json")
+        .json(&json!({
+            "model": "recovering-model",
+            "messages": [{"role": "user", "content": "Hi"}]
+        }))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 503);
+    let body: serde_json::Value = resp.json().await.unwrap();
+    let message = body["error"]["message"].as_str().unwrap();
+    assert!(
+        message.contains("recovering") && message.contains("retry"),
+        "503 body must say recovering/retry, got: {message}"
+    );
+
+    // The model must still be visible on the unified models endpoint.
+    let models: serde_json::Value = client
+        .get(format!("{gw_url}/v1/models"))
+        .send()
+        .await
+        .expect("models request should succeed")
+        .json()
+        .await
+        .unwrap();
+    let listed = models["data"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .any(|m| m["id"] == "recovering-model");
+    assert!(listed, "recovering model must stay listed on /v1/models");
+}
--- a/crates/helexa-acp/Cargo.toml
+++ b/crates/helexa-acp/Cargo.toml
@@ -3,7 +3,7 @@ name = "helexa-acp"
 version = "0.1.16"
 edition = "2024"
 license = "Apache-2.0"
-repository = "https://git.lair.cafe/helexa/cortex"
+repository = "https://git.lair.cafe/helexa/helexa"
 description = """
 Agent Client Protocol bridge for the helexa self-hosted LLM stack.
 Speaks ACP to ACP-compatible editor clients (Zed, etc.) and forwards
--- a/crates/helexa-acp/README.md
+++ b/crates/helexa-acp/README.md
@@ -58,8 +58,8 @@ one vendor's agent client.
 ### From source

 ```sh
-git clone https://git.lair.cafe/helexa/cortex.git
-cd cortex
+git clone https://git.lair.cafe/helexa/helexa.git
+cd helexa
 cargo install --path crates/helexa-acp
 # Binary lands at ~/.cargo/bin/helexa-acp
 ```
@@ -536,7 +536,7 @@ Cargo.toml-only.

 ## Contributing

-Repository: https://git.lair.cafe/helexa/cortex (`crates/helexa-acp/`).
+Repository: https://git.lair.cafe/helexa/helexa (`crates/helexa-acp/`).
 Issues / PRs welcome. The canonical staged plan is in
 `~/.claude/plans/plan-the-per-device-worker-abstract-micali.md` on
 the maintainer's machine; the substages 3a–3e and 6a/6b that the
--- a/crates/helexa-bench/Cargo.toml
+++ b/crates/helexa-bench/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "helexa-bench"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+repository.workspace = true
+
+[[bin]]
+name = "helexa-bench"
+path = "src/main.rs"
+
+[dependencies]
+cortex-core = { workspace = true }
+
+tokio = { workspace = true }
+reqwest = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+figment = { workspace = true }
+anyhow = { workspace = true }
+async-trait = { workspace = true }
+clap = { workspace = true }
+tracing = { workspace = true }
+tracing-subscriber = { workspace = true }
+chrono = { workspace = true }
+futures = { workspace = true }
+tokio-stream = { workspace = true }
+eventsource-stream = { workspace = true }
+
+# read-only JSON API (api.rs)
+axum = { workspace = true }
+tower-http = { workspace = true }
+
+# SQLite system-of-record. `bundled` compiles SQLite from source so the
+# binary has no libsqlite3 runtime dependency — matches the project's
+# single-static-binary packaging.
+rusqlite = { version = "0.32", features = ["bundled"] }
+
+[dev-dependencies]
+# Jail (isolated cwd + env) for config tests.
+figment = { workspace = true, features = ["test"] }
--- a/crates/helexa-bench/src/api.rs
+++ b/crates/helexa-bench/src/api.rs
@@ -0,0 +1,119 @@
+//! Read-only JSON API over the bench SQLite store.
+//!
+//! Consumed by the `bench/` visualisation app and for programmatic
+//! access. Served by the `run` daemon (alongside the sweep loop) and by
+//! the standalone `serve` subcommand. CORS is permissive because the UI
+//! is hosted separately (different origin); the API is internal-only
+//! (WireGuard + firewalld) and read-only, so this predates the auth epic.
+
+use crate::store::{RunFilter, Store};
+use anyhow::Result;
+use axum::Router;
+use axum::extract::{Query, State};
+use axum::http::StatusCode;
+use axum::response::Json;
+use axum::routing::get;
+use serde::Deserialize;
+use serde_json::json;
+use std::sync::Arc;
+use tokio::sync::Mutex;
+use tower_http::cors::CorsLayer;
+
+/// Shared API state: a dedicated read connection to the store, guarded
+/// (rusqlite `Connection` isn't `Sync`). Separate from the sweep's
+/// writer connection — WAL lets them run concurrently.
+pub type ApiState = Arc<Mutex<Store>>;
+
+/// Open an API state over the store at `db_path`.
+pub fn open_state(db_path: &str) -> Result<ApiState> {
+    Ok(Arc::new(Mutex::new(Store::open(db_path)?)))
+}
+
+/// Build the API router.
+pub fn api_routes(state: ApiState) -> Router {
+    Router::new()
+        .route("/api/health", get(health))
+        .route("/api/dimensions", get(dimensions))
+        .route("/api/summary", get(summary))
+        .route("/api/series", get(series))
+        .route("/api/runs", get(runs))
+        .layer(CorsLayer::permissive())
+        .with_state(state)
+}
+
+/// Bind `listen` and serve the API until the process exits.
+pub async fn serve(listen: &str, state: ApiState) -> Result<()> {
+    let listener = tokio::net::TcpListener::bind(listen).await?;
+    tracing::info!(%listen, "bench API listening");
+    axum::serve(listener, api_routes(state)).await?;
+    Ok(())
+}
+
+type ApiError = (StatusCode, String);
+
+fn err500(e: anyhow::Error) -> ApiError {
+    (StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}"))
+}
+
+async fn health(State(s): State<ApiState>) -> Result<Json<serde_json::Value>, ApiError> {
+    let store = s.lock().await;
+    let count = store.run_count().map_err(err500)?;
+    Ok(Json(json!({ "status": "ok", "run_count": count })))
+}
+
+async fn dimensions(State(s): State<ApiState>) -> Result<Json<crate::store::Dimensions>, ApiError> {
+    let store = s.lock().await;
+    store.dimensions().map(Json).map_err(err500)
+}
+
+async fn summary(
+    State(s): State<ApiState>,
+) -> Result<Json<Vec<crate::store::ReportRow>>, ApiError> {
+    let store = s.lock().await;
+    store.summary().map(Json).map_err(err500)
+}
+
+#[derive(Debug, Deserialize)]
+struct SeriesQuery {
+    /// Optional — when omitted the store resolves the host serving this model.
+    host: Option<String>,
+    model: String,
+    scenario: String,
+}
+
+async fn series(
+    State(s): State<ApiState>,
+    Query(q): Query<SeriesQuery>,
+) -> Result<Json<Vec<crate::store::SeriesPoint>>, ApiError> {
+    let store = s.lock().await;
+    store
+        .series(q.host.as_deref(), &q.model, &q.scenario)
+        .map(Json)
+        .map_err(err500)
+}
+
+#[derive(Debug, Deserialize)]
+struct RunsQuery {
+    host: Option<String>,
+    model: Option<String>,
+    scenario: Option<String>,
+    sha: Option<String>,
+    ok: Option<bool>,
+    limit: Option<u32>,
+}
+
+async fn runs(
+    State(s): State<ApiState>,
+    Query(q): Query<RunsQuery>,
+) -> Result<Json<Vec<crate::store::RunRow>>, ApiError> {
+    let filter = RunFilter {
+        host: q.host,
+        model: q.model,
+        scenario: q.scenario,
+        sha: q.sha,
+        ok: q.ok,
+        limit: q.limit,
+    };
+    let store = s.lock().await;
+    store.runs(&filter).map(Json).map_err(err500)
+}
--- a/crates/helexa-bench/src/client.rs
+++ b/crates/helexa-bench/src/client.rs
@@ -0,0 +1,163 @@
+//! Outbound calls to a benchmark target: build identity, host discovery,
+//! and warm-model enumeration. Neuron targets use the native neuron API;
+//! `openai` targets use the OpenAI-compatible surface (preliminary).
+
+use crate::config::{TargetConfig, TargetKind};
+use anyhow::{Context, Result};
+use cortex_core::build_info::BuildInfo;
+use cortex_core::discovery::DiscoveryResponse;
+use cortex_core::harness::ModelInfo;
+use cortex_core::openai::ModelsResponse;
+use std::time::Duration;
+
+/// How long to wait on the cheap metadata polls (version/discovery/models).
+const META_TIMEOUT: Duration = Duration::from_secs(10);
+
+pub struct TargetClient {
+    http: reqwest::Client,
+}
+
+impl TargetClient {
+    pub fn new(request_timeout: Duration) -> Result<Self> {
+        let http = reqwest::Client::builder()
+            .timeout(request_timeout)
+            .build()
+            .context("building HTTP client")?;
+        Ok(TargetClient { http })
+    }
+
+    pub fn http(&self) -> &reqwest::Client {
+        &self.http
+    }
+
+    /// Chat-completions URL for the target.
+    pub fn chat_url(&self, target: &TargetConfig) -> String {
+        let base = target.endpoint.trim_end_matches('/');
+        match target.kind {
+            // neuron exposes OpenAI routes under /v1.
+            TargetKind::Neuron => format!("{base}/v1/chat/completions"),
+            // openai endpoint is the /v1 base already (bench.py convention).
+            TargetKind::Openai => format!("{base}/chat/completions"),
+        }
+    }
+
+    /// Build identity. Neuron: `GET /version`. Openai: a synthetic
+    /// placeholder keyed by `"external"` so the version-aware skip logic
+    /// treats it as one stable build (comparison runs are manual anyway).
+    pub async fn fetch_version(&self, target: &TargetConfig) -> Result<BuildInfo> {
+        match target.kind {
+            TargetKind::Neuron => {
+                let base = target.endpoint.trim_end_matches('/');
+                let info = self
+                    .http
+                    .get(format!("{base}/version"))
+                    .timeout(META_TIMEOUT)
+                    .send()
+                    .await
+                    .context("GET /version")?
+                    .error_for_status()
+                    .context("GET /version status")?
+                    .json::<BuildInfo>()
+                    .await
+                    .context("decoding /version")?;
+                Ok(info)
+            }
+            TargetKind::Openai => {
+                let mut info = BuildInfo::unknown();
+                info.git_sha = "external".to_string();
+                Ok(info)
+            }
+        }
+    }
+
+    /// Host discovery (neuron only).
+    pub async fn fetch_discovery(
+        &self,
+        target: &TargetConfig,
+    ) -> Result<Option<DiscoveryResponse>> {
+        if target.kind != TargetKind::Neuron {
+            return Ok(None);
+        }
+        let base = target.endpoint.trim_end_matches('/');
+        let disco = self
+            .http
+            .get(format!("{base}/discovery"))
+            .timeout(META_TIMEOUT)
+            .send()
+            .await
+            .context("GET /discovery")?
+            .error_for_status()
+            .context("GET /discovery status")?
+            .json::<DiscoveryResponse>()
+            .await
+            .context("decoding /discovery")?;
+        Ok(Some(disco))
+    }
+
+    /// Warm models — those ready to serve without a cold load.
+    ///
+    /// Neuron: `GET /models` filtered to `status == "loaded"` (skips
+    /// `recovering`/`poisoned`). Openai: `GET /models`, honouring the
+    /// helexa `loaded` extension when present, else treating all listed
+    /// models as warm.
+    pub async fn warm_models(&self, target: &TargetConfig) -> Result<Vec<ModelInfo>> {
+        let base = target.endpoint.trim_end_matches('/');
+        match target.kind {
+            TargetKind::Neuron => {
+                let models = self
+                    .http
+                    .get(format!("{base}/models"))
+                    .timeout(META_TIMEOUT)
+                    .send()
+                    .await
+                    .context("GET /models")?
+                    .error_for_status()
+                    .context("GET /models status")?
+                    .json::<Vec<ModelInfo>>()
+                    .await
+                    .context("decoding /models")?;
+                Ok(models
+                    .into_iter()
+                    .filter(|m| m.status == "loaded")
+                    .collect())
+            }
+            TargetKind::Openai => {
+                let resp = self
+                    .http
+                    .get(format!("{base}/models"))
+                    .timeout(META_TIMEOUT)
+                    .send()
+                    .await
+                    .context("GET /models")?
+                    .error_for_status()
+                    .context("GET /models status")?
+                    .json::<ModelsResponse>()
+                    .await
+                    .context("decoding /models")?;
+                Ok(resp
+                    .data
+                    .into_iter()
+                    .filter(|m| {
+                        // honour the helexa `loaded` extension if present
+                        m.extra
+                            .get("loaded")
+                            .and_then(|v| v.as_bool())
+                            .unwrap_or(true)
+                    })
+                    .map(|m| ModelInfo {
+                        id: m.id,
+                        harness: "openai".to_string(),
+                        status: "loaded".to_string(),
+                        devices: Vec::new(),
+                        vram_used_mb: None,
+                        capabilities: Vec::new(),
+                        limit: None,
+                        cost: None,
+                        tool_call: false,
+                        reasoning: false,
+                    })
+                    .collect())
+            }
+        }
+    }
+}
--- a/crates/helexa-bench/src/config.rs
+++ b/crates/helexa-bench/src/config.rs
@@ -0,0 +1,240 @@
+//! Bench configuration: loaded from `helexa-bench.toml` with figment,
+//! `BENCH_`-prefixed env overrides (mirrors `NeuronConfig::load`).
+
+use figment::{
+    Figment,
+    providers::{Env, Format, Toml},
+};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+use std::time::Duration;
+
+/// Top-level bench config.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BenchConfig {
+    #[serde(default)]
+    pub bench: BenchSettings,
+    #[serde(default)]
+    pub scenarios: ScenarioConfig,
+    /// Read-only JSON API (consumed by the bench UI + programmatic access).
+    #[serde(default)]
+    pub api: ApiSettings,
+    /// Endpoints to benchmark. At least one is required for `run`/`once`.
+    #[serde(default)]
+    pub targets: Vec<TargetConfig>,
+}
+
+/// The read-only HTTP API the `run` daemon (and the `serve` subcommand)
+/// exposes over the SQLite store.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ApiSettings {
+    /// Whether to bind the API at all.
+    #[serde(default = "default_api_enabled")]
+    pub enabled: bool,
+    /// Listen address for the API.
+    #[serde(default = "default_api_listen")]
+    pub listen: String,
+}
+
+impl Default for ApiSettings {
+    fn default() -> Self {
+        ApiSettings {
+            enabled: default_api_enabled(),
+            listen: default_api_listen(),
+        }
+    }
+}
+
+/// Loop/timing knobs.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BenchSettings {
+    /// Pause between full sweeps of all targets.
+    #[serde(default = "default_sweep_interval")]
+    pub sweep_interval_secs: u64,
+    /// Target number of measured samples to record for a given
+    /// (target, build SHA, model, scenario). Once met, later sweeps skip
+    /// that cell — so a fully-sampled build costs only cheap version
+    /// polls until a new SHA ships.
+    #[serde(default = "default_samples")]
+    pub samples_per_version: u32,
+    /// Pause between successive measured iterations against one model.
+    #[serde(default = "default_iter_pause")]
+    pub iteration_pause_secs: u64,
+    /// Per-request timeout (cold lazy-loads can be slow; generous like
+    /// bench.py's 600s default).
+    #[serde(default = "default_timeout")]
+    pub request_timeout_secs: u64,
+    /// SQLite system-of-record path.
+    #[serde(default = "default_db_path")]
+    pub db_path: String,
+}
+
+impl Default for BenchSettings {
+    fn default() -> Self {
+        BenchSettings {
+            sweep_interval_secs: default_sweep_interval(),
+            samples_per_version: default_samples(),
+            iteration_pause_secs: default_iter_pause(),
+            request_timeout_secs: default_timeout(),
+            db_path: default_db_path(),
+        }
+    }
+}
+
+impl BenchSettings {
+    pub fn iteration_pause(&self) -> Duration {
+        Duration::from_secs(self.iteration_pause_secs)
+    }
+    pub fn request_timeout(&self) -> Duration {
+        Duration::from_secs(self.request_timeout_secs)
+    }
+    pub fn sweep_interval(&self) -> Duration {
+        Duration::from_secs(self.sweep_interval_secs)
+    }
+}
+
+/// Which scenarios to run and their shared parameters.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ScenarioConfig {
+    /// Approximate prompt sizes (in tokens) — one chat-latency scenario
+    /// is generated per size, e.g. `chat:128`, `chat:4096`. This is the
+    /// per-cell dimension that the version-aware skip logic keys on.
+    #[serde(default = "default_prompt_sizes")]
+    pub prompt_sizes: Vec<u32>,
+    /// Max generated tokens per request.
+    #[serde(default = "default_max_tokens")]
+    pub max_tokens: u64,
+}
+
+impl Default for ScenarioConfig {
+    fn default() -> Self {
+        ScenarioConfig {
+            prompt_sizes: default_prompt_sizes(),
+            max_tokens: default_max_tokens(),
+        }
+    }
+}
+
+/// One endpoint to benchmark.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TargetConfig {
+    /// Stable label used as the engine column and in the DB.
+    pub name: String,
+    /// Which protocol/metadata surface the target exposes.
+    #[serde(default)]
+    pub kind: TargetKind,
+    /// Base URL. For `neuron`: the daemon root (e.g.
+    /// `http://beast.internal:13131`). For `openai`: the OpenAI `/v1`
+    /// base (e.g. `http://host:8080/v1`).
+    pub endpoint: String,
+    /// Optional display label override for reports (defaults to `name`).
+    #[serde(default)]
+    pub label: Option<String>,
+}
+
+impl TargetConfig {
+    pub fn display_label(&self) -> &str {
+        self.label.as_deref().unwrap_or(&self.name)
+    }
+}
+
+/// The two target surfaces. `neuron` gets rich build metadata and warm
+/// model discovery via the native neuron API; `openai` is the seam for
+/// later comparison against mistral.rs / llama.cpp / vLLM (phase 1
+/// implements `neuron` fully; `openai` is preliminary plumbing).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum TargetKind {
+    #[default]
+    Neuron,
+    Openai,
+}
+
+impl BenchConfig {
+    pub fn load(path: impl AsRef<Path>) -> Result<Self, Box<figment::Error>> {
+        Figment::new()
+            .merge(Toml::file(path))
+            .merge(Env::prefixed("BENCH_").split("__"))
+            .extract()
+            .map_err(Box::new)
+    }
+}
+
+fn default_sweep_interval() -> u64 {
+    1800
+}
+fn default_samples() -> u32 {
+    5
+}
+fn default_iter_pause() -> u64 {
+    2
+}
+fn default_timeout() -> u64 {
+    600
+}
+fn default_db_path() -> String {
+    "/var/lib/helexa-bench/bench.sqlite".to_string()
+}
+fn default_api_enabled() -> bool {
+    true
+}
+fn default_api_listen() -> String {
+    "0.0.0.0:13132".to_string()
+}
+fn default_prompt_sizes() -> Vec<u32> {
+    vec![128, 4096]
+}
+fn default_max_tokens() -> u64 {
+    256
+}
+
+#[cfg(test)]
+// Jail's closure must return figment::Result; the large-Err type is
+// figment's, not ours, so suppress the lint here.
+#[allow(clippy::result_large_err)]
+mod tests {
+    use super::*;
+    use figment::Jail;
+
+    #[test]
+    fn loads_minimal_with_defaults() {
+        Jail::expect_with(|jail| {
+            jail.create_file(
+                "helexa-bench.toml",
+                r#"
+                [[targets]]
+                name = "beast"
+                endpoint = "http://beast.internal:13131"
+                "#,
+            )?;
+            let cfg = BenchConfig::load("helexa-bench.toml").unwrap();
+            assert_eq!(cfg.targets.len(), 1);
+            assert_eq!(cfg.targets[0].kind, TargetKind::Neuron);
+            assert_eq!(cfg.bench.samples_per_version, 5);
+            assert_eq!(cfg.scenarios.prompt_sizes, vec![128, 4096]);
+            Ok(())
+        });
+    }
+
+    #[test]
+    fn env_overrides_apply() {
+        Jail::expect_with(|jail| {
+            jail.create_file(
+                "helexa-bench.toml",
+                r#"
+                [bench]
+                samples_per_version = 3
+                [[targets]]
+                name = "benjy"
+                kind = "openai"
+                endpoint = "http://benjy:8080/v1"
+                "#,
+            )?;
+            jail.set_env("BENCH_BENCH__SAMPLES_PER_VERSION", "9");
+            let cfg = BenchConfig::load("helexa-bench.toml").unwrap();
+            assert_eq!(cfg.bench.samples_per_version, 9);
+            assert_eq!(cfg.targets[0].kind, TargetKind::Openai);
+            Ok(())
+        });
+    }
+}
--- a/crates/helexa-bench/src/lib.rs
+++ b/crates/helexa-bench/src/lib.rs
@@ -0,0 +1,13 @@
+//! helexa-bench — a continuous, version-aware benchmark harness for the
+//! neuron fleet. It hits each neuron directly, exercises an extensible
+//! scenario suite against every warm model, and records each run with
+//! full build/version provenance into SQLite so improvements can be
+//! tracked automatically across neuron implementation updates.
+
+pub mod api;
+pub mod client;
+pub mod config;
+pub mod report;
+pub mod scenario;
+pub mod store;
+pub mod sweep;
--- a/crates/helexa-bench/src/main.rs
+++ b/crates/helexa-bench/src/main.rs
@@ -0,0 +1,153 @@
+//! helexa-bench CLI.
+//!
+//! - `run`    — continuous daemon (systemd default): sweep, sleep, repeat.
+//! - `once`   — a single sweep, then exit (manual / CI).
+//! - `report` — render the SQLite store as a results table.
+//!
+//! Runs on a single-threaded runtime: the workload is batch-1 sequential
+//! (one request at a time, the regime we measure), and it lets the
+//! SQLite connection live across awaits without `Sync` gymnastics.
+
+use anyhow::{Context, Result};
+use clap::{Parser, Subcommand};
+use helexa_bench::api;
+use helexa_bench::config::BenchConfig;
+use helexa_bench::report;
+use helexa_bench::store::Store;
+use helexa_bench::sweep::Sweeper;
+use tracing_subscriber::EnvFilter;
+
+#[derive(Parser)]
+#[command(name = "helexa-bench")]
+#[command(about = "Continuous version-aware benchmark harness for the neuron fleet")]
+#[command(version)]
+struct Cli {
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Subcommand)]
+enum Command {
+    /// Run sweeps continuously, pausing `sweep_interval_secs` between them.
+    Run {
+        #[arg(short, long, default_value = "helexa-bench.toml")]
+        config: String,
+    },
+    /// Run a single sweep over all targets, then exit.
+    Once {
+        #[arg(short, long, default_value = "helexa-bench.toml")]
+        config: String,
+    },
+    /// Serve the read-only JSON API only (no sweeping).
+    Serve {
+        #[arg(short, long, default_value = "helexa-bench.toml")]
+        config: String,
+    },
+    /// Render recorded results. Uses `--db` if given, else the db_path
+    /// from `--config`.
+    Report {
+        #[arg(short, long, default_value = "helexa-bench.toml")]
+        config: String,
+        /// Override the SQLite path (skips reading the config file).
+        #[arg(long)]
+        db: Option<String>,
+        /// Output format.
+        #[arg(long, default_value = "md")]
+        format: Format,
+    },
+}
+
+#[derive(Clone, Copy, clap::ValueEnum)]
+enum Format {
+    Md,
+    Json,
+}
+
+fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
+        )
+        .init();
+
+    let cli = Cli::parse();
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .context("building tokio runtime")?;
+    rt.block_on(run(cli))
+}
+
+async fn run(cli: Cli) -> Result<()> {
+    match cli.command {
+        Command::Run { config } => {
+            let cfg = load_config(&config)?;
+            require_targets(&cfg)?;
+            // Bind the read API alongside the sweep loop (one bob service
+            // does both). Its own store connection; WAL keeps the sweep
+            // writer and the API readers from blocking each other.
+            if cfg.api.enabled {
+                let state = api::open_state(&cfg.bench.db_path)?;
+                let listen = cfg.api.listen.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = api::serve(&listen, state).await {
+                        tracing::error!(error = %format!("{e:#}"), "bench API server exited");
+                    }
+                });
+            }
+            let sweeper = Sweeper::new(cfg)?;
+            tracing::info!("helexa-bench started; entering continuous sweep loop");
+            sweeper.run_forever().await
+        }
+        Command::Serve { config } => {
+            let cfg = load_config(&config)?;
+            if !cfg.api.enabled {
+                anyhow::bail!("[api] enabled = false — nothing to serve");
+            }
+            let state = api::open_state(&cfg.bench.db_path)?;
+            tracing::info!("helexa-bench serving API only");
+            api::serve(&cfg.api.listen, state).await
+        }
+        Command::Once { config } => {
+            let cfg = load_config(&config)?;
+            require_targets(&cfg)?;
+            let sweeper = Sweeper::new(cfg)?;
+            let summary = sweeper.run_once().await?;
+            tracing::info!(
+                measured = summary.measured,
+                skipped = summary.skipped,
+                failed = summary.failed,
+                unreachable = summary.targets_unreachable,
+                "single sweep complete"
+            );
+            Ok(())
+        }
+        Command::Report { config, db, format } => {
+            let db_path = match db {
+                Some(p) => p,
+                None => load_config(&config)?.bench.db_path,
+            };
+            let store = Store::open(&db_path)?;
+            let rows = store.report_rows()?;
+            let rendered = match format {
+                Format::Md => report::render_markdown(&rows),
+                Format::Json => report::render_json(&rows)?,
+            };
+            println!("{rendered}");
+            Ok(())
+        }
+    }
+}
+
+fn load_config(path: &str) -> Result<BenchConfig> {
+    BenchConfig::load(path)
+        .map_err(|e| anyhow::anyhow!("{e}"))
+        .with_context(|| format!("loading config {path}"))
+}
+
+fn require_targets(cfg: &BenchConfig) -> Result<()> {
+    if cfg.targets.is_empty() {
+        anyhow::bail!("no targets configured — add at least one [[targets]] entry");
+    }
+    Ok(())
+}
--- a/crates/helexa-bench/src/report.rs
+++ b/crates/helexa-bench/src/report.rs
@@ -0,0 +1,109 @@
+//! Render the SQLite store as a results table — the automated
+//! replacement for hand-editing `doc/benchmarks.md`. Columns match that
+//! doc: engine, model, prompt tok, TTFT (s), decode tok/s, total (s),
+//! plus the build SHA each cell was measured against.
+
+use crate::store::ReportRow;
+use anyhow::Result;
+
+pub fn render_markdown(rows: &[ReportRow]) -> String {
+    let mut out = String::new();
+    out.push_str(
+        "| engine | model | prompt tok | TTFT (s) | decode tok/s | total (s) | build | n |\n",
+    );
+    out.push_str("|---|---|---:|---:|---:|---:|---|---:|\n");
+    for r in rows {
+        let ptok = r
+            .prompt_tokens
+            .map(|t| t.to_string())
+            .unwrap_or_else(|| format!("~{}", r.prompt_size_approx));
+        out.push_str(&format!(
+            "| {} | {} | {} | {} | {} | {} | `{}` | {} |\n",
+            r.target_name,
+            r.model_id,
+            ptok,
+            fmt_opt(r.ttft_s_median, 3),
+            fmt_opt(r.decode_tps_median, 1),
+            fmt_opt(r.total_s_median, 3),
+            r.git_sha,
+            r.samples,
+        ));
+    }
+    out
+}
+
+pub fn render_json(rows: &[ReportRow]) -> Result<String> {
+    let arr: Vec<serde_json::Value> = rows
+        .iter()
+        .map(|r| {
+            serde_json::json!({
+                "engine": r.target_name,
+                "model": r.model_id,
+                "scenario": r.scenario_id,
+                "prompt_size_approx": r.prompt_size_approx,
+                "prompt_tokens": r.prompt_tokens,
+                "ttft_s_median": r.ttft_s_median,
+                "decode_tps_median": r.decode_tps_median,
+                "total_s_median": r.total_s_median,
+                "git_sha": r.git_sha,
+                "samples": r.samples,
+                "gpu": r.gpu,
+            })
+        })
+        .collect();
+    Ok(serde_json::to_string_pretty(&arr)?)
+}
+
+fn fmt_opt(v: Option<f64>, places: usize) -> String {
+    match v {
+        Some(x) => format!("{x:.places$}"),
+        None => "—".to_string(),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn markdown_has_header_and_row() {
+        let rows = vec![ReportRow {
+            target_name: "beast".into(),
+            model_id: "Qwen/Qwen3.6-27B".into(),
+            scenario_id: "chat:128".into(),
+            prompt_size_approx: 128,
+            git_sha: "30d50d6".into(),
+            prompt_tokens: Some(130),
+            ttft_s_median: Some(0.123),
+            decode_tps_median: Some(45.6),
+            total_s_median: Some(1.234),
+            samples: 5,
+            gpu: Some("2× RTX 5090".into()),
+        }];
+        let md = render_markdown(&rows);
+        assert!(md.contains("| engine |"));
+        assert!(md.contains("beast"));
+        assert!(md.contains("`30d50d6`"));
+        assert!(md.contains("0.123"));
+    }
+
+    #[test]
+    fn missing_decode_renders_dash() {
+        let rows = vec![ReportRow {
+            target_name: "benjy".into(),
+            model_id: "m".into(),
+            scenario_id: "chat:128".into(),
+            prompt_size_approx: 128,
+            git_sha: "abc".into(),
+            prompt_tokens: None,
+            ttft_s_median: Some(0.1),
+            decode_tps_median: None,
+            total_s_median: Some(0.5),
+            samples: 1,
+            gpu: None,
+        }];
+        let md = render_markdown(&rows);
+        assert!(md.contains("~128"));
+        assert!(md.contains("—"));
+    }
+}
--- a/crates/helexa-bench/src/scenario.rs
+++ b/crates/helexa-bench/src/scenario.rs
@@ -0,0 +1,238 @@
+//! The extensible test suite.
+//!
+//! A [`Scenario`] puts one warm model through one shaped request and
+//! reports operator-felt metrics (TTFT, decode tok/s, total). Phase 1
+//! ships the chat-latency family ported faithfully from `script/bench.py`;
+//! the trait is the seam for future families (vision, concurrency,
+//! long-generation, cold-start) selected per model via [`Scenario::applies_to`].
+
+use crate::config::ScenarioConfig;
+use anyhow::{Context, Result, anyhow};
+use async_trait::async_trait;
+use cortex_core::harness::ModelInfo;
+use cortex_core::openai::ChatCompletionChunk;
+use eventsource_stream::Eventsource;
+use futures::StreamExt;
+use serde_json::json;
+use std::time::{Duration, Instant};
+
+/// A paragraph of filler re-used to synthesise prompts of a target
+/// approximate token count (~4 chars/token heuristic — close enough for
+/// bucketing; real token counts are read back from the usage object).
+/// Mirrors `script/bench.py::FILLER`.
+const FILLER: &str = "The quick brown fox jumps over the lazy dog while the band plays \
+a slow waltz in the background and somebody counts the beats. ";
+
+/// `/no_think`: Qwen3-family soft switch keeping thinking models from
+/// burning the token budget invisibly. Harmless for non-thinking models.
+const QUESTION: &str = "\n\nRetell the scene above as a vivid story of about 300 words. /no_think";
+
+/// Build a synthetic prompt of approximately `approx_tokens` tokens.
+/// Ported from `bench.py::build_prompt`.
+pub fn build_prompt(approx_tokens: u32) -> String {
+    let target_chars = (approx_tokens.max(16) as usize) * 4;
+    let reps = target_chars / FILLER.len() + 1;
+    let mut body = FILLER.repeat(reps);
+    body.truncate(target_chars);
+    body.push_str(QUESTION);
+    body
+}
+
+/// Per-request inputs shared by every scenario.
+pub struct RunCtx<'a> {
+    pub client: &'a reqwest::Client,
+    /// Fully-qualified chat-completions URL for the target.
+    pub chat_url: String,
+    pub model_id: String,
+    pub max_tokens: u64,
+    pub timeout: Duration,
+}
+
+/// Operator-felt metrics for a single measured request.
+#[derive(Debug, Clone)]
+pub struct ScenarioMetrics {
+    /// Time to first content chunk (seconds).
+    pub ttft_s: f64,
+    /// Completion tokens / decode window. `None` when the window is too
+    /// short to be honest (≤ 200 ms), matching bench.py.
+    pub decode_tps: Option<f64>,
+    /// Wall-clock for the whole request (seconds).
+    pub total_s: f64,
+    /// Prompt tokens from the final `usage` object, if the server sent one.
+    pub prompt_tokens: Option<u64>,
+    /// Completion tokens: from `usage` when present, else content-chunk count.
+    pub completion_tokens: u64,
+}
+
+#[async_trait]
+pub trait Scenario: Send + Sync {
+    /// Stable id, e.g. `chat:128`. Used as the version-aware skip key
+    /// dimension and recorded against every run.
+    fn id(&self) -> &str;
+
+    /// Approximate prompt size in tokens (the cell dimension), recorded
+    /// for reporting.
+    fn prompt_size(&self) -> u32;
+
+    /// Whether this scenario should run against the given model. Default
+    /// runs against everything; vision/audio scenarios will gate on
+    /// [`ModelInfo::capabilities`].
+    fn applies_to(&self, _model: &ModelInfo) -> bool {
+        true
+    }
+
+    /// Issue one shaped request and measure it.
+    async fn run(&self, ctx: &RunCtx) -> Result<ScenarioMetrics>;
+}
+
+/// Build the active scenario set from config. One chat-latency scenario
+/// per configured prompt size.
+pub fn build_scenarios(cfg: &ScenarioConfig) -> Vec<Box<dyn Scenario>> {
+    cfg.prompt_sizes
+        .iter()
+        .map(|&size| {
+            Box::new(ChatLatencyScenario {
+                id: format!("chat:{size}"),
+                approx_prompt_tokens: size,
+            }) as Box<dyn Scenario>
+        })
+        .collect()
+}
+
+/// Streamed single-request chat-completions latency probe — the batch-1
+/// regime bench.py measures.
+pub struct ChatLatencyScenario {
+    id: String,
+    approx_prompt_tokens: u32,
+}
+
+#[async_trait]
+impl Scenario for ChatLatencyScenario {
+    fn id(&self) -> &str {
+        &self.id
+    }
+
+    fn prompt_size(&self) -> u32 {
+        self.approx_prompt_tokens
+    }
+
+    async fn run(&self, ctx: &RunCtx) -> Result<ScenarioMetrics> {
+        let prompt = build_prompt(self.approx_prompt_tokens);
+        let payload = json!({
+            "model": ctx.model_id,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": ctx.max_tokens,
+            "temperature": 0,
+            "stream": true,
+            "stream_options": {"include_usage": true},
+        });
+
+        let fut = stream_and_measure(ctx, &payload);
+        tokio::time::timeout(ctx.timeout, fut)
+            .await
+            .map_err(|_| anyhow!("request timed out after {:?}", ctx.timeout))?
+    }
+}
+
+/// The SSE-timing core, ported from `bench.py::one_run`. Kept free of the
+/// `Scenario` trait so it's unit-testable against a mock byte stream.
+async fn stream_and_measure(
+    ctx: &RunCtx<'_>,
+    payload: &serde_json::Value,
+) -> Result<ScenarioMetrics> {
+    let start = Instant::now();
+    let resp = ctx
+        .client
+        .post(&ctx.chat_url)
+        .json(payload)
+        .send()
+        .await
+        .context("sending chat request")?;
+    if !resp.status().is_success() {
+        let status = resp.status();
+        let body = resp.text().await.unwrap_or_default();
+        return Err(anyhow!("upstream returned {status}: {}", body.trim()));
+    }
+
+    let mut stream = resp.bytes_stream().eventsource();
+    let mut first: Option<Instant> = None;
+    let mut last: Option<Instant> = None;
+    let mut chunk_count: u64 = 0;
+    let mut prompt_tokens: Option<u64> = None;
+    let mut completion_tokens: Option<u64> = None;
+
+    while let Some(event) = stream.next().await {
+        let event = event.context("reading SSE stream")?;
+        let now = Instant::now();
+        let data = event.data.trim();
+        if data.is_empty() || data == "[DONE]" {
+            continue;
+        }
+        let chunk: ChatCompletionChunk = match serde_json::from_str(data) {
+            Ok(c) => c,
+            Err(_) => continue, // tolerate non-JSON keepalive frames
+        };
+        if let Some(choice) = chunk.choices.first()
+            && choice
+                .delta
+                .get("content")
+                .and_then(|c| c.as_str())
+                .is_some_and(|s| !s.is_empty())
+        {
+            if first.is_none() {
+                first = Some(now);
+            }
+            last = Some(now);
+            chunk_count += 1;
+        }
+        if let Some(usage) = chunk.usage {
+            prompt_tokens = Some(usage.prompt_tokens);
+            completion_tokens = Some(usage.completion_tokens);
+        }
+    }
+    let end = Instant::now();
+
+    let first = first.ok_or_else(|| anyhow!("no content chunks received"))?;
+
+    // neuron emits one SSE chunk per visible token, so chunk_count is an
+    // engine-truth count when no usage frame is sent.
+    let tokens = completion_tokens.filter(|&t| t > 0).unwrap_or(chunk_count);
+    // decode rate is only meaningful over a real inter-chunk window.
+    let window = last
+        .filter(|&l| l > first)
+        .map(|l| (l - first).as_secs_f64())
+        .unwrap_or(0.0);
+    Ok(ScenarioMetrics {
+        ttft_s: (first - start).as_secs_f64(),
+        decode_tps: if window > 0.2 {
+            Some(tokens as f64 / window)
+        } else {
+            None
+        },
+        total_s: (end - start).as_secs_f64(),
+        prompt_tokens,
+        completion_tokens: tokens,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn prompt_grows_with_token_target() {
+        let small = build_prompt(128);
+        let big = build_prompt(4096);
+        assert!(big.len() > small.len());
+        // ~4 chars/token + the trailing question.
+        assert!(small.len() >= 128 * 4);
+        assert!(small.ends_with("/no_think"));
+    }
+
+    #[test]
+    fn prompt_floor_for_tiny_targets() {
+        // max(approx,16) floor means even 0 yields a non-trivial prompt.
+        let p = build_prompt(0);
+        assert!(p.len() >= 16 * 4);
+    }
+}
--- a/crates/helexa-bench/src/store.rs
+++ b/crates/helexa-bench/src/store.rs
@@ -0,0 +1,768 @@
+//! SQLite system-of-record. One row per measured iteration, keyed so a
+//! benchmark can be attributed to the exact neuron build that produced
+//! it. Replaces hand edits to `doc/benchmarks.md`.
+//!
+//! Calls are synchronous (SQLite is local and the sweep is batch-1
+//! sequential), so the connection is used inline between `await` points,
+//! never held across one.
+
+use anyhow::{Context, Result};
+use rusqlite::{Connection, OptionalExtension, params};
+use std::path::Path;
+
+/// A single measured (or failed) iteration, with full provenance.
+#[derive(Debug, Clone)]
+pub struct RunRecord {
+    pub ts: String, // RFC3339
+    // target
+    pub target_name: String,
+    pub target_kind: String,
+    pub endpoint: String,
+    // host (from /discovery)
+    pub hostname: Option<String>,
+    pub driver_version: Option<String>,
+    pub cuda_version: Option<String>,
+    pub gpus_json: Option<String>,
+    // neuron build (from /version)
+    pub git_sha: String,
+    pub git_sha_long: Option<String>,
+    pub package_version: String,
+    pub git_dirty: bool,
+    pub build_timestamp: Option<String>,
+    pub rustc_version: Option<String>,
+    pub profile: Option<String>,
+    pub features_json: String,
+    pub candle_version: Option<String>,
+    // bench's own build
+    pub bench_version: String,
+    pub bench_sha: String,
+    // model
+    pub model_id: String,
+    pub harness: String,
+    pub capabilities_json: String,
+    pub devices_json: String,
+    // scenario
+    pub scenario_id: String,
+    pub prompt_size_approx: u32,
+    pub prompt_tokens_actual: Option<u64>,
+    pub max_tokens: u64,
+    // metrics
+    pub ttft_s: Option<f64>,
+    pub decode_tps: Option<f64>,
+    pub total_s: Option<f64>,
+    pub completion_tokens: Option<u64>,
+    // outcome
+    pub ok: bool,
+    pub error: Option<String>,
+}
+
+pub struct Store {
+    conn: Connection,
+}
+
+impl Store {
+    /// Open (creating parent dirs + schema as needed).
+    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
+        let path = path.as_ref();
+        if let Some(parent) = path.parent()
+            && !parent.as_os_str().is_empty()
+        {
+            std::fs::create_dir_all(parent)
+                .with_context(|| format!("creating db dir {}", parent.display()))?;
+        }
+        let conn = Connection::open(path)
+            .with_context(|| format!("opening sqlite db {}", path.display()))?;
+        Self::init(&conn)?;
+        Ok(Store { conn })
+    }
+
+    /// In-memory store for tests.
+    #[cfg(test)]
+    pub fn open_in_memory() -> Result<Self> {
+        let conn = Connection::open_in_memory()?;
+        Self::init(&conn)?;
+        Ok(Store { conn })
+    }
+
+    fn init(conn: &Connection) -> Result<()> {
+        conn.execute_batch(
+            r#"
+            -- WAL so the read-only API connection never blocks the
+            -- sweep writer (and vice versa).
+            PRAGMA journal_mode=WAL;
+            CREATE TABLE IF NOT EXISTS runs (
+                id                   INTEGER PRIMARY KEY AUTOINCREMENT,
+                ts                   TEXT NOT NULL,
+                target_name          TEXT NOT NULL,
+                target_kind          TEXT NOT NULL,
+                endpoint             TEXT NOT NULL,
+                hostname             TEXT,
+                driver_version       TEXT,
+                cuda_version         TEXT,
+                gpus_json            TEXT,
+                git_sha              TEXT NOT NULL,
+                git_sha_long         TEXT,
+                package_version      TEXT NOT NULL,
+                git_dirty            INTEGER NOT NULL,
+                build_timestamp      TEXT,
+                rustc_version        TEXT,
+                profile              TEXT,
+                features_json        TEXT NOT NULL,
+                candle_version       TEXT,
+                bench_version        TEXT NOT NULL,
+                bench_sha            TEXT NOT NULL,
+                model_id             TEXT NOT NULL,
+                harness              TEXT NOT NULL,
+                capabilities_json    TEXT NOT NULL,
+                devices_json         TEXT NOT NULL,
+                scenario_id          TEXT NOT NULL,
+                prompt_size_approx   INTEGER NOT NULL,
+                prompt_tokens_actual INTEGER,
+                max_tokens           INTEGER NOT NULL,
+                ttft_s               REAL,
+                decode_tps           REAL,
+                total_s              REAL,
+                completion_tokens    INTEGER,
+                ok                   INTEGER NOT NULL,
+                error                TEXT
+            );
+            -- The version-aware skip query keys on this tuple. scenario_id
+            -- encodes the prompt size (chat:<n>), so it subsumes the cell.
+            CREATE INDEX IF NOT EXISTS idx_runs_cell
+                ON runs (target_name, git_sha, model_id, scenario_id, ok);
+            "#,
+        )
+        .context("initialising sqlite schema")?;
+        Ok(())
+    }
+
+    /// Count successful samples already recorded for a cell. Only `ok`
+    /// rows count toward the per-version target so transient failures
+    /// don't permanently starve a cell.
+    pub fn count_samples(
+        &self,
+        target_name: &str,
+        git_sha: &str,
+        model_id: &str,
+        scenario_id: &str,
+    ) -> Result<u32> {
+        let n: i64 = self.conn.query_row(
+            "SELECT COUNT(*) FROM runs WHERE target_name=?1 AND git_sha=?2 \
+             AND model_id=?3 AND scenario_id=?4 AND ok=1",
+            params![target_name, git_sha, model_id, scenario_id],
+            |row| row.get(0),
+        )?;
+        Ok(n as u32)
+    }
+
+    pub fn insert_run(&self, r: &RunRecord) -> Result<()> {
+        self.conn.execute(
+            "INSERT INTO runs (
+                ts, target_name, target_kind, endpoint,
+                hostname, driver_version, cuda_version, gpus_json,
+                git_sha, git_sha_long, package_version, git_dirty,
+                build_timestamp, rustc_version, profile, features_json, candle_version,
+                bench_version, bench_sha,
+                model_id, harness, capabilities_json, devices_json,
+                scenario_id, prompt_size_approx, prompt_tokens_actual, max_tokens,
+                ttft_s, decode_tps, total_s, completion_tokens,
+                ok, error
+            ) VALUES (
+                ?1, ?2, ?3, ?4,
+                ?5, ?6, ?7, ?8,
+                ?9, ?10, ?11, ?12,
+                ?13, ?14, ?15, ?16, ?17,
+                ?18, ?19,
+                ?20, ?21, ?22, ?23,
+                ?24, ?25, ?26, ?27,
+                ?28, ?29, ?30, ?31,
+                ?32, ?33
+            )",
+            params![
+                r.ts,
+                r.target_name,
+                r.target_kind,
+                r.endpoint,
+                r.hostname,
+                r.driver_version,
+                r.cuda_version,
+                r.gpus_json,
+                r.git_sha,
+                r.git_sha_long,
+                r.package_version,
+                r.git_dirty as i64,
+                r.build_timestamp,
+                r.rustc_version,
+                r.profile,
+                r.features_json,
+                r.candle_version,
+                r.bench_version,
+                r.bench_sha,
+                r.model_id,
+                r.harness,
+                r.capabilities_json,
+                r.devices_json,
+                r.scenario_id,
+                r.prompt_size_approx,
+                r.prompt_tokens_actual,
+                r.max_tokens,
+                r.ttft_s,
+                r.decode_tps,
+                r.total_s,
+                r.completion_tokens,
+                r.ok as i64,
+                r.error,
+            ],
+        )?;
+        Ok(())
+    }
+
+    /// One reportable cell: the median metrics over the most-recently-seen
+    /// build SHA for each (target, model, scenario).
+    pub fn report_rows(&self) -> Result<Vec<ReportRow>> {
+        // For each (target, model, scenario), find the SHA of the latest
+        // successful run, then median that SHA's samples.
+        let mut stmt = self.conn.prepare(
+            "SELECT target_name, model_id, scenario_id, prompt_size_approx, git_sha,
+                    ttft_s, decode_tps, total_s, prompt_tokens_actual, gpus_json
+             FROM runs
+             WHERE ok=1
+             ORDER BY target_name, model_id, scenario_id, id",
+        )?;
+        let rows = stmt.query_map([], |row| {
+            Ok(RawRow {
+                target_name: row.get(0)?,
+                model_id: row.get(1)?,
+                scenario_id: row.get(2)?,
+                prompt_size_approx: row.get(3)?,
+                git_sha: row.get(4)?,
+                ttft_s: row.get(5)?,
+                decode_tps: row.get(6)?,
+                total_s: row.get(7)?,
+                prompt_tokens_actual: row.get(8)?,
+                gpus_json: row.get(9)?,
+            })
+        })?;
+        let raws: Vec<RawRow> = rows.collect::<rusqlite::Result<_>>()?;
+        Ok(aggregate(raws))
+    }
+
+    // ── Read API surface (consumed by api.rs) ─────────────────────────
+
+    /// Total recorded runs (for `/api/health`).
+    pub fn run_count(&self) -> Result<u64> {
+        let n: i64 = self
+            .conn
+            .query_row("SELECT COUNT(*) FROM runs", [], |row| row.get(0))?;
+        Ok(n as u64)
+    }
+
+    /// Distinct hosts / models / scenarios / builds, for populating UI
+    /// filters. Builds are ordered chronologically by build timestamp
+    /// (falling back to first-seen wall-clock).
+    pub fn dimensions(&self) -> Result<Dimensions> {
+        let col = |sql: &str| -> Result<Vec<String>> {
+            let mut stmt = self.conn.prepare(sql)?;
+            let rows = stmt.query_map([], |r| r.get::<_, String>(0))?;
+            Ok(rows.collect::<rusqlite::Result<_>>()?)
+        };
+        let hosts = col("SELECT DISTINCT target_name FROM runs ORDER BY target_name")?;
+        let models = col("SELECT DISTINCT model_id FROM runs ORDER BY model_id")?;
+        let scenarios = col("SELECT DISTINCT scenario_id FROM runs ORDER BY scenario_id")?;
+
+        let mut stmt = self.conn.prepare(
+            "SELECT git_sha, MAX(build_timestamp), MAX(package_version), MIN(COALESCE(build_timestamp, ts)) AS ord
+             FROM runs GROUP BY git_sha ORDER BY ord",
+        )?;
+        let builds = stmt
+            .query_map([], |r| {
+                Ok(BuildRef {
+                    git_sha: r.get(0)?,
+                    build_timestamp: r.get(1)?,
+                    package_version: r.get(2)?,
+                })
+            })?
+            .collect::<rusqlite::Result<_>>()?;
+
+        // host/model → GPU label, taken from each one's most recent run.
+        let gpu_map = |group_col: &str| -> Result<std::collections::HashMap<String, String>> {
+            let sql = format!(
+                "SELECT {group_col}, gpus_json FROM runs \
+                 WHERE id IN (SELECT MAX(id) FROM runs GROUP BY {group_col})"
+            );
+            let mut stmt = self.conn.prepare(&sql)?;
+            let rows = stmt.query_map([], |r| {
+                Ok((r.get::<_, String>(0)?, r.get::<_, Option<String>>(1)?))
+            })?;
+            let mut out = std::collections::HashMap::new();
+            for row in rows {
+                let (key, gpus) = row?;
+                if let Some(label) = gpus.as_deref().and_then(gpu_label) {
+                    out.insert(key, label);
+                }
+            }
+            Ok(out)
+        };
+        let host_gpus = gpu_map("target_name")?;
+        let model_gpus = gpu_map("model_id")?;
+
+        Ok(Dimensions {
+            hosts,
+            models,
+            scenarios,
+            builds,
+            host_gpus,
+            model_gpus,
+        })
+    }
+
+    /// Latest-SHA-per-cell medians (the report table as JSON).
+    pub fn summary(&self) -> Result<Vec<ReportRow>> {
+        self.report_rows()
+    }
+
+    /// Per-build median metrics for one (model, scenario) cell, ordered
+    /// chronologically by build — the "over time" series. `host` is
+    /// optional: when omitted it resolves to the host with the most recent
+    /// run for this (model, scenario). Each model is served by a single
+    /// host today, so this yields a coherent single-host series and lets
+    /// callers (the public UI) select by model alone.
+    pub fn series(
+        &self,
+        host: Option<&str>,
+        model: &str,
+        scenario: &str,
+    ) -> Result<Vec<SeriesPoint>> {
+        let host = match host {
+            Some(h) => h.to_string(),
+            None => {
+                let resolved: Option<String> = self
+                    .conn
+                    .query_row(
+                        "SELECT target_name FROM runs WHERE ok=1 AND model_id=?1 \
+                         AND scenario_id=?2 ORDER BY id DESC LIMIT 1",
+                        params![model, scenario],
+                        |r| r.get(0),
+                    )
+                    .optional()?;
+                match resolved {
+                    Some(h) => h,
+                    None => return Ok(Vec::new()),
+                }
+            }
+        };
+        let mut stmt = self.conn.prepare(
+            "SELECT git_sha, build_timestamp, package_version, ttft_s, decode_tps, total_s, ts
+             FROM runs
+             WHERE ok=1 AND target_name=?1 AND model_id=?2 AND scenario_id=?3
+             ORDER BY id",
+        )?;
+        let raws: Vec<SeriesRaw> = stmt
+            .query_map(params![host, model, scenario], |r| {
+                Ok(SeriesRaw {
+                    git_sha: r.get(0)?,
+                    build_timestamp: r.get(1)?,
+                    package_version: r.get(2)?,
+                    ttft_s: r.get(3)?,
+                    decode_tps: r.get(4)?,
+                    total_s: r.get(5)?,
+                    ts: r.get(6)?,
+                })
+            })?
+            .collect::<rusqlite::Result<_>>()?;
+        Ok(aggregate_series(raws))
+    }
+
+    /// Raw rows, optionally filtered. For drill-down + programmatic access.
+    pub fn runs(&self, f: &RunFilter) -> Result<Vec<RunRow>> {
+        let mut sql = String::from(
+            "SELECT id, ts, target_name, hostname, git_sha, build_timestamp, package_version,
+                    model_id, harness, scenario_id, prompt_size_approx, prompt_tokens_actual,
+                    max_tokens, ttft_s, decode_tps, total_s, completion_tokens, ok, error,
+                    gpus_json
+             FROM runs",
+        );
+        let mut conds: Vec<String> = Vec::new();
+        let mut args: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
+        let bind = |col: &str,
+                    val: Option<&str>,
+                    conds: &mut Vec<String>,
+                    args: &mut Vec<Box<dyn rusqlite::ToSql>>| {
+            if let Some(v) = val {
+                args.push(Box::new(v.to_string()));
+                conds.push(format!("{col}=?{}", args.len()));
+            }
+        };
+        bind("target_name", f.host.as_deref(), &mut conds, &mut args);
+        bind("model_id", f.model.as_deref(), &mut conds, &mut args);
+        bind("scenario_id", f.scenario.as_deref(), &mut conds, &mut args);
+        bind("git_sha", f.sha.as_deref(), &mut conds, &mut args);
+        if let Some(ok) = f.ok {
+            args.push(Box::new(ok as i64));
+            conds.push(format!("ok=?{}", args.len()));
+        }
+        if !conds.is_empty() {
+            sql.push_str(" WHERE ");
+            sql.push_str(&conds.join(" AND "));
+        }
+        sql.push_str(" ORDER BY id DESC");
+        let limit = f.limit.unwrap_or(500).min(5000);
+        args.push(Box::new(limit as i64));
+        sql.push_str(&format!(" LIMIT ?{}", args.len()));
+
+        let mut stmt = self.conn.prepare(&sql)?;
+        let rows = stmt
+            .query_map(rusqlite::params_from_iter(args.iter()), |r| {
+                let gpus_json: Option<String> = r.get(19)?;
+                Ok(RunRow {
+                    id: r.get(0)?,
+                    ts: r.get(1)?,
+                    host: r.get(2)?,
+                    gpu: gpus_json.as_deref().and_then(gpu_label),
+                    hostname: r.get(3)?,
+                    git_sha: r.get(4)?,
+                    build_timestamp: r.get(5)?,
+                    package_version: r.get(6)?,
+                    model_id: r.get(7)?,
+                    harness: r.get(8)?,
+                    scenario_id: r.get(9)?,
+                    prompt_size_approx: r.get(10)?,
+                    prompt_tokens_actual: r.get(11)?,
+                    max_tokens: r.get(12)?,
+                    ttft_s: r.get(13)?,
+                    decode_tps: r.get(14)?,
+                    total_s: r.get(15)?,
+                    completion_tokens: r.get(16)?,
+                    ok: r.get::<_, i64>(17)? != 0,
+                    error: r.get(18)?,
+                })
+            })?
+            .collect::<rusqlite::Result<_>>()?;
+        Ok(rows)
+    }
+}
+
+// ── Read-API serde types ──────────────────────────────────────────────
+
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct Dimensions {
+    pub hosts: Vec<String>,
+    pub models: Vec<String>,
+    pub scenarios: Vec<String>,
+    pub builds: Vec<BuildRef>,
+    /// host → GPU label (latest run), so the UI can show the GPU as the
+    /// resource name instead of the internal hostname.
+    pub host_gpus: std::collections::HashMap<String, String>,
+    /// model → GPU label (latest run); model maps to one host today.
+    pub model_gpus: std::collections::HashMap<String, String>,
+}
+
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct BuildRef {
+    pub git_sha: String,
+    pub build_timestamp: Option<String>,
+    pub package_version: Option<String>,
+}
+
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct SeriesPoint {
+    pub git_sha: String,
+    pub build_timestamp: Option<String>,
+    pub package_version: Option<String>,
+    pub ttft_s_median: Option<f64>,
+    pub decode_tps_median: Option<f64>,
+    pub total_s_median: Option<f64>,
+    pub samples: usize,
+}
+
+struct SeriesRaw {
+    git_sha: String,
+    build_timestamp: Option<String>,
+    package_version: Option<String>,
+    ttft_s: Option<f64>,
+    decode_tps: Option<f64>,
+    total_s: Option<f64>,
+    ts: String,
+}
+
+/// Group id-ordered rows by build SHA, median each metric, and order the
+/// resulting points chronologically by build (timestamp, else first ts).
+fn aggregate_series(raws: Vec<SeriesRaw>) -> Vec<SeriesPoint> {
+    use std::collections::BTreeMap;
+    // Preserve first-seen order per sha for the chronological sort key.
+    let mut order: Vec<String> = Vec::new();
+    let mut groups: BTreeMap<String, Vec<SeriesRaw>> = BTreeMap::new();
+    for r in raws {
+        if !groups.contains_key(&r.git_sha) {
+            order.push(r.git_sha.clone());
+        }
+        groups.entry(r.git_sha.clone()).or_default().push(r);
+    }
+    let mut points: Vec<(String, SeriesPoint)> = order
+        .into_iter()
+        .map(|sha| {
+            let rows = &groups[&sha];
+            let sort_key = rows
+                .iter()
+                .map(|r| r.build_timestamp.clone().unwrap_or_else(|| r.ts.clone()))
+                .min()
+                .unwrap_or_default();
+            let point = SeriesPoint {
+                git_sha: sha,
+                build_timestamp: rows.iter().find_map(|r| r.build_timestamp.clone()),
+                package_version: rows.iter().find_map(|r| r.package_version.clone()),
+                ttft_s_median: median(rows.iter().filter_map(|r| r.ttft_s)),
+                decode_tps_median: median(rows.iter().filter_map(|r| r.decode_tps)),
+                total_s_median: median(rows.iter().filter_map(|r| r.total_s)),
+                samples: rows.len(),
+            };
+            (sort_key, point)
+        })
+        .collect();
+    points.sort_by(|a, b| a.0.cmp(&b.0));
+    points.into_iter().map(|(_, p)| p).collect()
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct RunFilter {
+    pub host: Option<String>,
+    pub model: Option<String>,
+    pub scenario: Option<String>,
+    pub sha: Option<String>,
+    pub ok: Option<bool>,
+    pub limit: Option<u32>,
+}
+
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct RunRow {
+    pub id: i64,
+    pub ts: String,
+    pub host: String,
+    /// Public-facing resource name (the host's GPU(s)), e.g. "RTX 4090".
+    pub gpu: Option<String>,
+    pub hostname: Option<String>,
+    pub git_sha: String,
+    pub build_timestamp: Option<String>,
+    pub package_version: String,
+    pub model_id: String,
+    pub harness: String,
+    pub scenario_id: String,
+    pub prompt_size_approx: u32,
+    pub prompt_tokens_actual: Option<u64>,
+    pub max_tokens: u64,
+    pub ttft_s: Option<f64>,
+    pub decode_tps: Option<f64>,
+    pub total_s: Option<f64>,
+    pub completion_tokens: Option<u64>,
+    pub ok: bool,
+    pub error: Option<String>,
+}
+
+struct RawRow {
+    target_name: String,
+    model_id: String,
+    scenario_id: String,
+    prompt_size_approx: u32,
+    git_sha: String,
+    ttft_s: Option<f64>,
+    decode_tps: Option<f64>,
+    total_s: Option<f64>,
+    prompt_tokens_actual: Option<u64>,
+    gpus_json: Option<String>,
+}
+
+/// An aggregated cell ready for the report table.
+#[derive(Debug, Clone, PartialEq, serde::Serialize)]
+pub struct ReportRow {
+    pub target_name: String,
+    pub model_id: String,
+    pub scenario_id: String,
+    pub prompt_size_approx: u32,
+    pub git_sha: String,
+    pub prompt_tokens: Option<u64>,
+    pub ttft_s_median: Option<f64>,
+    pub decode_tps_median: Option<f64>,
+    pub total_s_median: Option<f64>,
+    pub samples: usize,
+    /// Public-facing resource name (the host's GPU(s)), e.g. "2× RTX 5090".
+    pub gpu: Option<String>,
+}
+
+/// Group by (target, model, scenario), keep only the latest SHA's rows
+/// (latest = the SHA of the last-inserted row, since input is id-ordered),
+/// and median each metric.
+fn aggregate(raws: Vec<RawRow>) -> Vec<ReportRow> {
+    use std::collections::BTreeMap;
+    // key -> (latest_sha, rows for that sha)
+    let mut groups: BTreeMap<(String, String, String), Vec<RawRow>> = BTreeMap::new();
+    for r in raws {
+        groups
+            .entry((
+                r.target_name.clone(),
+                r.model_id.clone(),
+                r.scenario_id.clone(),
+            ))
+            .or_default()
+            .push(r);
+    }
+    let mut out = Vec::new();
+    for ((target_name, model_id, scenario_id), rows) in groups {
+        // id-ordered, so the last row carries the latest SHA.
+        let latest_sha = rows.last().map(|r| r.git_sha.clone()).unwrap_or_default();
+        let cell: Vec<&RawRow> = rows.iter().filter(|r| r.git_sha == latest_sha).collect();
+        let prompt_size_approx = cell.first().map(|r| r.prompt_size_approx).unwrap_or(0);
+        out.push(ReportRow {
+            target_name,
+            model_id,
+            scenario_id,
+            prompt_size_approx,
+            git_sha: latest_sha,
+            prompt_tokens: cell.iter().find_map(|r| r.prompt_tokens_actual),
+            ttft_s_median: median(cell.iter().filter_map(|r| r.ttft_s)),
+            decode_tps_median: median(cell.iter().filter_map(|r| r.decode_tps)),
+            total_s_median: median(cell.iter().filter_map(|r| r.total_s)),
+            samples: cell.len(),
+            gpu: cell
+                .iter()
+                .find_map(|r| r.gpus_json.as_deref().and_then(gpu_label)),
+        });
+    }
+    out
+}
+
+/// Compact GPU label from a run's stored `gpus_json` (the discovery device
+/// list) — e.g. "2× RTX 5090", "RTX 4090". `None` when empty/absent. Used
+/// as the public-facing resource name in place of internal hostnames.
+fn gpu_label(gpus_json: &str) -> Option<String> {
+    let devices: Vec<serde_json::Value> = serde_json::from_str(gpus_json).ok()?;
+    if devices.is_empty() {
+        return None;
+    }
+    let mut order: Vec<String> = Vec::new();
+    let mut counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
+    for d in &devices {
+        let name = d.get("name").and_then(|v| v.as_str()).unwrap_or("GPU");
+        let short = name
+            .trim_start_matches("NVIDIA GeForce ")
+            .trim_start_matches("NVIDIA ")
+            .to_string();
+        if !counts.contains_key(&short) {
+            order.push(short.clone());
+        }
+        *counts.entry(short).or_insert(0) += 1;
+    }
+    Some(
+        order
+            .iter()
+            .map(|n| {
+                let c = counts[n];
+                if c > 1 {
+                    format!("{c}× {n}")
+                } else {
+                    n.clone()
+                }
+            })
+            .collect::<Vec<_>>()
+            .join(" + "),
+    )
+}
+
+fn median(values: impl Iterator<Item = f64>) -> Option<f64> {
+    let mut v: Vec<f64> = values.collect();
+    if v.is_empty() {
+        return None;
+    }
+    v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    // lo == hi for odd lengths (the middle element); they straddle the
+    // centre for even lengths. Avoids a `% 2` branch.
+    let lo = (v.len() - 1) / 2;
+    let hi = v.len() / 2;
+    Some((v[lo] + v[hi]) / 2.0)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn rec(target: &str, sha: &str, model: &str, scenario: &str, ok: bool) -> RunRecord {
+        RunRecord {
+            ts: "2026-06-13T00:00:00Z".into(),
+            target_name: target.into(),
+            target_kind: "neuron".into(),
+            endpoint: "http://x:13131".into(),
+            hostname: Some("x".into()),
+            driver_version: None,
+            cuda_version: None,
+            gpus_json: None,
+            git_sha: sha.into(),
+            git_sha_long: None,
+            package_version: "0.1.16".into(),
+            git_dirty: false,
+            build_timestamp: None,
+            rustc_version: None,
+            profile: None,
+            features_json: "[]".into(),
+            candle_version: None,
+            bench_version: "0.1.16".into(),
+            bench_sha: "deadbee".into(),
+            model_id: model.into(),
+            harness: "candle".into(),
+            capabilities_json: "[]".into(),
+            devices_json: "[]".into(),
+            scenario_id: scenario.into(),
+            prompt_size_approx: 128,
+            prompt_tokens_actual: Some(130),
+            max_tokens: 256,
+            ttft_s: Some(0.1),
+            decode_tps: Some(50.0),
+            total_s: Some(1.0),
+            completion_tokens: Some(50),
+            ok,
+            error: if ok { None } else { Some("boom".into()) },
+        }
+    }
+
+    #[test]
+    fn counts_only_successful_samples() {
+        let s = Store::open_in_memory().unwrap();
+        s.insert_run(&rec("beast", "abc", "m", "chat:128", true))
+            .unwrap();
+        s.insert_run(&rec("beast", "abc", "m", "chat:128", true))
+            .unwrap();
+        s.insert_run(&rec("beast", "abc", "m", "chat:128", false))
+            .unwrap();
+        assert_eq!(s.count_samples("beast", "abc", "m", "chat:128").unwrap(), 2);
+        // Different SHA is a different cell.
+        assert_eq!(s.count_samples("beast", "xyz", "m", "chat:128").unwrap(), 0);
+    }
+
+    #[test]
+    fn report_uses_latest_sha_per_cell() {
+        let s = Store::open_in_memory().unwrap();
+        // old build
+        s.insert_run(&rec("beast", "old", "m", "chat:128", true))
+            .unwrap();
+        // new build, two samples
+        let mut r = rec("beast", "new", "m", "chat:128", true);
+        r.ttft_s = Some(0.2);
+        s.insert_run(&r).unwrap();
+        r.ttft_s = Some(0.4);
+        s.insert_run(&r).unwrap();
+        let rows = s.report_rows().unwrap();
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].git_sha, "new");
+        assert_eq!(rows[0].samples, 2);
+        assert!((rows[0].ttft_s_median.unwrap() - 0.3).abs() < 1e-9);
+    }
+
+    #[test]
+    fn gpu_label_formats() {
+        let two = r#"[{"name":"NVIDIA GeForce RTX 5090"},{"name":"NVIDIA GeForce RTX 5090"}]"#;
+        assert_eq!(gpu_label(two).as_deref(), Some("2× RTX 5090"));
+        let one = r#"[{"name":"NVIDIA GeForce RTX 4090"}]"#;
+        assert_eq!(gpu_label(one).as_deref(), Some("RTX 4090"));
+        let dc = r#"[{"name":"NVIDIA H100"}]"#;
+        assert_eq!(gpu_label(dc).as_deref(), Some("H100"));
+        assert_eq!(gpu_label("[]"), None);
+    }
+}
--- a/crates/helexa-bench/src/sweep.rs
+++ b/crates/helexa-bench/src/sweep.rs
@@ -0,0 +1,250 @@
+//! The version-aware sweep loop.
+//!
+//! Each sweep visits every configured target, polls its build identity
+//! and warm models, and tops up benchmark samples per
+//! (target, build SHA, model, scenario) to `samples_per_version`. Cells
+//! already at target are skipped — so once every neuron's current build
+//! is fully sampled, sweeps cost only the cheap metadata polls until a
+//! new SHA ships. Runs are recorded to SQLite with full provenance.
+
+use crate::client::TargetClient;
+use crate::config::{BenchConfig, TargetConfig, TargetKind};
+use crate::scenario::{RunCtx, build_scenarios};
+use crate::store::{RunRecord, Store};
+use anyhow::Result;
+use cortex_core::build_info::BuildInfo;
+use cortex_core::discovery::DiscoveryResponse;
+use cortex_core::harness::ModelInfo;
+
+/// helexa-bench's own build version.
+fn bench_version() -> String {
+    env!("CARGO_PKG_VERSION").to_string()
+}
+
+/// helexa-bench's own build SHA, injected by CI via `HELEXA_BUILD_SHA`
+/// at compile time; `"unknown"` for ad-hoc local builds.
+fn bench_sha() -> String {
+    option_env!("HELEXA_BUILD_SHA")
+        .filter(|s| !s.is_empty())
+        .unwrap_or("unknown")
+        .to_string()
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct SweepSummary {
+    pub measured: usize,
+    pub skipped: usize,
+    pub failed: usize,
+    pub targets_unreachable: usize,
+}
+
+pub struct Sweeper {
+    cfg: BenchConfig,
+    client: TargetClient,
+    store: Store,
+}
+
+impl Sweeper {
+    pub fn new(cfg: BenchConfig) -> Result<Self> {
+        let client = TargetClient::new(cfg.bench.request_timeout())?;
+        let store = Store::open(&cfg.bench.db_path)?;
+        Ok(Sweeper { cfg, client, store })
+    }
+
+    /// Run sweeps forever, pausing `sweep_interval` between them.
+    pub async fn run_forever(&self) -> ! {
+        loop {
+            match self.run_once().await {
+                Ok(s) => tracing::info!(
+                    measured = s.measured,
+                    skipped = s.skipped,
+                    failed = s.failed,
+                    unreachable = s.targets_unreachable,
+                    "sweep complete"
+                ),
+                Err(e) => tracing::error!(error = %format!("{e:#}"), "sweep errored"),
+            }
+            tracing::debug!(
+                secs = self.cfg.bench.sweep_interval_secs,
+                "sleeping until next sweep"
+            );
+            tokio::time::sleep(self.cfg.bench.sweep_interval()).await;
+        }
+    }
+
+    /// One full pass over all targets.
+    pub async fn run_once(&self) -> Result<SweepSummary> {
+        let mut summary = SweepSummary::default();
+        for target in &self.cfg.targets {
+            if let Err(e) = self.sweep_target(target, &mut summary).await {
+                summary.targets_unreachable += 1;
+                tracing::warn!(target = %target.name, error = %format!("{e:#}"), "target skipped");
+            }
+        }
+        Ok(summary)
+    }
+
+    async fn sweep_target(&self, target: &TargetConfig, summary: &mut SweepSummary) -> Result<()> {
+        let build = self.client.fetch_version(target).await?;
+        let discovery = self.client.fetch_discovery(target).await.unwrap_or(None);
+        let models = self.client.warm_models(target).await?;
+
+        tracing::info!(
+            target = %target.name,
+            sha = %build.git_sha,
+            warm_models = models.len(),
+            "sweeping target"
+        );
+
+        let scenarios = build_scenarios(&self.cfg.scenarios);
+        for model in &models {
+            for scenario in scenarios.iter().filter(|s| s.applies_to(model)) {
+                let have = self.store.count_samples(
+                    &target.name,
+                    &build.git_sha,
+                    &model.id,
+                    scenario.id(),
+                )?;
+                let need = self.cfg.bench.samples_per_version.saturating_sub(have);
+                if need == 0 {
+                    summary.skipped += 1;
+                    tracing::debug!(
+                        target = %target.name, model = %model.id, scenario = scenario.id(),
+                        sha = %build.git_sha, "cell already satisfied, skipping"
+                    );
+                    continue;
+                }
+
+                let ctx = RunCtx {
+                    client: self.client.http(),
+                    chat_url: self.client.chat_url(target),
+                    model_id: model.id.clone(),
+                    max_tokens: self.cfg.scenarios.max_tokens,
+                    timeout: self.cfg.bench.request_timeout(),
+                };
+
+                // One unmeasured warmup when the cell is empty (matches
+                // bench.py — first run after a load hits cold caches).
+                if have == 0 {
+                    tracing::debug!(model = %model.id, scenario = scenario.id(), "warmup run");
+                    let _ = scenario.run(&ctx).await;
+                }
+
+                for i in 0..need {
+                    match scenario.run(&ctx).await {
+                        Ok(m) => {
+                            let rec = self.build_record(
+                                target,
+                                &build,
+                                discovery.as_ref(),
+                                model,
+                                scenario.id(),
+                                scenario.prompt_size(),
+                                Ok(&m),
+                            );
+                            self.store.insert_run(&rec)?;
+                            summary.measured += 1;
+                            tracing::info!(
+                                target = %target.name, model = %model.id, scenario = scenario.id(),
+                                ttft_s = m.ttft_s, decode_tps = ?m.decode_tps, total_s = m.total_s,
+                                "{}/{} recorded", have + i + 1, self.cfg.bench.samples_per_version
+                            );
+                        }
+                        Err(e) => {
+                            let msg = format!("{e:#}");
+                            let rec = self.build_record(
+                                target,
+                                &build,
+                                discovery.as_ref(),
+                                model,
+                                scenario.id(),
+                                scenario.prompt_size(),
+                                Err(&msg),
+                            );
+                            self.store.insert_run(&rec)?;
+                            summary.failed += 1;
+                            tracing::warn!(
+                                target = %target.name, model = %model.id, scenario = scenario.id(),
+                                error = %msg, "iteration failed"
+                            );
+                        }
+                    }
+                    tokio::time::sleep(self.cfg.bench.iteration_pause()).await;
+                }
+            }
+        }
+        Ok(())
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn build_record(
+        &self,
+        target: &TargetConfig,
+        build: &BuildInfo,
+        discovery: Option<&DiscoveryResponse>,
+        model: &ModelInfo,
+        scenario_id: &str,
+        prompt_size: u32,
+        result: Result<&crate::scenario::ScenarioMetrics, &str>,
+    ) -> RunRecord {
+        let (ok, error, ttft, decode, total, prompt_tokens, completion) = match result {
+            Ok(m) => (
+                true,
+                None,
+                Some(m.ttft_s),
+                m.decode_tps,
+                Some(m.total_s),
+                m.prompt_tokens,
+                Some(m.completion_tokens),
+            ),
+            Err(e) => (false, Some(e.to_string()), None, None, None, None, None),
+        };
+
+        RunRecord {
+            ts: chrono::Utc::now().to_rfc3339(),
+            target_name: target.name.clone(),
+            target_kind: kind_str(target.kind).to_string(),
+            endpoint: target.endpoint.clone(),
+            hostname: discovery.map(|d| d.hostname.clone()),
+            driver_version: discovery.and_then(|d| d.driver_version.clone()),
+            cuda_version: discovery.and_then(|d| d.cuda_version.clone()),
+            gpus_json: discovery
+                .map(|d| serde_json::to_string(&d.devices).unwrap_or_else(|_| "[]".to_string())),
+            git_sha: build.git_sha.clone(),
+            git_sha_long: build.git_sha_long.clone(),
+            package_version: build.package_version.clone(),
+            git_dirty: build.git_dirty,
+            build_timestamp: build.build_timestamp.clone(),
+            rustc_version: build.rustc_version.clone(),
+            profile: build.profile.clone(),
+            features_json: serde_json::to_string(&build.features)
+                .unwrap_or_else(|_| "[]".to_string()),
+            candle_version: build.candle_version.clone(),
+            bench_version: bench_version(),
+            bench_sha: bench_sha(),
+            model_id: model.id.clone(),
+            harness: model.harness.clone(),
+            capabilities_json: serde_json::to_string(&model.capabilities)
+                .unwrap_or_else(|_| "[]".to_string()),
+            devices_json: serde_json::to_string(&model.devices)
+                .unwrap_or_else(|_| "[]".to_string()),
+            scenario_id: scenario_id.to_string(),
+            prompt_size_approx: prompt_size,
+            prompt_tokens_actual: prompt_tokens,
+            max_tokens: self.cfg.scenarios.max_tokens,
+            ttft_s: ttft,
+            decode_tps: decode,
+            total_s: total,
+            completion_tokens: completion,
+            ok,
+            error,
+        }
+    }
+}
+
+fn kind_str(kind: TargetKind) -> &'static str {
+    match kind {
+        TargetKind::Neuron => "neuron",
+        TargetKind::Openai => "openai",
+    }
+}
--- a/crates/helexa-bench/tests/api.rs
+++ b/crates/helexa-bench/tests/api.rs
@@ -0,0 +1,219 @@
+//! Read-API tests: seed a temp store, serve the router, assert JSON.
+
+use helexa_bench::api;
+use helexa_bench::store::{RunRecord, Store};
+use serde_json::Value;
+
+#[allow(clippy::too_many_arguments)]
+fn rec(
+    host: &str,
+    sha: &str,
+    build_ts: Option<&str>,
+    model: &str,
+    scenario: &str,
+    ttft: f64,
+    ok: bool,
+) -> RunRecord {
+    RunRecord {
+        ts: "2026-06-13T00:00:00Z".into(),
+        target_name: host.into(),
+        target_kind: "neuron".into(),
+        endpoint: format!("http://{host}:13131"),
+        hostname: Some(host.into()),
+        driver_version: Some("580.159".into()),
+        cuda_version: Some("13.0".into()),
+        gpus_json: Some("[]".into()),
+        git_sha: sha.into(),
+        git_sha_long: None,
+        package_version: "0.1.16".into(),
+        git_dirty: false,
+        build_timestamp: build_ts.map(|s| s.to_string()),
+        rustc_version: None,
+        profile: Some("release".into()),
+        features_json: "[\"cuda\"]".into(),
+        candle_version: Some("0.10.2".into()),
+        bench_version: "0.1.16".into(),
+        bench_sha: "deadbee".into(),
+        model_id: model.into(),
+        harness: "candle".into(),
+        capabilities_json: "[\"text\"]".into(),
+        devices_json: "[0]".into(),
+        scenario_id: scenario.into(),
+        prompt_size_approx: 128,
+        prompt_tokens_actual: Some(130),
+        max_tokens: 64,
+        ttft_s: if ok { Some(ttft) } else { None },
+        decode_tps: if ok { Some(30.0) } else { None },
+        total_s: if ok { Some(2.0) } else { None },
+        completion_tokens: if ok { Some(60) } else { None },
+        ok,
+        error: if ok { None } else { Some("boom".into()) },
+    }
+}
+
+/// Seed a temp db, return its path.
+fn seed(tag: &str) -> String {
+    let path = std::env::temp_dir().join(format!("hb-api-{}-{tag}.sqlite", std::process::id()));
+    let _ = std::fs::remove_file(&path);
+    let p = path.to_string_lossy().to_string();
+    let store = Store::open(&p).unwrap();
+    // beast / m / chat:128 across two builds (old then new).
+    store
+        .insert_run(&rec(
+            "beast",
+            "old",
+            Some("2026-06-01T00:00:00Z"),
+            "m",
+            "chat:128",
+            0.20,
+            true,
+        ))
+        .unwrap();
+    store
+        .insert_run(&rec(
+            "beast",
+            "new",
+            Some("2026-06-10T00:00:00Z"),
+            "m",
+            "chat:128",
+            0.10,
+            true,
+        ))
+        .unwrap();
+    store
+        .insert_run(&rec(
+            "beast",
+            "new",
+            Some("2026-06-10T00:00:00Z"),
+            "m",
+            "chat:128",
+            0.12,
+            true,
+        ))
+        .unwrap();
+    // a failed row (must not count in series/summary medians)
+    store
+        .insert_run(&rec(
+            "beast",
+            "new",
+            Some("2026-06-10T00:00:00Z"),
+            "m",
+            "chat:128",
+            0.0,
+            false,
+        ))
+        .unwrap();
+    // a different host for the runs filter
+    store
+        .insert_run(&rec(
+            "benjy",
+            "new",
+            Some("2026-06-10T00:00:00Z"),
+            "n",
+            "chat:128",
+            0.15,
+            true,
+        ))
+        .unwrap();
+    p
+}
+
+async fn spawn(db: &str) -> String {
+    let state = api::open_state(db).unwrap();
+    let app = api::api_routes(state);
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    format!("http://{addr}")
+}
+
+async fn get(base: &str, path: &str) -> Value {
+    reqwest::get(format!("{base}{path}"))
+        .await
+        .unwrap()
+        .json()
+        .await
+        .unwrap()
+}
+
+#[tokio::test]
+async fn health_reports_run_count() {
+    let base = spawn(&seed("health")).await;
+    let v = get(&base, "/api/health").await;
+    assert_eq!(v["status"], "ok");
+    assert_eq!(v["run_count"], 5);
+}
+
+#[tokio::test]
+async fn dimensions_lists_distinct_values_and_builds_chronologically() {
+    let base = spawn(&seed("dims")).await;
+    let v = get(&base, "/api/dimensions").await;
+    let hosts: Vec<&str> = v["hosts"]
+        .as_array()
+        .unwrap()
+        .iter()
+        .map(|x| x.as_str().unwrap())
+        .collect();
+    assert_eq!(hosts, vec!["beast", "benjy"]);
+    assert_eq!(v["models"].as_array().unwrap().len(), 2);
+    // builds ordered by earliest build_timestamp: old before new
+    let builds = v["builds"].as_array().unwrap();
+    assert_eq!(builds[0]["git_sha"], "old");
+    assert_eq!(builds[1]["git_sha"], "new");
+}
+
+#[tokio::test]
+async fn summary_uses_latest_sha_and_ignores_failures() {
+    let base = spawn(&seed("summary")).await;
+    let v = get(&base, "/api/summary").await;
+    let rows = v.as_array().unwrap();
+    let beast = rows
+        .iter()
+        .find(|r| r["target_name"] == "beast" && r["scenario_id"] == "chat:128")
+        .unwrap();
+    assert_eq!(beast["git_sha"], "new");
+    assert_eq!(beast["samples"], 2); // two ok rows on "new"; failure excluded
+    // median of 0.10 and 0.12
+    assert!((beast["ttft_s_median"].as_f64().unwrap() - 0.11).abs() < 1e-9);
+}
+
+#[tokio::test]
+async fn series_is_chronological_per_build() {
+    let base = spawn(&seed("series")).await;
+    let v = get(&base, "/api/series?host=beast&model=m&scenario=chat:128").await;
+    let pts = v.as_array().unwrap();
+    assert_eq!(pts.len(), 2);
+    assert_eq!(pts[0]["git_sha"], "old");
+    assert_eq!(pts[1]["git_sha"], "new");
+    assert_eq!(pts[0]["samples"], 1);
+    assert_eq!(pts[1]["samples"], 2);
+}
+
+#[tokio::test]
+async fn series_resolves_host_when_omitted() {
+    // The public UI selects by model alone; the store resolves the host.
+    let base = spawn(&seed("series-nohost")).await;
+    let v = get(&base, "/api/series?model=m&scenario=chat:128").await;
+    let pts = v.as_array().unwrap();
+    assert_eq!(pts.len(), 2);
+    assert_eq!(pts[0]["git_sha"], "old");
+    assert_eq!(pts[1]["git_sha"], "new");
+}
+
+#[tokio::test]
+async fn runs_filters_by_host() {
+    let base = spawn(&seed("runs")).await;
+    let all = get(&base, "/api/runs").await;
+    assert_eq!(all.as_array().unwrap().len(), 5);
+    let beast = get(&base, "/api/runs?host=beast").await;
+    let rows = beast.as_array().unwrap();
+    assert_eq!(rows.len(), 4);
+    assert!(rows.iter().all(|r| r["host"] == "beast"));
+    // failed row carries its error + ok=false
+    assert!(
+        rows.iter()
+            .any(|r| r["ok"] == false && r["error"] == "boom")
+    );
+}
--- a/crates/helexa-bench/tests/sweep_integration.rs
+++ b/crates/helexa-bench/tests/sweep_integration.rs
@@ -0,0 +1,133 @@
+//! End-to-end sweep against a mock neuron: a sweep records samples, a
+//! second sweep skips the satisfied cell, and bumping the reported build
+//! SHA resumes fresh sampling.
+
+use axum::Router;
+use axum::extract::State;
+use axum::http::header;
+use axum::response::{IntoResponse, Json};
+use axum::routing::{get, post};
+use helexa_bench::config::{BenchConfig, BenchSettings, ScenarioConfig, TargetConfig, TargetKind};
+use helexa_bench::sweep::Sweeper;
+use serde_json::json;
+use std::sync::{Arc, Mutex};
+
+#[derive(Clone)]
+struct MockState {
+    sha: Arc<Mutex<String>>,
+}
+
+async fn version(State(s): State<MockState>) -> Json<serde_json::Value> {
+    let sha = s.sha.lock().unwrap().clone();
+    Json(json!({
+        "package_version": "0.1.16",
+        "git_sha": sha,
+        "git_dirty": false,
+        "features": ["cuda", "cudnn"],
+        "candle_version": "0.10.2",
+    }))
+}
+
+async fn discovery() -> Json<serde_json::Value> {
+    Json(json!({
+        "hostname": "mock-beast",
+        "os": "Linux",
+        "kernel": "6.19.0",
+        "cuda_version": "13.0",
+        "driver_version": "580.159",
+        "devices": [{"index": 0, "name": "RTX 5090", "vram_total_mb": 32614, "compute_capability": "12.0"}],
+        "harnesses": ["candle"],
+    }))
+}
+
+async fn models() -> Json<serde_json::Value> {
+    Json(json!([
+        {"id": "Qwen/Qwen3.6-27B", "harness": "candle", "status": "loaded", "devices": [0], "capabilities": ["text"]},
+        // A non-warm model the bench must ignore.
+        {"id": "Qwen/cold", "harness": "candle", "status": "recovering", "devices": [0]},
+    ]))
+}
+
+async fn chat() -> impl IntoResponse {
+    let body = concat!(
+        "data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"Hello\"},\"finish_reason\":null}]}\n\n",
+        "data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\" world\"},\"finish_reason\":null}]}\n\n",
+        "data: {\"choices\":[{\"index\":0,\"delta\":{},\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":130,\"completion_tokens\":2,\"total_tokens\":132}}\n\n",
+        "data: [DONE]\n\n",
+    );
+    ([(header::CONTENT_TYPE, "text/event-stream")], body)
+}
+
+async fn spawn_mock(sha: &str) -> (String, Arc<Mutex<String>>) {
+    let shared = Arc::new(Mutex::new(sha.to_string()));
+    let state = MockState {
+        sha: shared.clone(),
+    };
+    let app = Router::new()
+        .route("/version", get(version))
+        .route("/discovery", get(discovery))
+        .route("/models", get(models))
+        .route("/v1/chat/completions", post(chat))
+        .with_state(state);
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (format!("http://{addr}"), shared)
+}
+
+fn config_for(endpoint: String, db_path: String) -> BenchConfig {
+    BenchConfig {
+        bench: BenchSettings {
+            sweep_interval_secs: 1,
+            samples_per_version: 2,
+            iteration_pause_secs: 0,
+            request_timeout_secs: 30,
+            db_path,
+        },
+        scenarios: ScenarioConfig {
+            prompt_sizes: vec![128], // single scenario keeps assertions simple
+            max_tokens: 16,
+        },
+        api: Default::default(),
+        targets: vec![TargetConfig {
+            name: "mock".into(),
+            kind: TargetKind::Neuron,
+            endpoint,
+            label: None,
+        }],
+    }
+}
+
+#[tokio::test]
+async fn sweep_records_skips_and_resumes_on_new_sha() {
+    let (endpoint, sha_handle) = spawn_mock("aaaaaaa").await;
+
+    // Unique db path per run (bound port is unique).
+    let port = endpoint.rsplit(':').next().unwrap();
+    let db_path = std::env::temp_dir().join(format!("helexa-bench-it-{port}.sqlite"));
+    let _ = std::fs::remove_file(&db_path);
+    let db_str = db_path.to_string_lossy().to_string();
+
+    let sweeper = Sweeper::new(config_for(endpoint, db_str)).unwrap();
+
+    // First sweep: one warm model × one scenario × 2 samples.
+    let s1 = sweeper.run_once().await.unwrap();
+    assert_eq!(s1.measured, 2, "should record samples_per_version samples");
+    assert_eq!(s1.skipped, 0);
+    assert_eq!(s1.failed, 0);
+
+    // Second sweep at same SHA: cell satisfied, nothing measured.
+    let s2 = sweeper.run_once().await.unwrap();
+    assert_eq!(s2.measured, 0, "satisfied cell must be skipped");
+    assert_eq!(s2.skipped, 1);
+
+    // Bump the reported build SHA: a new cell → fresh sampling resumes.
+    *sha_handle.lock().unwrap() = "bbbbbbb".to_string();
+    let s3 = sweeper.run_once().await.unwrap();
+    assert_eq!(s3.measured, 2, "new SHA must resume sampling");
+    assert_eq!(s3.skipped, 0);
+
+    let _ = std::fs::remove_file(&db_path);
+}
--- a/crates/neuron/Cargo.toml
+++ b/crates/neuron/Cargo.toml
@@ -60,6 +60,11 @@ tokio-stream.workspace = true
 figment.workspace = true
 toml.workspace = true

+# Parallel in-situ quantization (#1): fans candle's per-block k-quant
+# math across the CPU pool at model-load time. Already in the tree
+# transitively via candle-core.
+rayon = "1"
+
 # candle for in-process inference. CUDA support is gated behind the
 # crate's `cuda` feature (default off) so the workspace builds on
 # non-CUDA hosts and CI runners.
@@ -76,20 +81,31 @@ cudarc = { version = "0.19", optional = true, default-features = false, features
 half = { version = "2.5", optional = true }
 tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
 hf-hub = { version = "0.4", features = ["tokio"] }
-# Jinja-compatible template renderer for the model's
-# `tokenizer_config.json::chat_template`. Hugging Face's chat
-# templates use a strict subset of Jinja2 that minijinja supports
-# out of the box. ~80KB compiled; pure Rust, no async surface.
-# Features: `builtins` for the `is defined` / `default` filters HF
-# templates use; `json` for `tojson` (some Qwen3 templates emit
-# tool definitions via tojson); `serde` so we can hand it a
-# serde_json::Value as the context.
+# Jinja-compatible template renderer for the model's chat template
+# (standalone `chat_template.jinja` or `tokenizer_config.json::chat_template`).
+# Hugging Face's chat templates lean on Python string semantics; we
+# bridge them with `minijinja-contrib`'s `pycompat` callback (str
+# methods like `startswith`/`split`/`strip`) plus a `raise_exception`
+# global. Features: `builtins` for `is defined` / `default`; `json`
+# for `tojson`; `serde` so we can hand it a serde_json::Value context.
 minijinja = { version = "2", features = ["builtins", "json", "serde"] }
+# Python-compatibility shim: the Qwen3-VL / Qwen3.6 template uses
+# `content.startswith(...)`, `.endswith(...)`, `.split(...)`,
+# `.rstrip(...)`, `.lstrip(...)` — Python str methods minijinja doesn't
+# implement natively. `pycompat::unknown_method_callback` supplies them.
+minijinja-contrib = { version = "2", features = ["pycompat"] }
 # Direct dep on `safetensors` (re-exported by candle but its `TensorView`
 # / `slice::IndexOp` types are public-but-not-re-exported). Used by the
 # tp `fused_load` module to read per-rank slices of fused QKV tensors
 # without materialising the full tensor on device.
 safetensors = "0.7"
+# Vision capability for Qwen3.6 (Stage A of the vision plan in
+# doc/vision-qwen3_6-spec.md). `image` decodes PNG/JPEG/etc from
+# the bytes embedded in `data:image/...;base64,...` content parts;
+# `base64` does the URI decode. Default-features off on `image` to
+# avoid pulling in audio/video formats we don't need.
+image = { version = "0.25", default-features = false, features = ["png", "jpeg", "webp", "bmp", "gif"] }
+base64 = "0.22"

 [dev-dependencies]
 tokio = { workspace = true, features = ["test-util"] }
--- a/crates/neuron/build.rs
+++ b/crates/neuron/build.rs
@@ -1,10 +1,16 @@
-//! Build script: compile the CUDA kernels in `src/cuda/*.cu` into a
-//! static library and link it under the `cuda` feature.
+//! Build script: capture build/version metadata for `GET /version`,
+//! and (under the `cuda` feature) compile the CUDA kernels in
+//! `src/cuda/*.cu` into a static library and link it.
 //!
-//! Patterned on `EricLBuehler/mistral.rs::mistralrs-core/build.rs` —
-//! same `cudaforge::KernelBuilder` invocation, same NVCC flag set.
+//! The CUDA portion is patterned on
+//! `EricLBuehler/mistral.rs::mistralrs-core/build.rs` — same
+//! `cudaforge::KernelBuilder` invocation, same NVCC flag set.
+
+use std::process::Command;

 fn main() {
+    emit_build_metadata();
+
    #[cfg(feature = "cuda")]
    {
        use std::path::PathBuf;
@@ -64,3 +70,127 @@ fn main() {
        }
    }
 }
+
+/// Emit `cargo:rustc-env=` vars consumed by `env!()` in `src/version.rs`
+/// so the daemon can report its own build identity from `GET /version`.
+///
+/// We re-run only when HEAD moves or the SHA override changes — not on
+/// every compile — so the captured timestamp is stable for a given
+/// build input rather than churning on each `cargo build`.
+fn emit_build_metadata() {
+    println!("cargo:rerun-if-env-changed=HELEXA_BUILD_SHA");
+    println!("cargo:rerun-if-changed=.git/HEAD");
+    // A detached/normal HEAD points at a ref whose file is what actually
+    // changes on commit; watch the packed-refs fallback too.
+    println!("cargo:rerun-if-changed=.git/packed-refs");
+
+    // SHA: prefer the CI/RPM-injected override (tarball builds have no
+    // .git), then fall back to git, then to "unknown".
+    let (sha_short, sha_long, dirty) = match std::env::var("HELEXA_BUILD_SHA") {
+        Ok(s) if !s.trim().is_empty() => {
+            let s = s.trim().to_string();
+            let short = s.chars().take(7).collect::<String>();
+            (short, Some(s), false)
+        }
+        _ => {
+            let long = git(&["rev-parse", "HEAD"]);
+            let short = git(&["rev-parse", "--short", "HEAD"]);
+            let dirty = git(&["status", "--porcelain"])
+                .map(|s| !s.trim().is_empty())
+                .unwrap_or(false);
+            match short {
+                Some(short) => (short, long, dirty),
+                None => ("unknown".to_string(), None, false),
+            }
+        }
+    };
+    println!("cargo:rustc-env=HELEXA_GIT_SHA={sha_short}");
+    println!(
+        "cargo:rustc-env=HELEXA_GIT_SHA_LONG={}",
+        sha_long.unwrap_or_default()
+    );
+    println!("cargo:rustc-env=HELEXA_GIT_DIRTY={dirty}");
+
+    // RFC3339 build timestamp. `date` is universally present on the
+    // Linux hosts neuron targets; empty if it ever isn't.
+    let ts = Command::new("date")
+        .args(["-u", "+%Y-%m-%dT%H:%M:%SZ"])
+        .output()
+        .ok()
+        .filter(|o| o.status.success())
+        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
+        .unwrap_or_default();
+    println!("cargo:rustc-env=HELEXA_BUILD_TIMESTAMP={ts}");
+
+    // Compiler version: cargo sets $RUSTC to the rustc it invokes.
+    let rustc = std::env::var("RUSTC").unwrap_or_else(|_| "rustc".to_string());
+    let rustc_version = Command::new(rustc)
+        .arg("--version")
+        .output()
+        .ok()
+        .filter(|o| o.status.success())
+        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
+        .unwrap_or_default();
+    println!("cargo:rustc-env=HELEXA_RUSTC_VERSION={rustc_version}");
+
+    println!(
+        "cargo:rustc-env=HELEXA_BUILD_PROFILE={}",
+        std::env::var("PROFILE").unwrap_or_default()
+    );
+    println!(
+        "cargo:rustc-env=HELEXA_TARGET={}",
+        std::env::var("TARGET").unwrap_or_default()
+    );
+
+    // Enabled features: cargo exports CARGO_FEATURE_<NAME> for each.
+    // Reverse the mangling (uppercase, '-'→'_') best-effort for display.
+    let mut features: Vec<String> = std::env::vars()
+        .filter_map(|(k, _)| k.strip_prefix("CARGO_FEATURE_").map(|f| f.to_string()))
+        .map(|f| f.to_lowercase().replace('_', "-"))
+        // `default` is the meta-feature, not a perf-relevant flag.
+        .filter(|f| f != "default")
+        .collect();
+    features.sort();
+    println!("cargo:rustc-env=HELEXA_FEATURES={}", features.join(","));
+
+    println!(
+        "cargo:rustc-env=HELEXA_CANDLE_VERSION={}",
+        candle_version().unwrap_or_default()
+    );
+}
+
+fn git(args: &[&str]) -> Option<String> {
+    let out = Command::new("git").args(args).output().ok()?;
+    if !out.status.success() {
+        return None;
+    }
+    let s = String::from_utf8_lossy(&out.stdout).trim().to_string();
+    if s.is_empty() { None } else { Some(s) }
+}
+
+/// Best-effort: read the locked `candle-core` version from the workspace
+/// `Cargo.lock` (two levels up from this crate). Returns `None` if the
+/// lockfile is absent (e.g. some packaging flows) or the entry isn't
+/// found.
+fn candle_version() -> Option<String> {
+    let manifest = std::env::var("CARGO_MANIFEST_DIR").ok()?;
+    let lock = std::path::Path::new(&manifest)
+        .join("..")
+        .join("..")
+        .join("Cargo.lock");
+    println!("cargo:rerun-if-changed={}", lock.display());
+    let text = std::fs::read_to_string(lock).ok()?;
+    // Cargo.lock entries are `[[package]]\nname = "x"\nversion = "y"`.
+    let mut in_candle = false;
+    for line in text.lines() {
+        let line = line.trim();
+        if line == "[[package]]" {
+            in_candle = false;
+        } else if line == "name = \"candle-core\"" {
+            in_candle = true;
+        } else if in_candle && let Some(rest) = line.strip_prefix("version = \"") {
+            return Some(rest.trim_end_matches('"').to_string());
+        }
+    }
+    None
+}
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -41,6 +41,7 @@ pub struct NeuronState {
 /// Build the neuron API router.
 pub fn neuron_routes() -> Router<Arc<NeuronState>> {
    Router::new()
+        .route("/version", get(version_handler))
        .route("/discovery", get(discovery_handler))
        .route("/health", get(health_handler))
        .route("/models", get(list_models))
@@ -51,6 +52,14 @@ pub fn neuron_routes() -> Router<Arc<NeuronState>> {
        .route("/v1/responses", post(responses))
 }

+/// `GET /version` — the daemon's own build identity (git SHA, enabled
+/// features, rustc/candle versions). Static for the process lifetime, so
+/// no state is touched. This is the canonical "which build is live"
+/// probe for fleet validation and benchmark attribution.
+async fn version_handler() -> Json<cortex_core::build_info::BuildInfo> {
+    Json(crate::version::build_info())
+}
+
 async fn discovery_handler(State(state): State<Arc<NeuronState>>) -> Json<DiscoveryResponse> {
    Json(state.discovery.clone())
 }
@@ -62,6 +71,12 @@ async fn health_handler(State(state): State<Arc<NeuronState>>) -> Json<HealthRes
    // know about activation lifecycle.
    let mut snapshot = state.health_cache.snapshot().await;
    snapshot.activation = state.activation.snapshot().await;
+    // Per-model admission load (#53) — read live from the candle harness so
+    // cortex's load-aware router (#55) can spread traffic and propagate
+    // backpressure. Absent when no candle harness is present.
+    if let Some(candle) = &state.candle {
+        snapshot.models = candle.load_snapshot().await;
+    }
    Json(snapshot)
 }

@@ -81,6 +96,21 @@ async fn load_model(
    State(state): State<Arc<NeuronState>>,
    Json(spec): Json<ModelSpec>,
 ) -> impl IntoResponse {
+    // Driver/library mismatch preflight (#19): every CUDA load is
+    // guaranteed to fail until the host reboots. Reject up front with
+    // the operator-actionable reason instead of letting the load die
+    // minutes later inside cuInit/NCCL with a cryptic error.
+    if let Some(reason) = &state.discovery.cuda_unavailable_reason {
+        tracing::warn!(model = %spec.model_id, reason = %reason, "load_model rejected: CUDA unavailable");
+        return (
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(json!({
+                "error": reason,
+                "code": "cuda_unavailable",
+            })),
+        )
+            .into_response();
+    }
    let registry = state.registry.read().await;
    match registry.load_model(&spec).await {
        Ok(()) => Json(json!({"status": "loaded"})).into_response(),
@@ -174,13 +204,43 @@ async fn model_endpoint(
    }
 }

+/// Default `chat_template_kwargs.enable_thinking` to `include_thinking`
+/// when the client didn't set it explicitly, leaving any explicit client
+/// choice untouched. See the call site in [`chat_completions`] for the
+/// rationale (reasoning eating the token budget for clients that drop it).
+fn default_enable_thinking(req: &mut ChatCompletionRequest, include_thinking: bool) {
+    if req
+        .extra
+        .get("chat_template_kwargs")
+        .and_then(|k| k.get("enable_thinking"))
+        .is_some()
+    {
+        return; // client chose explicitly — respect it
+    }
+    if !req.extra.is_object() {
+        req.extra = json!({});
+    }
+    let Some(obj) = req.extra.as_object_mut() else {
+        return;
+    };
+    let kwargs = obj
+        .entry("chat_template_kwargs")
+        .or_insert_with(|| json!({}));
+    if !kwargs.is_object() {
+        *kwargs = json!({});
+    }
+    if let Some(kw) = kwargs.as_object_mut() {
+        kw.insert("enable_thinking".into(), json!(include_thinking));
+    }
+}
+
 /// OpenAI-compatible chat completions. Dispatches to streaming SSE when
 /// `stream: true` is set on the request; otherwise returns a single
 /// `ChatCompletionResponse`.
 async fn chat_completions(
    State(state): State<Arc<NeuronState>>,
    headers: axum::http::HeaderMap,
-    Json(req): Json<ChatCompletionRequest>,
+    Json(mut req): Json<ChatCompletionRequest>,
 ) -> impl IntoResponse {
    let Some(candle) = state.candle.as_ref().map(Arc::clone) else {
        return (
@@ -205,6 +265,18 @@ async fn chat_completions(
        reasoning_markers: None, // filled in from the loaded model inside candle
    };

+    // Couple reasoning *generation* to reasoning *surfacing*. Reasoning
+    // models (Qwen3.6) think by default, and that `<think>` block can
+    // consume the entire `max_tokens` budget — which, when we then drop
+    // it (`include_thinking == false`, the default for OpenAI/Anthropic
+    // clients like Claude Code), leaves the visible answer empty or
+    // truncated. So when the caller isn't going to see the reasoning,
+    // don't generate it: default `enable_thinking` to `include_thinking`.
+    // A client that explicitly set `chat_template_kwargs.enable_thinking`
+    // wins; thinking-aware clients (helexa-acp, `x-include-thinking:
+    // true`) keep reasoning on.
+    default_enable_thinking(&mut req, include_thinking);
+
    if req.stream.unwrap_or(false) {
        match candle.chat_completion_stream_with(req, chat_config).await {
            Ok(rx) => {
@@ -220,80 +292,12 @@ async fn chat_completions(
                    .keep_alive(KeepAlive::default())
                    .into_response()
            }
-            Err(InferenceError::ModelNotLoaded(id)) => (
-                StatusCode::NOT_FOUND,
-                Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
-            )
-                .into_response(),
-            Err(InferenceError::PromptTooLong { prompt_len, max }) => (
-                StatusCode::BAD_REQUEST,
-                Json(json!({
-                    "error": format!("prompt has {prompt_len} tokens but max is {max}"),
-                    "code": "prompt_too_long",
-                    "prompt_len": prompt_len,
-                    "max": max,
-                })),
-            )
-                .into_response(),
-            Err(InferenceError::InsufficientVram {
-                free_mb,
-                required_mb,
-            }) => (
-                StatusCode::SERVICE_UNAVAILABLE,
-                Json(json!({
-                    "error": format!(
-                        "insufficient free VRAM: {free_mb} MiB free, need at least {required_mb} MiB"
-                    ),
-                    "code": "insufficient_vram",
-                    "free_mb": free_mb,
-                    "required_mb": required_mb,
-                })),
-            )
-                .into_response(),
-            Err(InferenceError::Other(e)) => (
-                StatusCode::INTERNAL_SERVER_ERROR,
-                Json(json!({"error": format!("{e:#}")})),
-            )
-                .into_response(),
+            Err(e) => inference_error_response(e),
        }
    } else {
        match candle.chat_completion(req).await {
            Ok(resp) => Json(resp).into_response(),
-            Err(InferenceError::ModelNotLoaded(id)) => (
-                StatusCode::NOT_FOUND,
-                Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
-            )
-                .into_response(),
-            Err(InferenceError::PromptTooLong { prompt_len, max }) => (
-                StatusCode::BAD_REQUEST,
-                Json(json!({
-                    "error": format!("prompt has {prompt_len} tokens but max is {max}"),
-                    "code": "prompt_too_long",
-                    "prompt_len": prompt_len,
-                    "max": max,
-                })),
-            )
-                .into_response(),
-            Err(InferenceError::InsufficientVram {
-                free_mb,
-                required_mb,
-            }) => (
-                StatusCode::SERVICE_UNAVAILABLE,
-                Json(json!({
-                    "error": format!(
-                        "insufficient free VRAM: {free_mb} MiB free, need at least {required_mb} MiB"
-                    ),
-                    "code": "insufficient_vram",
-                    "free_mb": free_mb,
-                    "required_mb": required_mb,
-                })),
-            )
-                .into_response(),
-            Err(InferenceError::Other(e)) => (
-                StatusCode::INTERNAL_SERVER_ERROR,
-                Json(json!({"error": format!("{e:#}")})),
-            )
-                .into_response(),
+            Err(e) => inference_error_response(e),
        }
    }
 }
@@ -392,6 +396,9 @@ async fn responses(
                    input_tokens: u.prompt_tokens,
                    output_tokens: u.completion_tokens,
                    total_tokens: u.prompt_tokens + u.completion_tokens,
+                    // Non-streaming reasoning accounting deferred (#64).
+                    output_tokens_details: None,
+                    input_tokens_details: None,
                });
                let meta = openai_responses::ResponseMeta {
                    response_id: mint_response_id(),
@@ -418,46 +425,103 @@ fn finish_reason_from_str(s: &str) -> crate::wire::FinishReason {
 }

 /// Centralised mapping from [`InferenceError`] to an HTTP response.
-/// Lifted out so the chat-completions and responses handlers stay
-/// readable and changes to error-code semantics happen in one spot.
+///
+/// Emits the OpenAI-standard *nested* error envelope:
+///
+/// ```json
+/// { "error": { "message": "...", "type": "...", "code": "...", "param": null } }
+/// ```
+///
+/// OpenAI-compatible clients (opencode, the openai SDK) reach into
+/// `error.type` / `error.code` to drive behaviour — most importantly,
+/// `code == "context_length_exceeded"` triggers auto-compaction and
+/// retry rather than a hard failure. A flat `{"error": "..."}` string
+/// is invisible to that logic, so every variant nests here. Diagnostic
+/// extras (prompt_len, free_mb, …) ride *inside* the error object so
+/// they don't break the envelope shape.
 fn inference_error_response(err: InferenceError) -> axum::response::Response {
-    match err {
-        InferenceError::ModelNotLoaded(id) => (
-            StatusCode::NOT_FOUND,
-            Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
+    use cortex_core::error_envelope::OpenAiError;
+    let env = match err {
+        InferenceError::ModelNotLoaded(id) => OpenAiError::new(
+            404,
+            "invalid_request_error",
+            "model_not_found",
+            format!("model '{id}' not loaded on this neuron"),
        )
-            .into_response(),
-        InferenceError::PromptTooLong { prompt_len, max } => (
-            StatusCode::BAD_REQUEST,
-            Json(json!({
-                "error": format!("prompt has {prompt_len} tokens but max is {max}"),
-                "code": "prompt_too_long",
-                "prompt_len": prompt_len,
-                "max": max,
-            })),
-        )
-            .into_response(),
+        .with_extra("model_id", json!(id)),
+        // OpenAI's canonical context-overflow error. opencode keys on
+        // `code == "context_length_exceeded"` and the message phrasing
+        // ("maximum context length is N tokens") to auto-compact+retry.
+        InferenceError::PromptTooLong { prompt_len, max } => {
+            OpenAiError::context_length_exceeded(format!(
+                "This model's maximum context length is {max} tokens. \
+                 However, your messages resulted in {prompt_len} tokens. \
+                 Please reduce the length of the messages."
+            ))
+            .with_extra("prompt_len", json!(prompt_len))
+            .with_extra("max", json!(max))
+        }
+        // VRAM frees as the in-flight request(s) complete, so this is a
+        // transient 503 — advertise a short Retry-After (#63).
        InferenceError::InsufficientVram {
            free_mb,
            required_mb,
-        } => (
-            StatusCode::SERVICE_UNAVAILABLE,
-            Json(json!({
-                "error": format!(
-                    "insufficient free VRAM: {free_mb} MiB free, need at least {required_mb} MiB"
+        } => OpenAiError::new(
+            503,
+            "api_error",
+            "insufficient_vram",
+            format!("insufficient free VRAM: {free_mb} MiB free, need at least {required_mb} MiB"),
+        )
+        .with_retry_after(5)
+        .with_extra("free_mb", json!(free_mb))
+        .with_extra("required_mb", json!(required_mb)),
+        InferenceError::VisionUnsupported { model_id } => OpenAiError::new(
+            400,
+            "invalid_request_error",
+            "vision_unsupported",
+            format!("model '{model_id}' does not support image input"),
+        )
+        .with_extra("model_id", json!(model_id))
+        .with_extra(
+            "suggestion",
+            json!("load a vision-capable model or remove image_url content parts"),
        ),
-                "code": "insufficient_vram",
-                "free_mb": free_mb,
-                "required_mb": required_mb,
-            })),
+        InferenceError::TemplateRenderFailed { detail } => OpenAiError::new(
+            422,
+            "invalid_request_error",
+            "template_render_failed",
+            format!("chat template could not render this request: {detail}"),
+        ),
+        // Admission control refused (#53): a fast, retryable "busy" signal.
+        // 503 (service busy) + Retry-After; opencode/AI SDK back off.
+        InferenceError::Overloaded { retry_after_secs } => OpenAiError::new(
+            503,
+            "rate_limit_error",
+            "rate_limit_exceeded",
+            "model is busy (admission queue full); retry shortly",
        )
-            .into_response(),
-        InferenceError::Other(e) => (
-            StatusCode::INTERNAL_SERVER_ERROR,
-            Json(json!({"error": format!("{e:#}")})),
-        )
-            .into_response(),
+        .with_retry_after(retry_after_secs),
+        InferenceError::Other(e) => OpenAiError::without_code(500, "api_error", format!("{e:#}")),
+    };
+    envelope_response(env)
 }
+
+/// Neuron adapter: turn the shared [`cortex_core::error_envelope::OpenAiError`]
+/// into an axum response, setting `Retry-After` when the envelope carries one.
+/// cortex-core owns the envelope shape (#60/#63); this is the only crossing
+/// from that data into axum on the neuron side.
+fn envelope_response(err: cortex_core::error_envelope::OpenAiError) -> axum::response::Response {
+    let status = StatusCode::from_u16(err.status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+    let retry_after = err.retry_after_secs;
+    let mut response = (status, Json(err.body())).into_response();
+    if let Some(secs) = retry_after
+        && let Ok(value) = axum::http::HeaderValue::from_str(&secs.to_string())
+    {
+        response
+            .headers_mut()
+            .insert(axum::http::header::RETRY_AFTER, value);
+    }
+    response
 }

 fn mint_response_id() -> String {
@@ -481,3 +545,193 @@ fn unix_subsec_nanos() -> u64 {
        .map(|d| d.as_nanos() as u64)
        .unwrap_or(0)
 }
+
+#[cfg(test)]
+mod thinking_tests {
+    use super::*;
+
+    fn req(value: serde_json::Value) -> ChatCompletionRequest {
+        serde_json::from_value(value).expect("valid ChatCompletionRequest")
+    }
+
+    fn enable_thinking(r: &ChatCompletionRequest) -> Option<bool> {
+        r.extra
+            .get("chat_template_kwargs")
+            .and_then(|k| k.get("enable_thinking"))
+            .and_then(|v| v.as_bool())
+    }
+
+    #[test]
+    fn defaults_enable_thinking_to_include_thinking_false() {
+        let mut r = req(json!({"model": "m", "messages": []}));
+        default_enable_thinking(&mut r, false);
+        assert_eq!(enable_thinking(&r), Some(false));
+    }
+
+    #[test]
+    fn defaults_enable_thinking_true_when_surfacing() {
+        let mut r = req(json!({"model": "m", "messages": []}));
+        default_enable_thinking(&mut r, true);
+        assert_eq!(enable_thinking(&r), Some(true));
+    }
+
+    #[test]
+    fn explicit_client_choice_is_respected() {
+        let mut r = req(json!({
+            "model": "m", "messages": [],
+            "chat_template_kwargs": {"enable_thinking": true}
+        }));
+        // include_thinking=false would normally force false; explicit wins.
+        default_enable_thinking(&mut r, false);
+        assert_eq!(enable_thinking(&r), Some(true));
+    }
+
+    #[test]
+    fn preserves_other_chat_template_kwargs() {
+        let mut r = req(json!({
+            "model": "m", "messages": [],
+            "chat_template_kwargs": {"some_other": 42}
+        }));
+        default_enable_thinking(&mut r, false);
+        assert_eq!(enable_thinking(&r), Some(false));
+        assert_eq!(
+            r.extra["chat_template_kwargs"]["some_other"],
+            json!(42),
+            "existing kwargs must survive"
+        );
+    }
+}
+
+#[cfg(test)]
+mod error_envelope_tests {
+    use super::*;
+    use axum::http::StatusCode;
+
+    /// Drive an `InferenceError` through the mapper and decode the
+    /// `(status, json)` pair it produces.
+    async fn map(err: InferenceError) -> (StatusCode, Value) {
+        let resp = inference_error_response(err);
+        let status = resp.status();
+        let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .expect("buffer error body");
+        let body: Value = serde_json::from_slice(&bytes).expect("error body is JSON");
+        (status, body)
+    }
+
+    #[tokio::test]
+    async fn prompt_too_long_is_context_length_exceeded() {
+        let (status, body) = map(InferenceError::PromptTooLong {
+            prompt_len: 60_000,
+            max: 49_152,
+        })
+        .await;
+
+        assert_eq!(status, StatusCode::BAD_REQUEST);
+        // The envelope must be nested under `error`, not a flat string.
+        let error = body
+            .get("error")
+            .and_then(Value::as_object)
+            .expect("error object");
+        assert_eq!(error["type"], "invalid_request_error");
+        assert_eq!(
+            error["code"], "context_length_exceeded",
+            "opencode keys on this code to auto-compact and retry"
+        );
+        assert_eq!(error["param"], Value::Null);
+        // Phrasing opencode/openai clients pattern-match on.
+        let msg = error["message"].as_str().unwrap();
+        assert!(
+            msg.contains("maximum context length is 49152 tokens"),
+            "message was: {msg}"
+        );
+        // Diagnostics ride inside the error object.
+        assert_eq!(error["prompt_len"], 60_000);
+        assert_eq!(error["max"], 49_152);
+    }
+
+    #[tokio::test]
+    async fn model_not_loaded_is_404_model_not_found() {
+        let (status, body) = map(InferenceError::ModelNotLoaded("Qwen/X".into())).await;
+        assert_eq!(status, StatusCode::NOT_FOUND);
+        let error = &body["error"];
+        assert_eq!(error["type"], "invalid_request_error");
+        assert_eq!(error["code"], "model_not_found");
+        assert_eq!(error["model_id"], "Qwen/X");
+    }
+
+    #[tokio::test]
+    async fn insufficient_vram_is_503_api_error() {
+        let (status, body) = map(InferenceError::InsufficientVram {
+            free_mb: 1_024,
+            required_mb: 8_192,
+        })
+        .await;
+        assert_eq!(status, StatusCode::SERVICE_UNAVAILABLE);
+        let error = &body["error"];
+        assert_eq!(error["type"], "api_error");
+        assert_eq!(error["code"], "insufficient_vram");
+        assert_eq!(error["free_mb"], 1_024);
+        assert_eq!(error["required_mb"], 8_192);
+    }
+
+    #[tokio::test]
+    async fn overloaded_is_503_rate_limited_with_retry_after() {
+        // Admission rejection (#53) → fast, retryable backpressure.
+        let resp = inference_error_response(InferenceError::Overloaded {
+            retry_after_secs: 7,
+        });
+        assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
+        let retry = resp
+            .headers()
+            .get(axum::http::header::RETRY_AFTER)
+            .expect("admission rejection must advertise Retry-After");
+        assert_eq!(retry.to_str().unwrap(), "7");
+
+        let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body: Value = serde_json::from_slice(&bytes).unwrap();
+        assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+    }
+
+    #[tokio::test]
+    async fn insufficient_vram_carries_retry_after() {
+        // Transient 503 — VRAM frees as in-flight requests finish, so the
+        // client should back off and retry (#63).
+        let resp = inference_error_response(InferenceError::InsufficientVram {
+            free_mb: 1_024,
+            required_mb: 8_192,
+        });
+        let retry = resp
+            .headers()
+            .get(axum::http::header::RETRY_AFTER)
+            .expect("transient 503 must advertise Retry-After");
+        assert_eq!(retry.to_str().unwrap(), "5");
+    }
+
+    #[tokio::test]
+    async fn permanent_rejections_have_no_retry_after() {
+        // context_length_exceeded is permanent for this request — no hint.
+        let resp = inference_error_response(InferenceError::PromptTooLong {
+            prompt_len: 60_000,
+            max: 49_152,
+        });
+        assert!(
+            resp.headers()
+                .get(axum::http::header::RETRY_AFTER)
+                .is_none(),
+            "permanent rejection must not advertise Retry-After"
+        );
+    }
+
+    #[tokio::test]
+    async fn other_is_500_with_null_code() {
+        let (status, body) = map(InferenceError::Other(anyhow::anyhow!("kaboom"))).await;
+        assert_eq!(status, StatusCode::INTERNAL_SERVER_ERROR);
+        let error = &body["error"];
+        assert_eq!(error["type"], "api_error");
+        assert_eq!(error["code"], Value::Null);
+        assert!(error["message"].as_str().unwrap().contains("kaboom"));
+    }
+}
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -6,8 +6,18 @@ use figment::{
    providers::{Env, Format, Toml},
 };
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::path::{Path, PathBuf};

+/// Default scheme name applied to bare `org/name` model ids when no
+/// `[harness.candle.default_source]` is set. Keeps existing operator
+/// configs (which know nothing about schemes) working unchanged.
+pub const DEFAULT_SOURCE_SCHEME: &str = "huggingface";
+
+/// Endpoint URL for the default huggingface source, used when no
+/// `[harness.candle.sources.huggingface]` is configured.
+pub const DEFAULT_HF_ENDPOINT: &str = "https://huggingface.co";
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct NeuronConfig {
    #[serde(default = "default_port")]
@@ -37,8 +47,279 @@ pub struct HarnessSettings {
 pub struct CandleHarnessConfig {
    /// HuggingFace cache directory for model weights.
    /// When unset, defers to hf-hub's default (~/.cache/huggingface).
+    ///
+    /// Retained for back-compat — operators with existing
+    /// `hf_cache = "..."` configs continue to work. Treated as the
+    /// `huggingface` source's cache_dir when a sources table isn't
+    /// provided.
    #[serde(default)]
    pub hf_cache: Option<PathBuf>,
+
+    /// Default source scheme applied to bare `org/name` model ids
+    /// (those without an explicit `scheme:` prefix). When unset, falls
+    /// back to `DEFAULT_SOURCE_SCHEME` ("huggingface").
+    #[serde(default)]
+    pub default_source: Option<String>,
+
+    /// Per-scheme source endpoints. Each entry maps a scheme name
+    /// (`huggingface`, `helexa`, an operator's mirror tag, …) to its
+    /// endpoint URL, optional auth env var, and optional cache
+    /// directory.
+    ///
+    /// When absent or missing the `huggingface` key, the loader
+    /// synthesises a `huggingface` entry pointing at
+    /// `https://huggingface.co` with `hf_cache` (above) as its
+    /// cache_dir. This keeps single-source configs ergonomic.
+    #[serde(default)]
+    pub sources: HashMap<String, SourceConfig>,
+
+    /// Prefix KV cache across requests (#11). Applies per loaded
+    /// model, on architectures that support cache snapshots (qwen3_5).
+    #[serde(default)]
+    pub prefix_cache: PrefixCacheConfig,
+
+    /// Self-derived context/token limits (#67). The neuron computes the
+    /// most-efficient `limit{context,input,output}` that still allows
+    /// coherent agentic performance from model architecture + live free
+    /// VRAM + a self-measured throughput ceiling, advertises it on
+    /// `/models`, and enforces it. These knobs tune that derivation.
+    #[serde(default)]
+    pub context_limit: ContextLimitConfig,
+
+    /// Admission control (#53): bounds the per-model wait queue so a busy
+    /// model returns a fast, retryable `429`/`503` instead of stalling new
+    /// requests until their client times out.
+    #[serde(default)]
+    pub admission: AdmissionConfig,
+}
+
+/// `[harness.candle.admission]` settings (#53).
+///
+/// Inference is batch-1, so `max_in_flight` is 1 in practice; the queue
+/// (`max_queue_depth`) absorbs short bursts, and `max_wait_secs` caps how
+/// long a queued request waits before it's refused with backpressure.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AdmissionConfig {
+    /// Concurrent running requests per model. Batch-1 inference → 1.
+    #[serde(default = "default_admission_max_in_flight")]
+    pub max_in_flight: usize,
+    /// Queued (waiting) requests allowed beyond the in-flight one. The
+    /// `(max_in_flight + max_queue_depth + 1)`-th request is refused
+    /// immediately with `429`/`503` + `Retry-After`.
+    #[serde(default = "default_admission_max_queue_depth")]
+    pub max_queue_depth: usize,
+    /// Maximum seconds a queued request waits for the in-flight slot before
+    /// it is refused (turns the old ~300s client-side hang into a fast,
+    /// honest signal).
+    #[serde(default = "default_admission_max_wait_secs")]
+    pub max_wait_secs: u64,
+}
+
+impl Default for AdmissionConfig {
+    fn default() -> Self {
+        Self {
+            max_in_flight: default_admission_max_in_flight(),
+            max_queue_depth: default_admission_max_queue_depth(),
+            max_wait_secs: default_admission_max_wait_secs(),
+        }
+    }
+}
+
+fn default_admission_max_in_flight() -> usize {
+    1
+}
+
+fn default_admission_max_queue_depth() -> usize {
+    8
+}
+
+fn default_admission_max_wait_secs() -> u64 {
+    30
+}
+
+/// `[harness.candle.prefix_cache]` settings.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PrefixCacheConfig {
+    /// Master switch. On by default — set `false` to restore the
+    /// clear-every-request behaviour.
+    #[serde(default = "default_prefix_cache_enabled")]
+    pub enabled: bool,
+    /// Snapshot byte budget per loaded model, in MiB. Snapshots live
+    /// on the model's device, so this comes out of the same VRAM that
+    /// serves inference — size it against the device's headroom after
+    /// the model weights.
+    #[serde(default = "default_prefix_cache_budget_mb")]
+    pub budget_mb: u64,
+    /// Maximum live snapshots per loaded model, regardless of budget.
+    #[serde(default = "default_prefix_cache_max_entries")]
+    pub max_entries: usize,
+}
+
+impl Default for PrefixCacheConfig {
+    fn default() -> Self {
+        Self {
+            enabled: default_prefix_cache_enabled(),
+            budget_mb: default_prefix_cache_budget_mb(),
+            max_entries: default_prefix_cache_max_entries(),
+        }
+    }
+}
+
+fn default_prefix_cache_enabled() -> bool {
+    true
+}
+
+fn default_prefix_cache_budget_mb() -> u64 {
+    1024
+}
+
+fn default_prefix_cache_max_entries() -> usize {
+    8
+}
+
+/// `[harness.candle.context_limit]` settings (#67).
+///
+/// The derived limit is `context = min(max_position_embeddings,
+/// vram_ceiling, throughput_ceiling)`, then `input = context −
+/// output_reserve`. `vram_ceiling` and `throughput_ceiling` read live
+/// state, so the advertised/enforced limit tracks the resident model and
+/// rises automatically as efficiency work (e.g. prefix caching, #11)
+/// frees headroom or speeds prefill — no operator action.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ContextLimitConfig {
+    /// Master switch. On by default — set `false` to fall back to the
+    /// static `NEURON_MAX_PROMPT_TOKENS` cap with no advertised limit.
+    #[serde(default = "default_context_limit_enabled")]
+    pub enabled: bool,
+
+    /// Coherence target: the longest prefill-per-turn latency (seconds)
+    /// considered acceptable agentic performance. The throughput ceiling
+    /// is `target_prefill_latency_secs × measured_prefill_tok_per_sec`.
+    /// Raise it once cross-request prefix caching (#11) makes long
+    /// contexts cheap to re-prefill.
+    #[serde(default = "default_target_prefill_latency_secs")]
+    pub target_prefill_latency_secs: f64,
+
+    /// Cold-start prefill speed (tokens/sec) used for the throughput
+    /// ceiling until the model has served enough requests to measure its
+    /// own rate. A conservative estimate; the live EMA supersedes it.
+    #[serde(default = "default_bootstrap_prefill_tok_per_sec")]
+    pub bootstrap_prefill_tok_per_sec: f64,
+
+    /// VRAM (MiB) reserved per card for prefill activations on top of the
+    /// resident weights and the KV cache, before computing the VRAM
+    /// context ceiling.
+    #[serde(default = "default_activation_headroom_mb")]
+    pub activation_headroom_mb: u64,
+
+    /// Free-VRAM floor (MiB) kept available per card — the VRAM ceiling
+    /// leaves at least this much unused. Mirrors `NEURON_MIN_FREE_VRAM_MB`.
+    #[serde(default = "default_context_min_free_floor_mb")]
+    pub min_free_floor_mb: u64,
+
+    /// Generation reserve (tokens) left below the context wall:
+    /// `input = context − output_reserve_tokens`. Defaults to neuron's
+    /// default `max_tokens`.
+    #[serde(default = "default_output_reserve_tokens")]
+    pub output_reserve_tokens: usize,
+}
+
+impl Default for ContextLimitConfig {
+    fn default() -> Self {
+        Self {
+            enabled: default_context_limit_enabled(),
+            target_prefill_latency_secs: default_target_prefill_latency_secs(),
+            bootstrap_prefill_tok_per_sec: default_bootstrap_prefill_tok_per_sec(),
+            activation_headroom_mb: default_activation_headroom_mb(),
+            min_free_floor_mb: default_context_min_free_floor_mb(),
+            output_reserve_tokens: default_output_reserve_tokens(),
+        }
+    }
+}
+
+fn default_context_limit_enabled() -> bool {
+    true
+}
+
+fn default_target_prefill_latency_secs() -> f64 {
+    // ~2 min/turn is the coherence wall observed pre-#11 on beast
+    // (the issue's worked example). Raisable once prefix caching lands.
+    120.0
+}
+
+fn default_bootstrap_prefill_tok_per_sec() -> f64 {
+    // beast Qwen3.6-27B TP=2 measured ~850 tok/s prefill; a conservative
+    // floor so the cold-start ceiling isn't wildly optimistic.
+    800.0
+}
+
+fn default_activation_headroom_mb() -> u64 {
+    2048
+}
+
+fn default_context_min_free_floor_mb() -> u64 {
+    1500
+}
+
+fn default_output_reserve_tokens() -> usize {
+    8192
+}
+
+/// Per-scheme source configuration. Mirrors the shape `hf_hub::ApiBuilder`
+/// needs: endpoint URL, optional auth token (read from an env var so
+/// secrets stay out of the config file), and optional cache directory
+/// disambiguated per source to prevent mirror-vs-canonical collisions.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct SourceConfig {
+    /// Base URL of the registry. Must speak the HF-compatible wire
+    /// format (siblings listing at
+    /// `/api/models/{org}/{name}[/revision/{rev}]`, blob fetch at
+    /// `/{org}/{name}/resolve/{rev}/{path}`).
+    pub endpoint: String,
+
+    /// Environment variable name to read for the bearer token used
+    /// against this source. `None` = anonymous. Reading from env
+    /// (vs. literal token in the config) keeps secrets out of TOML.
+    #[serde(default)]
+    pub auth_env: Option<String>,
+
+    /// Cache directory for this source. The hf-hub
+    /// `models--{org}--{name}/snapshots/...` tree lives directly
+    /// under this path, so distinct sources serving the same
+    /// `org/name` cannot collide on disk.
+    ///
+    /// `None` means "share the harness `hf_cache` directory" — only
+    /// safe when the operator has exactly one source configured.
+    #[serde(default)]
+    pub cache_dir: Option<PathBuf>,
+}
+
+impl CandleHarnessConfig {
+    /// Resolve the effective sources map for this config, synthesising
+    /// a `huggingface` entry from legacy fields (`hf_cache`) when the
+    /// operator hasn't supplied a sources table. Idempotent.
+    ///
+    /// Returns a fresh map rather than mutating self so the original
+    /// (operator-typed) config can still be serialized back to TOML
+    /// for diagnostics.
+    pub fn effective_sources(&self) -> HashMap<String, SourceConfig> {
+        let mut out = self.sources.clone();
+        out.entry(DEFAULT_SOURCE_SCHEME.to_string())
+            .or_insert_with(|| SourceConfig {
+                endpoint: DEFAULT_HF_ENDPOINT.to_string(),
+                auth_env: Some("HF_TOKEN".to_string()),
+                cache_dir: self.hf_cache.clone(),
+            });
+        out
+    }
+
+    /// Effective default scheme. Falls back to `DEFAULT_SOURCE_SCHEME`
+    /// when the operator hasn't pinned one.
+    pub fn effective_default_source(&self) -> &str {
+        self.default_source
+            .as_deref()
+            .unwrap_or(DEFAULT_SOURCE_SCHEME)
+    }
 }

 fn default_port() -> u16 {
@@ -65,3 +346,109 @@ impl Default for NeuronConfig {
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn effective_sources_synthesises_huggingface_when_absent() {
+        let cfg = CandleHarnessConfig::default();
+        let sources = cfg.effective_sources();
+        assert!(sources.contains_key("huggingface"));
+        let hf = &sources["huggingface"];
+        assert_eq!(hf.endpoint, DEFAULT_HF_ENDPOINT);
+        assert_eq!(hf.auth_env.as_deref(), Some("HF_TOKEN"));
+        assert!(hf.cache_dir.is_none());
+    }
+
+    #[test]
+    fn effective_sources_carries_legacy_hf_cache_into_synth_entry() {
+        // Existing operator configs only set `hf_cache = "/archive3/..."`
+        // — the synth must pick that up so the loader keeps using the
+        // operator's storage.
+        let cfg = CandleHarnessConfig {
+            hf_cache: Some(PathBuf::from("/archive3/llm-cache")),
+            ..Default::default()
+        };
+        let sources = cfg.effective_sources();
+        assert_eq!(
+            sources["huggingface"].cache_dir.as_deref(),
+            Some(Path::new("/archive3/llm-cache"))
+        );
+    }
+
+    #[test]
+    fn effective_sources_preserves_explicit_huggingface_entry() {
+        // When an operator types out `[harness.candle.sources.huggingface]`
+        // explicitly, we must not clobber it with the synth defaults.
+        let mut sources = HashMap::new();
+        sources.insert(
+            "huggingface".to_string(),
+            SourceConfig {
+                endpoint: "https://huggingface.example.org".into(),
+                auth_env: Some("MY_TOKEN".into()),
+                cache_dir: Some(PathBuf::from("/operator-cache")),
+            },
+        );
+        let cfg = CandleHarnessConfig {
+            hf_cache: Some(PathBuf::from("/legacy-cache")),
+            sources,
+            ..Default::default()
+        };
+        let effective = cfg.effective_sources();
+        assert_eq!(
+            effective["huggingface"].endpoint,
+            "https://huggingface.example.org"
+        );
+        assert_eq!(
+            effective["huggingface"].auth_env.as_deref(),
+            Some("MY_TOKEN")
+        );
+        assert_eq!(
+            effective["huggingface"].cache_dir.as_deref(),
+            Some(Path::new("/operator-cache"))
+        );
+    }
+
+    #[test]
+    fn effective_sources_includes_helexa_alongside_synth_huggingface() {
+        let mut sources = HashMap::new();
+        sources.insert(
+            "helexa".to_string(),
+            SourceConfig {
+                endpoint: "https://registry.helexa.ai".into(),
+                auth_env: Some("HELEXA_TOKEN".into()),
+                cache_dir: Some(PathBuf::from("/archive3/llm-cache/helexa")),
+            },
+        );
+        let cfg = CandleHarnessConfig {
+            hf_cache: Some(PathBuf::from("/archive3/llm-cache/huggingface")),
+            sources,
+            ..Default::default()
+        };
+        let effective = cfg.effective_sources();
+        assert_eq!(effective.len(), 2);
+        assert_eq!(effective["helexa"].endpoint, "https://registry.helexa.ai");
+        // huggingface still gets synth-derived from legacy hf_cache.
+        assert_eq!(
+            effective["huggingface"].cache_dir.as_deref(),
+            Some(Path::new("/archive3/llm-cache/huggingface"))
+        );
+    }
+
+    #[test]
+    fn effective_default_source_falls_back() {
+        let cfg = CandleHarnessConfig::default();
+        assert_eq!(cfg.effective_default_source(), DEFAULT_SOURCE_SCHEME);
+    }
+
+    #[test]
+    fn effective_default_source_honours_explicit() {
+        let cfg = CandleHarnessConfig {
+            default_source: Some("helexa".into()),
+            ..Default::default()
+        };
+        assert_eq!(cfg.effective_default_source(), "helexa");
+    }
+}
--- a/crates/neuron/src/discovery.rs
+++ b/crates/neuron/src/discovery.rs
@@ -100,6 +100,87 @@ pub fn parse_health_info(csv_output: &str) -> Result<Vec<DeviceHealth>> {
    Ok(devices)
 }

+// ── Driver/library mismatch preflight (#19) ─────────────────────────
+
+/// Classify a failed nvidia-smi invocation: is it the classic
+/// "Driver/library version mismatch" (userspace libs updated, kernel
+/// module not reloaded — every CUDA call on the host is dead until a
+/// reboot)? Returns the userspace NVML library version when the
+/// message carries one ("NVML library version: 580.159"), or
+/// `Some("unknown")` for a mismatch without a parsable version.
+/// `None` for any other failure — other errors (no devices, perms)
+/// are NOT the mismatch and must not trigger the loud diagnosis.
+pub fn classify_driver_mismatch(combined_output: &str) -> Option<String> {
+    if !combined_output.contains("Driver/library version mismatch") {
+        return None;
+    }
+    let userspace = combined_output
+        .lines()
+        .find_map(|l| l.trim().strip_prefix("NVML library version:"))
+        .map(|v| v.trim().to_string())
+        .filter(|v| !v.is_empty())
+        .unwrap_or_else(|| "unknown".to_string());
+    Some(userspace)
+}
+
+/// Extract the loaded kernel module's driver version from
+/// `/proc/driver/nvidia/version` contents. Typical first line:
+///
+/// ```text
+/// NVRM version: NVIDIA UNIX Open Kernel Module for x86_64  580.159.03  Release Build  (...)
+/// ```
+pub fn parse_kernel_module_version(proc_contents: &str) -> Option<String> {
+    let is_numeric = |p: &str| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit());
+    let line = proc_contents
+        .lines()
+        .find(|l| l.starts_with("NVRM version:"))?;
+    line.split_whitespace()
+        .find(|tok| {
+            let mut parts = tok.split('.');
+            parts.next().is_some_and(is_numeric) && parts.next().is_some_and(is_numeric)
+        })
+        .map(|s| s.to_string())
+}
+
+/// Render the operator-actionable mismatch description carried in
+/// `DiscoveryResponse::cuda_unavailable_reason` and logged at startup.
+pub fn mismatch_reason(userspace: &str, kernel_module: Option<&str>) -> String {
+    format!(
+        "host NVIDIA driver/library mismatch (userspace NVML {userspace} vs loaded kernel \
+         module {}) — reboot the host to reload the kernel module; all CUDA inference is \
+         unavailable until then",
+        kernel_module.unwrap_or("unknown")
+    )
+}
+
+/// Outcome of an nvidia-smi invocation, distinguishing "binary not
+/// present" (CPU-only host, not an error) from "present but failing"
+/// (possible driver mismatch — worth classifying).
+enum SmiOutcome {
+    Ok(String),
+    Failed(String),
+    Absent,
+}
+
+async fn run_nvidia_smi(args: &[&str]) -> SmiOutcome {
+    match tokio::process::Command::new("nvidia-smi")
+        .args(args)
+        .output()
+        .await
+    {
+        Err(_) => SmiOutcome::Absent,
+        Ok(out) if out.status.success() => {
+            SmiOutcome::Ok(String::from_utf8_lossy(&out.stdout).to_string())
+        }
+        Ok(out) => {
+            let mut combined = String::from_utf8_lossy(&out.stdout).to_string();
+            combined.push('\n');
+            combined.push_str(&String::from_utf8_lossy(&out.stderr));
+            SmiOutcome::Failed(combined)
+        }
+    }
+}
+
 // ── Command execution wrappers ──────────────────────────────────────

 async fn run_command(cmd: &str, args: &[&str]) -> Result<String> {
@@ -139,23 +220,42 @@ pub async fn discover_system() -> Result<DiscoveryResponse> {
        .trim()
        .to_string();

-    let (devices, driver_version) = match run_command_optional(
-        "nvidia-smi",
-        &[
+    let (devices, driver_version, cuda_unavailable_reason) = match run_nvidia_smi(&[
        &format!("--query-gpu={NVIDIA_SMI_DISCOVERY_QUERY}"),
        "--format=csv,noheader,nounits",
-        ],
-    )
+    ])
    .await
    {
-        Some(output) => {
+        SmiOutcome::Ok(output) => {
            let devs = parse_gpu_info(&output).unwrap_or_default();
            let driver = parse_driver_version(&output);
-            (devs, driver)
+            (devs, driver, None)
        }
-        None => {
+        SmiOutcome::Absent => {
            tracing::info!("nvidia-smi not found — no GPU devices discovered");
-            (vec![], None)
+            (vec![], None, None)
+        }
+        SmiOutcome::Failed(combined) => {
+            // nvidia-smi exists but can't talk to the driver. The case
+            // worth diagnosing precisely is the userspace↔kernel-module
+            // version skew after an un-rebooted driver update (#19) —
+            // every CUDA call on the host fails until a reboot, and
+            // without this classification it surfaces as a cryptic
+            // NCCL/cuInit error deep inside the first model load.
+            let reason = classify_driver_mismatch(&combined).map(|userspace| {
+                let kmod = std::fs::read_to_string("/proc/driver/nvidia/version")
+                    .ok()
+                    .as_deref()
+                    .and_then(parse_kernel_module_version);
+                mismatch_reason(&userspace, kmod.as_deref())
+            });
+            if reason.is_none() {
+                tracing::warn!(
+                    output = %combined.trim(),
+                    "nvidia-smi present but failing — no GPU devices discovered"
+                );
+            }
+            (vec![], None, reason)
        }
    };

@@ -172,6 +272,8 @@ pub async fn discover_system() -> Result<DiscoveryResponse> {
        driver_version,
        devices,
        harnesses: vec![], // populated by harness registry in Phase 8
+        cuda_unavailable_reason,
+        max_prompt_tokens: crate::harness::candle::max_prompt_tokens() as u64,
    })
 }

@@ -272,4 +374,63 @@ mod tests {
        assert_eq!(health[1].vram_used_mb, 4096);
        assert_eq!(health[1].temp_c, 58);
    }
+
+    // ── #19 driver/library mismatch preflight ────────────────────────
+
+    #[test]
+    fn classify_driver_mismatch_detects_and_extracts_nvml_version() {
+        // Verbatim shape of nvidia-smi's failure output on a host
+        // whose userspace libs were updated without a reboot.
+        let out = "Failed to initialize NVML: Driver/library version mismatch\n\
+                   NVML library version: 580.159\n";
+        assert_eq!(classify_driver_mismatch(out).as_deref(), Some("580.159"));
+    }
+
+    #[test]
+    fn classify_driver_mismatch_without_version_line() {
+        let out = "Failed to initialize NVML: Driver/library version mismatch\n";
+        assert_eq!(classify_driver_mismatch(out).as_deref(), Some("unknown"));
+    }
+
+    #[test]
+    fn classify_driver_mismatch_ignores_other_failures() {
+        // Other nvidia-smi failures must NOT be diagnosed as the
+        // mismatch (no false positives on healthy or odd hosts).
+        for out in [
+            "No devices were found\n",
+            "Failed to initialize NVML: Insufficient Permissions\n",
+            "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver.\n",
+            "",
+        ] {
+            assert_eq!(
+                classify_driver_mismatch(out),
+                None,
+                "false positive on: {out:?}"
+            );
+        }
+    }
+
+    #[test]
+    fn parse_kernel_module_version_from_proc() {
+        let proc = "NVRM version: NVIDIA UNIX Open Kernel Module for x86_64  580.159.03  Release Build  (dvs-builder@U22-I3-AE24-12-2)  Tue May 12 21:03:35 UTC 2026\n\
+                    GCC version:  gcc version 15.2.1 20251022 (Red Hat 15.2.1-3) (GCC)\n";
+        assert_eq!(
+            parse_kernel_module_version(proc).as_deref(),
+            Some("580.159.03")
+        );
+    }
+
+    #[test]
+    fn parse_kernel_module_version_absent() {
+        assert_eq!(parse_kernel_module_version(""), None);
+        assert_eq!(parse_kernel_module_version("GCC version: gcc 15\n"), None);
+    }
+
+    #[test]
+    fn mismatch_reason_is_operator_actionable() {
+        let reason = mismatch_reason("580.159", Some("580.159.03"));
+        assert!(reason.contains("580.159"));
+        assert!(reason.contains("580.159.03"));
+        assert!(reason.contains("reboot"));
+    }
 }
--- a/crates/neuron/src/harness/admission.rs
+++ b/crates/neuron/src/harness/admission.rs
@@ -0,0 +1,202 @@
+//! Per-model admission control (#53).
+//!
+//! Inference against a loaded model is batch-1: one request runs at a time,
+//! serialized by the model's `inference_lock` (single-GPU) / `pool` mutex
+//! (TP). Before this, the wait for that lock was an **unbounded FIFO of
+//! mutex waiters with no timeout** — a busy model made every new request
+//! hang until its client gave up (~300s) with an opaque error.
+//!
+//! [`AdmissionController`] replaces that implicit unbounded wait with an
+//! explicit bounded scheduler: at most `max_in_flight` running (1, batch-1)
+//! plus a bounded queue of `max_queue_depth` waiters, each waiting at most
+//! `max_wait`. When the queue is full or the wait elapses, the request is
+//! rejected *immediately* — an honest, fast, retryable "busy" signal
+//! (`429`/`503` + `Retry-After` per #63) instead of a silent stall.
+//!
+//! The controller is pure async (no CUDA), so the inference paths just call
+//! [`AdmissionController::enter`] before taking the inference lock and hold
+//! the returned [`AdmissionPermit`] for the request's lifetime. Its counters
+//! ([`in_flight`](AdmissionController::in_flight) /
+//! [`queue_depth`](AdmissionController::queue_depth)) are lock-free, so
+//! `/health` can read live load without contending with inference.
+
+use crate::config::AdmissionConfig;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
+
+/// Why admission was refused. Both map to the #63 backpressure envelope
+/// (`429`/`503` + `rate_limit_exceeded` + `Retry-After`); they differ only
+/// in cause, for logging.
+#[derive(Debug, Clone, Copy)]
+pub enum AdmissionRejection {
+    /// The bounded wait queue was already full.
+    QueueFull { retry_after_secs: u64 },
+    /// A queue slot was taken but the in-flight slot didn't free within
+    /// `max_wait`.
+    Timeout { retry_after_secs: u64 },
+}
+
+impl AdmissionRejection {
+    pub fn retry_after_secs(&self) -> u64 {
+        match self {
+            AdmissionRejection::QueueFull { retry_after_secs }
+            | AdmissionRejection::Timeout { retry_after_secs } => *retry_after_secs,
+        }
+    }
+}
+
+/// Bounded batch-1 scheduler for one loaded model.
+pub struct AdmissionController {
+    /// In-flight slots — `max_in_flight` permits (1 for batch-1).
+    slots: Arc<Semaphore>,
+    /// Queued + in-flight count, for fast rejection and load reporting.
+    pending: Arc<AtomicUsize>,
+    /// `max_in_flight + max_queue_depth` — the rejection threshold.
+    max_pending: usize,
+    max_in_flight: usize,
+    max_wait: Duration,
+}
+
+impl AdmissionController {
+    pub fn new(cfg: &AdmissionConfig) -> Self {
+        // A controller with zero in-flight slots would deadlock; clamp.
+        let max_in_flight = cfg.max_in_flight.max(1);
+        Self {
+            slots: Arc::new(Semaphore::new(max_in_flight)),
+            pending: Arc::new(AtomicUsize::new(0)),
+            max_pending: max_in_flight + cfg.max_queue_depth,
+            max_in_flight,
+            max_wait: Duration::from_secs(cfg.max_wait_secs),
+        }
+    }
+
+    /// Admit a request: reserve a queue slot (fast-rejecting if full), then
+    /// wait up to `max_wait` for an in-flight slot. The returned permit must
+    /// be held for the request's lifetime; dropping it frees both slots.
+    pub async fn enter(&self) -> Result<AdmissionPermit, AdmissionRejection> {
+        // Reserve a pending slot up front so concurrent callers can't all
+        // slip past the threshold check. Roll back if we're over capacity.
+        let prev = self.pending.fetch_add(1, Ordering::AcqRel);
+        if prev >= self.max_pending {
+            self.pending.fetch_sub(1, Ordering::AcqRel);
+            return Err(AdmissionRejection::QueueFull {
+                retry_after_secs: self.retry_hint(),
+            });
+        }
+
+        match tokio::time::timeout(self.max_wait, Arc::clone(&self.slots).acquire_owned()).await {
+            Ok(Ok(permit)) => Ok(AdmissionPermit {
+                _permit: permit,
+                pending: Arc::clone(&self.pending),
+            }),
+            // Semaphore is never closed; treat a closed/elapsed wait the same.
+            Ok(Err(_)) | Err(_) => {
+                self.pending.fetch_sub(1, Ordering::AcqRel);
+                Err(AdmissionRejection::Timeout {
+                    retry_after_secs: self.retry_hint(),
+                })
+            }
+        }
+    }
+
+    /// Requests currently running (holding an in-flight slot).
+    pub fn in_flight(&self) -> usize {
+        self.max_in_flight
+            .saturating_sub(self.slots.available_permits())
+    }
+
+    /// Requests waiting for an in-flight slot.
+    pub fn queue_depth(&self) -> usize {
+        self.pending
+            .load(Ordering::Acquire)
+            .saturating_sub(self.in_flight())
+    }
+
+    /// Rough `Retry-After`: scale with how backed-up the model is, clamped to
+    /// a sane band. Without per-request timing this is a heuristic, but it
+    /// gives well-behaved clients (opencode/AI SDK) a sensible backoff.
+    fn retry_hint(&self) -> u64 {
+        ((self.queue_depth() as u64 + 1) * 2).clamp(1, 120)
+    }
+}
+
+/// Held for a request's lifetime; frees the in-flight + queue slot on drop.
+#[derive(Debug)]
+pub struct AdmissionPermit {
+    _permit: OwnedSemaphorePermit,
+    pending: Arc<AtomicUsize>,
+}
+
+impl Drop for AdmissionPermit {
+    fn drop(&mut self) {
+        self.pending.fetch_sub(1, Ordering::AcqRel);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn cfg(max_in_flight: usize, max_queue_depth: usize, max_wait_secs: u64) -> AdmissionConfig {
+        AdmissionConfig {
+            max_in_flight,
+            max_queue_depth,
+            max_wait_secs,
+        }
+    }
+
+    #[tokio::test]
+    async fn admits_up_to_in_flight_and_reports_load() {
+        let ctrl = AdmissionController::new(&cfg(1, 4, 30));
+        assert_eq!(ctrl.in_flight(), 0);
+        let p = ctrl.enter().await.expect("first admits");
+        assert_eq!(ctrl.in_flight(), 1);
+        assert_eq!(ctrl.queue_depth(), 0);
+        drop(p);
+        assert_eq!(ctrl.in_flight(), 0);
+    }
+
+    #[tokio::test]
+    async fn rejects_when_queue_full() {
+        // 1 in-flight + 1 queue slot = capacity 2; the 3rd is refused fast.
+        let ctrl = Arc::new(AdmissionController::new(&cfg(1, 1, 30)));
+        let _running = ctrl.enter().await.expect("admit running");
+
+        // Fill the single queue slot with a waiter that parks on the semaphore.
+        let ctrl2 = Arc::clone(&ctrl);
+        let waiter = tokio::spawn(async move { ctrl2.enter().await.map(|p| drop(p)) });
+        // Give the waiter a moment to occupy the queue slot.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert_eq!(ctrl.queue_depth(), 1);
+
+        // Queue full → immediate QueueFull with a Retry-After hint.
+        match ctrl.enter().await {
+            Err(AdmissionRejection::QueueFull { retry_after_secs }) => {
+                assert!(retry_after_secs >= 1)
+            }
+            other => panic!("expected QueueFull, got {other:?}"),
+        }
+
+        // Release the runner so the parked waiter can proceed and finish.
+        drop(_running);
+        waiter.await.unwrap().unwrap();
+    }
+
+    #[tokio::test]
+    async fn rejects_on_wait_timeout() {
+        // Zero queue depth + a runner holding the only slot → a second
+        // request can't even queue, so it's QueueFull, not Timeout. Use a
+        // queue of 1 and a tiny max_wait to exercise the timeout path.
+        let ctrl = Arc::new(AdmissionController::new(&cfg(1, 1, 0)));
+        let _running = ctrl.enter().await.expect("admit running");
+        // max_wait 0 → the queued request times out almost immediately.
+        match ctrl.enter().await {
+            Err(AdmissionRejection::Timeout { .. }) => {}
+            other => panic!("expected Timeout, got {other:?}"),
+        }
+        // The timed-out request released its queue slot.
+        assert_eq!(ctrl.queue_depth(), 0);
+    }
+}
--- a/Show More
+++ b/Show More