Merge fix/cortex-poll-debounce-retryable: poll debounce + retryable 503 for feasible-but-unhealthy node

fix(cortex): poll-failure debounce + retryable 503 for feasible-but-unhealthy node
Defense-in-depth for the agent0 NoFeasibleNeuron storm (root cause fixed in neuron). Two cortex resilience gaps this incident exposed: 1. Brittle health flip: the poller marked a node unhealthy on a SINGLE missed /models poll, instantly yanking the node and all its models from routing. A busy neuron briefly slow to answer shouldn't be declared dead. Now debounced: NodeState.consecutive_poll_failures must reach POLL_FAILURE_THRESHOLD (3) before the node flips unhealthy (~20s at the 10s poll interval); any successful poll resets it. A never-healthy node stays unhealthy (the counter only protects an already-healthy node from blips). 2. Transient surfaced as permanent: when a catalogued model's only feasible neuron is momentarily unhealthy, the router returned 404 NoFeasibleNeuron — which litellm/clients treat as non-retryable, so agent0 hard-failed. pick_feasible_neuron now distinguishes "a feasible node exists but is unhealthy right now" → new RouteError::FeasibleNodeUnhealthy (503 + Retry-After: 3, retryable) from "no node could ever satisfy the topology" → 404 NoFeasibleNeuron (permanent). Mirrors the beast case exactly: healthy 1-GPU nodes + an unhealthy 2-GPU node → retry, don't fail. Tests: poller test updated to assert debounce (1 miss keeps healthy, 3 flip); new feasibility_routing tests cover transient-503 vs permanent-404. Local fmt/clippy/test green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-18 12:46:30 +03:00 · 2026-06-18 12:39:18 +03:00 · 2026-06-18 12:32:15 +03:00 · 2026-06-17 21:40:34 +03:00 · 2026-06-17 20:57:55 +03:00 · 2026-06-17 20:51:26 +03:00
21 changed files with 1859 additions and 62 deletions
--- a/crates/cortex-core/src/discovery.rs
+++ b/crates/cortex-core/src/discovery.rs
@@ -68,6 +68,57 @@ pub struct HealthResponse {
    pub devices: Vec<DeviceHealth>,
    #[serde(default)]
    pub activation: ActivationStatus,
+    /// Per-model admission load (#53): how many requests are running vs.
+    /// queued on each loaded model right now. Cortex's load-aware router
+    /// (#55) reads this to spread traffic across replicas and to propagate
+    /// honest backpressure. `#[serde(default)]` keeps older gateways/neurons
+    /// interoperable (absent → empty → treated as no load info).
+    #[serde(default)]
+    pub models: Vec<ModelLoad>,
+}
+
+/// Live admission load for one loaded model (#53).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelLoad {
+    pub id: String,
+    /// Requests currently running (batch-1 → 0 or 1).
+    pub in_flight: usize,
+    /// Requests waiting in the bounded admission queue.
+    pub queue_depth: usize,
+}
+
+#[cfg(test)]
+mod health_load_tests {
+    use super::*;
+
+    #[test]
+    fn health_response_without_models_field_still_deserializes() {
+        // A pre-#53 neuron's /health payload omits `models`; the gateway
+        // must still parse it (serde default → empty).
+        let json = r#"{"uptime_secs":42,"devices":[]}"#;
+        let resp: HealthResponse = serde_json::from_str(json).expect("back-compat parse");
+        assert_eq!(resp.uptime_secs, 42);
+        assert!(resp.models.is_empty());
+    }
+
+    #[test]
+    fn health_response_round_trips_model_load() {
+        let resp = HealthResponse {
+            uptime_secs: 1,
+            devices: vec![],
+            activation: ActivationStatus::default(),
+            models: vec![ModelLoad {
+                id: "Qwen/Qwen3.6-27B".into(),
+                in_flight: 1,
+                queue_depth: 3,
+            }],
+        };
+        let s = serde_json::to_string(&resp).unwrap();
+        let back: HealthResponse = serde_json::from_str(&s).unwrap();
+        assert_eq!(back.models.len(), 1);
+        assert_eq!(back.models[0].in_flight, 1);
+        assert_eq!(back.models[0].queue_depth, 3);
+    }
 }

 /// High-level activation state of the neuron daemon. The HTTP listener
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -1,4 +1,4 @@
-use crate::discovery::{ActivationStatus, DiscoveryResponse};
+use crate::discovery::{ActivationStatus, DiscoveryResponse, ModelLoad};
 use crate::harness::{ModelCost, ModelLimit};
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
@@ -27,6 +27,17 @@ pub struct NodeState {
    /// to synthesize `Loading` locations so clients see a catalogued
    /// model that's mid-prewarm as "loading", not "missing".
    pub activation: Option<ActivationStatus>,
+    /// Last-seen per-model admission load from this neuron's `/health`
+    /// (#53), keyed by model id. The router (#55) reads it to pick the
+    /// least-busy replica when a model is loaded on more than one neuron.
+    /// Empty until the first /health poll reports load.
+    pub model_load: HashMap<String, ModelLoad>,
+    /// Consecutive failed `/models` polls. The poller marks a node
+    /// unhealthy only once this crosses a threshold, so a single transient
+    /// miss (e.g. a neuron momentarily slow to answer while busy) doesn't
+    /// yank the node — and all its models — out of routing. Reset to 0 on
+    /// any successful poll.
+    pub consecutive_poll_failures: u32,
 }

 /// A model registered on a node, with its runtime status.
--- a/crates/cortex-gateway/src/auth.rs
+++ b/crates/cortex-gateway/src/auth.rs
@@ -83,9 +83,23 @@ pub async fn require_principal(
                req.extensions_mut().insert(principal);
                next.run(req).await
            }
-            // A present-but-invalid credential is always an error, even when
-            // anonymous access is otherwise allowed.
-            Err(_) => unauthorized("invalid API key"),
+            // An unrecognized key only hard-fails when auth is *required*.
+            // In allow-anonymous mode (the default) we must IGNORE it and
+            // serve the request unauthenticated — otherwise the placeholder
+            // keys that OpenAI-compatible clients send by default (opencode,
+            // Open WebUI, Agent Zero, litellm) would all break, even though
+            // the operator never opted into auth. Pre-#49 the bearer was
+            // never inspected at all; this preserves that for require_auth=false.
+            Err(_) => {
+                if fleet.require_auth {
+                    unauthorized("invalid API key")
+                } else {
+                    tracing::debug!(
+                        "ignoring unrecognized bearer token (require_auth=false): serving anonymously"
+                    );
+                    next.run(req).await
+                }
+            }
        },
        None => {
            if fleet.require_auth {
--- a/crates/cortex-gateway/src/handlers.rs
+++ b/crates/cortex-gateway/src/handlers.rs
@@ -306,19 +306,25 @@ async fn anthropic_messages(
    }
    let start = Instant::now();

-    // Per-request metering (#51), same lifecycle as the OpenAI paths:
-    // reserve (0 tokens this phase) and build the completion sink. Consumed
-    // by whichever branch runs below; dropping it unused releases the
-    // reservation.
+    // Per-request metering + budget enforcement (#51/#52), same lifecycle as
+    // the OpenAI paths. Estimate from the translated OpenAI body (what neuron
+    // sees). Refuse over-cap before dispatch via the #63 envelope; otherwise
+    // build the sink consumed by whichever branch runs below.
    let usage_sink = match crate::metering::principal_from_headers(&headers) {
        Some(principal) => {
-            let guard = crate::metering::ReservationGuard::reserve(
+            let advertised =
+                advertised_output_limit(&fleet, &route.node_name, &route.resolved_model_id).await;
+            let max_tokens = crate::metering::reservation_estimate(&openai_body, advertised);
+            match crate::metering::reserve_or_reject(
                Arc::clone(&fleet.entitlements),
                &principal,
-                0,
+                max_tokens,
            )
-            .await;
-            Some(crate::metering::usage_sink(principal, guard))
+            .await
+            {
+                Ok(guard) => Some(crate::metering::usage_sink(principal, guard)),
+                Err(env) => return crate::error::envelope_response(env),
+            }
        }
        None => None,
    };
@@ -755,6 +761,19 @@ async fn proxy_with_metrics(
    body: Bytes,
    model_id: &str,
 ) -> Response {
+    // Fail-fast prompt pre-validation (#56): refuse a prompt that already
+    // exceeds the model's advertised context window *before* dispatching to
+    // neuron — the same `400 context_length_exceeded` neuron would emit on
+    // overflow, just earlier and without burning a cold-load/queue slot.
+    // cortex has no tokenizer, so the estimate under-counts and neuron stays
+    // the exact wall; we only catch gross overages (the A0 failure mode).
+    if let Some(context) = advertised_context(fleet, &route.node_name, model_id).await {
+        let est = estimate_prompt_tokens(&body);
+        if est > context {
+            return context_length_exceeded_response(context, est, &headers);
+        }
+    }
+
    let labels = [
        ("model", model_id.to_string()),
        ("node", route.node_name.clone()),
@@ -765,20 +784,27 @@ async fn proxy_with_metrics(
        metrics::counter!("cortex_cold_starts_total", &labels).increment(1);
    }

-    // Per-request metering (#51): reconstruct the principal from the
-    // middleware-stamped headers, reserve (0 tokens this phase — metering
-    // only; #52 makes it the real cap), and build the completion sink that
-    // settles spend when the response finishes. Anonymous requests get no
-    // sink. Must happen before `headers`/`body` are moved into the proxy.
+    // Per-request metering + budget enforcement (#51/#52): reconstruct the
+    // principal from the middleware-stamped headers, reserve the request's
+    // upper-bound cost (prompt estimate + max output), and build the
+    // completion sink that settles actual spend when the response finishes.
+    // A reservation over the hard cap is refused *before* dispatch with the
+    // #63 envelope. Anonymous requests skip all of this. Must happen before
+    // `headers`/`body` are moved into the proxy.
    let usage_sink = match crate::metering::principal_from_headers(&headers) {
        Some(principal) => {
-            let guard = crate::metering::ReservationGuard::reserve(
+            let advertised = advertised_output_limit(fleet, &route.node_name, model_id).await;
+            let max_tokens = crate::metering::reservation_estimate(&body, advertised);
+            match crate::metering::reserve_or_reject(
                Arc::clone(&fleet.entitlements),
                &principal,
-                0,
+                max_tokens,
            )
-            .await;
-            Some(crate::metering::usage_sink(principal, guard))
+            .await
+            {
+                Ok(guard) => Some(crate::metering::usage_sink(principal, guard)),
+                Err(env) => return crate::error::envelope_response(env),
+            }
        }
        None => None,
    };
@@ -812,6 +838,117 @@ async fn proxy_with_metrics(
    }
 }

+/// The model's advertised `limit.output` (#62) on a given node, used as the
+/// default output budget for budget reservations (#52) when the request
+/// omits `max_(completion_)tokens`. `None` when the node/model/limit is
+/// unknown — callers fall back to [`crate::metering::FALLBACK_MAX_OUTPUT`].
+async fn advertised_output_limit(
+    fleet: &CortexState,
+    node_name: &str,
+    model_id: &str,
+) -> Option<u64> {
+    let nodes = fleet.nodes.read().await;
+    nodes
+        .get(node_name)?
+        .models
+        .get(model_id)?
+        .limit
+        .as_ref()
+        .map(|l| l.output as u64)
+}
+
+/// The model's advertised hard context window (`limit.context`, #62/#67) on a
+/// node, used for fail-fast prompt pre-validation (#56). `None` when no limit
+/// is known — pre-validation is then skipped and neuron remains the wall.
+async fn advertised_context(fleet: &CortexState, node_name: &str, model_id: &str) -> Option<u64> {
+    let nodes = fleet.nodes.read().await;
+    nodes
+        .get(node_name)?
+        .models
+        .get(model_id)?
+        .limit
+        .as_ref()
+        .map(|l| l.context as u64)
+}
+
+/// Conservative prompt-token estimate (~4 chars/token over message text).
+/// cortex has no tokenizer; under-counting is the safe direction — we only
+/// pre-reject gross overages (#56), and neuron enforces the exact wall.
+fn estimate_prompt_tokens(body: &[u8]) -> u64 {
+    let Ok(v) = serde_json::from_slice::<Value>(body) else {
+        return (body.len() as u64 / 4).max(1);
+    };
+    let mut chars = 0usize;
+    if let Some(messages) = v.get("messages").and_then(Value::as_array) {
+        for m in messages {
+            match m.get("content") {
+                Some(Value::String(s)) => chars += s.len(),
+                Some(Value::Array(parts)) => {
+                    for p in parts {
+                        if let Some(t) = p.get("text").and_then(Value::as_str) {
+                            chars += t.len();
+                        }
+                    }
+                }
+                _ => {}
+            }
+            chars += 8; // rough per-message role/formatting overhead
+        }
+    } else if let Some(prompt) = v.get("prompt").and_then(Value::as_str) {
+        chars += prompt.len(); // legacy /v1/completions
+    } else {
+        return (body.len() as u64 / 4).max(1);
+    }
+    (chars as u64 / 4).max(1)
+}
+
+/// Client-specific, advisory guidance for an over-long prompt (#56),
+/// fingerprinted from `User-Agent`. Strictly advisory: it rides the
+/// `X-Helexa-Advice` header only, never the error envelope, and behaviour
+/// never depends on it. Unknown clients get nothing.
+fn client_advice(headers: &HeaderMap) -> Option<&'static str> {
+    let ua = headers
+        .get(axum::http::header::USER_AGENT)?
+        .to_str()
+        .ok()?
+        .to_ascii_lowercase();
+    if ua.contains("litellm") {
+        Some(
+            "litellm forwards the full context; lower the configured context window or enable client-side compaction",
+        )
+    } else if ua.contains("agent-zero") || ua.contains("agent zero") {
+        Some("reduce the conversation/context size or summarize earlier turns before resending")
+    } else if ua.contains("zed") {
+        Some("reduce the assistant context window in Zed's settings")
+    } else {
+        None
+    }
+}
+
+/// `400 context_length_exceeded` for an over-long prompt caught at the edge
+/// (#56), in the #60 envelope — the same shape neuron emits on overflow, so
+/// clients (opencode auto-compacts) handle it identically. Attaches the
+/// advisory `X-Helexa-Advice` header for fingerprinted clients.
+fn context_length_exceeded_response(
+    context: u64,
+    prompt_est: u64,
+    headers: &HeaderMap,
+) -> Response {
+    let env = OpenAiError::context_length_exceeded(format!(
+        "This model's maximum context length is {context} tokens. Your request is \
+         estimated at ~{prompt_est} tokens. Please reduce the length of the messages."
+    ))
+    .with_extra("max", json!(context))
+    .with_extra("estimated_prompt_tokens", json!(prompt_est));
+    let mut response = crate::error::envelope_response(env);
+    if let Some(advice) = client_advice(headers)
+        && let Ok(value) = axum::http::HeaderValue::from_str(advice)
+    {
+        response.headers_mut().insert("x-helexa-advice", value);
+    }
+    response
+}
+
 /// Update `last_accessed` timestamp for a model on a node (drives LRU eviction).
 async fn touch_model(fleet: &CortexState, node_name: &str, model_id: &str) {
    let mut nodes = fleet.nodes.write().await;
--- a/crates/cortex-gateway/src/metering.rs
+++ b/crates/cortex-gateway/src/metering.rs
@@ -19,9 +19,17 @@
 //! or dropped stream can't strand a reservation.

 use axum::http::HeaderMap;
-use cortex_core::entitlements::{EntitlementProvider, HEADER_ACCOUNT_ID, HEADER_KEY_ID, Principal};
+use cortex_core::entitlements::{
+    BudgetError, EntitlementProvider, HEADER_ACCOUNT_ID, HEADER_KEY_ID, Principal,
+};
+use cortex_core::error_envelope::OpenAiError;
 use std::sync::Arc;

+/// Fallback output-token budget when neither the request nor the model's
+/// advertised limit gives one. Bounds the reservation so a capped key is
+/// still gated even on under-specified requests (#52).
+pub const FALLBACK_MAX_OUTPUT: u64 = 4096;
+
 /// Invoked exactly once at request completion with best-effort
 /// `(prompt_tokens, completion_tokens)`. When no usage could be observed
 /// (e.g. a pre-dispatch failure or a dropped stream) it is dropped unused —
@@ -70,18 +78,14 @@ impl ReservationGuard {
        }
    }

-    /// Reserve `max_tokens` for the principal, returning a guard. In this
-    /// phase callers pass `0` (metering only); #52 passes the real cap and
-    /// surfaces the [`cortex_core::entitlements::BudgetError`] instead.
-    pub async fn reserve(
+    /// Wrap an already-acquired reservation.
+    fn held(
        provider: Arc<dyn EntitlementProvider>,
-        principal: &Principal,
-        max_tokens: u64,
+        reservation: cortex_core::entitlements::Reservation,
    ) -> Self {
-        let reservation = provider.reserve(principal, max_tokens).await.ok();
        Self {
            provider,
-            reservation,
+            reservation: Some(reservation),
        }
    }

@@ -119,3 +123,97 @@ pub fn usage_sink(principal: Principal, guard: ReservationGuard) -> UsageSink {
        guard.settle(prompt + completion);
    })
 }
+
+/// Reserve the request's upper-bound token cost for the principal, refusing
+/// *before* dispatch if it would exceed the hard cap (#52). On success
+/// returns a guard the caller settles with actual usage; on refusal returns
+/// the #63 envelope (`rate_limit_exceeded` + `Retry-After` for a resetting
+/// window, `insufficient_quota` for a hard balance — never `402`).
+pub async fn reserve_or_reject(
+    provider: Arc<dyn EntitlementProvider>,
+    principal: &Principal,
+    max_tokens: u64,
+) -> Result<ReservationGuard, OpenAiError> {
+    match provider.reserve(principal, max_tokens).await {
+        Ok(reservation) => Ok(ReservationGuard::held(provider, reservation)),
+        Err(err) => Err(budget_error_to_envelope(err)),
+    }
+}
+
+/// Map a [`BudgetError`] to the #63 envelope. The provider chose the window
+/// semantics; this only translates them to HTTP.
+fn budget_error_to_envelope(err: BudgetError) -> OpenAiError {
+    match err {
+        BudgetError::RateLimited {
+            retry_after_secs, ..
+        } => OpenAiError::rate_limit_exceeded(err.to_string(), retry_after_secs),
+        BudgetError::InsufficientQuota { .. } => OpenAiError::insufficient_quota(err.to_string()),
+    }
+}
+
+/// Upper-bound tokens to reserve for a request (#52): an over-estimate of
+/// the prompt plus the maximum output. `advertised_output` is the model's
+/// `limit.output` (#62), used when the request omits `max_(completion_)tokens`.
+/// Over-reserving is safe — settle corrects spend to the actual usage.
+pub fn reservation_estimate(body: &[u8], advertised_output: Option<u64>) -> u64 {
+    let max_output = requested_max_output(body)
+        .or(advertised_output)
+        .unwrap_or(FALLBACK_MAX_OUTPUT);
+    estimate_prompt_tokens(body).saturating_add(max_output)
+}
+
+/// The client's requested output cap, from `max_completion_tokens` (or the
+/// legacy `max_tokens`). `None` when unspecified.
+fn requested_max_output(body: &[u8]) -> Option<u64> {
+    let v: serde_json::Value = serde_json::from_slice(body).ok()?;
+    v.get("max_completion_tokens")
+        .or_else(|| v.get("max_tokens"))
+        .and_then(serde_json::Value::as_u64)
+}
+
+/// Rough prompt-token estimate at ~4 chars/token over the whole body. cortex
+/// has no tokenizer; JSON overhead makes this a conservative over-estimate,
+/// and neuron remains the exact context wall (#56/#60). Settle reconciles to
+/// the real usage afterward.
+fn estimate_prompt_tokens(body: &[u8]) -> u64 {
+    (body.len() as u64 / 4).max(1)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn requested_max_output_prefers_max_completion_tokens() {
+        let body = br#"{"model":"m","max_completion_tokens":256,"max_tokens":99}"#;
+        assert_eq!(requested_max_output(body), Some(256));
+    }
+
+    #[test]
+    fn requested_max_output_falls_back_to_legacy_max_tokens() {
+        let body = br#"{"model":"m","max_tokens":128}"#;
+        assert_eq!(requested_max_output(body), Some(128));
+    }
+
+    #[test]
+    fn estimate_uses_requested_output_when_present() {
+        // Requested output dominates; prompt estimate is small for a tiny body.
+        let body = br#"{"model":"m","max_tokens":1000}"#;
+        let est = reservation_estimate(body, Some(8192));
+        assert!(est >= 1000 && est < 1100, "est was {est}");
+    }
+
+    #[test]
+    fn estimate_uses_advertised_output_when_request_omits_it() {
+        let body = br#"{"model":"m","messages":[]}"#;
+        let est = reservation_estimate(body, Some(8192));
+        assert!(est >= 8192, "est was {est}");
+    }
+
+    #[test]
+    fn estimate_falls_back_when_nothing_advertised() {
+        let body = br#"{"model":"m"}"#;
+        let est = reservation_estimate(body, None);
+        assert!(est >= FALLBACK_MAX_OUTPUT, "est was {est}");
+    }
+}
--- a/crates/cortex-gateway/src/poller.rs
+++ b/crates/cortex-gateway/src/poller.rs
@@ -5,12 +5,29 @@ use crate::state::CortexState;
 use chrono::Utc;
 use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
 use cortex_core::harness::ModelInfo;
-use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_core::node::{ModelEntry, ModelStatus, NodeState};
 use std::sync::Arc;
 use std::time::Duration;

 const POLL_INTERVAL: Duration = Duration::from_secs(10);

+/// Consecutive failed `/models` polls before a node is marked unhealthy.
+/// Debounces transient misses (a busy neuron briefly slow to answer) so a
+/// single blip can't yank a node — and its models — out of routing. At the
+/// 10s poll interval this tolerates ~20s of flapping before evicting.
+const POLL_FAILURE_THRESHOLD: u32 = 3;
+
+/// Record a failed poll for `node`, marking it unhealthy only once failures
+/// reach [`POLL_FAILURE_THRESHOLD`]. Below the threshold the node keeps its
+/// last-known health, riding over transient misses. A successful poll resets
+/// the counter (see the success arm in `poll_once`).
+fn record_poll_failure(node: &mut NodeState) {
+    node.consecutive_poll_failures = node.consecutive_poll_failures.saturating_add(1);
+    if node.consecutive_poll_failures >= POLL_FAILURE_THRESHOLD {
+        node.healthy = false;
+    }
+}
+
 /// Runs forever, polling all neurons on a fixed interval.
 pub async fn poll_loop(fleet: Arc<CortexState>) {
    loop {
@@ -138,13 +155,14 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
                    // Remove models no longer reported by the neuron.
                    node.models.retain(|id, _| seen.contains(id));

+                    node.consecutive_poll_failures = 0;
                    node.healthy = true;
                    node.last_poll = Some(Utc::now());
                    tracing::debug!(node = name, models = models.len(), "poll ok");
                }
                Err(e) => {
                    tracing::warn!(node = name, error = %e, "failed to parse /models response");
-                    node.healthy = false;
+                    record_poll_failure(node);
                }
            }
        }
@@ -154,11 +172,11 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
                status = %resp.status(),
                "neuron returned non-success status"
            );
-            node.healthy = false;
+            record_poll_failure(node);
        }
        Err(e) => {
            tracing::warn!(node = name, error = %e, "failed to reach neuron");
-            node.healthy = false;
+            record_poll_failure(node);
        }
    }

@@ -200,6 +218,9 @@ async fn poll_health(fleet: &CortexState, name: &str, endpoint: &str) {
            let mut nodes = fleet.nodes.write().await;
            if let Some(node) = nodes.get_mut(name) {
                node.activation = Some(h.activation);
+                // Per-model admission load (#53) → keyed by id for the
+                // load-aware router (#55).
+                node.model_load = h.models.into_iter().map(|m| (m.id.clone(), m)).collect();
            }
        }
        Err(e) => {
--- a/crates/cortex-gateway/src/router.rs
+++ b/crates/cortex-gateway/src/router.rs
@@ -50,6 +50,10 @@ pub enum RouteError {
        "model '{model_id}' is in the catalogue but no healthy neuron's topology satisfies its constraints"
    )]
    NoFeasibleNeuron { model_id: String },
+    #[error(
+        "model '{model_id}' is feasible on a neuron that is currently unhealthy — retry shortly"
+    )]
+    FeasibleNodeUnhealthy { model_id: String },
    #[error("cold-load of '{model_id}' on '{node}' failed: {message}")]
    ColdLoadFailed {
        model_id: String,
@@ -68,7 +72,9 @@ impl RouteError {
    /// safe to retry the same request); everything else is 404.
    pub fn http_status(&self) -> u16 {
        match self {
-            RouteError::NoHealthyNodes | RouteError::ModelRecovering { .. } => 503,
+            RouteError::NoHealthyNodes
+            | RouteError::ModelRecovering { .. }
+            | RouteError::FeasibleNodeUnhealthy { .. } => 503,
            _ => 404,
        }
    }
@@ -81,7 +87,8 @@ impl RouteError {
            | RouteError::EndpointResolveFailed(_, _)
            | RouteError::NoFeasibleNeuron { .. }
            | RouteError::ColdLoadFailed { .. }
-            | RouteError::ModelRecovering { .. } => "api_error",
+            | RouteError::ModelRecovering { .. }
+            | RouteError::FeasibleNodeUnhealthy { .. } => "api_error",
        }
    }

@@ -94,6 +101,7 @@ impl RouteError {
            RouteError::NoFeasibleNeuron { .. } => "service_unavailable",
            RouteError::ColdLoadFailed { .. } => "service_unavailable",
            RouteError::ModelRecovering { .. } => "service_unavailable",
+            RouteError::FeasibleNodeUnhealthy { .. } => "service_unavailable",
        }
    }

@@ -105,6 +113,7 @@ impl RouteError {
    pub fn retry_after_secs(&self) -> Option<u64> {
        match self {
            RouteError::ModelRecovering { .. } => Some(2),
+            RouteError::FeasibleNodeUnhealthy { .. } => Some(3),
            RouteError::NoHealthyNodes => Some(5),
            _ => None,
        }
@@ -132,7 +141,9 @@ pub async fn resolve(
    // Snapshot loaded / unloaded / recovering state from the poller cache.
    let (loaded_route, unloaded_route, recovering_node, any_healthy) = {
        let nodes = fleet.nodes.read().await;
-        let mut loaded_route = None;
+        // All healthy nodes with the model loaded, each with its current
+        // admission load (#53) so we can pick the least-busy replica (#55).
+        let mut loaded_candidates: Vec<(String, String, usize)> = Vec::new();
        let mut unloaded_route = None;
        let mut recovering_node = None;
        let mut any_healthy = false;
@@ -144,8 +155,15 @@ pub async fn resolve(
            if let Some(entry) = node.models.get(model_id) {
                match entry.status {
                    ModelStatus::Loaded | ModelStatus::Reloading => {
-                        loaded_route = Some((node.name.clone(), node.endpoint.clone(), false));
-                        break;
+                        // Least-busy score: in-flight + queued from the
+                        // neuron's last /health (#53). Unknown load (no poll
+                        // yet) scores 0 so the replica stays eligible.
+                        let score = node
+                            .model_load
+                            .get(model_id)
+                            .map(|l| l.in_flight + l.queue_depth)
+                            .unwrap_or(0);
+                        loaded_candidates.push((node.name.clone(), node.endpoint.clone(), score));
                    }
                    ModelStatus::Unloaded => {
                        if unloaded_route.is_none() {
@@ -175,6 +193,12 @@ pub async fn resolve(
                }
            }
        }
+        // Pick the least-busy loaded replica; ties break by node name for
+        // deterministic routing. `false` = not a cold start.
+        let loaded_route = loaded_candidates
+            .into_iter()
+            .min_by(|a, b| a.2.cmp(&b.2).then_with(|| a.0.cmp(&b.0)))
+            .map(|(name, endpoint, _score)| (name, endpoint, false));
        (loaded_route, unloaded_route, recovering_node, any_healthy)
    };

@@ -237,11 +261,32 @@ async fn pick_feasible_neuron(
        b.2.cmp(&a.2) // pinned first (true > false)
            .then(a.0.cmp(&b.0))
    });
-    let pick = candidates.into_iter().next();
-    pick.map(|(n, e, _)| (n, e))
-        .ok_or_else(|| RouteError::NoFeasibleNeuron {
+    if let Some((n, e, _)) = candidates.into_iter().next() {
+        return Ok((n, e));
+    }
+
+    // No *healthy* feasible neuron. Distinguish a transient outage from a
+    // permanent misconfiguration: if some neuron is topologically feasible
+    // but currently unhealthy (e.g. it briefly missed polls while busy),
+    // this is retryable — return 503 + Retry-After so the client backs off
+    // and retries instead of treating a 404 as a hard failure. Only when no
+    // neuron could *ever* satisfy the topology is it a permanent 404.
+    let feasible_but_unhealthy = nodes.values().any(|node| {
+        !node.healthy
+            && node
+                .discovery
+                .as_ref()
+                .is_some_and(|disc| profile.is_feasible_on(&node.name, &disc.devices))
+    });
+    if feasible_but_unhealthy {
+        Err(RouteError::FeasibleNodeUnhealthy {
            model_id: profile.id.clone(),
        })
+    } else {
+        Err(RouteError::NoFeasibleNeuron {
+            model_id: profile.id.clone(),
+        })
+    }
 }

 /// Issue `POST {endpoint}/models/load` for this profile on this neuron,
--- a/crates/cortex-gateway/src/state.rs
+++ b/crates/cortex-gateway/src/state.rs
@@ -37,6 +37,8 @@ impl CortexState {
                    last_poll: None,
                    discovery: None,
                    activation: None,
+                    model_load: HashMap::new(),
+                    consecutive_poll_failures: 0,
                },
            );
        }
--- a/crates/cortex-gateway/tests/auth.rs
+++ b/crates/cortex-gateway/tests/auth.rs
@@ -175,11 +175,33 @@ async fn missing_key_when_required_is_401_invalid_api_key() {
 }

 #[tokio::test]
-async fn invalid_key_is_401_even_when_auth_not_required() {
+async fn unrecognized_key_is_ignored_when_auth_not_required() {
    let (neuron, seen) = spawn_capturing_neuron().await;
-    // A present-but-wrong credential is always an error.
+    // allow-anonymous mode: a placeholder/unknown bearer (as opencode,
+    // Open WebUI, Agent Zero, litellm all send by default) must NOT be
+    // rejected — it's ignored and the request is served anonymously.
    let gateway = spawn_gateway(&neuron, one_key_config(false)).await;

+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-dummy-placeholder")
+        .json(&chat_body())
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    // Served, but anonymous — no principal stamped from the bogus key.
+    assert!(seen.lock().unwrap().account_id.is_none());
+}
+
+#[tokio::test]
+async fn invalid_key_is_401_when_auth_required() {
+    let (neuron, seen) = spawn_capturing_neuron().await;
+    // With auth required, a present-but-wrong credential is rejected.
+    let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
+
    let resp = reqwest::Client::new()
        .post(format!("{gateway}/v1/chat/completions"))
        .bearer_auth("sk-wrong")
--- a/crates/cortex-gateway/tests/budget_enforcement.rs
+++ b/crates/cortex-gateway/tests/budget_enforcement.rs
@@ -0,0 +1,253 @@
+//! Integration tests for budget enforcement (#52) — the A0 seatbelt.
+//!
+//! A reservation over the key's hard cap is refused *before* neuron is hit,
+//! with the #63 code matching the cap-window semantics (rate_limit_exceeded
+//! + Retry-After for a resetting window, insufficient_quota for a hard
+//! balance). Spend never exceeds the cap. No 402, ever.
+
+use axum::Json;
+use axum::extract::Path;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
+    GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::entitlements::{CapWindow, Principal};
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use tokio::net::TcpListener;
+
+/// Mock neuron with a hit counter on the inference path, so a test can prove
+/// a request was (or wasn't) dispatched.
+async fn spawn_counting_neuron() -> (String, Arc<AtomicU64>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let hits = Arc::new(AtomicU64::new(0));
+    let sink = Arc::clone(&hits);
+
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<Value>| {
+                let sink = Arc::clone(&sink);
+                async move {
+                    sink.fetch_add(1, Ordering::SeqCst);
+                    let model = body.get("model").and_then(Value::as_str).unwrap_or("m");
+                    Json(json!({
+                        "id": "chatcmpl-budget",
+                        "object": "chat.completion",
+                        "created": 1700000000_u64,
+                        "model": model,
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}],
+                        "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+                    }))
+                }
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (base_url, hits)
+}
+
+async fn spawn_gateway(neuron_url: &str, key: ApiKeyConfig) -> (Arc<CortexState>, String) {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron_url.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: EntitlementsConfig {
+            require_auth: true,
+            keys: vec![key],
+        },
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let node = nodes.get_mut("mock-node").unwrap();
+        node.healthy = true;
+        node.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: None,
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (fleet, format!("http://{addr}"))
+}
+
+fn key(window: CapWindow, hard_cap: u64) -> ApiKeyConfig {
+    ApiKeyConfig {
+        key: "sk-cap".into(),
+        account_id: "acct-cap".into(),
+        key_id: Some("key-cap".into()),
+        hard_cap: Some(hard_cap),
+        window,
+    }
+}
+
+fn chat(max_tokens: u64) -> Value {
+    json!({
+        "model": "test-model",
+        "max_tokens": max_tokens,
+        "messages": [{"role": "user", "content": "hi"}]
+    })
+}
+
+#[tokio::test]
+async fn balance_over_cap_is_429_insufficient_quota_before_dispatch() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    // Cap far below a single request's reservation (max_tokens 1000).
+    let (_fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 10)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(1000))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::TOO_MANY_REQUESTS);
+    // Hard balance → no Retry-After.
+    assert!(resp.headers().get(reqwest::header::RETRY_AFTER).is_none());
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "insufficient_quota");
+    // Refused before dispatch — neuron never saw it.
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn rolling_over_cap_is_429_rate_limited_with_retry_after() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (_fleet, gateway) =
+        spawn_gateway(&neuron, key(CapWindow::Rolling { seconds: 3600 }, 10)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(1000))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::TOO_MANY_REQUESTS);
+    let retry = resp
+        .headers()
+        .get(reqwest::header::RETRY_AFTER)
+        .expect("rolling-window rejection must carry Retry-After");
+    assert!(retry.to_str().unwrap().parse::<u64>().unwrap() >= 1);
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn within_cap_is_served() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (_fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 1_000_000)).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .bearer_auth("sk-cap")
+        .json(&chat(50))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    assert_eq!(hits.load(Ordering::SeqCst), 1);
+}
+
+#[tokio::test]
+async fn a0_seatbelt_caps_a_runaway_fan_out() {
+    // An Agent-Zero-style key with a modest cap: a burst of requests drains
+    // it, then further requests are refused — the account stops draining and
+    // spend never exceeds the cap.
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let (fleet, gateway) = spawn_gateway(&neuron, key(CapWindow::Balance, 100)).await;
+    let client = reqwest::Client::new();
+
+    let mut ok = 0;
+    let mut refused = 0;
+    for _ in 0..20 {
+        let resp = client
+            .post(format!("{gateway}/v1/chat/completions"))
+            .bearer_auth("sk-cap")
+            .json(&chat(20))
+            .send()
+            .await
+            .unwrap();
+        match resp.status() {
+            reqwest::StatusCode::OK => {
+                ok += 1;
+                let _ = resp.bytes().await.unwrap();
+            }
+            reqwest::StatusCode::TOO_MANY_REQUESTS => {
+                refused += 1;
+                let body: Value = resp.json().await.unwrap();
+                assert_eq!(body["error"]["code"], "insufficient_quota");
+            }
+            other => panic!("unexpected status {other}"),
+        }
+    }
+
+    assert!(ok >= 1, "some requests should be served");
+    assert!(refused >= 1, "the cap must eventually refuse the fan-out");
+    assert_eq!(
+        hits.load(Ordering::SeqCst),
+        ok,
+        "refused requests never dispatched"
+    );
+
+    // Spend never exceeded the hard cap (reservation prevents overshoot).
+    // Poll briefly for in-flight settles to land.
+    let principal = Principal {
+        account_id: "acct-cap".into(),
+        key_id: "key-cap".into(),
+    };
+    for _ in 0..50 {
+        let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+        if snap.reserved == 0 {
+            break;
+        }
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+    }
+    let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
+    assert!(snap.spent <= 100, "spent {} exceeded cap", snap.spent);
+}
--- a/crates/cortex-gateway/tests/feasibility_routing.rs
+++ b/crates/cortex-gateway/tests/feasibility_routing.rs
@@ -0,0 +1,124 @@
+//! Router: a catalogued model whose only topologically-feasible neuron is
+//! currently unhealthy is a *transient* condition (retryable 503), not a
+//! permanent 404. This is the exact shape of the beast incident: benjy/
+//! quadbrat (1 GPU, healthy) can't host the 27B, and beast (2 GPU) — the
+//! sole feasible node — briefly drops out → clients must back off and retry,
+//! not hard-fail.
+
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::discovery::{DeviceInfo, DiscoveryResponse};
+use cortex_gateway::router::{self, RouteError};
+use cortex_gateway::state::CortexState;
+use std::sync::Arc;
+
+fn devices(n: usize) -> Vec<DeviceInfo> {
+    (0..n)
+        .map(|i| DeviceInfo {
+            index: i as u32,
+            name: "RTX 5090".into(),
+            vram_total_mb: 32_768,
+            compute_capability: "9.0".into(),
+        })
+        .collect()
+}
+
+fn discovery(host: &str, n_devices: usize) -> DiscoveryResponse {
+    DiscoveryResponse {
+        hostname: host.into(),
+        os: "Linux".into(),
+        kernel: "7.0".into(),
+        cuda_version: Some("13.0".into()),
+        driver_version: Some("999".into()),
+        devices: devices(n_devices),
+        harnesses: vec!["candle".into()],
+        cuda_unavailable_reason: None,
+        max_prompt_tokens: 49_152,
+    }
+}
+
+/// Catalogue with one model needing 2 devices. Returns a temp path.
+fn write_catalogue() -> std::path::PathBuf {
+    let toml = r#"
+[[models]]
+id = "big-model"
+harness = "candle"
+min_devices = 2
+"#;
+    let path = std::env::temp_dir().join("cortex_test_feasibility_models.toml");
+    std::fs::write(&path, toml).unwrap();
+    path
+}
+
+async fn fleet_with(big_healthy: bool, big_devices: usize) -> Arc<CortexState> {
+    let cat = write_catalogue();
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![
+            NeuronEndpoint {
+                name: "small".into(),
+                endpoint: "http://127.0.0.1:1".into(),
+            },
+            NeuronEndpoint {
+                name: "big".into(),
+                endpoint: "http://127.0.0.1:2".into(),
+            },
+        ],
+        models_config: cat.to_string_lossy().into_owned(),
+        entitlements: Default::default(),
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        // "small" is healthy but only has 1 GPU → not feasible for the model.
+        let small = nodes.get_mut("small").unwrap();
+        small.healthy = true;
+        small.discovery = Some(discovery("small", 1));
+        // "big" has enough GPUs but its health is the variable under test.
+        let big = nodes.get_mut("big").unwrap();
+        big.healthy = big_healthy;
+        big.discovery = Some(discovery("big", big_devices));
+    }
+    fleet
+}
+
+#[tokio::test]
+async fn feasible_node_unhealthy_is_transient_503() {
+    // big (2 GPU, the only feasible node) is unhealthy; small (1 GPU) is
+    // healthy but can't host the model → retryable, not a permanent 404.
+    let fleet = fleet_with(false, 2).await;
+    let err = router::resolve(&fleet, "big-model")
+        .await
+        .expect_err("model can't be served right now");
+    assert!(
+        matches!(err, RouteError::FeasibleNodeUnhealthy { .. }),
+        "expected FeasibleNodeUnhealthy, got {err:?}"
+    );
+    assert_eq!(err.http_status(), 503);
+    assert_eq!(err.retry_after_secs(), Some(3));
+    assert_eq!(err.code(), "service_unavailable");
+}
+
+#[tokio::test]
+async fn no_node_can_ever_satisfy_is_permanent_404() {
+    // big is healthy but only has 1 GPU now (e.g. topology genuinely can't
+    // satisfy min_devices=2 anywhere) → permanent, non-retryable 404.
+    let fleet = fleet_with(true, 1).await;
+    let err = router::resolve(&fleet, "big-model")
+        .await
+        .expect_err("no feasible topology");
+    assert!(
+        matches!(err, RouteError::NoFeasibleNeuron { .. }),
+        "expected NoFeasibleNeuron, got {err:?}"
+    );
+    assert_eq!(err.http_status(), 404);
+    assert_eq!(err.retry_after_secs(), None);
+}
--- a/crates/cortex-gateway/tests/load_routing.rs
+++ b/crates/cortex-gateway/tests/load_routing.rs
@@ -0,0 +1,189 @@
+//! Load-aware routing across replicas (#55).
+//!
+//! When a model is loaded on more than one healthy neuron, the router picks
+//! the least-busy replica using the per-model admission load each neuron
+//! reports on `GET /health` (#53), rather than always taking the first.
+
+mod common;
+
+use axum::Json;
+use axum::extract::Path;
+use axum::http::{StatusCode, header};
+use axum::response::IntoResponse;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::discovery::ModelLoad;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+/// Seed a node as healthy with `test-model` loaded and a given admission load.
+async fn seed_loaded(fleet: &CortexState, node: &str, in_flight: usize, queue_depth: usize) {
+    let mut nodes = fleet.nodes.write().await;
+    let n = nodes.get_mut(node).expect("node exists");
+    n.healthy = true;
+    n.models.insert(
+        "test-model".into(),
+        ModelEntry {
+            id: "test-model".into(),
+            status: ModelStatus::Loaded,
+            last_accessed: None,
+            vram_estimate_mb: Some(8000),
+            capabilities: Vec::new(),
+            tool_call: false,
+            reasoning: false,
+            limit: None,
+        },
+    );
+    n.model_load.insert(
+        "test-model".into(),
+        ModelLoad {
+            id: "test-model".into(),
+            in_flight,
+            queue_depth,
+        },
+    );
+}
+
+/// Build a gateway state over two mock neurons (no poller; we seed state).
+async fn two_neuron_fleet(endpoint_a: &str, endpoint_b: &str) -> Arc<CortexState> {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![
+            NeuronEndpoint {
+                name: "node-a".into(),
+                endpoint: endpoint_a.to_string(),
+            },
+            NeuronEndpoint {
+                name: "node-b".into(),
+                endpoint: endpoint_b.to_string(),
+            },
+        ],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+    Arc::new(CortexState::from_config(&config))
+}
+
+#[tokio::test]
+async fn routes_to_least_busy_replica() {
+    let neuron_a = common::spawn_mock_neuron().await;
+    let neuron_b = common::spawn_mock_neuron().await;
+    let fleet = two_neuron_fleet(&neuron_a, &neuron_b).await;
+
+    // A is busy (1 running + 3 queued), B is idle.
+    seed_loaded(&fleet, "node-a", 1, 3).await;
+    seed_loaded(&fleet, "node-b", 0, 0).await;
+
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("model is loaded on both nodes");
+    assert_eq!(route.node_name, "node-b", "should pick the idle replica");
+
+    // Flip the load: now B is the busy one.
+    seed_loaded(&fleet, "node-a", 0, 0).await;
+    seed_loaded(&fleet, "node-b", 1, 5).await;
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("still loaded");
+    assert_eq!(route.node_name, "node-a", "should follow the lighter load");
+}
+
+/// Mock neuron whose inference endpoint always returns a #63 backpressure
+/// envelope (503 + Retry-After) — simulating a saturated neuron.
+async fn spawn_busy_neuron() -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(|| async {
+                let body = json!({"error": {
+                    "message": "model is busy (admission queue full); retry shortly",
+                    "type": "rate_limit_error",
+                    "code": "rate_limit_exceeded",
+                    "param": null
+                }});
+                (
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    [(header::RETRY_AFTER, "6")],
+                    Json(body),
+                )
+                    .into_response()
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    base_url
+}
+
+#[tokio::test]
+async fn neuron_backpressure_is_propagated_intact() {
+    // A saturated neuron's 503 + Retry-After + envelope must reach the client
+    // verbatim — not unwrapped, remapped, or stripped (#55 / #63).
+    let neuron = spawn_busy_neuron().await;
+    let fleet = two_neuron_fleet(&neuron, &neuron).await;
+    seed_loaded(&fleet, "node-a", 1, 8).await;
+
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+
+    let resp = reqwest::Client::new()
+        .post(format!("http://{addr}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::SERVICE_UNAVAILABLE);
+    assert_eq!(
+        resp.headers()
+            .get(reqwest::header::RETRY_AFTER)
+            .and_then(|v| v.to_str().ok()),
+        Some("6"),
+        "Retry-After must survive the proxy"
+    );
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+}
+
+#[tokio::test]
+async fn ties_break_deterministically_by_name() {
+    let neuron_a = common::spawn_mock_neuron().await;
+    let neuron_b = common::spawn_mock_neuron().await;
+    let fleet = two_neuron_fleet(&neuron_a, &neuron_b).await;
+
+    // Equal load on both → stable pick (lowest node name).
+    seed_loaded(&fleet, "node-a", 0, 0).await;
+    seed_loaded(&fleet, "node-b", 0, 0).await;
+
+    let route = cortex_gateway::router::resolve(&fleet, "test-model")
+        .await
+        .expect("loaded");
+    assert_eq!(route.node_name, "node-a", "ties break by name");
+}
--- a/crates/cortex-gateway/tests/poller.rs
+++ b/crates/cortex-gateway/tests/poller.rs
@@ -228,10 +228,26 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
        nodes.get_mut("dead-node").unwrap().healthy = true;
    }

+    // Debounce (#53 follow-up): a single missed poll must NOT evict a
+    // previously-healthy node — a busy neuron briefly slow to answer
+    // shouldn't yank its models out of routing.
    cortex_gateway::poller::poll_once(&fleet).await;
+    assert!(
+        fleet.nodes.read().await.get("dead-node").unwrap().healthy,
+        "one failed poll should not mark a healthy node unhealthy"
+    );

-    let nodes = fleet.nodes.read().await;
-    assert!(!nodes.get("dead-node").unwrap().healthy);
+    // It flips unhealthy only after POLL_FAILURE_THRESHOLD (3) consecutive
+    // failures.
+    cortex_gateway::poller::poll_once(&fleet).await;
+    cortex_gateway::poller::poll_once(&fleet).await;
+    assert!(
+        !fleet.nodes.read().await.get("dead-node").unwrap().healthy,
+        "three consecutive failed polls should mark the node unhealthy"
+    );
+
+    // A subsequent successful poll would reset the counter and restore
+    // health; covered implicitly by the discovery tests above.
 }

 #[tokio::test]
--- a/crates/cortex-gateway/tests/prompt_prevalidation.rs
+++ b/crates/cortex-gateway/tests/prompt_prevalidation.rs
@@ -0,0 +1,174 @@
+//! Fail-fast prompt pre-validation + advisory client hints (#56).
+//!
+//! cortex refuses a prompt that already exceeds the model's advertised
+//! context window before dispatching to neuron — the same #60
+//! `context_length_exceeded` envelope neuron would emit, just earlier — and
+//! attaches an advisory `X-Helexa-Advice` header for fingerprinted clients.
+
+use axum::Json;
+use axum::extract::Path;
+use axum::routing::{get, post};
+use cortex_core::config::{
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
+};
+use cortex_core::harness::ModelLimit;
+use cortex_core::node::{ModelEntry, ModelStatus};
+use cortex_gateway::state::CortexState;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use tokio::net::TcpListener;
+
+/// Mock neuron with a hit counter, so a test can prove a request was (or
+/// wasn't) dispatched past the gateway's pre-validation.
+async fn spawn_counting_neuron() -> (String, Arc<AtomicU64>) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+    let hits = Arc::new(AtomicU64::new(0));
+    let sink = Arc::clone(&hits);
+    let app = axum::Router::new()
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({ "url": url })) }
+            }),
+        )
+        .route(
+            "/v1/chat/completions",
+            post(move || {
+                let sink = Arc::clone(&sink);
+                async move {
+                    sink.fetch_add(1, Ordering::SeqCst);
+                    Json(json!({
+                        "id": "c", "object": "chat.completion", "created": 1_700_000_000_u64,
+                        "model": "test-model",
+                        "choices": [{"index": 0, "message": {"role": "assistant", "content": "ok"}, "finish_reason": "stop"}],
+                        "usage": {"prompt_tokens": 3, "completion_tokens": 1, "total_tokens": 4}
+                    }))
+                }
+            }),
+        );
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    (base_url, hits)
+}
+
+/// Gateway over one neuron with `test-model` loaded and a tiny advertised
+/// context window (so a modest prompt overflows it).
+async fn spawn_gateway(neuron: &str, context: usize) -> String {
+    let config = GatewayConfig {
+        gateway: GatewaySettings {
+            listen: "127.0.0.1:0".into(),
+            metrics_listen: "127.0.0.1:0".into(),
+        },
+        eviction: EvictionSettings {
+            strategy: EvictionStrategy::Lru,
+            defrag_after_cycles: 0,
+        },
+        neurons: vec![NeuronEndpoint {
+            name: "mock-node".into(),
+            endpoint: neuron.to_string(),
+        }],
+        models_config: "/dev/null".into(),
+        entitlements: Default::default(),
+    };
+    let fleet = Arc::new(CortexState::from_config(&config));
+    {
+        let mut nodes = fleet.nodes.write().await;
+        let n = nodes.get_mut("mock-node").unwrap();
+        n.healthy = true;
+        n.models.insert(
+            "test-model".into(),
+            ModelEntry {
+                id: "test-model".into(),
+                status: ModelStatus::Loaded,
+                last_accessed: None,
+                vram_estimate_mb: Some(8000),
+                capabilities: Vec::new(),
+                tool_call: false,
+                reasoning: false,
+                limit: Some(ModelLimit {
+                    context,
+                    input: None,
+                    output: 16,
+                }),
+            },
+        );
+    }
+    let app = cortex_gateway::build_app(Arc::clone(&fleet));
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    format!("http://{addr}")
+}
+
+#[tokio::test]
+async fn over_long_prompt_is_rejected_before_dispatch() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let gateway = spawn_gateway(&neuron, 50).await; // tiny 50-token window
+
+    // ~1200 chars → ~300 est tokens, well over 50.
+    let big = "word ".repeat(240);
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .header("user-agent", "litellm/1.0")
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": big}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::BAD_REQUEST);
+    // Advisory hint for the fingerprinted client (header only, never body).
+    assert!(
+        resp.headers().get("x-helexa-advice").is_some(),
+        "litellm should get advice"
+    );
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "context_length_exceeded");
+    assert_eq!(body["error"]["max"], 50);
+    // Refused at the edge — neuron never saw it.
+    assert_eq!(hits.load(Ordering::SeqCst), 0);
+}
+
+#[tokio::test]
+async fn within_context_passes_through() {
+    let (neuron, hits) = spawn_counting_neuron().await;
+    let gateway = spawn_gateway(&neuron, 4096).await;
+
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::OK);
+    let _ = resp.bytes().await.unwrap();
+    assert_eq!(hits.load(Ordering::SeqCst), 1, "served by neuron");
+}
+
+#[tokio::test]
+async fn unknown_client_gets_no_advice_header() {
+    let (neuron, _hits) = spawn_counting_neuron().await;
+    let gateway = spawn_gateway(&neuron, 50).await;
+
+    let big = "word ".repeat(240);
+    let resp = reqwest::Client::new()
+        .post(format!("{gateway}/v1/chat/completions"))
+        // no/unknown User-Agent → no advice, but still a clean 400
+        .json(&json!({"model": "test-model", "messages": [{"role": "user", "content": big}]}))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(resp.status(), reqwest::StatusCode::BAD_REQUEST);
+    assert!(resp.headers().get("x-helexa-advice").is_none());
+    let body: Value = resp.json().await.unwrap();
+    assert_eq!(body["error"]["code"], "context_length_exceeded");
+}
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -13,6 +13,7 @@ use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Json};
 use axum::routing::{get, post};
 use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
+use cortex_core::entitlements::{HEADER_ACCOUNT_ID, HEADER_KEY_ID};
 use cortex_core::harness::ModelSpec;
 use cortex_core::openai::{ChatCompletionRequest, MessageContent};
 use cortex_core::responses::{ResponsesRequest, ResponsesUsage};
@@ -71,6 +72,12 @@ async fn health_handler(State(state): State<Arc<NeuronState>>) -> Json<HealthRes
    // know about activation lifecycle.
    let mut snapshot = state.health_cache.snapshot().await;
    snapshot.activation = state.activation.snapshot().await;
+    // Per-model admission load (#53) — read live from the candle harness so
+    // cortex's load-aware router (#55) can spread traffic and propagate
+    // backpressure. Absent when no candle harness is present.
+    if let Some(candle) = &state.candle {
+        snapshot.models = candle.load_snapshot().await;
+    }
    Json(snapshot)
 }

@@ -228,6 +235,17 @@ fn default_enable_thinking(req: &mut ChatCompletionRequest, include_thinking: bo
    }
 }

+/// The request's principal for fair-share admission (#54), reconstructed
+/// from the internal headers cortex stamps (#49). cortex strips any
+/// client-supplied copy and asserts the authoritative value, so over the
+/// trusted WireGuard link these are safe to key fair-share on. `None` for an
+/// unauthenticated/direct request — exempt from the per-principal cap.
+fn principal_key(headers: &axum::http::HeaderMap) -> Option<String> {
+    let account = headers.get(HEADER_ACCOUNT_ID)?.to_str().ok()?;
+    let key = headers.get(HEADER_KEY_ID)?.to_str().ok()?;
+    Some(format!("{account}/{key}"))
+}
+
 /// OpenAI-compatible chat completions. Dispatches to streaming SSE when
 /// `stream: true` is set on the request; otherwise returns a single
 /// `ChatCompletionResponse`.
@@ -271,8 +289,14 @@ async fn chat_completions(
    // true`) keep reasoning on.
    default_enable_thinking(&mut req, include_thinking);

+    // Fair-share admission principal (#54), from cortex's stamped headers.
+    let principal = principal_key(&headers);
+
    if req.stream.unwrap_or(false) {
-        match candle.chat_completion_stream_with(req, chat_config).await {
+        match candle
+            .chat_completion_stream_with(req, chat_config, principal)
+            .await
+        {
            Ok(rx) => {
                // Each chunk → one SSE `data: {json}` line. After the
                // channel closes, append the OpenAI [DONE] terminator.
@@ -289,7 +313,7 @@ async fn chat_completions(
            Err(e) => inference_error_response(e),
        }
    } else {
-        match candle.chat_completion(req).await {
+        match candle.chat_completion(req, principal).await {
            Ok(resp) => Json(resp).into_response(),
            Err(e) => inference_error_response(e),
        }
@@ -302,6 +326,7 @@ async fn chat_completions(
 /// event stream into the Responses event family.
 async fn responses(
    State(state): State<Arc<NeuronState>>,
+    headers: axum::http::HeaderMap,
    Json(req): Json<ResponsesRequest>,
 ) -> impl IntoResponse {
    let Some(candle) = state.candle.as_ref().map(Arc::clone) else {
@@ -336,9 +361,12 @@ async fn responses(
    };
    chat_req.stream = Some(stream_requested);

+    // Fair-share admission principal (#54), from cortex's stamped headers.
+    let principal = principal_key(&headers);
+
    if stream_requested {
        match candle
-            .responses_stream(chat_req, response_id, message_item_id)
+            .responses_stream(chat_req, response_id, message_item_id, principal)
            .await
        {
            Ok(rx) => {
@@ -362,7 +390,7 @@ async fn responses(
        // and translate the result. We don't currently re-tokenise
        // to compute usage; the harness returns it via the chat
        // response and we pass it through.
-        match candle.chat_completion(chat_req).await {
+        match candle.chat_completion(chat_req, principal).await {
            Ok(chat_resp) => {
                // Extract the assistant text (chat completions
                // always emits one choice on the candle path).
@@ -486,6 +514,24 @@ fn inference_error_response(err: InferenceError) -> axum::response::Response {
            "template_render_failed",
            format!("chat template could not render this request: {detail}"),
        ),
+        // Admission control refused on load (#53): a fast, retryable "busy"
+        // signal. 503 (service busy) + Retry-After; opencode/AI SDK back off.
+        InferenceError::Overloaded { retry_after_secs } => OpenAiError::new(
+            503,
+            "rate_limit_error",
+            "rate_limit_exceeded",
+            "model is busy (admission queue full); retry shortly",
+        )
+        .with_retry_after(retry_after_secs),
+        // Per-principal fair-share cap (#54): 429 rate_limit_exceeded +
+        // Retry-After — the caller is sending too many concurrent requests.
+        InferenceError::PerPrincipalLimit { retry_after_secs } => OpenAiError::new(
+            429,
+            "rate_limit_error",
+            "rate_limit_exceeded",
+            "too many concurrent requests for this key; retry shortly",
+        )
+        .with_retry_after(retry_after_secs),
        InferenceError::Other(e) => OpenAiError::without_code(500, "api_error", format!("{e:#}")),
    };
    envelope_response(env)
@@ -660,6 +706,26 @@ mod error_envelope_tests {
        assert_eq!(error["required_mb"], 8_192);
    }

+    #[tokio::test]
+    async fn overloaded_is_503_rate_limited_with_retry_after() {
+        // Admission rejection (#53) → fast, retryable backpressure.
+        let resp = inference_error_response(InferenceError::Overloaded {
+            retry_after_secs: 7,
+        });
+        assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
+        let retry = resp
+            .headers()
+            .get(axum::http::header::RETRY_AFTER)
+            .expect("admission rejection must advertise Retry-After");
+        assert_eq!(retry.to_str().unwrap(), "7");
+
+        let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body: Value = serde_json::from_slice(&bytes).unwrap();
+        assert_eq!(body["error"]["code"], "rate_limit_exceeded");
+    }
+
    #[tokio::test]
    async fn insufficient_vram_carries_retry_after() {
        // Transient 503 — VRAM frees as in-flight requests finish, so the
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -85,6 +85,68 @@ pub struct CandleHarnessConfig {
    /// `/models`, and enforces it. These knobs tune that derivation.
    #[serde(default)]
    pub context_limit: ContextLimitConfig,
+
+    /// Admission control (#53): bounds the per-model wait queue so a busy
+    /// model returns a fast, retryable `429`/`503` instead of stalling new
+    /// requests until their client times out.
+    #[serde(default)]
+    pub admission: AdmissionConfig,
+}
+
+/// `[harness.candle.admission]` settings (#53).
+///
+/// Inference is batch-1, so `max_in_flight` is 1 in practice; the queue
+/// (`max_queue_depth`) absorbs short bursts, and `max_wait_secs` caps how
+/// long a queued request waits before it's refused with backpressure.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AdmissionConfig {
+    /// Concurrent running requests per model. Batch-1 inference → 1.
+    #[serde(default = "default_admission_max_in_flight")]
+    pub max_in_flight: usize,
+    /// Queued (waiting) requests allowed beyond the in-flight one. The
+    /// `(max_in_flight + max_queue_depth + 1)`-th request is refused
+    /// immediately with `429`/`503` + `Retry-After`.
+    #[serde(default = "default_admission_max_queue_depth")]
+    pub max_queue_depth: usize,
+    /// Maximum seconds a queued request waits for the in-flight slot before
+    /// it is refused (turns the old ~300s client-side hang into a fast,
+    /// honest signal).
+    #[serde(default = "default_admission_max_wait_secs")]
+    pub max_wait_secs: u64,
+    /// Per-principal fair-share cap (#54): max in-flight + queued requests
+    /// for any single principal (resolved from the `x-helexa-*` headers
+    /// cortex stamps), so one client can't monopolize the queue while others
+    /// wait. Over-cap → `429 rate_limit_exceeded` + `Retry-After`. `0`
+    /// disables the cap; anonymous requests are always exempt.
+    #[serde(default = "default_admission_max_per_principal")]
+    pub max_per_principal: usize,
+}
+
+impl Default for AdmissionConfig {
+    fn default() -> Self {
+        Self {
+            max_in_flight: default_admission_max_in_flight(),
+            max_queue_depth: default_admission_max_queue_depth(),
+            max_wait_secs: default_admission_max_wait_secs(),
+            max_per_principal: default_admission_max_per_principal(),
+        }
+    }
+}
+
+fn default_admission_max_in_flight() -> usize {
+    1
+}
+
+fn default_admission_max_queue_depth() -> usize {
+    8
+}
+
+fn default_admission_max_wait_secs() -> u64 {
+    30
+}
+
+fn default_admission_max_per_principal() -> usize {
+    2
 }

 /// `[harness.candle.prefix_cache]` settings.
--- a/crates/neuron/src/harness/admission.rs
+++ b/crates/neuron/src/harness/admission.rs
@@ -0,0 +1,298 @@
+//! Per-model admission control (#53).
+//!
+//! Inference against a loaded model is batch-1: one request runs at a time,
+//! serialized by the model's `inference_lock` (single-GPU) / `pool` mutex
+//! (TP). Before this, the wait for that lock was an **unbounded FIFO of
+//! mutex waiters with no timeout** — a busy model made every new request
+//! hang until its client gave up (~300s) with an opaque error.
+//!
+//! [`AdmissionController`] replaces that implicit unbounded wait with an
+//! explicit bounded scheduler: at most `max_in_flight` running (1, batch-1)
+//! plus a bounded queue of `max_queue_depth` waiters, each waiting at most
+//! `max_wait`. When the queue is full or the wait elapses, the request is
+//! rejected *immediately* — an honest, fast, retryable "busy" signal
+//! (`429`/`503` + `Retry-After` per #63) instead of a silent stall.
+//!
+//! The controller is pure async (no CUDA), so the inference paths just call
+//! [`AdmissionController::enter`] before taking the inference lock and hold
+//! the returned [`AdmissionPermit`] for the request's lifetime. Its counters
+//! ([`in_flight`](AdmissionController::in_flight) /
+//! [`queue_depth`](AdmissionController::queue_depth)) are lock-free, so
+//! `/health` can read live load without contending with inference.
+
+use crate::config::AdmissionConfig;
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
+
+/// Why admission was refused. All map to the #63 backpressure envelope
+/// (`rate_limit_exceeded` + `Retry-After`); they differ in cause (and HTTP
+/// status — load → `503`, per-principal → `429`).
+#[derive(Debug, Clone, Copy)]
+pub enum AdmissionRejection {
+    /// The bounded wait queue was already full (server-side load).
+    QueueFull { retry_after_secs: u64 },
+    /// A queue slot was taken but the in-flight slot didn't free within
+    /// `max_wait` (server-side load).
+    Timeout { retry_after_secs: u64 },
+    /// This principal already has `max_per_principal` requests in flight or
+    /// queued (#54 fair-share) — one principal can't monopolize the model.
+    PrincipalCap { retry_after_secs: u64 },
+}
+
+impl AdmissionRejection {
+    pub fn retry_after_secs(&self) -> u64 {
+        match self {
+            AdmissionRejection::QueueFull { retry_after_secs }
+            | AdmissionRejection::Timeout { retry_after_secs }
+            | AdmissionRejection::PrincipalCap { retry_after_secs } => *retry_after_secs,
+        }
+    }
+}
+
+/// Admission accounting, mutated under a brief lock (never held across an
+/// await). `pending` is queued + in-flight overall; `per_principal` is the
+/// same count keyed by principal for fair-share (#54).
+#[derive(Default, Debug)]
+struct AdmissionState {
+    pending: usize,
+    per_principal: HashMap<String, usize>,
+}
+
+/// Bounded batch-1 scheduler for one loaded model, with per-principal
+/// fair-share.
+pub struct AdmissionController {
+    /// In-flight slots — `max_in_flight` permits (1 for batch-1).
+    slots: Arc<Semaphore>,
+    /// Queued + in-flight accounting (overall + per principal).
+    state: Arc<Mutex<AdmissionState>>,
+    /// `max_in_flight + max_queue_depth` — the overall rejection threshold.
+    max_pending: usize,
+    /// Max in-flight + queued for any single principal (#54). `0` disables.
+    max_per_principal: usize,
+    max_in_flight: usize,
+    max_wait: Duration,
+}
+
+impl AdmissionController {
+    pub fn new(cfg: &AdmissionConfig) -> Self {
+        // A controller with zero in-flight slots would deadlock; clamp.
+        let max_in_flight = cfg.max_in_flight.max(1);
+        Self {
+            slots: Arc::new(Semaphore::new(max_in_flight)),
+            state: Arc::new(Mutex::new(AdmissionState::default())),
+            max_pending: max_in_flight + cfg.max_queue_depth,
+            max_per_principal: cfg.max_per_principal,
+            max_in_flight,
+            max_wait: Duration::from_secs(cfg.max_wait_secs),
+        }
+    }
+
+    /// Admit a request for `principal` (`None` = anonymous, exempt from the
+    /// per-principal cap). Reserves a queue slot — fast-rejecting if the
+    /// overall queue is full or the principal is over its fair-share cap —
+    /// then waits up to `max_wait` for an in-flight slot. The returned permit
+    /// must be held for the request's lifetime; dropping it frees the slots.
+    pub async fn enter(
+        &self,
+        principal: Option<&str>,
+    ) -> Result<AdmissionPermit, AdmissionRejection> {
+        // Decision + reservation under one brief lock so concurrent callers
+        // can't both slip past the thresholds. No await is held here.
+        {
+            let mut st = self.state.lock().expect("admission state poisoned");
+            if st.pending >= self.max_pending {
+                return Err(AdmissionRejection::QueueFull {
+                    retry_after_secs: self.retry_hint(st.pending),
+                });
+            }
+            if let Some(p) = principal
+                && self.max_per_principal > 0
+                && st.per_principal.get(p).copied().unwrap_or(0) >= self.max_per_principal
+            {
+                return Err(AdmissionRejection::PrincipalCap {
+                    retry_after_secs: self.retry_hint(st.pending),
+                });
+            }
+            st.pending += 1;
+            if let Some(p) = principal {
+                *st.per_principal.entry(p.to_string()).or_insert(0) += 1;
+            }
+        }
+
+        match tokio::time::timeout(self.max_wait, Arc::clone(&self.slots).acquire_owned()).await {
+            Ok(Ok(permit)) => Ok(AdmissionPermit {
+                _permit: permit,
+                state: Arc::clone(&self.state),
+                principal: principal.map(str::to_string),
+            }),
+            // Semaphore is never closed; treat a closed/elapsed wait the same.
+            Ok(Err(_)) | Err(_) => {
+                self.release(principal);
+                Err(AdmissionRejection::Timeout {
+                    retry_after_secs: self.retry_hint(self.max_pending),
+                })
+            }
+        }
+    }
+
+    /// Roll back a reserved-but-not-admitted slot (wait timed out).
+    fn release(&self, principal: Option<&str>) {
+        let mut st = self.state.lock().expect("admission state poisoned");
+        st.pending = st.pending.saturating_sub(1);
+        decrement_principal(&mut st.per_principal, principal);
+    }
+
+    /// Requests currently running (holding an in-flight slot).
+    pub fn in_flight(&self) -> usize {
+        self.max_in_flight
+            .saturating_sub(self.slots.available_permits())
+    }
+
+    /// Requests waiting for an in-flight slot.
+    pub fn queue_depth(&self) -> usize {
+        let pending = self.state.lock().expect("admission state poisoned").pending;
+        pending.saturating_sub(self.in_flight())
+    }
+
+    /// Rough `Retry-After`: scale with how backed-up the model is, clamped to
+    /// a sane band. Without per-request timing this is a heuristic, but it
+    /// gives well-behaved clients (opencode/AI SDK) a sensible backoff.
+    fn retry_hint(&self, pending: usize) -> u64 {
+        let queued = pending.saturating_sub(self.max_in_flight) as u64;
+        ((queued + 1) * 2).clamp(1, 120)
+    }
+}
+
+/// Decrement (and prune at zero) a principal's outstanding count.
+fn decrement_principal(map: &mut HashMap<String, usize>, principal: Option<&str>) {
+    if let Some(p) = principal
+        && let Some(count) = map.get_mut(p)
+    {
+        *count -= 1;
+        if *count == 0 {
+            map.remove(p);
+        }
+    }
+}
+
+/// Held for a request's lifetime; frees the in-flight + queue slot (and the
+/// principal's fair-share slot) on drop.
+#[derive(Debug)]
+pub struct AdmissionPermit {
+    _permit: OwnedSemaphorePermit,
+    state: Arc<Mutex<AdmissionState>>,
+    principal: Option<String>,
+}
+
+impl Drop for AdmissionPermit {
+    fn drop(&mut self) {
+        let mut st = self.state.lock().expect("admission state poisoned");
+        st.pending = st.pending.saturating_sub(1);
+        decrement_principal(&mut st.per_principal, self.principal.as_deref());
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Config with the per-principal cap disabled (0) — most tests exercise
+    /// the overall queue with anonymous (`None`) callers.
+    fn cfg(max_in_flight: usize, max_queue_depth: usize, max_wait_secs: u64) -> AdmissionConfig {
+        AdmissionConfig {
+            max_in_flight,
+            max_queue_depth,
+            max_wait_secs,
+            max_per_principal: 0,
+        }
+    }
+
+    #[tokio::test]
+    async fn admits_up_to_in_flight_and_reports_load() {
+        let ctrl = AdmissionController::new(&cfg(1, 4, 30));
+        assert_eq!(ctrl.in_flight(), 0);
+        let p = ctrl.enter(None).await.expect("first admits");
+        assert_eq!(ctrl.in_flight(), 1);
+        assert_eq!(ctrl.queue_depth(), 0);
+        drop(p);
+        assert_eq!(ctrl.in_flight(), 0);
+    }
+
+    #[tokio::test]
+    async fn rejects_when_queue_full() {
+        // 1 in-flight + 1 queue slot = capacity 2; the 3rd is refused fast.
+        let ctrl = Arc::new(AdmissionController::new(&cfg(1, 1, 30)));
+        let _running = ctrl.enter(None).await.expect("admit running");
+
+        // Fill the single queue slot with a waiter that parks on the semaphore.
+        let ctrl2 = Arc::clone(&ctrl);
+        let waiter = tokio::spawn(async move { ctrl2.enter(None).await.map(|p| drop(p)) });
+        // Give the waiter a moment to occupy the queue slot.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert_eq!(ctrl.queue_depth(), 1);
+
+        // Queue full → immediate QueueFull with a Retry-After hint.
+        match ctrl.enter(None).await {
+            Err(AdmissionRejection::QueueFull { retry_after_secs }) => {
+                assert!(retry_after_secs >= 1)
+            }
+            other => panic!("expected QueueFull, got {other:?}"),
+        }
+
+        // Release the runner so the parked waiter can proceed and finish.
+        drop(_running);
+        waiter.await.unwrap().unwrap();
+    }
+
+    #[tokio::test]
+    async fn rejects_on_wait_timeout() {
+        // Zero queue depth + a runner holding the only slot → a second
+        // request can't even queue, so it's QueueFull, not Timeout. Use a
+        // queue of 1 and a tiny max_wait to exercise the timeout path.
+        let ctrl = Arc::new(AdmissionController::new(&cfg(1, 1, 0)));
+        let _running = ctrl.enter(None).await.expect("admit running");
+        // max_wait 0 → the queued request times out almost immediately.
+        match ctrl.enter(None).await {
+            Err(AdmissionRejection::Timeout { .. }) => {}
+            other => panic!("expected Timeout, got {other:?}"),
+        }
+        // The timed-out request released its queue slot.
+        assert_eq!(ctrl.queue_depth(), 0);
+    }
+
+    #[tokio::test]
+    async fn per_principal_cap_protects_other_principals() {
+        // Generous overall queue, but each principal capped at 1 in-flight+
+        // queued. Principal A holds the running slot; A's second request is
+        // refused (PrincipalCap) rather than occupying the queue, so B's
+        // single request still gets a queue slot and proceeds.
+        let cfg = AdmissionConfig {
+            max_in_flight: 1,
+            max_queue_depth: 8,
+            max_wait_secs: 30,
+            max_per_principal: 1,
+        };
+        let ctrl = Arc::new(AdmissionController::new(&cfg));
+
+        let _a1 = ctrl.enter(Some("acct-a/key-a")).await.expect("A admits");
+
+        // A is over its fair-share cap → fast PrincipalCap, no queue slot taken.
+        match ctrl.enter(Some("acct-a/key-a")).await {
+            Err(AdmissionRejection::PrincipalCap { retry_after_secs }) => {
+                assert!(retry_after_secs >= 1)
+            }
+            other => panic!("expected PrincipalCap, got {other:?}"),
+        }
+
+        // B (a different principal) is admitted to the queue and proceeds
+        // once A releases — it was never stuck behind A's backlog.
+        let ctrl2 = Arc::clone(&ctrl);
+        let b = tokio::spawn(async move { ctrl2.enter(Some("acct-b/key-b")).await.map(drop) });
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert_eq!(ctrl.queue_depth(), 1, "B is queued, not rejected");
+        drop(_a1);
+        b.await.unwrap().expect("B is served after A releases");
+    }
+}
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -33,7 +33,7 @@ use crate::wire::{
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
 #[cfg(feature = "cuda")]
 use std::time::Duration;
 use std::time::{SystemTime, UNIX_EPOCH};
@@ -81,6 +81,9 @@ pub struct CandleHarness {
    /// Context-limit derivation settings (#67), read in `list_models`
    /// to compute each model's advertised `limit{context,input,output}`.
    context_limit_cfg: crate::config::ContextLimitConfig,
+    /// Admission-control settings (#53), used to build each loaded model's
+    /// [`super::admission::AdmissionController`] at load time.
+    admission_cfg: crate::config::AdmissionConfig,
 }

 /// Devices/capabilities snapshot of a model entering auto-recovery
@@ -146,6 +149,16 @@ impl LoadedHandle {
        }
    }

+    /// Current admission load (#53): `(in_flight, queue_depth)`. Lock-free,
+    /// so `/health` can read it without contending with inference.
+    pub fn load(&self) -> (usize, usize) {
+        match self {
+            LoadedHandle::Single(m) => (m.admission.in_flight(), m.admission.queue_depth()),
+            #[cfg(feature = "cuda")]
+            LoadedHandle::Tp(m) => (m.admission.in_flight(), m.admission.queue_depth()),
+        }
+    }
+
    /// Modalities the loaded model supports. Stage B7 (single-GPU) +
    /// TP-vision (#12) — both single-GPU and TP loads advertise
    /// `"vision"` when a replicated vision tower materialised.
@@ -192,23 +205,50 @@ impl LoadedHandle {
    /// `NEURON_MAX_PROMPT_TOKENS`, when explicitly set, is applied as a
    /// clamp-only upper bound on the derived `context` — a backstop, not
    /// the authority. Unset → no clamp; the derivation stands alone.
-    pub async fn derived_limit(
+    /// Refresh the cached free-VRAM reading used by [`Self::derived_limit`]
+    /// (#53). Queries the device worker — so it MUST run off the request
+    /// path (background refresher / load-time seed), never from a control
+    /// endpoint, since the query queues behind inference on the worker.
+    /// Single-GPU caches the device's free VRAM; TP caches the tightest
+    /// free across ranks (the same value `derived_limit` used pre-cache).
+    pub async fn refresh_free_mb(&self) {
+        let free = match self {
+            LoadedHandle::Single(m) => m.query_vram().await.0,
+            #[cfg(feature = "cuda")]
+            LoadedHandle::Tp(m) => m.query_vram_tightest_free_mb().await,
+        };
+        // Don't clobber a good cached value with a transient `0`
+        // (worker gone/poisoned sentinel).
+        if free > 0 {
+            match self {
+                LoadedHandle::Single(m) => m.last_free_mb.store(free, Ordering::Release),
+                #[cfg(feature = "cuda")]
+                LoadedHandle::Tp(m) => m.last_free_mb.store(free, Ordering::Release),
+            }
+        }
+    }
+
+    pub fn derived_limit(
        &self,
        cfg: &crate::config::ContextLimitConfig,
    ) -> Option<cortex_core::harness::ModelLimit> {
        if !cfg.enabled {
            return None;
        }
+        // Read the *cached* free VRAM — never query the device worker here.
+        // This runs on `GET /models`; a live query would queue behind
+        // inference on the worker thread and stall the control plane (#53).
+        // The cache is refreshed off the request path (load + background task).
        let (profile, free_mb, rate) = match self {
            LoadedHandle::Single(m) => (
                m.context_profile?,
-                m.query_vram().await.0,
+                m.last_free_mb.load(Ordering::Acquire),
                m.prefill_rate.get(),
            ),
            #[cfg(feature = "cuda")]
            LoadedHandle::Tp(m) => (
                m.context_profile?,
-                m.query_vram_tightest_free_mb().await,
+                m.last_free_mb.load(Ordering::Acquire),
                m.prefill_rate.get(),
            ),
        };
@@ -305,6 +345,10 @@ pub struct LoadedModel {
    /// for the TP path (which already had this invariant by accident
    /// because the pool lock covered the same window).
    pub inference_lock: tokio::sync::Mutex<()>,
+    /// Bounded admission scheduler (#53). Gated *before* `inference_lock`
+    /// so a busy model refuses overflow fast instead of growing an
+    /// unbounded, untimed queue of lock waiters.
+    pub admission: super::admission::AdmissionController,
    /// Open/close token IDs for the reasoning marker this model
    /// emits, populated once at load time by probing the tokenizer's
    /// added-tokens table. `None` for non-reasoning models or
@@ -374,6 +418,13 @@ pub struct LoadedModel {
    /// request-path enforcement reads this — `0` means "not derived yet"
    /// → fall back to the static `NEURON_MAX_PROMPT_TOKENS`.
    pub derived_input_cap: AtomicUsize,
+    /// Cached free VRAM (MiB) for the control plane (#53). `derived_limit`
+    /// (served by `GET /models`) reads this instead of querying the device
+    /// worker, which during inference is saturated processing forward jobs —
+    /// a live query would queue behind them and stall `/models`, tripping
+    /// cortex's health poller into marking the node unhealthy. Refreshed off
+    /// the request path: seeded at load, then by a background task.
+    pub last_free_mb: AtomicU64,
 }

 impl LoadedModel {
@@ -422,6 +473,10 @@ pub struct TpLoadedModel {
    /// serialises subprocess RPC traffic on the pool's
    /// `Vec<Worker>` channels.
    pub pool: tokio::sync::Mutex<super::tp::WorkerPool>,
+    /// Bounded admission scheduler (#53), mirroring the single-GPU path.
+    /// Gated before the pool lock so an overloaded TP model returns fast
+    /// backpressure instead of an unbounded, untimed wait.
+    pub admission: super::admission::AdmissionController,
    /// Handle into the leader device worker's TP slab. The boxed
    /// `TpLeaderModel` (with its embedded `Arc<Comm>` clones and
    /// per-rank CUDA tensors) lives on the worker thread; we hold an
@@ -482,6 +537,10 @@ pub struct TpLoadedModel {
    /// Mint for pool-wide snapshot ids. Plain counter; uniqueness only
    /// needs to hold per model lifetime (snapshots die with the model).
    pub next_snapshot_id: std::sync::atomic::AtomicU64,
+    /// Cached tightest free VRAM (MiB) for the control plane (#53) — see
+    /// [`LoadedModel::last_free_mb`]. Read by `derived_limit` so `GET /models`
+    /// never fans a VRAM query out to the (inference-saturated) TP workers.
+    pub last_free_mb: AtomicU64,
 }

 #[cfg(feature = "cuda")]
@@ -1088,6 +1147,32 @@ fn debug_poison_armed(model_id: &str) -> bool {
    armed && !FIRED.swap(true, Ordering::Relaxed)
 }

+/// Background control-plane VRAM cache refresher (#53). Every few seconds,
+/// refreshes each loaded model's `last_free_mb` so `derived_limit` (served
+/// by `GET /models`) reads a cached value and never queries the device
+/// worker on the request path — a live query would queue behind inference
+/// forward jobs on the worker thread, stalling `/models` for seconds and
+/// tripping cortex's health poller into evicting the node from routing.
+/// Holds a `Weak` so a shutting-down harness lets the task exit. The query
+/// itself may queue behind inference, but that only delays this background
+/// refresh — no request-path caller is ever blocked.
+async fn vram_cache_refresh_loop(weak: std::sync::Weak<CandleHarness>) {
+    const REFRESH_INTERVAL: std::time::Duration = std::time::Duration::from_secs(5);
+    loop {
+        tokio::time::sleep(REFRESH_INTERVAL).await;
+        let Some(this) = weak.upgrade() else {
+            return; // harness dropped — exit
+        };
+        // Snapshot handles, then release the read lock before awaiting the
+        // (possibly slow) worker queries so we never hold it across an await.
+        let handles: Vec<LoadedHandle> = this.models.read().await.values().cloned().collect();
+        drop(this);
+        for handle in handles {
+            handle.refresh_free_mb().await;
+        }
+    }
+}
+
 /// Background auto-recovery task (#17). Drains poisoned model ids and
 /// rebuilds each via [`CandleHarness::recover_one`]. Holds a `Weak` so a
 /// shutting-down harness lets the task exit; processes one id at a time,
@@ -1565,6 +1650,7 @@ impl CandleHarness {
            recovery_tx,
            prefix_cache_cfg: config.prefix_cache.clone(),
            context_limit_cfg: config.context_limit.clone(),
+            admission_cfg: config.admission.clone(),
        });
        // Background auto-recovery task (#17). Holds a `Weak` so it can't
        // keep the harness alive. Spawned only when a tokio runtime is
@@ -1573,6 +1659,11 @@ impl CandleHarness {
        if tokio::runtime::Handle::try_current().is_ok() {
            let weak = Arc::downgrade(&this);
            tokio::spawn(recovery_loop(weak, recovery_rx));
+            // Control-plane VRAM cache refresher (#53): keeps each loaded
+            // model's `last_free_mb` current off the request path, so
+            // `derived_limit` / `GET /models` never query the device worker
+            // (which is saturated during inference) and never stall.
+            tokio::spawn(vram_cache_refresh_loop(Arc::downgrade(&this)));
        }
        this
    }
@@ -2006,6 +2097,7 @@ impl CandleHarness {
    pub async fn chat_completion(
        &self,
        request: ChatCompletionRequest,
+        principal: Option<String>,
    ) -> Result<ChatCompletionResponse, InferenceError> {
        let handle = {
            let models = self.models.read().await;
@@ -2030,7 +2122,7 @@ impl CandleHarness {
            LoadedHandle::Single(m) => m,
            #[cfg(feature = "cuda")]
            LoadedHandle::Tp(m) => {
-                return self.chat_completion_tp(m, request).await;
+                return self.chat_completion_tp(m, request, principal).await;
            }
        };

@@ -2059,6 +2151,15 @@ impl CandleHarness {
            return Err(self.trigger_recovery(&model_id).await);
        }

+        // Admission control (#53): refuse fast if the bounded queue is full
+        // or the wait elapses, rather than joining an unbounded lock-wait.
+        // The permit is held for the whole request (released on drop).
+        let _admit = loaded
+            .admission
+            .enter(principal.as_deref())
+            .await
+            .map_err(InferenceError::from)?;
+
        // Serialise concurrent requests against this model. Holds for
        // the duration of clear_kv_cache → prefill → decode so two
        // requests' chunked-prefill sequences can't interleave on the
@@ -2378,9 +2479,14 @@ impl CandleHarness {
    pub async fn chat_completion_stream(
        &self,
        request: ChatCompletionRequest,
+        principal: Option<String>,
    ) -> Result<mpsc::Receiver<ChatCompletionChunk>, InferenceError> {
-        self.chat_completion_stream_with(request, wire_chat::ChatProjectionConfig::default())
-            .await
+        self.chat_completion_stream_with(
+            request,
+            wire_chat::ChatProjectionConfig::default(),
+            principal,
+        )
+        .await
    }

    /// Same as [`Self::chat_completion_stream`] but lets the caller
@@ -2391,8 +2497,9 @@ impl CandleHarness {
        &self,
        request: ChatCompletionRequest,
        mut config: wire_chat::ChatProjectionConfig,
+        principal: Option<String>,
    ) -> Result<mpsc::Receiver<ChatCompletionChunk>, InferenceError> {
-        let stream = self.inference_stream(request).await?;
+        let stream = self.inference_stream(request, principal).await?;
        // Fill in the model's reasoning markers if the caller
        // didn't pre-populate them — they're a property of the
        // loaded model (which the HTTP handler doesn't reach into
@@ -2419,9 +2526,10 @@ impl CandleHarness {
        request: ChatCompletionRequest,
        response_id: String,
        message_item_id: String,
+        principal: Option<String>,
    ) -> Result<mpsc::Receiver<crate::wire::openai_responses::ResponseStreamFrame>, InferenceError>
    {
-        let stream = self.inference_stream(request).await?;
+        let stream = self.inference_stream(request, principal).await?;
        let meta = crate::wire::openai_responses::ResponseMeta {
            response_id,
            created_at: stream.created,
@@ -2442,6 +2550,7 @@ impl CandleHarness {
    async fn inference_stream(
        &self,
        request: ChatCompletionRequest,
+        principal: Option<String>,
    ) -> Result<InferenceStream, InferenceError> {
        let handle = {
            let models = self.models.read().await;
@@ -2466,7 +2575,7 @@ impl CandleHarness {
            LoadedHandle::Single(m) => m,
            #[cfg(feature = "cuda")]
            LoadedHandle::Tp(m) => {
-                return self.inference_tp_stream(m, request).await;
+                return self.inference_tp_stream(m, request, principal).await;
            }
        };

@@ -2610,6 +2719,15 @@ impl CandleHarness {
        // role chunk was already sent above, so the client sees
        // immediate "stream open" feedback even when this request
        // queues behind another for the lock.
+        // Admission control (#53): refuse before opening the stream if the
+        // model's bounded queue is full / the wait elapses. The permit moves
+        // into the inference task and is held until it completes.
+        let admit = loaded
+            .admission
+            .enter(principal.as_deref())
+            .await
+            .map_err(InferenceError::from)?;
+
        let tool_schemas = build_tool_schemas(&request);
        if let (Some(worker), Some(handle)) = (loaded.worker.clone(), loaded.arch_handle) {
            #[cfg(feature = "cuda")]
@@ -2620,6 +2738,7 @@ impl CandleHarness {
                let tool_schemas_inner = tool_schemas.clone();
                tokio::spawn(
                    async move {
+                        let _admit = admit;
                        let _inference_guard = loaded_for_task.inference_lock.lock().await;
                        match stream_inference_via_worker(
                            worker,
@@ -2680,6 +2799,7 @@ impl CandleHarness {
            let tool_call_tokens_inner = loaded.tool_call_tokens.clone();
            let tool_schemas_inner = tool_schemas.clone();
            tokio::task::spawn_blocking(move || {
+                let _admit = admit;
                let _g = span_for_task.enter();
                // `blocking_lock` is safe here: spawn_blocking runs on
                // a dedicated thread, not on the async runtime, so
@@ -2779,6 +2899,24 @@ pub struct InferenceStream {
 /// Auto-recovery (#17) — rebuild a poisoned model's device context
 /// automatically instead of leaving it bricked until a human reloads.
 impl CandleHarness {
+    /// Per-model admission load for `GET /health` (#53): in-flight + queued
+    /// counts for every resident model. Lock-free per-model reads, so this
+    /// only briefly holds the registry read lock to enumerate handles.
+    pub async fn load_snapshot(&self) -> Vec<cortex_core::discovery::ModelLoad> {
+        let models = self.models.read().await;
+        models
+            .values()
+            .map(|handle| {
+                let (in_flight, queue_depth) = handle.load();
+                cortex_core::discovery::ModelLoad {
+                    id: handle.model_id().to_string(),
+                    in_flight,
+                    queue_depth,
+                }
+            })
+            .collect()
+    }
+
    /// True while `model_id` is being auto-recovered (its slot is briefly
    /// absent from the registry during the reload).
    pub async fn is_recovering(&self, model_id: &str) -> bool {
@@ -2890,7 +3028,7 @@ impl Harness for CandleHarness {
            // physics + live free VRAM + measured prefill rate. `None`
            // for arches without a context profile. `cost` stays
            // operator-set in the catalogue, filled by the gateway.
-            let limit = h.derived_limit(&self.context_limit_cfg).await;
+            let limit = h.derived_limit(&self.context_limit_cfg);
            out.push(ModelInfo {
                id: h.model_id().into(),
                harness: "candle".into(),
@@ -3128,6 +3266,7 @@ impl Harness for CandleHarness {
            worker,
            arch_handle,
            inference_lock: tokio::sync::Mutex::new(()),
+            admission: super::admission::AdmissionController::new(&self.admission_cfg),
            reasoning_tokens,
            tool_call_tokens,
            chat_template,
@@ -3139,6 +3278,7 @@ impl Harness for CandleHarness {
            context_profile,
            prefill_rate: super::context_limit::PrefillRateEma::new(),
            derived_input_cap: AtomicUsize::new(0),
+            last_free_mb: AtomicU64::new(0),
        });
        if loaded.prefix_cache.is_some() {
            tracing::info!(
@@ -3149,6 +3289,14 @@ impl Harness for CandleHarness {
            );
        }

+        // Seed the control-plane VRAM cache (#53) while the worker is idle
+        // (load just finished), so `/models` has a value before the
+        // background refresher's first tick and never queries the worker.
+        let (free_mb, _) = loaded.query_vram().await;
+        if free_mb > 0 {
+            loaded.last_free_mb.store(free_mb, Ordering::Release);
+        }
+
        let mut models = self.models.write().await;
        models.insert(spec.model_id.clone(), LoadedHandle::Single(loaded));
        tracing::info!(model = %spec.model_id, "model loaded");
@@ -3372,6 +3520,7 @@ impl CandleHarness {
            tokenizer,
            devices: devices.clone(),
            pool: TMutex::new(pool),
+            admission: super::admission::AdmissionController::new(&self.admission_cfg),
            leader_handle,
            leader_device: leader_device.clone(),
            poisoned: AtomicBool::new(false),
@@ -3398,6 +3547,7 @@ impl CandleHarness {
            ),
            prefill_rate: super::context_limit::PrefillRateEma::new(),
            derived_input_cap: AtomicUsize::new(0),
+            last_free_mb: AtomicU64::new(0),
            next_snapshot_id: std::sync::atomic::AtomicU64::new(1),
        });
        if tp_loaded.prefix_cache.is_some() {
@@ -3409,6 +3559,14 @@ impl CandleHarness {
            );
        }

+        // Seed the control-plane VRAM cache (#53) — tightest free across
+        // ranks, while the workers are idle post-load — so `/models` never
+        // fans a query out to the inference-busy TP workers.
+        let free_mb = tp_loaded.query_vram_tightest_free_mb().await;
+        if free_mb > 0 {
+            tp_loaded.last_free_mb.store(free_mb, Ordering::Release);
+        }
+
        let mut models = self.models.write().await;
        models.insert(spec.model_id.clone(), LoadedHandle::Tp(tp_loaded));
        tracing::info!(
@@ -3438,6 +3596,7 @@ impl CandleHarness {
        &self,
        tp: Arc<TpLoadedModel>,
        request: ChatCompletionRequest,
+        principal: Option<String>,
    ) -> Result<ChatCompletionResponse, InferenceError> {
        // Tag every line of this request with a short req_id so a
        // grep over journalctl reconstructs one request even when
@@ -3474,7 +3633,8 @@ impl CandleHarness {
        }

        let tp_for_marker = Arc::clone(&tp);
-        let handle = tokio::spawn(chat_completion_tp_inner(tp, request).instrument(span.clone()));
+        let handle =
+            tokio::spawn(chat_completion_tp_inner(tp, request, principal).instrument(span.clone()));
        match handle.await {
            Ok(Ok(resp)) => Ok(resp),
            Ok(Err(e)) => {
@@ -3545,6 +3705,7 @@ impl CandleHarness {
        &self,
        tp: Arc<TpLoadedModel>,
        request: ChatCompletionRequest,
+        principal: Option<String>,
    ) -> Result<InferenceStream, InferenceError> {
        if tp.poisoned.load(Ordering::Acquire) {
            return Err(self.trigger_recovery(&request.model).await);
@@ -3690,10 +3851,19 @@ impl CandleHarness {
            validate_vision_prefill(prompt_len, vram_free_mb)?;
        }

+        // Admission control (#53): refuse before opening the stream; the
+        // permit moves into the orchestration task and is held for its life.
+        let admit = tp
+            .admission
+            .enter(principal.as_deref())
+            .await
+            .map_err(InferenceError::from)?;
+
        let tool_schemas = build_tool_schemas(&request);
        let tp_for_task = Arc::clone(&tp);
        tokio::spawn(
            async move {
+                let _admit = admit;
                let mut failure: Option<String> = None;
                let mut pool = acquire_pool_lock(&tp_for_task.pool, &model_id).await;
                let leader_handle = tp_for_task.leader_handle;
@@ -4196,6 +4366,7 @@ impl CandleHarness {
 async fn chat_completion_tp_inner(
    tp: Arc<TpLoadedModel>,
    request: ChatCompletionRequest,
+    principal: Option<String>,
 ) -> Result<ChatCompletionResponse, InferenceError> {
    let req_start = std::time::Instant::now();
    let model_id = request.model.clone();
@@ -4284,6 +4455,14 @@ async fn chat_completion_tp_inner(
        validate_vision_prefill(prompt_len, vram_free_mb)?;
    }

+    // Admission control (#53): bounded queue + fast reject before joining
+    // the pool-lock wait. Held for the whole request (released on drop).
+    let _admit = tp
+        .admission
+        .enter(principal.as_deref())
+        .await
+        .map_err(InferenceError::from)?;
+
    // Acquire the pool lock for the duration of the request. After
    // Phase 3 the leader's TpLeaderModel lives in the device worker
    // thread, so the pool lock now serialises only subprocess RPC
@@ -4826,10 +5005,35 @@ pub enum InferenceError {
    /// failure mode that hid several client-compat bugs. Maps to 422.
    #[error("chat template could not render this request: {detail}")]
    TemplateRenderFailed { detail: String },
+    /// Admission control (#53) refused on load: the model's bounded queue is
+    /// full or the wait elapsed. Maps to `503 rate_limit_exceeded` +
+    /// `Retry-After` — a fast, retryable "busy" signal, not a stall.
+    #[error("model is busy; retry after {retry_after_secs}s")]
+    Overloaded { retry_after_secs: u64 },
+    /// Per-principal fair-share cap (#54) exceeded: this principal already
+    /// has its max requests in flight/queued. Maps to `429
+    /// rate_limit_exceeded` + `Retry-After`; a well-behaved client self-paces.
+    #[error("per-principal in-flight limit reached; retry after {retry_after_secs}s")]
+    PerPrincipalLimit { retry_after_secs: u64 },
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

+impl From<super::admission::AdmissionRejection> for InferenceError {
+    fn from(rejection: super::admission::AdmissionRejection) -> Self {
+        use super::admission::AdmissionRejection;
+        match rejection {
+            AdmissionRejection::QueueFull { retry_after_secs }
+            | AdmissionRejection::Timeout { retry_after_secs } => {
+                InferenceError::Overloaded { retry_after_secs }
+            }
+            AdmissionRejection::PrincipalCap { retry_after_secs } => {
+                InferenceError::PerPrincipalLimit { retry_after_secs }
+            }
+        }
+    }
+}
+
 /// Build the model's prompt from a [`ChatCompletionRequest`].
 ///
 /// Prefers the model's own `chat_template` when one was loaded
--- a/crates/neuron/src/harness/mod.rs
+++ b/crates/neuron/src/harness/mod.rs
@@ -1,5 +1,6 @@
 //! Harness registry — maps harness names to trait implementations.

+pub mod admission;
 pub mod arch;
 pub mod candle;
 pub mod chat_template;
--- a/crates/neuron/src/health.rs
+++ b/crates/neuron/src/health.rs
@@ -30,6 +30,9 @@ impl HealthCache {
                // direct read from the cache stays a well-typed
                // HealthResponse on the wire.
                activation: Default::default(),
+                // Per-model admission load is overlaid by the api handler
+                // from the candle harness (#53); the cache doesn't own it.
+                models: Vec::new(),
            }),
            has_gpus: RwLock::new(false),
        }
--- a/crates/neuron/tests/api.rs
+++ b/crates/neuron/tests/api.rs
@@ -114,6 +114,12 @@ async fn test_health_endpoint() {

    let body: serde_json::Value = resp.json().await.unwrap();
    assert_eq!(body["uptime_secs"], 0);
+    // Per-model admission load (#53) is always present, even with no models
+    // loaded (empty array) — cortex's load-aware router (#55) relies on it.
+    assert!(
+        body["models"].is_array(),
+        "/health must expose a models load array"
+    );
 }

 #[tokio::test]