From abc6e605b81aa385cb8f22c41cbb183723635736 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Mon, 8 Jun 2026 09:08:40 +0300 Subject: [PATCH] test(neuron): NEURON_DEBUG_POISON hook to verify auto-recovery (#17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One-shot, env-gated fault injector for beast verification: when NEURON_DEBUG_POISON names a model, the first request for it triggers the auto-recovery path as if a device fault had occurred — exercising unload→reload→healthy without corrupting the GPU. Latched so it fires exactly once (no recovery loop). No-op unless the env var is set; wired into both the single-GPU and TP chat poison gates. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/neuron/src/harness/candle.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index d92ab20..d70cfba 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -830,6 +830,18 @@ fn recovering_error(model_id: &str) -> InferenceError { )) } +/// Verification hook for #17 auto-recovery. When `NEURON_DEBUG_POISON` +/// names a model, the **first** request for it (process-wide) returns +/// true, so the request path can trigger recovery as if a device fault +/// had occurred — exercising the unload→reload→healthy cycle without +/// corrupting the GPU. One-shot (a `swap` latch) so it can't loop the +/// model through endless recoveries. No-op unless the env var is set. +fn debug_poison_armed(model_id: &str) -> bool { + static FIRED: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); + let armed = std::env::var("NEURON_DEBUG_POISON").ok().as_deref() == Some(model_id); + armed && !FIRED.swap(true, Ordering::Relaxed) +} + /// Background auto-recovery task (#17). Drains poisoned model ids and /// rebuilds each via [`CandleHarness::recover_one`]. Holds a `Weak` so a /// shutting-down harness lets the task exit; processes one id at a time, @@ -1736,6 +1748,11 @@ impl CandleHarness { tracing::warn!("chat_completion: refusing request, model poisoned"); return Err(self.trigger_recovery(&model_id).await); } + if debug_poison_armed(&model_id) { + let _g = span.enter(); + tracing::warn!("NEURON_DEBUG_POISON: forcing auto-recovery (#17 verification)"); + return Err(self.trigger_recovery(&model_id).await); + } // Serialise concurrent requests against this model. Holds for // the duration of clear_kv_cache → prefill → decode so two @@ -2988,6 +3005,11 @@ impl CandleHarness { tracing::warn!("TP chat_completion: refusing request, model poisoned"); return Err(self.trigger_recovery(&model_id).await); } + if debug_poison_armed(&model_id) { + let _g = span.enter(); + tracing::warn!("NEURON_DEBUG_POISON: forcing auto-recovery (#17 verification)"); + return Err(self.trigger_recovery(&model_id).await); + } // Reject image-bearing requests against a TP model with no // vision tower, cleanly (`vision_unsupported`) rather than