Compare commits
7 Commits
feat/neuro
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
60f5598542
|
|||
|
7945240646
|
|||
|
0c74d89d15
|
|||
|
c94a2ae755
|
|||
|
99920dd322
|
|||
|
c4f239ceb9
|
|||
|
ac445c1569
|
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -905,8 +905,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "cudarc"
|
||||
version = "0.19.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cea5f10a99e025c1b44ae2354c2d8326b25ddbd0baf76bde8e55cfd4018a2cc"
|
||||
source = "git+https://github.com/grenade/cudarc?rev=63327a256059f8252641ae46c6bb9eefe707f382#63327a256059f8252641ae46c6bb9eefe707f382"
|
||||
dependencies = [
|
||||
"float8",
|
||||
"half",
|
||||
|
||||
@@ -61,3 +61,12 @@ eventsource-stream = "0.2"
|
||||
# workspace crates
|
||||
cortex-core = { path = "crates/cortex-core" }
|
||||
cortex-gateway = { path = "crates/cortex-gateway" }
|
||||
|
||||
# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is
|
||||
# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds
|
||||
# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP
|
||||
# hang-recovery (abort a wedged collective from another thread, then
|
||||
# rebuild the comm). Pinned to a fork revision pending upstream review
|
||||
# (grenade/cudarc @ nccl-comm-abort).
|
||||
[patch.crates-io]
|
||||
cudarc = { git = "https://github.com/grenade/cudarc", rev = "63327a256059f8252641ae46c6bb9eefe707f382" }
|
||||
|
||||
@@ -201,6 +201,16 @@ pub(crate) fn run(device_index: u32, rx: Receiver<Job>, poisoned: Arc<AtomicBool
|
||||
let _ = reply.send(resp);
|
||||
}
|
||||
#[cfg(feature = "cuda")]
|
||||
Job::GetLeaderComm { reply } => {
|
||||
// Clone the leader's Arc<Comm> out for the async-side
|
||||
// watchdog. `None` before NcclInit. (#17 Stage 2)
|
||||
let comm = state
|
||||
.nccl
|
||||
.comm()
|
||||
.map(crate::harness::tp::nccl_state::SendComm);
|
||||
let _ = reply.send(comm);
|
||||
}
|
||||
#[cfg(feature = "cuda")]
|
||||
Job::TpLoadShard {
|
||||
model_id,
|
||||
config_json,
|
||||
@@ -1004,6 +1014,10 @@ fn drain_poisoned(job: Job, device_index: u32) {
|
||||
message: format!("device worker {device_index} poisoned"),
|
||||
});
|
||||
}
|
||||
#[cfg(feature = "cuda")]
|
||||
Job::GetLeaderComm { reply } => {
|
||||
let _ = reply.send(None);
|
||||
}
|
||||
Job::NcclSanity { reply } => {
|
||||
let _ = reply.send(crate::harness::tp::rpc::WorkerResponse::Error {
|
||||
kind: "device_worker_poisoned".into(),
|
||||
|
||||
@@ -192,6 +192,17 @@ pub enum Job {
|
||||
NcclSanity {
|
||||
reply: oneshot::Sender<crate::harness::tp::rpc::WorkerResponse>,
|
||||
},
|
||||
/// Hand a clonable handle to the leader's NCCL `Comm` back to the
|
||||
/// async side, so the TP step watchdog can call `ncclCommAbort` on
|
||||
/// it from a *different* thread to unblock a wedged collective
|
||||
/// (#17 Stage 2). Fetched once at init while the worker thread is
|
||||
/// still responsive — a thread already wedged in a collective can't
|
||||
/// service this job, which is exactly why the handle is cached
|
||||
/// up front. Replies `None` before `NcclInit` has run.
|
||||
#[cfg(feature = "cuda")]
|
||||
GetLeaderComm {
|
||||
reply: oneshot::Sender<Option<crate::harness::tp::nccl_state::SendComm>>,
|
||||
},
|
||||
/// Load the leader's TP shard on the worker thread. The dispatch
|
||||
/// handler reads `state.nccl.comm()` directly (no cross-thread
|
||||
/// `Arc<Comm>` transfer, no `SendComm` wrapper) and builds the
|
||||
|
||||
@@ -161,6 +161,27 @@ impl DeviceWorkerHandle {
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch a clonable handle to the leader's NCCL `Comm` (#17 Stage 2).
|
||||
/// The TP step watchdog caches this at init so it can call
|
||||
/// `ncclCommAbort` from the async thread to unblock a wedged
|
||||
/// collective. Returns `None` if uninitialised, poisoned, or gone —
|
||||
/// the caller treats a missing handle as "can't abort" and logs it.
|
||||
#[cfg(feature = "cuda")]
|
||||
pub async fn get_leader_comm(&self) -> Option<crate::harness::tp::nccl_state::SendComm> {
|
||||
if self.poisoned.load(Ordering::Acquire) {
|
||||
return None;
|
||||
}
|
||||
let (reply_tx, reply_rx) = oneshot::channel();
|
||||
if self
|
||||
.tx
|
||||
.send(Job::GetLeaderComm { reply: reply_tx })
|
||||
.is_err()
|
||||
{
|
||||
return None;
|
||||
}
|
||||
reply_rx.await.ok().flatten()
|
||||
}
|
||||
|
||||
/// Load a GGUF (pre-quantized) single-GPU model on the worker
|
||||
/// thread. The hf-hub resolution happens on the async caller; the
|
||||
/// resolved local `gguf_path` plus the spec's model_id are sent
|
||||
|
||||
@@ -245,9 +245,67 @@ pub struct WorkerPool {
|
||||
/// Phase 4 the load itself moves onto the worker and that bridge
|
||||
/// goes away.
|
||||
pub(crate) leader_worker: std::sync::Arc<super::device_worker::DeviceWorkerHandle>,
|
||||
/// Cached handle to the leader's NCCL `Comm`, fetched at `init_nccl`
|
||||
/// while the worker thread is responsive. The TP step watchdog uses
|
||||
/// it to `ncclCommAbort` a wedged collective from the async thread —
|
||||
/// the one NCCL op allowed concurrently with an in-flight collective,
|
||||
/// and the only way to unblock the in-process leader thread so
|
||||
/// recovery's `unload` doesn't itself hang (#17 Stage 2). `None` if
|
||||
/// init couldn't cache it; the watchdog then logs that it can't abort.
|
||||
#[cfg(feature = "cuda")]
|
||||
leader_comm: Option<nccl_state::SendComm>,
|
||||
}
|
||||
|
||||
/// Per-step deadline for a TP forward (#17 Stage 2). A healthy decode
|
||||
/// step or chunked prefill completes in well under a second; a wedged
|
||||
/// NCCL collective never returns. Generous default so no legitimate step
|
||||
/// trips it; overridable via `NEURON_TP_STEP_TIMEOUT_S` (seconds).
|
||||
#[cfg(feature = "cuda")]
|
||||
fn tp_step_timeout() -> std::time::Duration {
|
||||
let secs = std::env::var("NEURON_TP_STEP_TIMEOUT_S")
|
||||
.ok()
|
||||
.and_then(|v| v.trim().parse::<u64>().ok())
|
||||
.filter(|&s| s > 0)
|
||||
.unwrap_or(120);
|
||||
std::time::Duration::from_secs(secs)
|
||||
}
|
||||
|
||||
impl WorkerPool {
|
||||
/// Abort the leader's NCCL comm to unblock a collective the watchdog
|
||||
/// found wedged (#17 Stage 2). Logs the whole sequence loudly so a
|
||||
/// real-world hang leaves a greppable forensic trail
|
||||
/// (`tp watchdog:` / `ncclCommAbort`). Calling abort from this async
|
||||
/// thread while the worker thread is blocked inside the collective is
|
||||
/// the one concurrent NCCL op the library sanctions — it is how a
|
||||
/// stuck/failed collective is unblocked.
|
||||
#[cfg(feature = "cuda")]
|
||||
fn watchdog_abort_leader_comm(&self, model_id: &str, secs: u64) {
|
||||
tracing::error!(
|
||||
model = %model_id,
|
||||
timeout_s = secs,
|
||||
"tp watchdog: leader forward exceeded deadline — NCCL collective wedged; \
|
||||
aborting comm to unblock the leader thread for auto-recovery"
|
||||
);
|
||||
match &self.leader_comm {
|
||||
Some(c) => match c.0.abort() {
|
||||
Ok(()) => tracing::error!(
|
||||
model = %model_id,
|
||||
"tp watchdog: ncclCommAbort succeeded — wedged collective unblocked; \
|
||||
failing the step so the model auto-recovers (unload+reload)"
|
||||
),
|
||||
Err(e) => tracing::error!(
|
||||
model = %model_id, error = ?e,
|
||||
"tp watchdog: ncclCommAbort failed — recovery may stall until a process restart"
|
||||
),
|
||||
},
|
||||
None => tracing::error!(
|
||||
model = %model_id,
|
||||
"tp watchdog: no cached leader comm handle — cannot abort; recovery will rely \
|
||||
on a process restart"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn `world_size - 1` worker subprocesses. Rank 0 is the
|
||||
/// leader (in-process) and is *not* spawned here — the leader
|
||||
/// holds rank 0's NCCL Comm and shard in its own address space.
|
||||
@@ -324,6 +382,8 @@ impl WorkerPool {
|
||||
workers,
|
||||
exe,
|
||||
leader_worker,
|
||||
#[cfg(feature = "cuda")]
|
||||
leader_comm: None,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -404,6 +464,23 @@ impl WorkerPool {
|
||||
world_size = self.world_size,
|
||||
"NCCL communicator established across all ranks"
|
||||
);
|
||||
|
||||
// Cache the leader's Comm handle now, while the worker thread is
|
||||
// responsive, so the TP step watchdog can abort a wedged
|
||||
// collective later (it can't fetch it then — the thread is stuck).
|
||||
// (#17 Stage 2.)
|
||||
#[cfg(feature = "cuda")]
|
||||
{
|
||||
self.leader_comm = self.leader_worker.get_leader_comm().await;
|
||||
if self.leader_comm.is_some() {
|
||||
tracing::debug!("cached leader NCCL comm handle for the TP step watchdog");
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"could not cache leader NCCL comm handle; the TP step watchdog will be \
|
||||
unable to abort a wedged collective (a hang would need a process restart)"
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -628,10 +705,27 @@ impl WorkerPool {
|
||||
// that's the invariant the whole refactor exists to
|
||||
// preserve.
|
||||
let leader_start = std::time::Instant::now();
|
||||
let leader_result = self
|
||||
let timeout = tp_step_timeout();
|
||||
let leader_fut = self
|
||||
.leader_worker
|
||||
.tp_forward_logits(leader_handle, tokens, offset)
|
||||
.await;
|
||||
.tp_forward_logits(leader_handle, tokens, offset);
|
||||
let leader_result = match tokio::time::timeout(timeout, leader_fut).await {
|
||||
Ok(r) => r,
|
||||
Err(_elapsed) => {
|
||||
// Watchdog (#17 Stage 2): the NCCL collective is wedged.
|
||||
// Abort the leader comm to unblock its thread, then fail
|
||||
// the step WITHOUT draining (the subprocess workers are
|
||||
// wedged too; recovery's unload kills them). The error
|
||||
// poisons the model → auto-recovery, which no longer hangs
|
||||
// because the leader thread is now responsive.
|
||||
self.watchdog_abort_leader_comm(model_id, timeout.as_secs());
|
||||
anyhow::bail!(
|
||||
"tp watchdog: leader forward exceeded {}s deadline; aborted wedged NCCL \
|
||||
comm — model will auto-recover",
|
||||
timeout.as_secs()
|
||||
);
|
||||
}
|
||||
};
|
||||
let leader_ok = leader_result.is_ok();
|
||||
let leader_ms = leader_start.elapsed().as_millis();
|
||||
// Surface the leader's own error at WARN before draining
|
||||
@@ -767,17 +861,29 @@ impl WorkerPool {
|
||||
// matching collective; CPU-side logits keep the device tensor
|
||||
// from escaping the worker thread.
|
||||
let leader_start = std::time::Instant::now();
|
||||
let leader_result = self
|
||||
.leader_worker
|
||||
.tp_forward_logits_with_images(
|
||||
let timeout = tp_step_timeout();
|
||||
let leader_fut = self.leader_worker.tp_forward_logits_with_images(
|
||||
leader_handle,
|
||||
tokens,
|
||||
offset,
|
||||
image_token_id,
|
||||
image_data_uris,
|
||||
chunk_size,
|
||||
)
|
||||
.await;
|
||||
);
|
||||
let leader_result = match tokio::time::timeout(timeout, leader_fut).await {
|
||||
Ok(r) => r,
|
||||
Err(_elapsed) => {
|
||||
// Watchdog (#17 Stage 2) — see generate_step. Vision
|
||||
// prefill is still well under the deadline on healthy
|
||||
// hardware; a timeout means a wedged collective.
|
||||
self.watchdog_abort_leader_comm(model_id, timeout.as_secs());
|
||||
anyhow::bail!(
|
||||
"tp watchdog: leader image forward exceeded {}s deadline; aborted wedged \
|
||||
NCCL comm — model will auto-recover",
|
||||
timeout.as_secs()
|
||||
);
|
||||
}
|
||||
};
|
||||
let leader_ok = leader_result.is_ok();
|
||||
let leader_ms = leader_start.elapsed().as_millis();
|
||||
if !leader_ok {
|
||||
|
||||
@@ -119,40 +119,25 @@ mod cuda_impl {
|
||||
}
|
||||
}
|
||||
|
||||
/// `Arc<Comm>` doesn't impl `Send` because `Comm` wraps a raw
|
||||
/// `ncclComm_t` pointer. The NCCL contract is "operations against a
|
||||
/// given comm must be serialised", not "the handle must stay on the
|
||||
/// thread that created it" — so it's safe to move an `Arc<Comm>`
|
||||
/// across threads as long as no concurrent ops are issued. The
|
||||
/// pool's outer Mutex serialises us into `spawn_blocking`, so this
|
||||
/// wrapper at the move boundary is the only thing missing.
|
||||
/// Thin newtype over `Arc<Comm>`, kept for call-site clarity — it marks
|
||||
/// the points where a comm handle is intentionally moved across threads
|
||||
/// (e.g. cached async-side for the TP step watchdog's `ncclCommAbort`).
|
||||
///
|
||||
/// `Sync` is also marked safe because the `Arc<Comm>` clones held
|
||||
/// by the row-parallel layers are only used from the
|
||||
/// `spawn_blocking` thread driving the forward pass; concurrent
|
||||
/// access from another thread would still be a bug.
|
||||
/// `Send`/`Sync` are provided upstream by `cudarc`'s `Comm` (which
|
||||
/// asserts the NCCL thread-safety invariant, including aborting from a
|
||||
/// different thread than one inside a collective), so this type derives
|
||||
/// them automatically — no manual `unsafe impl` here.
|
||||
pub struct SendComm(pub Arc<Comm>);
|
||||
|
||||
// SAFETY: see the doc-comment above; the invariant is enforced at
|
||||
// the call site (pool Mutex + single spawn_blocking thread), not at
|
||||
// the type level.
|
||||
unsafe impl Send for SendComm {}
|
||||
unsafe impl Sync for SendComm {}
|
||||
|
||||
impl SendComm {
|
||||
pub fn into_inner(self) -> Arc<Comm> {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: `cudarc::nccl::Comm` contains a raw `ncclComm_t` pointer
|
||||
// (libnccl-allocated state). NCCL requires that operations against
|
||||
// one Comm be issued one at a time; we serialise access by storing
|
||||
// NcclState behind a Mutex in `WorkerPool`. The Comm itself is
|
||||
// move-safe — NCCL doesn't track the calling OS thread, only the
|
||||
// stream the operations are dispatched against.
|
||||
unsafe impl Send for NcclState {}
|
||||
unsafe impl Sync for NcclState {}
|
||||
// `NcclState`'s `Send`/`Sync` are auto-derived: its `Arc<Comm>` and
|
||||
// `Arc<CudaContext>` fields are now `Send`/`Sync` (cudarc asserts the
|
||||
// comm thread-safety invariant), so no manual `unsafe impl` is needed.
|
||||
|
||||
/// Generate a fresh NCCL `Id` and return it hex-encoded. Used by
|
||||
/// the leader to mint the shared communicator id which is then
|
||||
|
||||
Reference in New Issue
Block a user