From c4f239ceb9430cc48ec359c6086bcf2f9489cb57 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Mon, 8 Jun 2026 13:49:59 +0300 Subject: [PATCH] build(neuron): patch cudarc to expose Comm::abort/get_async_error (#17 Stage 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #17 Stage 2 (TP hang-recovery) needs to call ncclCommAbort on a LIVE communicator from another thread — to unblock a collective wedged on a dead/hung peer so the ranks can resync. No cudarc release (incl. main) exposes this: the safe Comm only aborts in Drop, which can't fire while a stuck thread holds an Arc clone. Pin neuron's cudarc 0.19.7 to a fork (grenade/cudarc @ nccl-comm-abort, rev 4dff0be) adding three thin methods — Comm::abort, get_async_error, and a raw comm() accessor — to be submitted upstream. The patch targets 0.19.x only; candle's transitive cudarc 0.17.8 stays on crates.io. Foundation only; the watchdog + abort + comm-rebuild that consume these land in follow-up commits (cuda-gated → validated by the blackwell build). Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 3 +-- Cargo.toml | 9 +++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e6cd5a4..4690de1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -905,8 +905,7 @@ dependencies = [ [[package]] name = "cudarc" version = "0.19.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cea5f10a99e025c1b44ae2354c2d8326b25ddbd0baf76bde8e55cfd4018a2cc" +source = "git+https://github.com/grenade/cudarc?rev=4dff0be72d8a685d6691a6a53d4c95e1fe932277#4dff0be72d8a685d6691a6a53d4c95e1fe932277" dependencies = [ "float8", "half", diff --git a/Cargo.toml b/Cargo.toml index 2ad8471..b69d9d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,3 +61,12 @@ eventsource-stream = "0.2" # workspace crates cortex-core = { path = "crates/cortex-core" } cortex-gateway = { path = "crates/cortex-gateway" } + +# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is +# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds +# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP +# hang-recovery (abort a wedged collective from another thread, then +# rebuild the comm). Pinned to a fork revision pending upstream review +# (grenade/cudarc @ nccl-comm-abort). +[patch.crates-io] +cudarc = { git = "https://github.com/grenade/cudarc", rev = "4dff0be72d8a685d6691a6a53d4c95e1fe932277" }