Stage 7a-ii: real NCCL handshake behind the worker pool

Wires cudarc::nccl into the TP worker lifecycle introduced in 7a-i. With --features cuda the leader and its workers now establish a live NCCL communicator end-to-end; without the feature the same code paths return Error{kind="cuda_feature_not_enabled"} so a misconfigured build is obvious instead of silently no-op. NCCL state machine (harness/tp/nccl_state.rs) is shared between the worker process and the leader's pool: - generate_comm_id_hex() mints an Id::new() on the leader. - NcclState::init parses 256 hex chars → [c_char; 128] → Id::uninit, opens a CudaContext on the configured device, calls Comm::from_rank with the supplied (rank, world_size, id). NCCL blocks until every rank has joined. - NcclState::sanity_check runs one all_reduce(1u32, Sum); the leader asserts every rank reports observed_sum == world_size. - NCCL handles serialised under Mutex; unsafe impl Send/Sync gates the Comm across spawn_blocking boundaries (NCCL is move-safe; only concurrent op issuance is unsafe). WorkerPool::init_nccl orchestrates the rendezvous: 1. Write Init { comm_id } to every worker's stdin (no await yet). 2. Leader rank 0 calls its own Comm::from_rank in spawn_blocking, concurrently with workers. 3. NCCL handshake completes for all ranks simultaneously. 4. Leader collects InitOk responses. WorkerPool::nccl_sanity_check follows the same pattern over all_reduce, validating world_size == observed_sum on every rank. Worker.send_only / Worker.recv_only split out from the previous monolithic Worker.request so the leader can interleave its own NCCL work with the worker calls — required because NCCL blocks during init. Tests: - 4 hex roundtrip unit tests for the wire encoding. - The 7a-i "not implemented" expectation now reads "cuda_feature_not_enabled" on the local dev box (no CUDA), or accepts InitOk on a cuda-built test binary. - New cuda-integration test in tp_worker_lifecycle_cuda.rs covers the real init + sanity round-trip; gated on the cuda-integration feature so default CI doesn't try to NCCL. Verifiable on beast (2× RTX 5090): cargo test -p neuron --features cuda-integration \ --test tp_worker_lifecycle_cuda Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 16:40:01 +03:00
parent 2a7ede0232
commit da068ded6d
7 changed files with 498 additions and 29 deletions
--- a/crates/neuron/tests/tp_worker_lifecycle.rs
+++ b/crates/neuron/tests/tp_worker_lifecycle.rs
@@ -69,12 +69,12 @@ async fn test_spawn_three_workers() {
    pool.shutdown().await.expect("clean shutdown");
 }

-/// 7a-i's Init/NcclSanityCheck handlers return an error rather than
-/// silently no-op, so the leader can tell the difference between
-/// "haven't implemented yet" and "succeeded vacuously". Confirm the
-/// shape so 7a-ii's replacement is a drop-in (same wire op names).
+/// 7a-ii: without the cuda feature, Init must fail with a clear
+/// `cuda_feature_not_enabled` marker rather than silently succeeding.
+/// This is the local-dev-box test; the real NCCL handshake is exercised
+/// by `tp_worker_lifecycle_cuda.rs` (gated on `cuda-integration`).
 #[tokio::test]
-async fn test_init_returns_not_implemented_in_7a_i() {
+async fn test_init_returns_cuda_feature_not_enabled_without_cuda() {
    use neuron::harness::tp::rpc::WorkerRequest;
    use std::process::Stdio;
    use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
@@ -117,9 +117,24 @@ async fn test_init_returns_not_implemented_in_7a_i() {
    let resp: WorkerResponse = serde_json::from_str(&reply).expect("parse reply");
    match resp {
        WorkerResponse::Error { kind, .. } => {
-            assert_eq!(kind, "not_implemented_7a_i");
+            #[cfg(feature = "cuda")]
+            {
+                // With cuda enabled the response depends on whether
+                // CUDA hardware is actually present. Accept either
+                // the success contract or a real NCCL failure.
+                let _ = kind;
+            }
+            #[cfg(not(feature = "cuda"))]
+            assert_eq!(kind, "cuda_feature_not_enabled");
        }
-        other => panic!("expected Error{{kind=not_implemented_7a_i}}, got {other:?}"),
+        WorkerResponse::InitOk => {
+            // Real NCCL succeeded — only possible with cuda feature
+            // AND a working NCCL stack AND another rank actually
+            // joining. Don't fail; just acknowledge.
+            #[cfg(not(feature = "cuda"))]
+            panic!("InitOk without cuda feature is impossible");
+        }
+        other => panic!("expected Error or InitOk, got {other:?}"),
    }

    // Clean shutdown.
--- a/crates/neuron/tests/tp_worker_lifecycle_cuda.rs
+++ b/crates/neuron/tests/tp_worker_lifecycle_cuda.rs
@@ -0,0 +1,43 @@
+//! Stage 7a-ii: real NCCL handshake across the worker pool.
+//!
+//! Gated behind the `cuda-integration` feature because it requires
+//! libnccl AND multiple CUDA devices on the running host. Run on
+//! beast (2× RTX 5090) via:
+//!
+//!   cargo test -p neuron --features cuda-integration \
+//!         --test tp_worker_lifecycle_cuda
+//!
+//! Steps: spawn N-1 workers, call `init_nccl`, run `nccl_sanity_check`
+//! (every rank `all_reduce`s `1u32` with Sum; expected total =
+//! world_size), shut down cleanly.
+
+#![cfg(feature = "cuda-integration")]
+
+use neuron::harness::tp::WorkerPool;
+
+const NEURON_BIN: &str = env!("CARGO_BIN_EXE_neuron");
+
+#[tokio::test]
+async fn test_init_and_sanity_check_two_ranks() {
+    let _ = tracing_subscriber::fmt()
+        .with_test_writer()
+        .with_env_filter("info,neuron=debug")
+        .try_init();
+
+    // 2 ranks: leader = rank 0 on device 0, worker = rank 1 on device 1.
+    let mut pool = WorkerPool::spawn(NEURON_BIN.as_ref(), 2, &[0, 1])
+        .await
+        .expect("spawn worker pool");
+
+    pool.ping_all().await.expect("pong all workers");
+
+    pool.init_nccl(0)
+        .await
+        .expect("init_nccl: NCCL handshake across all ranks");
+
+    pool.nccl_sanity_check()
+        .await
+        .expect("nccl_sanity_check: observed_sum == world_size on all ranks");
+
+    pool.shutdown().await.expect("clean shutdown");
+}