feat(tp): --tp-smoke CLI subcommand + remote validation script

Adds a one-shot diagnostic that exercises the lower half of the TP stack — WorkerPool::spawn, init_nccl, nccl_sanity_check — in isolation from model load and inference. Runs N-1 worker subprocesses (rank 0 stays in this process), joins them in an NCCL communicator on the specified CUDA devices, all_reduces a sentinel 1u32 per rank, verifies the observed_sum equals world_size on every rank, then shuts down. Output is `status=ok` on stdout (plus key=value lines for tp_size and cuda_devices) when every check passes, non-zero exit + tracing on stderr otherwise. The smoke command is diagnostic-only and not exposed through the daemon HTTP API. script/tp-smoke.sh wraps it with an ssh invocation against a fleet host (default beast — the only host with 2 GPUs) and asserts the status line, mirroring the validate-neuron.sh ergonomics. This is step 1 of the TP test plan. A failure here means TP cannot work on the host at all; step 2 (Stage 7b-iv) wires real model load and inference through the same WorkerPool primitives. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 19:40:25 +03:00
parent 96d8755245
commit 9b8bd146f6
2 changed files with 138 additions and 5 deletions
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Context, Result};
 use clap::Parser;
 use neuron::{
    api,
@@ -13,8 +13,9 @@ use tokio::sync::RwLock;
 use tracing_subscriber::EnvFilter;

 /// Top-level CLI. The same binary runs as either the public neuron
-/// daemon (default) or a tensor-parallel worker subprocess (when
-/// `--worker` is set) spawned by the leader on the same host.
+/// daemon (default), a tensor-parallel worker subprocess (when
+/// `--worker` is set, spawned by the leader on the same host), or a
+/// one-shot TP NCCL handshake check (when `--tp-smoke` is set).
 #[derive(Parser)]
 #[command(name = "neuron")]
 #[command(about = "Per-node daemon for cortex inference clusters")]
@@ -28,12 +29,20 @@ struct Args {
    #[arg(long, default_value_t = false)]
    worker: bool,

+    /// Run a one-shot TP smoke test: spawn `--tp-size - 1` worker
+    /// subprocesses on `--cuda-devices`, build the NCCL communicator,
+    /// run an `AllReduce` sanity check across every rank, and exit.
+    /// Used to validate the TP plumbing in isolation from model load
+    /// and inference. Diagnostic-only — not exposed through the daemon
+    /// HTTP API.
+    #[arg(long, default_value_t = false)]
+    tp_smoke: bool,
+
    /// NCCL rank for worker mode. Ignored when `--worker` is not set.
    #[arg(long, default_value_t = 0)]
    rank: u32,

-    /// Total NCCL world size for worker mode. Ignored when `--worker`
-    /// is not set.
+    /// Total NCCL world size for worker mode or TP smoke mode.
    #[arg(long, default_value_t = 1)]
    tp_size: u32,

@@ -42,6 +51,11 @@ struct Args {
    #[arg(long, default_value_t = 0)]
    cuda_device: u32,

+    /// Comma-separated CUDA device indices for TP smoke mode (one per
+    /// rank, starting with rank 0). Must have `tp_size` entries.
+    #[arg(long, value_delimiter = ',')]
+    cuda_devices: Vec<u32>,
+
    /// Port to listen on (overrides config file). Daemon mode only.
    #[arg(short, long)]
    port: Option<u16>,
@@ -72,9 +86,68 @@ async fn main() -> Result<()> {
        .await;
    }

+    if args.tp_smoke {
+        return tp_smoke(args.tp_size, args.cuda_devices).await;
+    }
+
    daemon(args).await
 }

+/// One-shot tensor-parallel handshake. Spawns N-1 worker subprocesses
+/// (rank 0 stays in this process), builds the NCCL communicator across
+/// the full world, runs an AllReduce sanity check, and shuts everyone
+/// down. Output is plain log lines on stderr + a final summary on
+/// stdout in `key=value` form so an outer script can parse it.
+async fn tp_smoke(tp_size: u32, cuda_devices: Vec<u32>) -> Result<()> {
+    if tp_size < 2 {
+        anyhow::bail!("--tp-size must be at least 2 (got {tp_size})");
+    }
+    if cuda_devices.len() as u32 != tp_size {
+        anyhow::bail!(
+            "--cuda-devices must list exactly {tp_size} entries (got {})",
+            cuda_devices.len()
+        );
+    }
+
+    let exe = std::env::current_exe().context("resolve current_exe for worker spawn")?;
+    let leader_device = cuda_devices[0];
+
+    tracing::info!(
+        tp_size,
+        ?cuda_devices,
+        binary = %exe.display(),
+        "tp-smoke: spawning worker pool"
+    );
+    let mut pool = tp::WorkerPool::spawn(&exe, tp_size, &cuda_devices).await?;
+
+    tracing::info!("tp-smoke: pinging every worker");
+    let pongs = pool.ping_all().await?;
+    for p in &pongs {
+        tracing::info!(?p, "tp-smoke: pong");
+    }
+
+    tracing::info!(leader_device, "tp-smoke: initialising NCCL");
+    pool.init_nccl(leader_device).await?;
+
+    tracing::info!("tp-smoke: running AllReduce sanity check");
+    pool.nccl_sanity_check().await?;
+
+    tracing::info!("tp-smoke: shutting down pool");
+    pool.shutdown().await?;
+
+    println!("status=ok");
+    println!("tp_size={tp_size}");
+    println!(
+        "cuda_devices={}",
+        cuda_devices
+            .iter()
+            .map(|d| d.to_string())
+            .collect::<Vec<_>>()
+            .join(",")
+    );
+    Ok(())
+}
+
 async fn daemon(args: Args) -> Result<()> {
    let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
        tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");