feat(tp): --tp-smoke CLI subcommand + remote validation script

Adds a one-shot diagnostic that exercises the lower half of the TP stack — WorkerPool::spawn, init_nccl, nccl_sanity_check — in isolation from model load and inference. Runs N-1 worker subprocesses (rank 0 stays in this process), joins them in an NCCL communicator on the specified CUDA devices, all_reduces a sentinel 1u32 per rank, verifies the observed_sum equals world_size on every rank, then shuts down. Output is `status=ok` on stdout (plus key=value lines for tp_size and cuda_devices) when every check passes, non-zero exit + tracing on stderr otherwise. The smoke command is diagnostic-only and not exposed through the daemon HTTP API. script/tp-smoke.sh wraps it with an ssh invocation against a fleet host (default beast — the only host with 2 GPUs) and asserts the status line, mirroring the validate-neuron.sh ergonomics. This is step 1 of the TP test plan. A failure here means TP cannot work on the host at all; step 2 (Stage 7b-iv) wires real model load and inference through the same WorkerPool primitives. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 19:40:25 +03:00
parent 96d8755245
commit 9b8bd146f6
2 changed files with 138 additions and 5 deletions
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Context, Result};
 use clap::Parser;
 use neuron::{
    api,
@@ -13,8 +13,9 @@ use tokio::sync::RwLock;
 use tracing_subscriber::EnvFilter;

 /// Top-level CLI. The same binary runs as either the public neuron
-/// daemon (default) or a tensor-parallel worker subprocess (when
-/// `--worker` is set) spawned by the leader on the same host.
+/// daemon (default), a tensor-parallel worker subprocess (when
+/// `--worker` is set, spawned by the leader on the same host), or a
+/// one-shot TP NCCL handshake check (when `--tp-smoke` is set).
 #[derive(Parser)]
 #[command(name = "neuron")]
 #[command(about = "Per-node daemon for cortex inference clusters")]
@@ -28,12 +29,20 @@ struct Args {
    #[arg(long, default_value_t = false)]
    worker: bool,

+    /// Run a one-shot TP smoke test: spawn `--tp-size - 1` worker
+    /// subprocesses on `--cuda-devices`, build the NCCL communicator,
+    /// run an `AllReduce` sanity check across every rank, and exit.
+    /// Used to validate the TP plumbing in isolation from model load
+    /// and inference. Diagnostic-only — not exposed through the daemon
+    /// HTTP API.
+    #[arg(long, default_value_t = false)]
+    tp_smoke: bool,
+
    /// NCCL rank for worker mode. Ignored when `--worker` is not set.
    #[arg(long, default_value_t = 0)]
    rank: u32,

-    /// Total NCCL world size for worker mode. Ignored when `--worker`
-    /// is not set.
+    /// Total NCCL world size for worker mode or TP smoke mode.
    #[arg(long, default_value_t = 1)]
    tp_size: u32,

@@ -42,6 +51,11 @@ struct Args {
    #[arg(long, default_value_t = 0)]
    cuda_device: u32,

+    /// Comma-separated CUDA device indices for TP smoke mode (one per
+    /// rank, starting with rank 0). Must have `tp_size` entries.
+    #[arg(long, value_delimiter = ',')]
+    cuda_devices: Vec<u32>,
+
    /// Port to listen on (overrides config file). Daemon mode only.
    #[arg(short, long)]
    port: Option<u16>,
@@ -72,9 +86,68 @@ async fn main() -> Result<()> {
        .await;
    }

+    if args.tp_smoke {
+        return tp_smoke(args.tp_size, args.cuda_devices).await;
+    }
+
    daemon(args).await
 }

+/// One-shot tensor-parallel handshake. Spawns N-1 worker subprocesses
+/// (rank 0 stays in this process), builds the NCCL communicator across
+/// the full world, runs an AllReduce sanity check, and shuts everyone
+/// down. Output is plain log lines on stderr + a final summary on
+/// stdout in `key=value` form so an outer script can parse it.
+async fn tp_smoke(tp_size: u32, cuda_devices: Vec<u32>) -> Result<()> {
+    if tp_size < 2 {
+        anyhow::bail!("--tp-size must be at least 2 (got {tp_size})");
+    }
+    if cuda_devices.len() as u32 != tp_size {
+        anyhow::bail!(
+            "--cuda-devices must list exactly {tp_size} entries (got {})",
+            cuda_devices.len()
+        );
+    }
+
+    let exe = std::env::current_exe().context("resolve current_exe for worker spawn")?;
+    let leader_device = cuda_devices[0];
+
+    tracing::info!(
+        tp_size,
+        ?cuda_devices,
+        binary = %exe.display(),
+        "tp-smoke: spawning worker pool"
+    );
+    let mut pool = tp::WorkerPool::spawn(&exe, tp_size, &cuda_devices).await?;
+
+    tracing::info!("tp-smoke: pinging every worker");
+    let pongs = pool.ping_all().await?;
+    for p in &pongs {
+        tracing::info!(?p, "tp-smoke: pong");
+    }
+
+    tracing::info!(leader_device, "tp-smoke: initialising NCCL");
+    pool.init_nccl(leader_device).await?;
+
+    tracing::info!("tp-smoke: running AllReduce sanity check");
+    pool.nccl_sanity_check().await?;
+
+    tracing::info!("tp-smoke: shutting down pool");
+    pool.shutdown().await?;
+
+    println!("status=ok");
+    println!("tp_size={tp_size}");
+    println!(
+        "cuda_devices={}",
+        cuda_devices
+            .iter()
+            .map(|d| d.to_string())
+            .collect::<Vec<_>>()
+            .join(",")
+    );
+    Ok(())
+}
+
 async fn daemon(args: Args) -> Result<()> {
    let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
        tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");
--- a/script/tp-smoke.sh
+++ b/script/tp-smoke.sh
@@ -0,0 +1,60 @@
+#!/bin/env bash
+#
+# TP smoke test against a deployed neuron host.
+#
+# SSHes into the target host and runs `neuron --tp-smoke --tp-size N
+# --cuda-devices ...` directly — no HTTP API involved. The smoke
+# subcommand spawns N-1 worker subprocesses, joins them in an NCCL
+# communicator, runs one AllReduce(Sum) of `1u32` across every rank, and
+# verifies the observed sum equals world_size on every rank.
+#
+# This validates the lower-half of the TP stack (NCCL + IPC topology +
+# subprocess lifecycle) without touching model load, inference, or HTTP.
+# A failure here means the host cannot run any TP model and there is no
+# point debugging the higher layers.
+#
+# Usage:
+#   script/tp-smoke.sh [host] [tp_size] [cuda_devices]
+#
+# Defaults:
+#   host         = beast.hanzalova.internal  (only fleet host with 2 GPUs)
+#   tp_size      = 2
+#   cuda_devices = 0,1
+
+set -euo pipefail
+
+HOST="${1:-beast.hanzalova.internal}"
+TP_SIZE="${2:-2}"
+CUDA_DEVICES="${3:-0,1}"
+
+say() { printf '[%s] %s\n' "${HOST}" "$*" >&2; }
+die() { say "FAIL: $*"; exit 1; }
+
+say "running neuron --tp-smoke --tp-size ${TP_SIZE} --cuda-devices ${CUDA_DEVICES}"
+
+# Run as root via sudo because:
+#   - cuda contexts under a user account require either the nvidia
+#     uvm/peer devices to be world-readable or the user to be in a
+#     priviliged group (neither is true on stock fc43);
+#   - the installed binary lives at /usr/bin/neuron with no setuid;
+# Running through root is the simplest path that matches how
+# systemd-managed neuron sees the GPUs in production.
+#
+# The smoke command is read-only — it allocates a transient NCCL comm
+# and a 1u32 buffer per rank, then tears it all down.
+if ! ssh -o BatchMode=yes "${HOST}" \
+    sudo /usr/bin/neuron \
+        --tp-smoke \
+        --tp-size "${TP_SIZE}" \
+        --cuda-devices "${CUDA_DEVICES}" 2>&1 | tee /tmp/tp-smoke-"${HOST}".log
+then
+    die "tp-smoke exited non-zero (see /tmp/tp-smoke-${HOST}.log)"
+fi
+
+# Final stdout line is `status=ok` on success.
+if grep -q '^status=ok$' /tmp/tp-smoke-"${HOST}".log; then
+    say "PASS — NCCL handshake + AllReduce sanity check OK across ${TP_SIZE} ranks"
+    exit 0
+else
+    die "no status=ok line in output"
+fi