feat(tp): --tp-smoke CLI subcommand + remote validation script

Adds a one-shot diagnostic that exercises the lower half of the TP stack — WorkerPool::spawn, init_nccl, nccl_sanity_check — in isolation from model load and inference. Runs N-1 worker subprocesses (rank 0 stays in this process), joins them in an NCCL communicator on the specified CUDA devices, all_reduces a sentinel 1u32 per rank, verifies the observed_sum equals world_size on every rank, then shuts down. Output is `status=ok` on stdout (plus key=value lines for tp_size and cuda_devices) when every check passes, non-zero exit + tracing on stderr otherwise. The smoke command is diagnostic-only and not exposed through the daemon HTTP API. script/tp-smoke.sh wraps it with an ssh invocation against a fleet host (default beast — the only host with 2 GPUs) and asserts the status line, mirroring the validate-neuron.sh ergonomics. This is step 1 of the TP test plan. A failure here means TP cannot work on the host at all; step 2 (Stage 7b-iv) wires real model load and inference through the same WorkerPool primitives. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 19:40:25 +03:00
parent 96d8755245
commit 9b8bd146f6
2 changed files with 138 additions and 5 deletions
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Context, Result};
 use clap::Parser;
 use neuron::{
    api,
@@ -13,8 +13,9 @@ use tokio::sync::RwLock;
 use tracing_subscriber::EnvFilter;
 /// Top-level CLI. The same binary runs as either the public neuron
-/// daemon (default) or a tensor-parallel worker subprocess (when
+/// daemon (default), a tensor-parallel worker subprocess (when
-/// `--worker` is set) spawned by the leader on the same host.
+/// `--worker` is set, spawned by the leader on the same host), or a
 /// one-shot TP NCCL handshake check (when `--tp-smoke` is set).
 #[derive(Parser)]
 #[command(name = "neuron")]
 #[command(about = "Per-node daemon for cortex inference clusters")]
@@ -28,12 +29,20 @@ struct Args {
    #[arg(long, default_value_t = false)]
    worker: bool,
    /// Run a one-shot TP smoke test: spawn `--tp-size - 1` worker
    /// subprocesses on `--cuda-devices`, build the NCCL communicator,
    /// run an `AllReduce` sanity check across every rank, and exit.
    /// Used to validate the TP plumbing in isolation from model load
    /// and inference. Diagnostic-only — not exposed through the daemon
    /// HTTP API.
    #[arg(long, default_value_t = false)]
    tp_smoke: bool,
    /// NCCL rank for worker mode. Ignored when `--worker` is not set.
    #[arg(long, default_value_t = 0)]
    rank: u32,
-    /// Total NCCL world size for worker mode. Ignored when `--worker`
+    /// Total NCCL world size for worker mode or TP smoke mode.
    /// is not set.
    #[arg(long, default_value_t = 1)]
    tp_size: u32,
@@ -42,6 +51,11 @@ struct Args {
    #[arg(long, default_value_t = 0)]
    cuda_device: u32,
    /// Comma-separated CUDA device indices for TP smoke mode (one per
    /// rank, starting with rank 0). Must have `tp_size` entries.
    #[arg(long, value_delimiter = ',')]
    cuda_devices: Vec<u32>,
    /// Port to listen on (overrides config file). Daemon mode only.
    #[arg(short, long)]
    port: Option<u16>,
@@ -72,9 +86,68 @@ async fn main() -> Result<()> {
        .await;
    }
    if args.tp_smoke {
        return tp_smoke(args.tp_size, args.cuda_devices).await;
    }
    daemon(args).await
 }
 /// One-shot tensor-parallel handshake. Spawns N-1 worker subprocesses
 /// (rank 0 stays in this process), builds the NCCL communicator across
 /// the full world, runs an AllReduce sanity check, and shuts everyone
 /// down. Output is plain log lines on stderr + a final summary on
 /// stdout in `key=value` form so an outer script can parse it.
 async fn tp_smoke(tp_size: u32, cuda_devices: Vec<u32>) -> Result<()> {
    if tp_size < 2 {
        anyhow::bail!("--tp-size must be at least 2 (got {tp_size})");
    }
    if cuda_devices.len() as u32 != tp_size {
        anyhow::bail!(
            "--cuda-devices must list exactly {tp_size} entries (got {})",
            cuda_devices.len()
        );
    }
    let exe = std::env::current_exe().context("resolve current_exe for worker spawn")?;
    let leader_device = cuda_devices[0];
    tracing::info!(
        tp_size,
        ?cuda_devices,
        binary = %exe.display(),
        "tp-smoke: spawning worker pool"
    );
    let mut pool = tp::WorkerPool::spawn(&exe, tp_size, &cuda_devices).await?;
    tracing::info!("tp-smoke: pinging every worker");
    let pongs = pool.ping_all().await?;
    for p in &pongs {
        tracing::info!(?p, "tp-smoke: pong");
    }
    tracing::info!(leader_device, "tp-smoke: initialising NCCL");
    pool.init_nccl(leader_device).await?;
    tracing::info!("tp-smoke: running AllReduce sanity check");
    pool.nccl_sanity_check().await?;
    tracing::info!("tp-smoke: shutting down pool");
    pool.shutdown().await?;
    println!("status=ok");
    println!("tp_size={tp_size}");
    println!(
        "cuda_devices={}",
        cuda_devices
            .iter()
            .map(|d| d.to_string())
            .collect::<Vec<_>>()
            .join(",")
    );
    Ok(())
 }
 async fn daemon(args: Args) -> Result<()> {
    let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
        tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");
--- a/script/tp-smoke.sh
+++ b/script/tp-smoke.sh
@@ -0,0 +1,60 @@
 #!/bin/env bash
 #
 # TP smoke test against a deployed neuron host.
 #
 # SSHes into the target host and runs `neuron --tp-smoke --tp-size N
 # --cuda-devices ...` directly — no HTTP API involved. The smoke
 # subcommand spawns N-1 worker subprocesses, joins them in an NCCL
 # communicator, runs one AllReduce(Sum) of `1u32` across every rank, and
 # verifies the observed sum equals world_size on every rank.
 #
 # This validates the lower-half of the TP stack (NCCL + IPC topology +
 # subprocess lifecycle) without touching model load, inference, or HTTP.
 # A failure here means the host cannot run any TP model and there is no
 # point debugging the higher layers.
 #
 # Usage:
 #   script/tp-smoke.sh [host] [tp_size] [cuda_devices]
 #
 # Defaults:
 #   host         = beast.hanzalova.internal  (only fleet host with 2 GPUs)
 #   tp_size      = 2
 #   cuda_devices = 0,1
 set -euo pipefail
 HOST="${1:-beast.hanzalova.internal}"
 TP_SIZE="${2:-2}"
 CUDA_DEVICES="${3:-0,1}"
 say() { printf '[%s] %s\n' "${HOST}" "$*" >&2; }
 die() { say "FAIL: $*"; exit 1; }
 say "running neuron --tp-smoke --tp-size ${TP_SIZE} --cuda-devices ${CUDA_DEVICES}"
 # Run as root via sudo because:
 #   - cuda contexts under a user account require either the nvidia
 #     uvm/peer devices to be world-readable or the user to be in a
 #     priviliged group (neither is true on stock fc43);
 #   - the installed binary lives at /usr/bin/neuron with no setuid;
 # Running through root is the simplest path that matches how
 # systemd-managed neuron sees the GPUs in production.
 #
 # The smoke command is read-only — it allocates a transient NCCL comm
 # and a 1u32 buffer per rank, then tears it all down.
 if ! ssh -o BatchMode=yes "${HOST}" \
    sudo /usr/bin/neuron \
        --tp-smoke \
        --tp-size "${TP_SIZE}" \
        --cuda-devices "${CUDA_DEVICES}" 2>&1 | tee /tmp/tp-smoke-"${HOST}".log
 then
    die "tp-smoke exited non-zero (see /tmp/tp-smoke-${HOST}.log)"
 fi
 # Final stdout line is `status=ok` on success.
 if grep -q '^status=ok$' /tmp/tp-smoke-"${HOST}".log; then
    say "PASS — NCCL handshake + AllReduce sanity check OK across ${TP_SIZE} ranks"
    exit 0
 else
    die "no status=ok line in output"
 fi