From 9b8bd146f6f4634e746b4eed9f609fccccf9b3bc Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Tue, 19 May 2026 19:40:25 +0300
Subject: [PATCH] feat(tp): --tp-smoke CLI subcommand + remote validation
 script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a one-shot diagnostic that exercises the lower half of the TP
stack — WorkerPool::spawn, init_nccl, nccl_sanity_check — in isolation
from model load and inference. Runs N-1 worker subprocesses (rank 0
stays in this process), joins them in an NCCL communicator on the
specified CUDA devices, all_reduces a sentinel 1u32 per rank, verifies
the observed_sum equals world_size on every rank, then shuts down.

Output is `status=ok` on stdout (plus key=value lines for tp_size and
cuda_devices) when every check passes, non-zero exit + tracing on
stderr otherwise. The smoke command is diagnostic-only and not exposed
through the daemon HTTP API.

script/tp-smoke.sh wraps it with an ssh invocation against a fleet
host (default beast — the only host with 2 GPUs) and asserts the
status line, mirroring the validate-neuron.sh ergonomics.

This is step 1 of the TP test plan. A failure here means TP cannot
work on the host at all; step 2 (Stage 7b-iv) wires real model load
and inference through the same WorkerPool primitives.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/neuron/src/main.rs | 83 ++++++++++++++++++++++++++++++++++++---
 script/tp-smoke.sh        | 60 ++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 5 deletions(-)
 create mode 100755 script/tp-smoke.sh
diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs
index d3b2ed1..d194b5a 100644
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Context, Result};
 use clap::Parser;
 use neuron::{
     api,
@@ -13,8 +13,9 @@ use tokio::sync::RwLock;
 use tracing_subscriber::EnvFilter;
 
 /// Top-level CLI. The same binary runs as either the public neuron
-/// daemon (default) or a tensor-parallel worker subprocess (when
-/// `--worker` is set) spawned by the leader on the same host.
+/// daemon (default), a tensor-parallel worker subprocess (when
+/// `--worker` is set, spawned by the leader on the same host), or a
+/// one-shot TP NCCL handshake check (when `--tp-smoke` is set).
 #[derive(Parser)]
 #[command(name = "neuron")]
 #[command(about = "Per-node daemon for cortex inference clusters")]
@@ -28,12 +29,20 @@ struct Args {
     #[arg(long, default_value_t = false)]
     worker: bool,
 
+    /// Run a one-shot TP smoke test: spawn `--tp-size - 1` worker
+    /// subprocesses on `--cuda-devices`, build the NCCL communicator,
+    /// run an `AllReduce` sanity check across every rank, and exit.
+    /// Used to validate the TP plumbing in isolation from model load
+    /// and inference. Diagnostic-only — not exposed through the daemon
+    /// HTTP API.
+    #[arg(long, default_value_t = false)]
+    tp_smoke: bool,
+
     /// NCCL rank for worker mode. Ignored when `--worker` is not set.
     #[arg(long, default_value_t = 0)]
     rank: u32,
 
-    /// Total NCCL world size for worker mode. Ignored when `--worker`
-    /// is not set.
+    /// Total NCCL world size for worker mode or TP smoke mode.
     #[arg(long, default_value_t = 1)]
     tp_size: u32,
 
@@ -42,6 +51,11 @@ struct Args {
     #[arg(long, default_value_t = 0)]
     cuda_device: u32,
 
+    /// Comma-separated CUDA device indices for TP smoke mode (one per
+    /// rank, starting with rank 0). Must have `tp_size` entries.
+    #[arg(long, value_delimiter = ',')]
+    cuda_devices: Vec<u32>,
+
     /// Port to listen on (overrides config file). Daemon mode only.
     #[arg(short, long)]
     port: Option<u16>,
@@ -72,9 +86,68 @@ async fn main() -> Result<()> {
         .await;
     }
 
+    if args.tp_smoke {
+        return tp_smoke(args.tp_size, args.cuda_devices).await;
+    }
+
     daemon(args).await
 }
 
+/// One-shot tensor-parallel handshake. Spawns N-1 worker subprocesses
+/// (rank 0 stays in this process), builds the NCCL communicator across
+/// the full world, runs an AllReduce sanity check, and shuts everyone
+/// down. Output is plain log lines on stderr + a final summary on
+/// stdout in `key=value` form so an outer script can parse it.
+async fn tp_smoke(tp_size: u32, cuda_devices: Vec<u32>) -> Result<()> {
+    if tp_size < 2 {
+        anyhow::bail!("--tp-size must be at least 2 (got {tp_size})");
+    }
+    if cuda_devices.len() as u32 != tp_size {
+        anyhow::bail!(
+            "--cuda-devices must list exactly {tp_size} entries (got {})",
+            cuda_devices.len()
+        );
+    }
+
+    let exe = std::env::current_exe().context("resolve current_exe for worker spawn")?;
+    let leader_device = cuda_devices[0];
+
+    tracing::info!(
+        tp_size,
+        ?cuda_devices,
+        binary = %exe.display(),
+        "tp-smoke: spawning worker pool"
+    );
+    let mut pool = tp::WorkerPool::spawn(&exe, tp_size, &cuda_devices).await?;
+
+    tracing::info!("tp-smoke: pinging every worker");
+    let pongs = pool.ping_all().await?;
+    for p in &pongs {
+        tracing::info!(?p, "tp-smoke: pong");
+    }
+
+    tracing::info!(leader_device, "tp-smoke: initialising NCCL");
+    pool.init_nccl(leader_device).await?;
+
+    tracing::info!("tp-smoke: running AllReduce sanity check");
+    pool.nccl_sanity_check().await?;
+
+    tracing::info!("tp-smoke: shutting down pool");
+    pool.shutdown().await?;
+
+    println!("status=ok");
+    println!("tp_size={tp_size}");
+    println!(
+        "cuda_devices={}",
+        cuda_devices
+            .iter()
+            .map(|d| d.to_string())
+            .collect::<Vec<_>>()
+            .join(",")
+    );
+    Ok(())
+}
+
 async fn daemon(args: Args) -> Result<()> {
     let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
         tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");
diff --git a/script/tp-smoke.sh b/script/tp-smoke.sh
new file mode 100755
index 0000000..6677095
--- /dev/null
+++ b/script/tp-smoke.sh
@@ -0,0 +1,60 @@
+#!/bin/env bash
+#
+# TP smoke test against a deployed neuron host.
+#
+# SSHes into the target host and runs `neuron --tp-smoke --tp-size N
+# --cuda-devices ...` directly — no HTTP API involved. The smoke
+# subcommand spawns N-1 worker subprocesses, joins them in an NCCL
+# communicator, runs one AllReduce(Sum) of `1u32` across every rank, and
+# verifies the observed sum equals world_size on every rank.
+#
+# This validates the lower-half of the TP stack (NCCL + IPC topology +
+# subprocess lifecycle) without touching model load, inference, or HTTP.
+# A failure here means the host cannot run any TP model and there is no
+# point debugging the higher layers.
+#
+# Usage:
+#   script/tp-smoke.sh [host] [tp_size] [cuda_devices]
+#
+# Defaults:
+#   host         = beast.hanzalova.internal  (only fleet host with 2 GPUs)
+#   tp_size      = 2
+#   cuda_devices = 0,1
+
+set -euo pipefail
+
+HOST="${1:-beast.hanzalova.internal}"
+TP_SIZE="${2:-2}"
+CUDA_DEVICES="${3:-0,1}"
+
+say() { printf '[%s] %s\n' "${HOST}" "$*" >&2; }
+die() { say "FAIL: $*"; exit 1; }
+
+say "running neuron --tp-smoke --tp-size ${TP_SIZE} --cuda-devices ${CUDA_DEVICES}"
+
+# Run as root via sudo because:
+#   - cuda contexts under a user account require either the nvidia
+#     uvm/peer devices to be world-readable or the user to be in a
+#     priviliged group (neither is true on stock fc43);
+#   - the installed binary lives at /usr/bin/neuron with no setuid;
+# Running through root is the simplest path that matches how
+# systemd-managed neuron sees the GPUs in production.
+#
+# The smoke command is read-only — it allocates a transient NCCL comm
+# and a 1u32 buffer per rank, then tears it all down.
+if ! ssh -o BatchMode=yes "${HOST}" \
+    sudo /usr/bin/neuron \
+        --tp-smoke \
+        --tp-size "${TP_SIZE}" \
+        --cuda-devices "${CUDA_DEVICES}" 2>&1 | tee /tmp/tp-smoke-"${HOST}".log
+then
+    die "tp-smoke exited non-zero (see /tmp/tp-smoke-${HOST}.log)"
+fi
+
+# Final stdout line is `status=ok` on success.
+if grep -q '^status=ok$' /tmp/tp-smoke-"${HOST}".log; then
+    say "PASS — NCCL handshake + AllReduce sanity check OK across ${TP_SIZE} ranks"
+    exit 0
+else
+    die "no status=ok line in output"
+fi