From 9b8bd146f6f4634e746b4eed9f609fccccf9b3bc Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 19 May 2026 19:40:25 +0300 Subject: [PATCH] feat(tp): --tp-smoke CLI subcommand + remote validation script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a one-shot diagnostic that exercises the lower half of the TP stack — WorkerPool::spawn, init_nccl, nccl_sanity_check — in isolation from model load and inference. Runs N-1 worker subprocesses (rank 0 stays in this process), joins them in an NCCL communicator on the specified CUDA devices, all_reduces a sentinel 1u32 per rank, verifies the observed_sum equals world_size on every rank, then shuts down. Output is `status=ok` on stdout (plus key=value lines for tp_size and cuda_devices) when every check passes, non-zero exit + tracing on stderr otherwise. The smoke command is diagnostic-only and not exposed through the daemon HTTP API. script/tp-smoke.sh wraps it with an ssh invocation against a fleet host (default beast — the only host with 2 GPUs) and asserts the status line, mirroring the validate-neuron.sh ergonomics. This is step 1 of the TP test plan. A failure here means TP cannot work on the host at all; step 2 (Stage 7b-iv) wires real model load and inference through the same WorkerPool primitives. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/neuron/src/main.rs | 83 ++++++++++++++++++++++++++++++++++++--- script/tp-smoke.sh | 60 ++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 5 deletions(-) create mode 100755 script/tp-smoke.sh diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs index d3b2ed1..d194b5a 100644 --- a/crates/neuron/src/main.rs +++ b/crates/neuron/src/main.rs @@ -1,4 +1,4 @@ -use anyhow::Result; +use anyhow::{Context, Result}; use clap::Parser; use neuron::{ api, @@ -13,8 +13,9 @@ use tokio::sync::RwLock; use tracing_subscriber::EnvFilter; /// Top-level CLI. The same binary runs as either the public neuron -/// daemon (default) or a tensor-parallel worker subprocess (when -/// `--worker` is set) spawned by the leader on the same host. +/// daemon (default), a tensor-parallel worker subprocess (when +/// `--worker` is set, spawned by the leader on the same host), or a +/// one-shot TP NCCL handshake check (when `--tp-smoke` is set). #[derive(Parser)] #[command(name = "neuron")] #[command(about = "Per-node daemon for cortex inference clusters")] @@ -28,12 +29,20 @@ struct Args { #[arg(long, default_value_t = false)] worker: bool, + /// Run a one-shot TP smoke test: spawn `--tp-size - 1` worker + /// subprocesses on `--cuda-devices`, build the NCCL communicator, + /// run an `AllReduce` sanity check across every rank, and exit. + /// Used to validate the TP plumbing in isolation from model load + /// and inference. Diagnostic-only — not exposed through the daemon + /// HTTP API. + #[arg(long, default_value_t = false)] + tp_smoke: bool, + /// NCCL rank for worker mode. Ignored when `--worker` is not set. #[arg(long, default_value_t = 0)] rank: u32, - /// Total NCCL world size for worker mode. Ignored when `--worker` - /// is not set. + /// Total NCCL world size for worker mode or TP smoke mode. #[arg(long, default_value_t = 1)] tp_size: u32, @@ -42,6 +51,11 @@ struct Args { #[arg(long, default_value_t = 0)] cuda_device: u32, + /// Comma-separated CUDA device indices for TP smoke mode (one per + /// rank, starting with rank 0). Must have `tp_size` entries. + #[arg(long, value_delimiter = ',')] + cuda_devices: Vec, + /// Port to listen on (overrides config file). Daemon mode only. #[arg(short, long)] port: Option, @@ -72,9 +86,68 @@ async fn main() -> Result<()> { .await; } + if args.tp_smoke { + return tp_smoke(args.tp_size, args.cuda_devices).await; + } + daemon(args).await } +/// One-shot tensor-parallel handshake. Spawns N-1 worker subprocesses +/// (rank 0 stays in this process), builds the NCCL communicator across +/// the full world, runs an AllReduce sanity check, and shuts everyone +/// down. Output is plain log lines on stderr + a final summary on +/// stdout in `key=value` form so an outer script can parse it. +async fn tp_smoke(tp_size: u32, cuda_devices: Vec) -> Result<()> { + if tp_size < 2 { + anyhow::bail!("--tp-size must be at least 2 (got {tp_size})"); + } + if cuda_devices.len() as u32 != tp_size { + anyhow::bail!( + "--cuda-devices must list exactly {tp_size} entries (got {})", + cuda_devices.len() + ); + } + + let exe = std::env::current_exe().context("resolve current_exe for worker spawn")?; + let leader_device = cuda_devices[0]; + + tracing::info!( + tp_size, + ?cuda_devices, + binary = %exe.display(), + "tp-smoke: spawning worker pool" + ); + let mut pool = tp::WorkerPool::spawn(&exe, tp_size, &cuda_devices).await?; + + tracing::info!("tp-smoke: pinging every worker"); + let pongs = pool.ping_all().await?; + for p in &pongs { + tracing::info!(?p, "tp-smoke: pong"); + } + + tracing::info!(leader_device, "tp-smoke: initialising NCCL"); + pool.init_nccl(leader_device).await?; + + tracing::info!("tp-smoke: running AllReduce sanity check"); + pool.nccl_sanity_check().await?; + + tracing::info!("tp-smoke: shutting down pool"); + pool.shutdown().await?; + + println!("status=ok"); + println!("tp_size={tp_size}"); + println!( + "cuda_devices={}", + cuda_devices + .iter() + .map(|d| d.to_string()) + .collect::>() + .join(",") + ); + Ok(()) +} + async fn daemon(args: Args) -> Result<()> { let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| { tracing::warn!(path = %args.config, error = %e, "config not found, using defaults"); diff --git a/script/tp-smoke.sh b/script/tp-smoke.sh new file mode 100755 index 0000000..6677095 --- /dev/null +++ b/script/tp-smoke.sh @@ -0,0 +1,60 @@ +#!/bin/env bash +# +# TP smoke test against a deployed neuron host. +# +# SSHes into the target host and runs `neuron --tp-smoke --tp-size N +# --cuda-devices ...` directly — no HTTP API involved. The smoke +# subcommand spawns N-1 worker subprocesses, joins them in an NCCL +# communicator, runs one AllReduce(Sum) of `1u32` across every rank, and +# verifies the observed sum equals world_size on every rank. +# +# This validates the lower-half of the TP stack (NCCL + IPC topology + +# subprocess lifecycle) without touching model load, inference, or HTTP. +# A failure here means the host cannot run any TP model and there is no +# point debugging the higher layers. +# +# Usage: +# script/tp-smoke.sh [host] [tp_size] [cuda_devices] +# +# Defaults: +# host = beast.hanzalova.internal (only fleet host with 2 GPUs) +# tp_size = 2 +# cuda_devices = 0,1 + +set -euo pipefail + +HOST="${1:-beast.hanzalova.internal}" +TP_SIZE="${2:-2}" +CUDA_DEVICES="${3:-0,1}" + +say() { printf '[%s] %s\n' "${HOST}" "$*" >&2; } +die() { say "FAIL: $*"; exit 1; } + +say "running neuron --tp-smoke --tp-size ${TP_SIZE} --cuda-devices ${CUDA_DEVICES}" + +# Run as root via sudo because: +# - cuda contexts under a user account require either the nvidia +# uvm/peer devices to be world-readable or the user to be in a +# priviliged group (neither is true on stock fc43); +# - the installed binary lives at /usr/bin/neuron with no setuid; +# Running through root is the simplest path that matches how +# systemd-managed neuron sees the GPUs in production. +# +# The smoke command is read-only — it allocates a transient NCCL comm +# and a 1u32 buffer per rank, then tears it all down. +if ! ssh -o BatchMode=yes "${HOST}" \ + sudo /usr/bin/neuron \ + --tp-smoke \ + --tp-size "${TP_SIZE}" \ + --cuda-devices "${CUDA_DEVICES}" 2>&1 | tee /tmp/tp-smoke-"${HOST}".log +then + die "tp-smoke exited non-zero (see /tmp/tp-smoke-${HOST}.log)" +fi + +# Final stdout line is `status=ok` on success. +if grep -q '^status=ok$' /tmp/tp-smoke-"${HOST}".log; then + say "PASS — NCCL handshake + AllReduce sanity check OK across ${TP_SIZE} ranks" + exit 0 +else + die "no status=ok line in output" +fi