feat(tp): --tp-smoke CLI subcommand + remote validation script
All checks were successful
CI / Format (push) Successful in 36s
build-prerelease / Resolve version stamps (push) Successful in 38s
CI / Clippy (push) Successful in 2m19s
CI / Test (push) Successful in 4m32s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Successful in 3m43s
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m16s
build-prerelease / Package cortex RPM (push) Successful in 1m23s
build-prerelease / Build neuron-ampere (push) Successful in 4m56s
build-prerelease / Build neuron-ada (push) Successful in 5m1s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m51s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m0s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m39s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 59s
All checks were successful
CI / Format (push) Successful in 36s
build-prerelease / Resolve version stamps (push) Successful in 38s
CI / Clippy (push) Successful in 2m19s
CI / Test (push) Successful in 4m32s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Successful in 3m43s
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m16s
build-prerelease / Package cortex RPM (push) Successful in 1m23s
build-prerelease / Build neuron-ampere (push) Successful in 4m56s
build-prerelease / Build neuron-ada (push) Successful in 5m1s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 2m51s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m0s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m39s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 59s
Adds a one-shot diagnostic that exercises the lower half of the TP stack — WorkerPool::spawn, init_nccl, nccl_sanity_check — in isolation from model load and inference. Runs N-1 worker subprocesses (rank 0 stays in this process), joins them in an NCCL communicator on the specified CUDA devices, all_reduces a sentinel 1u32 per rank, verifies the observed_sum equals world_size on every rank, then shuts down. Output is `status=ok` on stdout (plus key=value lines for tp_size and cuda_devices) when every check passes, non-zero exit + tracing on stderr otherwise. The smoke command is diagnostic-only and not exposed through the daemon HTTP API. script/tp-smoke.sh wraps it with an ssh invocation against a fleet host (default beast — the only host with 2 GPUs) and asserts the status line, mirroring the validate-neuron.sh ergonomics. This is step 1 of the TP test plan. A failure here means TP cannot work on the host at all; step 2 (Stage 7b-iv) wires real model load and inference through the same WorkerPool primitives. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
use anyhow::Result;
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use neuron::{
|
||||
api,
|
||||
@@ -13,8 +13,9 @@ use tokio::sync::RwLock;
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
/// Top-level CLI. The same binary runs as either the public neuron
|
||||
/// daemon (default) or a tensor-parallel worker subprocess (when
|
||||
/// `--worker` is set) spawned by the leader on the same host.
|
||||
/// daemon (default), a tensor-parallel worker subprocess (when
|
||||
/// `--worker` is set, spawned by the leader on the same host), or a
|
||||
/// one-shot TP NCCL handshake check (when `--tp-smoke` is set).
|
||||
#[derive(Parser)]
|
||||
#[command(name = "neuron")]
|
||||
#[command(about = "Per-node daemon for cortex inference clusters")]
|
||||
@@ -28,12 +29,20 @@ struct Args {
|
||||
#[arg(long, default_value_t = false)]
|
||||
worker: bool,
|
||||
|
||||
/// Run a one-shot TP smoke test: spawn `--tp-size - 1` worker
|
||||
/// subprocesses on `--cuda-devices`, build the NCCL communicator,
|
||||
/// run an `AllReduce` sanity check across every rank, and exit.
|
||||
/// Used to validate the TP plumbing in isolation from model load
|
||||
/// and inference. Diagnostic-only — not exposed through the daemon
|
||||
/// HTTP API.
|
||||
#[arg(long, default_value_t = false)]
|
||||
tp_smoke: bool,
|
||||
|
||||
/// NCCL rank for worker mode. Ignored when `--worker` is not set.
|
||||
#[arg(long, default_value_t = 0)]
|
||||
rank: u32,
|
||||
|
||||
/// Total NCCL world size for worker mode. Ignored when `--worker`
|
||||
/// is not set.
|
||||
/// Total NCCL world size for worker mode or TP smoke mode.
|
||||
#[arg(long, default_value_t = 1)]
|
||||
tp_size: u32,
|
||||
|
||||
@@ -42,6 +51,11 @@ struct Args {
|
||||
#[arg(long, default_value_t = 0)]
|
||||
cuda_device: u32,
|
||||
|
||||
/// Comma-separated CUDA device indices for TP smoke mode (one per
|
||||
/// rank, starting with rank 0). Must have `tp_size` entries.
|
||||
#[arg(long, value_delimiter = ',')]
|
||||
cuda_devices: Vec<u32>,
|
||||
|
||||
/// Port to listen on (overrides config file). Daemon mode only.
|
||||
#[arg(short, long)]
|
||||
port: Option<u16>,
|
||||
@@ -72,9 +86,68 @@ async fn main() -> Result<()> {
|
||||
.await;
|
||||
}
|
||||
|
||||
if args.tp_smoke {
|
||||
return tp_smoke(args.tp_size, args.cuda_devices).await;
|
||||
}
|
||||
|
||||
daemon(args).await
|
||||
}
|
||||
|
||||
/// One-shot tensor-parallel handshake. Spawns N-1 worker subprocesses
|
||||
/// (rank 0 stays in this process), builds the NCCL communicator across
|
||||
/// the full world, runs an AllReduce sanity check, and shuts everyone
|
||||
/// down. Output is plain log lines on stderr + a final summary on
|
||||
/// stdout in `key=value` form so an outer script can parse it.
|
||||
async fn tp_smoke(tp_size: u32, cuda_devices: Vec<u32>) -> Result<()> {
|
||||
if tp_size < 2 {
|
||||
anyhow::bail!("--tp-size must be at least 2 (got {tp_size})");
|
||||
}
|
||||
if cuda_devices.len() as u32 != tp_size {
|
||||
anyhow::bail!(
|
||||
"--cuda-devices must list exactly {tp_size} entries (got {})",
|
||||
cuda_devices.len()
|
||||
);
|
||||
}
|
||||
|
||||
let exe = std::env::current_exe().context("resolve current_exe for worker spawn")?;
|
||||
let leader_device = cuda_devices[0];
|
||||
|
||||
tracing::info!(
|
||||
tp_size,
|
||||
?cuda_devices,
|
||||
binary = %exe.display(),
|
||||
"tp-smoke: spawning worker pool"
|
||||
);
|
||||
let mut pool = tp::WorkerPool::spawn(&exe, tp_size, &cuda_devices).await?;
|
||||
|
||||
tracing::info!("tp-smoke: pinging every worker");
|
||||
let pongs = pool.ping_all().await?;
|
||||
for p in &pongs {
|
||||
tracing::info!(?p, "tp-smoke: pong");
|
||||
}
|
||||
|
||||
tracing::info!(leader_device, "tp-smoke: initialising NCCL");
|
||||
pool.init_nccl(leader_device).await?;
|
||||
|
||||
tracing::info!("tp-smoke: running AllReduce sanity check");
|
||||
pool.nccl_sanity_check().await?;
|
||||
|
||||
tracing::info!("tp-smoke: shutting down pool");
|
||||
pool.shutdown().await?;
|
||||
|
||||
println!("status=ok");
|
||||
println!("tp_size={tp_size}");
|
||||
println!(
|
||||
"cuda_devices={}",
|
||||
cuda_devices
|
||||
.iter()
|
||||
.map(|d| d.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn daemon(args: Args) -> Result<()> {
|
||||
let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
|
||||
tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");
|
||||
|
||||
Reference in New Issue
Block a user