Some checks failed
CI / CUDA type-check (push) Failing after 47s
CI / Format (push) Successful in 31s
CI / Test (push) Failing after 1m3s
CI / Clippy (push) Successful in 2m44s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
Make a hung NCCL collective recoverable instead of a permanent brick. Today a wedged collective hangs the in-process leader thread forever, and even Stage 1's recovery can't help — its unload's DropTp queues behind the stuck thread and hangs too. - Cache the leader's NCCL Comm handle async-side at init (new cuda-gated Job::GetLeaderComm → DeviceWorkerHandle::get_leader_comm → stored on WorkerPool.leader_comm). Fetched while the thread is responsive — a wedged thread can't service the fetch, which is why it's cached up front. - Wrap the leader forward in both generate_step and generate_step_with_images in tokio::time::timeout (default 120s, NEURON_TP_STEP_TIMEOUT_S). On expiry the watchdog calls Comm::abort() (ncclCommAbort) on the cached handle from the async thread — the one NCCL op sanctioned concurrently with an in-flight collective — which unblocks the leader thread, then fails the step WITHOUT draining (workers are wedged too; recovery's unload kills them). The error is a device fault → poison → Stage 1 auto-recovery, which now completes because the leader thread is responsive again. - Bumps the cudarc patch to dbc425a (adds the Drop-must-not-panic fix so the post-abort comm teardown during recovery doesn't double-abort-panic). Logs the whole sequence at ERROR with greppable `tp watchdog:` / `ncclCommAbort` markers so a real-world hang leaves a forensic trail — verification is by inspecting journals after real hangs, not a synthetic harness. cuda-gated → validated by the blackwell build. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
73 lines
1.9 KiB
TOML
73 lines
1.9 KiB
TOML
[workspace]
|
|
resolver = "2"
|
|
members = [
|
|
"crates/cortex-core",
|
|
"crates/cortex-gateway",
|
|
"crates/cortex-cli",
|
|
"crates/neuron",
|
|
"crates/helexa-acp",
|
|
]
|
|
|
|
[workspace.package]
|
|
version = "0.1.16"
|
|
edition = "2024"
|
|
license = "GPL-3.0-or-later"
|
|
repository = "https://git.lair.cafe/helexa/cortex"
|
|
|
|
[workspace.dependencies]
|
|
# async runtime
|
|
tokio = { version = "1", features = ["full"] }
|
|
|
|
# web framework
|
|
axum = { version = "0.8", features = ["macros"] }
|
|
tower = "0.5"
|
|
tower-http = { version = "0.6", features = ["cors", "trace", "timeout"] }
|
|
|
|
# serialization
|
|
serde = { version = "1", features = ["derive"] }
|
|
serde_json = "1"
|
|
toml = "0.8"
|
|
|
|
# http client (for proxying to neuron backends)
|
|
reqwest = { version = "0.12", features = ["json", "stream"] }
|
|
|
|
# observability
|
|
tracing = "0.1"
|
|
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
|
metrics = "0.24"
|
|
metrics-exporter-prometheus = "0.16"
|
|
|
|
# time
|
|
chrono = { version = "0.4", features = ["serde"] }
|
|
|
|
# config
|
|
figment = { version = "0.10", features = ["toml", "env"] }
|
|
|
|
# error handling
|
|
anyhow = "1"
|
|
thiserror = "2"
|
|
|
|
# async traits
|
|
async-trait = "0.1"
|
|
|
|
# CLI
|
|
clap = { version = "4", features = ["derive"] }
|
|
|
|
# futures / streams (for SSE proxying)
|
|
futures = "0.3"
|
|
tokio-stream = "0.1"
|
|
eventsource-stream = "0.2"
|
|
|
|
# workspace crates
|
|
cortex-core = { path = "crates/cortex-core" }
|
|
cortex-gateway = { path = "crates/cortex-gateway" }
|
|
|
|
# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is
|
|
# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds
|
|
# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP
|
|
# hang-recovery (abort a wedged collective from another thread, then
|
|
# rebuild the comm). Pinned to a fork revision pending upstream review
|
|
# (grenade/cudarc @ nccl-comm-abort).
|
|
[patch.crates-io]
|
|
cudarc = { git = "https://github.com/grenade/cudarc", rev = "dbc425aa865c178f38a3ec838f1f7a4da3146358" }
|