Files
cortex/Cargo.toml
rob thijssen c4f239ceb9
All checks were successful
CI / CUDA type-check (push) Successful in 33s
CI / Format (push) Successful in 35s
CI / Clippy (push) Successful in 2m34s
CI / Test (push) Successful in 6m1s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build(neuron): patch cudarc to expose Comm::abort/get_async_error (#17 Stage 2)
#17 Stage 2 (TP hang-recovery) needs to call ncclCommAbort on a LIVE
communicator from another thread — to unblock a collective wedged on a
dead/hung peer so the ranks can resync. No cudarc release (incl. main)
exposes this: the safe Comm only aborts in Drop, which can't fire while a
stuck thread holds an Arc<Comm> clone.

Pin neuron's cudarc 0.19.7 to a fork (grenade/cudarc @ nccl-comm-abort,
rev 4dff0be) adding three thin methods — Comm::abort, get_async_error,
and a raw comm() accessor — to be submitted upstream. The patch targets
0.19.x only; candle's transitive cudarc 0.17.8 stays on crates.io.

Foundation only; the watchdog + abort + comm-rebuild that consume these
land in follow-up commits (cuda-gated → validated by the blackwell build).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 13:49:59 +03:00

73 lines
1.9 KiB
TOML

[workspace]
resolver = "2"
members = [
"crates/cortex-core",
"crates/cortex-gateway",
"crates/cortex-cli",
"crates/neuron",
"crates/helexa-acp",
]
[workspace.package]
version = "0.1.16"
edition = "2024"
license = "GPL-3.0-or-later"
repository = "https://git.lair.cafe/helexa/cortex"
[workspace.dependencies]
# async runtime
tokio = { version = "1", features = ["full"] }
# web framework
axum = { version = "0.8", features = ["macros"] }
tower = "0.5"
tower-http = { version = "0.6", features = ["cors", "trace", "timeout"] }
# serialization
serde = { version = "1", features = ["derive"] }
serde_json = "1"
toml = "0.8"
# http client (for proxying to neuron backends)
reqwest = { version = "0.12", features = ["json", "stream"] }
# observability
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
metrics = "0.24"
metrics-exporter-prometheus = "0.16"
# time
chrono = { version = "0.4", features = ["serde"] }
# config
figment = { version = "0.10", features = ["toml", "env"] }
# error handling
anyhow = "1"
thiserror = "2"
# async traits
async-trait = "0.1"
# CLI
clap = { version = "4", features = ["derive"] }
# futures / streams (for SSE proxying)
futures = "0.3"
tokio-stream = "0.1"
eventsource-stream = "0.2"
# workspace crates
cortex-core = { path = "crates/cortex-core" }
cortex-gateway = { path = "crates/cortex-gateway" }
# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is
# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds
# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP
# hang-recovery (abort a wedged collective from another thread, then
# rebuild the comm). Pinned to a fork revision pending upstream review
# (grenade/cudarc @ nccl-comm-abort).
[patch.crates-io]
cudarc = { git = "https://github.com/grenade/cudarc", rev = "4dff0be72d8a685d6691a6a53d4c95e1fe932277" }