Compare commits
4 Commits
6779b7526a
...
0e9671dd7d
| Author | SHA1 | Date | |
|---|---|---|---|
|
0e9671dd7d
|
|||
|
e29c9e35f0
|
|||
|
8a2334eacb
|
|||
|
aad314cdfa
|
@@ -108,14 +108,25 @@ jobs:
|
||||
build_jobs: 8
|
||||
nvcc_threads: 4
|
||||
cargo_features: "cuda cudnn flash-attn"
|
||||
# runner-cuda-13.0 extends runner-rust, so rust/cargo are already
|
||||
# present via dnf — no rustup install step needed.
|
||||
# runner-cuda-13.0 inherits from runner-rust in gongfoo, so rust
|
||||
# *should* be available via dnf. The currently-published image is
|
||||
# missing it though (likely a stale build), so we run a defensive
|
||||
# `dnf install` at the top of the step. When the runner image is
|
||||
# rebuilt with the proper layers this becomes a fast no-op.
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.ref }}
|
||||
|
||||
- name: Ensure rust toolchain present
|
||||
run: |
|
||||
set -eux
|
||||
if ! command -v cargo >/dev/null 2>&1; then
|
||||
dnf install -y --setopt=install_weak_deps=False rust cargo clippy
|
||||
fi
|
||||
cargo --version
|
||||
|
||||
- name: Build neuron with CUDA (${{ matrix.flavour }})
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
@@ -78,11 +78,21 @@ async fn main() -> Result<()> {
|
||||
candle,
|
||||
});
|
||||
|
||||
let app = api::neuron_routes().with_state(state);
|
||||
let app = api::neuron_routes().with_state(Arc::clone(&state));
|
||||
let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?;
|
||||
tracing::info!("neuron listening on {addr}");
|
||||
let listener = tokio::net::TcpListener::bind(addr).await?;
|
||||
axum::serve(listener, app).await?;
|
||||
axum::serve(listener, app)
|
||||
.with_graceful_shutdown(startup::shutdown_signal())
|
||||
.await?;
|
||||
|
||||
// Deactivation: serve has returned (graceful shutdown signal
|
||||
// received and connections drained). Release CUDA contexts / VRAM
|
||||
// by unloading every model before exiting; systemd's TimeoutStopSec
|
||||
// bounds how long this phase may take.
|
||||
let registry = state.registry.read().await;
|
||||
startup::unload_all_models(®istry).await;
|
||||
tracing::info!("shutdown complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
//! Activation-time orchestration.
|
||||
//! Activation- and deactivation-time orchestration.
|
||||
//!
|
||||
//! Wired from `main.rs` after the harness registry is built and before
|
||||
//! the HTTP listener binds. Kept in its own module so the logic is
|
||||
//! Wired from `main.rs` around the HTTP listener — activation runs
|
||||
//! before bind, deactivation runs after axum returns from its
|
||||
//! graceful-shutdown future. Kept in its own module so the logic is
|
||||
//! unit-testable without spinning up a full neuron process.
|
||||
|
||||
use crate::harness::HarnessRegistry;
|
||||
use cortex_core::harness::ModelSpec;
|
||||
use std::time::Instant;
|
||||
use tokio::signal;
|
||||
|
||||
/// Load each spec sequentially against the registry, treating
|
||||
/// individual failures as warnings rather than fatal errors.
|
||||
@@ -36,3 +38,60 @@ pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Future that resolves on SIGINT (Ctrl-C) or SIGTERM (systemd stop).
|
||||
///
|
||||
/// Wired into `axum::serve(...).with_graceful_shutdown(shutdown_signal())`
|
||||
/// so the HTTP listener stops accepting new connections, lets in-flight
|
||||
/// requests drain, and then yields control back to main for cleanup.
|
||||
pub async fn shutdown_signal() {
|
||||
let ctrl_c = async {
|
||||
signal::ctrl_c().await.ok();
|
||||
};
|
||||
let terminate = async {
|
||||
signal::unix::signal(signal::unix::SignalKind::terminate())
|
||||
.expect("install SIGTERM handler")
|
||||
.recv()
|
||||
.await;
|
||||
};
|
||||
tokio::select! {
|
||||
_ = ctrl_c => tracing::info!("received SIGINT, shutting down"),
|
||||
_ = terminate => tracing::info!("received SIGTERM, shutting down"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Unload every model currently registered. Called from `main.rs` after
|
||||
/// axum's graceful shutdown future resolves, so CUDA contexts and VRAM
|
||||
/// are released before the process exits rather than left to the OS to
|
||||
/// reclaim. Per-model failures are logged and skipped — keep cleanup
|
||||
/// going even when one harness is unhealthy.
|
||||
pub async fn unload_all_models(registry: &HarnessRegistry) {
|
||||
let listed = match registry.list_all_models().await {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "failed to list models during shutdown");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if listed.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
tracing::info!(count = listed.len(), "unloading models for shutdown");
|
||||
for model in listed {
|
||||
let start = Instant::now();
|
||||
match registry.unload_model(&model.id).await {
|
||||
Ok(()) => tracing::info!(
|
||||
model = %model.id,
|
||||
elapsed_ms = start.elapsed().as_millis() as u64,
|
||||
"unloaded"
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
model = %model.id,
|
||||
error = %e,
|
||||
"unload failed during shutdown"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
32
crates/neuron/tests/shutdown.rs
Normal file
32
crates/neuron/tests/shutdown.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//! Deactivation behaviour: unload_all_models tolerates an empty
|
||||
//! registry and continues past per-model unload failures.
|
||||
|
||||
use cortex_core::harness::HarnessConfig;
|
||||
use neuron::config::HarnessSettings;
|
||||
use neuron::harness::HarnessRegistry;
|
||||
use neuron::startup;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unload_all_models_empty_registry_is_noop() {
|
||||
let registry = HarnessRegistry::new();
|
||||
startup::unload_all_models(®istry).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_unload_all_models_with_no_loaded_models() {
|
||||
let registry = HarnessRegistry::from_configs(
|
||||
&[HarnessConfig {
|
||||
name: "candle".into(),
|
||||
}],
|
||||
"http://localhost:0",
|
||||
&HarnessSettings::default(),
|
||||
);
|
||||
|
||||
startup::unload_all_models(®istry).await;
|
||||
|
||||
let listed = registry
|
||||
.list_all_models()
|
||||
.await
|
||||
.expect("list_all_models should still succeed after shutdown cleanup");
|
||||
assert!(listed.is_empty());
|
||||
}
|
||||
@@ -15,6 +15,11 @@ Group=neuron
|
||||
# materialise on first activation. systemd's default TimeoutStartSec
|
||||
# (90s) is far too short; allow 30 minutes.
|
||||
TimeoutStartSec=1800s
|
||||
# On stop, neuron drains in-flight requests then unloads every model
|
||||
# to release CUDA contexts cleanly. Allow generous time for big-model
|
||||
# unloads; systemd will SIGKILL after this bound.
|
||||
TimeoutStopSec=120s
|
||||
KillSignal=SIGTERM
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
164
script/deploy.sh
164
script/deploy.sh
@@ -27,16 +27,105 @@ mapfile -t neuron_entries < <(
|
||||
yq -r '.neurons[] | .host + "\t" + .flavour' "${MANIFEST}"
|
||||
)
|
||||
|
||||
latest_helexa_version=$(git -C "${REPO_DIR}" describe --tags --abbrev=0 | sed 's/^v//')
|
||||
# Return the installed package's "version-release" string, or
|
||||
# "(not installed)" when rpm reports the package as absent. Capture
|
||||
# rpm's output into a variable so its "package X is not installed"
|
||||
# stdout message (rpm writes that to stdout, not stderr, when -q fails)
|
||||
# doesn't leak into the result.
|
||||
installed_nvr() {
|
||||
local host="$1" pkg="$2"
|
||||
local nvr
|
||||
if nvr=$(ssh "${host}" "rpm -q --qf '%{version}-%{release}' ${pkg} 2>/dev/null"); then
|
||||
echo "${nvr}"
|
||||
else
|
||||
echo "(not installed)"
|
||||
fi
|
||||
}
|
||||
|
||||
# Ensure the rpm.lair.cafe unstable repo is configured AND enabled on
|
||||
# the remote host.
|
||||
#
|
||||
# The upstream .repo file at https://rpm.lair.cafe/lair-cafe-unstable.repo
|
||||
# ships with `enabled=0` so a host that just fetched it won't start
|
||||
# pulling unstable packages by accident. We have to explicitly flip
|
||||
# enabled=1 via `dnf config-manager setopt`. Both addrepo and setopt
|
||||
# are idempotent.
|
||||
#
|
||||
# Non-fatal — if either step fails the subsequent `dnf install` will
|
||||
# surface a clearer diagnostic on its own.
|
||||
ensure_lair_repo() {
|
||||
local host="$1"
|
||||
if ! ssh "${host}" "test -f /etc/yum.repos.d/lair-cafe-unstable.repo" 2>/dev/null; then
|
||||
echo "[${host}] adding rpm.lair.cafe unstable repo"
|
||||
if ! ssh "${host}" sudo dnf config-manager addrepo \
|
||||
--from-repofile=https://rpm.lair.cafe/lair-cafe-unstable.repo \
|
||||
>/dev/null 2>&1; then
|
||||
echo "[${host}] WARNING: failed to add lair.cafe repo file (proceeding anyway)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
# The .repo file ships enabled=0; flip it on. Cheap, idempotent.
|
||||
if ! ssh "${host}" sudo dnf config-manager setopt \
|
||||
lair-cafe-unstable.enabled=1 >/dev/null 2>&1; then
|
||||
echo "[${host}] WARNING: failed to enable lair-cafe-unstable (proceeding anyway)"
|
||||
fi
|
||||
}
|
||||
|
||||
# True when the named package needs to be installed or upgraded on the
|
||||
# remote host — either it's not present, or a newer version exists in
|
||||
# the repo. False only when the installed version is current.
|
||||
#
|
||||
# `dnf check-update <pkg>` returns 0 when the package isn't installed
|
||||
# at all (there's nothing to update), so we have to probe with rpm -q
|
||||
# first to distinguish "absent" from "current". Other dnf failures
|
||||
# collapse into "needs update" so the subsequent install step surfaces
|
||||
# the real diagnostic rather than this check swallowing it.
|
||||
needs_update() {
|
||||
local host="$1" pkg="$2"
|
||||
# Not installed → needs work.
|
||||
if ! ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
# Installed; ask dnf whether the repo has something newer.
|
||||
if ssh "${host}" sudo dnf check-update --refresh -q "${pkg}" >/dev/null 2>&1; then
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# cortex (gateway)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
observed_cortex_version=$(ssh "${cortex_host}" cortex --version | sed 's/^cortex //')
|
||||
if [[ "${latest_helexa_version}" = "${observed_cortex_version}" ]]; then
|
||||
echo "[${cortex_host}] cortex is up to date (${observed_cortex_version})"
|
||||
if ssh "${cortex_host}" sudo systemctl stop cortex.service && rsync \
|
||||
ensure_lair_repo "${cortex_host}"
|
||||
cortex_nvr=$(installed_nvr "${cortex_host}" cortex)
|
||||
if needs_update "${cortex_host}" cortex; then
|
||||
echo "[${cortex_host}] cortex update available (current: ${cortex_nvr})"
|
||||
# Stop the service only if the unit file exists — fresh installs
|
||||
# don't have it, and `systemctl stop` on a missing unit returns
|
||||
# non-zero, which would otherwise short-circuit the install branch
|
||||
# under set -e.
|
||||
if ssh "${cortex_host}" "[ ! -f /usr/lib/systemd/system/cortex.service ] || sudo systemctl stop cortex.service"; then
|
||||
echo "[${cortex_host}] stopped cortex service"
|
||||
if dnf_output=$(ssh "${cortex_host}" sudo dnf install --refresh --allowerasing -y cortex 2>&1); then
|
||||
cortex_nvr=$(installed_nvr "${cortex_host}" cortex)
|
||||
echo "[${cortex_host}] installed/upgraded cortex to ${cortex_nvr}"
|
||||
else
|
||||
echo "[${cortex_host}] failed to install/upgrade cortex:"
|
||||
echo "${dnf_output}" | sed "s/^/[${cortex_host}] /"
|
||||
fi
|
||||
else
|
||||
echo "[${cortex_host}] failed to stop cortex service"
|
||||
fi
|
||||
else
|
||||
echo "[${cortex_host}] cortex is up to date (${cortex_nvr})"
|
||||
ssh "${cortex_host}" sudo systemctl stop cortex.service || true
|
||||
fi
|
||||
|
||||
# Sync cortex.toml whether the package was upgraded or not — the config
|
||||
# can change without a package bump.
|
||||
if rsync \
|
||||
--archive \
|
||||
--compress \
|
||||
--rsync-path 'sudo rsync' \
|
||||
@@ -45,11 +134,11 @@ if [[ "${latest_helexa_version}" = "${observed_cortex_version}" ]]; then
|
||||
"${REPO_DIR}/cortex.toml" \
|
||||
"${cortex_host}:/etc/cortex/cortex.toml"; then
|
||||
echo "[${cortex_host}] sync'd cortex.toml"
|
||||
ssh "${cortex_host}" sudo systemctl daemon-reload
|
||||
ssh "${cortex_host}" sudo systemctl start cortex.service
|
||||
else
|
||||
echo "[${cortex_host}] failed to sync cortex.toml"
|
||||
fi
|
||||
|
||||
ssh "${cortex_host}" sudo systemctl daemon-reload
|
||||
if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then
|
||||
echo "[${cortex_host}] cortex service is active"
|
||||
elif ssh "${cortex_host}" sudo systemctl start cortex.service; then
|
||||
@@ -57,34 +146,6 @@ if [[ "${latest_helexa_version}" = "${observed_cortex_version}" ]]; then
|
||||
else
|
||||
echo "[${cortex_host}] failed to start cortex service"
|
||||
fi
|
||||
else
|
||||
echo "[${cortex_host}] cortex is out of date (${observed_cortex_version} != ${latest_helexa_version})"
|
||||
if ssh "${cortex_host}" sudo systemctl stop cortex.service; then
|
||||
echo "[${cortex_host}] stopped cortex service"
|
||||
if ssh "${cortex_host}" sudo dnf upgrade --refresh -y cortex; then
|
||||
echo "[${cortex_host}] upgraded cortex"
|
||||
if rsync \
|
||||
--archive \
|
||||
--compress \
|
||||
--verbose \
|
||||
--rsync-path 'sudo rsync' \
|
||||
--chown root:root \
|
||||
--chmod 644 \
|
||||
"${REPO_DIR}/cortex.toml" \
|
||||
"${cortex_host}:/etc/cortex/cortex.toml"; then
|
||||
echo "[${cortex_host}] sync'd cortex.toml"
|
||||
ssh "${cortex_host}" sudo systemctl daemon-reload
|
||||
ssh "${cortex_host}" sudo systemctl start cortex.service
|
||||
else
|
||||
echo "[${cortex_host}] failed to sync cortex.toml"
|
||||
fi
|
||||
else
|
||||
echo "[${cortex_host}] failed to upgrade cortex"
|
||||
fi
|
||||
else
|
||||
echo "[${cortex_host}] failed to stop cortex service"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# neuron (per-host, flavour from manifest)
|
||||
@@ -94,26 +155,19 @@ for entry in "${neuron_entries[@]}"; do
|
||||
IFS=$'\t' read -r neuron_host neuron_flavour <<< "${entry}"
|
||||
package="helexa-neuron-${neuron_flavour}"
|
||||
|
||||
observed_neuron_version=$(ssh "${neuron_host}" neuron --version 2> /dev/null | sed 's/^neuron //' || true)
|
||||
if [[ "${latest_helexa_version}" = "${observed_neuron_version}" ]]; then
|
||||
echo "[${neuron_host}] neuron is up to date (${observed_neuron_version}, ${package})"
|
||||
if ssh "${neuron_host}" systemctl is-active --quiet neuron.service; then
|
||||
echo "[${neuron_host}] neuron service is active"
|
||||
elif ssh "${neuron_host}" sudo systemctl start neuron.service; then
|
||||
echo "[${neuron_host}] started neuron service"
|
||||
else
|
||||
echo "[${neuron_host}] failed to start neuron service"
|
||||
fi
|
||||
else
|
||||
echo "[${neuron_host}] upgrading neuron from ${observed_neuron_version:-(absent)} to ${latest_helexa_version} (${package})"
|
||||
ensure_lair_repo "${neuron_host}"
|
||||
neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
|
||||
if needs_update "${neuron_host}" "${package}"; then
|
||||
echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"
|
||||
if ssh "${neuron_host}" "[ ! -f /usr/lib/systemd/system/neuron.service ] || sudo systemctl stop neuron.service"; then
|
||||
echo "[${neuron_host}] stopped neuron service"
|
||||
# --allowerasing lets dnf swap out a previously-installed
|
||||
# bare helexa-neuron or a different flavour without manual
|
||||
# intervention. The Conflicts: clauses in the spec ensure
|
||||
# only one flavour is ever resident.
|
||||
if ssh "${neuron_host}" sudo dnf install --refresh --allowerasing -y "${package}" &> /dev/null; then
|
||||
echo "[${neuron_host}] installed/upgraded ${package}"
|
||||
if dnf_output=$(ssh "${neuron_host}" sudo dnf install --refresh --allowerasing -y "${package}" 2>&1); then
|
||||
neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
|
||||
echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}"
|
||||
# Ensure firewalld allows neuron port
|
||||
ssh "${neuron_host}" "sudo firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null || sudo firewall-cmd --add-service=helexa-neuron --permanent && sudo firewall-cmd --reload" 2>/dev/null || true
|
||||
if ssh "${neuron_host}" "sudo systemctl daemon-reload && sudo systemctl start neuron.service"; then
|
||||
@@ -122,10 +176,20 @@ for entry in "${neuron_entries[@]}"; do
|
||||
echo "[${neuron_host}] failed to start neuron service"
|
||||
fi
|
||||
else
|
||||
echo "[${neuron_host}] failed to install ${package}"
|
||||
echo "[${neuron_host}] failed to install ${package}:"
|
||||
echo "${dnf_output}" | sed "s/^/[${neuron_host}] /"
|
||||
fi
|
||||
else
|
||||
echo "[${neuron_host}] failed to stop neuron service"
|
||||
fi
|
||||
else
|
||||
echo "[${neuron_host}] ${package} is up to date (${neuron_nvr})"
|
||||
if ssh "${neuron_host}" systemctl is-active --quiet neuron.service; then
|
||||
echo "[${neuron_host}] neuron service is active"
|
||||
elif ssh "${neuron_host}" sudo systemctl start neuron.service; then
|
||||
echo "[${neuron_host}] started neuron service"
|
||||
else
|
||||
echo "[${neuron_host}] failed to start neuron service"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user