From ea1fdf8aa62b423b03fe952fc1d009783c03a6cc Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 2 Jun 2026 16:41:04 +0300 Subject: [PATCH] chore(deploy): drop deploy.sh and manifest.yml now that workflow runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First end-to-end run of the deploy workflow succeeded (gitea run #289), so the operator-run rolling-deploy script and its YAML manifest are no longer the source of truth — fleet topology lives in .gitea/workflows/deploy.yml and per-host config in script/infra-setup.sh. Per-host neuron config comments updated to point at the new sync path. Co-Authored-By: Claude Opus 4.7 --- asset/manifest.yml | 30 ---- asset/neuron/beast.toml | 6 +- asset/neuron/benjy.toml | 2 +- asset/neuron/quadbrat.toml | 2 +- script/deploy.sh | 303 ------------------------------------- 5 files changed, 5 insertions(+), 338 deletions(-) delete mode 100644 asset/manifest.yml delete mode 100755 script/deploy.sh diff --git a/asset/manifest.yml b/asset/manifest.yml deleted file mode 100644 index 5d8b36c..0000000 --- a/asset/manifest.yml +++ /dev/null @@ -1,30 +0,0 @@ -# Helexa fleet manifest. -# -# Drives rolling deploys via script/deploy.sh and serves as the source -# of truth for which hosts run cortex vs neuron, and which CUDA -# compute-capability flavour each neuron host needs. -# -# Flavour ↔ NVIDIA generation ↔ compute cap: -# ampere sm_86 (RTX 30 series — e.g. 3060) -# ada sm_89 (RTX 40 series — e.g. 4090) -# blackwell sm_120 (RTX 50 series — e.g. 5090) -# -# The flavour determines which RPM is installed on a given neuron host: -# helexa-neuron-. Only one flavour may be installed at a time -# (the packages Conflict: with each other). - -cortex: - host: hanzalova.internal - -neurons: - - host: beast.hanzalova.internal - flavour: blackwell - gpu: "2x RTX 5090" - - - host: benjy.hanzalova.internal - flavour: ada - gpu: "RTX 4090" - - - host: quadbrat.hanzalova.internal - flavour: ampere - gpu: "RTX 3060" diff --git a/asset/neuron/beast.toml b/asset/neuron/beast.toml index 5def13e..32861d7 100644 --- a/asset/neuron/beast.toml +++ b/asset/neuron/beast.toml @@ -5,9 +5,9 @@ # invocation: `validate-neuron.sh beast.hanzalova.internal # Qwen/Qwen3.6-27B q5k 2`. # -# Synced by script/deploy.sh from asset/neuron/.toml. Edits -# take effect on the next deploy.sh run (which stops + restarts the -# service so default_models is re-read at activation). +# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. Edits +# take effect after the next deploy workflow run restarts the service +# (default_models is read at activation). port = 13131 diff --git a/asset/neuron/benjy.toml b/asset/neuron/benjy.toml index 793bd06..1adb3b7 100644 --- a/asset/neuron/benjy.toml +++ b/asset/neuron/benjy.toml @@ -4,7 +4,7 @@ # Qwen3-8B (bf16, ~18 GB), leaving ~6 GB for KV cache + activations on # moderate-length contexts. # -# Synced by script/deploy.sh from asset/neuron/.toml. +# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. port = 13131 diff --git a/asset/neuron/quadbrat.toml b/asset/neuron/quadbrat.toml index 4135557..fb58a30 100644 --- a/asset/neuron/quadbrat.toml +++ b/asset/neuron/quadbrat.toml @@ -4,7 +4,7 @@ # (bf16, ~4 GB), leaving ~7 GB for KV cache so long contexts on a small # model still have plenty of room. # -# Synced by script/deploy.sh from asset/neuron/.toml. +# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. port = 13131 diff --git a/script/deploy.sh b/script/deploy.sh deleted file mode 100755 index 1f3aa9f..0000000 --- a/script/deploy.sh +++ /dev/null @@ -1,303 +0,0 @@ -#!/bin/env bash -# -# Rolling deploy across the helexa fleet, driven by asset/manifest.yml. -# Installs / upgrades cortex on the gateway host and the appropriate -# helexa-neuron- package on each neuron host, then restarts -# their services. - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" -MANIFEST="${REPO_DIR}/asset/manifest.yml" - -if [[ ! -f "${MANIFEST}" ]]; then - echo "fatal: manifest not found at ${MANIFEST}" >&2 - exit 1 -fi - -# Parse the manifest with yq. NOTE: this expects the pip-installed yq -# (a jq wrapper using jq syntax) — `pip install yq`. The Fedora rpm -# `yq` is mikefarah/yq and uses different (yaml-native) syntax; if a -# host has that one instead these queries will fail. -cortex_host=$(yq -r '.cortex.host' "${MANIFEST}") - -# Emit one TAB-separated 'host\tflavour' line per neuron. -mapfile -t neuron_entries < <( - yq -r '.neurons[] | .host + "\t" + .flavour' "${MANIFEST}" -) - -# Return the installed package's "version-release" string, or -# "(not installed)" when rpm reports the package as absent. Capture -# rpm's output into a variable so its "package X is not installed" -# stdout message (rpm writes that to stdout, not stderr, when -q fails) -# doesn't leak into the result. -installed_nvr() { - local host="$1" pkg="$2" - local nvr - if nvr=$(ssh "${host}" "rpm -q --qf '%{version}-%{release}' ${pkg} 2>/dev/null"); then - echo "${nvr}" - else - echo "(not installed)" - fi -} - -# Ensure the rpm.lair.cafe unstable repo is configured AND enabled on -# the remote host. -# -# The upstream .repo file at https://rpm.lair.cafe/lair-cafe-unstable.repo -# ships with `enabled=0` so a host that just fetched it won't start -# pulling unstable packages by accident. We have to explicitly flip -# enabled=1 via `dnf config-manager setopt`. Both addrepo and setopt -# are idempotent. -# -# Non-fatal — if either step fails the subsequent `dnf install` will -# surface a clearer diagnostic on its own. -ensure_lair_repo() { - local host="$1" - if ! ssh "${host}" "test -f /etc/yum.repos.d/lair-cafe-unstable.repo" 2>/dev/null; then - echo "[${host}] adding rpm.lair.cafe unstable repo" - if ! ssh "${host}" sudo dnf config-manager addrepo \ - --from-repofile=https://rpm.lair.cafe/lair-cafe-unstable.repo \ - >/dev/null 2>&1; then - echo "[${host}] WARNING: failed to add lair.cafe repo file (proceeding anyway)" - return 0 - fi - fi - # The .repo file ships enabled=0; flip it on. Cheap, idempotent. - if ! ssh "${host}" sudo dnf config-manager setopt \ - lair-cafe-unstable.enabled=1 >/dev/null 2>&1; then - echo "[${host}] WARNING: failed to enable lair-cafe-unstable (proceeding anyway)" - fi -} - -# Ensure libcudnn.so.9 is resolvable on the remote host so the -# neuron binary (built with --features cudnn) doesn't fail at startup -# with "cannot open shared object file: No such file or directory". -# -# Probes ldconfig first — if cuDNN was installed manually (.tar/.run -# install), it'll be cached by ldconfig and we don't touch it. -# Otherwise adds NVIDIA's RHEL9 CUDA repo (the Fedora 43 CUDA repo -# doesn't ship cuDNN packages — only the RHEL9 one does) and installs -# libcudnn9-cuda-13. -ensure_cudnn_runtime() { - local host="$1" - if ssh "${host}" "ldconfig -p | grep -q libcudnn.so.9" 2>/dev/null; then - return 0 - fi - echo "[${host}] installing cuDNN runtime" - if ! ssh "${host}" "test -f /etc/yum.repos.d/cuda-rhel9-x86_64.repo" 2>/dev/null; then - if ! ssh "${host}" sudo dnf config-manager addrepo \ - --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \ - >/dev/null 2>&1; then - echo "[${host}] WARNING: failed to add rhel9 CUDA repo (proceeding anyway)" - fi - fi - if ! ssh "${host}" sudo dnf install -y libcudnn9-cuda-13 >/dev/null 2>&1; then - echo "[${host}] WARNING: failed to install libcudnn9-cuda-13" - echo "[${host}] neuron may fail to start; install cuDNN manually if so" - fi -} - -# True when the named package needs to be installed or upgraded on the -# remote host — either it's not present, or a newer version exists in -# the repo. False only when the installed version is current. -# -# `dnf check-update ` returns 0 when the package isn't installed -# at all (there's nothing to update), so we have to probe with rpm -q -# first to distinguish "absent" from "current". Other dnf failures -# collapse into "needs update" so the subsequent install step surfaces -# the real diagnostic rather than this check swallowing it. -needs_update() { - local host="$1" pkg="$2" - # Not installed → needs work. - if ! ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1; then - return 0 - fi - # Installed; ask dnf whether the repo has something newer. - if ssh "${host}" sudo dnf check-update --refresh -q "${pkg}" >/dev/null 2>&1; then - return 1 - else - return 0 - fi -} - -# True if the named package is currently installed on the remote host. -# Used to decide between `dnf install` (fresh) and `dnf upgrade` (stale): -# dnf5's `install` is a no-op when the package is already present at -# any version — it does NOT auto-upgrade to the latest available — so -# the wrong command silently leaves the host on an old build. -is_installed() { - local host="$1" pkg="$2" - ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1 -} - -# Install or upgrade the named package on the remote, picking the -# right dnf verb based on the installed-or-not state. Returns 0 with -# dnf's combined stdout/stderr captured in __DNF_OUTPUT__ on success, -# and 1 with the same captured output on failure. -__DNF_OUTPUT__="" -install_or_upgrade() { - local host="$1" pkg="$2" - local cmd - if is_installed "${host}" "${pkg}"; then - cmd="upgrade" - else - cmd="install" - fi - if __DNF_OUTPUT__=$( - ssh "${host}" sudo dnf "${cmd}" --refresh --allowerasing -y "${pkg}" 2>&1 - ); then - return 0 - else - return 1 - fi -} - -# --------------------------------------------------------------------------- -# cortex (gateway) -# --------------------------------------------------------------------------- - -ensure_lair_repo "${cortex_host}" -cortex_nvr=$(installed_nvr "${cortex_host}" cortex) -if needs_update "${cortex_host}" cortex; then - echo "[${cortex_host}] cortex update available (current: ${cortex_nvr})" - # Stop the service only if the unit file exists — fresh installs - # don't have it, and `systemctl stop` on a missing unit returns - # non-zero, which would otherwise short-circuit the install branch - # under set -e. - if ssh "${cortex_host}" "[ ! -f /usr/lib/systemd/system/cortex.service ] || sudo systemctl stop cortex.service"; then - echo "[${cortex_host}] stopped cortex service" - if install_or_upgrade "${cortex_host}" cortex; then - cortex_nvr=$(installed_nvr "${cortex_host}" cortex) - echo "[${cortex_host}] installed/upgraded cortex to ${cortex_nvr}" - else - echo "[${cortex_host}] failed to install/upgrade cortex:" - echo "${__DNF_OUTPUT__}" | sed "s/^/[${cortex_host}] /" - fi - else - echo "[${cortex_host}] failed to stop cortex service" - fi -else - echo "[${cortex_host}] cortex is up to date (${cortex_nvr})" - ssh "${cortex_host}" sudo systemctl stop cortex.service || true -fi - -# Sync cortex.toml whether the package was upgraded or not — the config -# can change without a package bump. -if rsync \ - --archive \ - --compress \ - --rsync-path 'sudo rsync' \ - --chown root:root \ - --chmod 644 \ - "${REPO_DIR}/cortex.toml" \ - "${cortex_host}:/etc/cortex/cortex.toml"; then - echo "[${cortex_host}] sync'd cortex.toml" -else - echo "[${cortex_host}] failed to sync cortex.toml" -fi - -# Sync models.toml on the same lifecycle as cortex.toml — operator-owned, -# gitignored, drives /v1/models catalogue × topology resolution. -if [[ -f "${REPO_DIR}/models.toml" ]]; then - if rsync \ - --archive \ - --compress \ - --rsync-path 'sudo rsync' \ - --chown root:root \ - --chmod 644 \ - "${REPO_DIR}/models.toml" \ - "${cortex_host}:/etc/cortex/models.toml"; then - echo "[${cortex_host}] sync'd models.toml" - else - echo "[${cortex_host}] failed to sync models.toml" - fi -else - echo "[${cortex_host}] no local models.toml — leaving /etc/cortex/models.toml untouched" -fi - -ssh "${cortex_host}" sudo systemctl daemon-reload -if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then - echo "[${cortex_host}] cortex service is active" -elif ssh "${cortex_host}" sudo systemctl start cortex.service; then - echo "[${cortex_host}] started cortex service" -else - echo "[${cortex_host}] failed to start cortex service" -fi - -# --------------------------------------------------------------------------- -# neuron (per-host, flavour from manifest) -# --------------------------------------------------------------------------- - -for entry in "${neuron_entries[@]}"; do - IFS=$'\t' read -r neuron_host neuron_flavour <<< "${entry}" - package="helexa-neuron-${neuron_flavour}" - # First dot-component of the host keys the per-host config file - # under asset/neuron/.toml. A host listed in the manifest - # without a corresponding config still deploys (the package's - # default /etc/neuron/neuron.toml stays in place; no pre-warm). - short_host="${neuron_host%%.*}" - host_config="${REPO_DIR}/asset/neuron/${short_host}.toml" - - ensure_lair_repo "${neuron_host}" - ensure_cudnn_runtime "${neuron_host}" - neuron_nvr=$(installed_nvr "${neuron_host}" "${package}") - - # Stop the service unconditionally before any reconfig step. - # `default_models` is read at activation, so a config change without - # a bounce silently leaves the host on the previous pre-warm set. - # Same shape as the cortex flow above. The `[ ! -f … ]` guard skips - # the stop on a fresh install where the unit file isn't there yet. - if ssh "${neuron_host}" "[ ! -f /usr/lib/systemd/system/neuron.service ] || sudo systemctl stop neuron.service"; then - echo "[${neuron_host}] stopped neuron service" - else - echo "[${neuron_host}] failed to stop neuron service (continuing)" - fi - - if needs_update "${neuron_host}" "${package}"; then - echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})" - # --allowerasing lets dnf swap out a previously-installed - # bare helexa-neuron or a different flavour without manual - # intervention. The Conflicts: clauses in the spec ensure - # only one flavour is ever resident. - if install_or_upgrade "${neuron_host}" "${package}"; then - neuron_nvr=$(installed_nvr "${neuron_host}" "${package}") - echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}" - # Ensure firewalld allows neuron port - ssh "${neuron_host}" "sudo firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null || sudo firewall-cmd --add-service=helexa-neuron --permanent && sudo firewall-cmd --reload" 2>/dev/null || true - else - echo "[${neuron_host}] failed to install ${package}:" - echo "${__DNF_OUTPUT__}" | sed "s/^/[${neuron_host}] /" - fi - else - echo "[${neuron_host}] ${package} is up to date (${neuron_nvr})" - fi - - # Sync per-host neuron.toml — drives default_models pre-warm so - # `/v1/models` on the gateway exposes the host's headline model - # immediately after the service comes back up. Missing per-host - # config leaves the package's installed neuron.toml untouched. - if [[ -f "${host_config}" ]]; then - if rsync \ - --archive \ - --compress \ - --rsync-path 'sudo rsync' \ - --chown root:root \ - --chmod 644 \ - "${host_config}" \ - "${neuron_host}:/etc/neuron/neuron.toml"; then - echo "[${neuron_host}] sync'd asset/neuron/${short_host}.toml" - else - echo "[${neuron_host}] failed to sync neuron.toml" - fi - else - echo "[${neuron_host}] no asset/neuron/${short_host}.toml — leaving /etc/neuron/neuron.toml untouched" - fi - - if ssh "${neuron_host}" "sudo systemctl daemon-reload && sudo systemctl start neuron.service"; then - echo "[${neuron_host}] started neuron service" - else - echo "[${neuron_host}] failed to start neuron service" - fi -done