chore(deploy): drop deploy.sh and manifest.yml now that workflow runs

First end-to-end run of the deploy workflow succeeded (gitea run #289), so the operator-run rolling-deploy script and its YAML manifest are no longer the source of truth — fleet topology lives in .gitea/workflows/deploy.yml and per-host config in script/infra-setup.sh. Per-host neuron config comments updated to point at the new sync path. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-02 16:41:04 +03:00
parent 577781de8d
commit ea1fdf8aa6
5 changed files with 5 additions and 338 deletions
--- a/asset/manifest.yml
+++ b/asset/manifest.yml
@@ -1,30 +0,0 @@
-# Helexa fleet manifest.
-#
-# Drives rolling deploys via script/deploy.sh and serves as the source
-# of truth for which hosts run cortex vs neuron, and which CUDA
-# compute-capability flavour each neuron host needs.
-#
-# Flavour ↔ NVIDIA generation ↔ compute cap:
-#   ampere    sm_86   (RTX 30 series — e.g. 3060)
-#   ada       sm_89   (RTX 40 series — e.g. 4090)
-#   blackwell sm_120  (RTX 50 series — e.g. 5090)
-#
-# The flavour determines which RPM is installed on a given neuron host:
-# helexa-neuron-<flavour>. Only one flavour may be installed at a time
-# (the packages Conflict: with each other).
-
-cortex:
-  host: hanzalova.internal
-
-neurons:
-  - host: beast.hanzalova.internal
-    flavour: blackwell
-    gpu: "2x RTX 5090"
-
-  - host: benjy.hanzalova.internal
-    flavour: ada
-    gpu: "RTX 4090"
-
-  - host: quadbrat.hanzalova.internal
-    flavour: ampere
-    gpu: "RTX 3060"
--- a/asset/neuron/beast.toml
+++ b/asset/neuron/beast.toml
@@ -5,9 +5,9 @@
 # invocation: `validate-neuron.sh beast.hanzalova.internal
 # Qwen/Qwen3.6-27B q5k 2`.
 #
-# Synced by script/deploy.sh from asset/neuron/<short-host>.toml. Edits
-# take effect on the next deploy.sh run (which stops + restarts the
-# service so default_models is re-read at activation).
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. Edits
+# take effect after the next deploy workflow run restarts the service
+# (default_models is read at activation).

 port = 13131

--- a/asset/neuron/benjy.toml
+++ b/asset/neuron/benjy.toml
@@ -4,7 +4,7 @@
 # Qwen3-8B (bf16, ~18 GB), leaving ~6 GB for KV cache + activations on
 # moderate-length contexts.
 #
-# Synced by script/deploy.sh from asset/neuron/<short-host>.toml.
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.

 port = 13131

--- a/asset/neuron/quadbrat.toml
+++ b/asset/neuron/quadbrat.toml
@@ -4,7 +4,7 @@
 # (bf16, ~4 GB), leaving ~7 GB for KV cache so long contexts on a small
 # model still have plenty of room.
 #
-# Synced by script/deploy.sh from asset/neuron/<short-host>.toml.
+# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.

 port = 13131

--- a/script/deploy.sh
+++ b/script/deploy.sh
@@ -1,303 +0,0 @@
-#!/bin/env bash
-#
-# Rolling deploy across the helexa fleet, driven by asset/manifest.yml.
-# Installs / upgrades cortex on the gateway host and the appropriate
-# helexa-neuron-<flavour> package on each neuron host, then restarts
-# their services.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-REPO_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
-MANIFEST="${REPO_DIR}/asset/manifest.yml"
-
-if [[ ! -f "${MANIFEST}" ]]; then
-    echo "fatal: manifest not found at ${MANIFEST}" >&2
-    exit 1
-fi
-
-# Parse the manifest with yq. NOTE: this expects the pip-installed yq
-# (a jq wrapper using jq syntax) — `pip install yq`. The Fedora rpm
-# `yq` is mikefarah/yq and uses different (yaml-native) syntax; if a
-# host has that one instead these queries will fail.
-cortex_host=$(yq -r '.cortex.host' "${MANIFEST}")
-
-# Emit one TAB-separated 'host\tflavour' line per neuron.
-mapfile -t neuron_entries < <(
-    yq -r '.neurons[] | .host + "\t" + .flavour' "${MANIFEST}"
-)
-
-# Return the installed package's "version-release" string, or
-# "(not installed)" when rpm reports the package as absent. Capture
-# rpm's output into a variable so its "package X is not installed"
-# stdout message (rpm writes that to stdout, not stderr, when -q fails)
-# doesn't leak into the result.
-installed_nvr() {
-    local host="$1" pkg="$2"
-    local nvr
-    if nvr=$(ssh "${host}" "rpm -q --qf '%{version}-%{release}' ${pkg} 2>/dev/null"); then
-        echo "${nvr}"
-    else
-        echo "(not installed)"
-    fi
-}
-
-# Ensure the rpm.lair.cafe unstable repo is configured AND enabled on
-# the remote host.
-#
-# The upstream .repo file at https://rpm.lair.cafe/lair-cafe-unstable.repo
-# ships with `enabled=0` so a host that just fetched it won't start
-# pulling unstable packages by accident. We have to explicitly flip
-# enabled=1 via `dnf config-manager setopt`. Both addrepo and setopt
-# are idempotent.
-#
-# Non-fatal — if either step fails the subsequent `dnf install` will
-# surface a clearer diagnostic on its own.
-ensure_lair_repo() {
-    local host="$1"
-    if ! ssh "${host}" "test -f /etc/yum.repos.d/lair-cafe-unstable.repo" 2>/dev/null; then
-        echo "[${host}] adding rpm.lair.cafe unstable repo"
-        if ! ssh "${host}" sudo dnf config-manager addrepo \
-            --from-repofile=https://rpm.lair.cafe/lair-cafe-unstable.repo \
-            >/dev/null 2>&1; then
-            echo "[${host}] WARNING: failed to add lair.cafe repo file (proceeding anyway)"
-            return 0
-        fi
-    fi
-    # The .repo file ships enabled=0; flip it on. Cheap, idempotent.
-    if ! ssh "${host}" sudo dnf config-manager setopt \
-        lair-cafe-unstable.enabled=1 >/dev/null 2>&1; then
-        echo "[${host}] WARNING: failed to enable lair-cafe-unstable (proceeding anyway)"
-    fi
-}
-
-# Ensure libcudnn.so.9 is resolvable on the remote host so the
-# neuron binary (built with --features cudnn) doesn't fail at startup
-# with "cannot open shared object file: No such file or directory".
-#
-# Probes ldconfig first — if cuDNN was installed manually (.tar/.run
-# install), it'll be cached by ldconfig and we don't touch it.
-# Otherwise adds NVIDIA's RHEL9 CUDA repo (the Fedora 43 CUDA repo
-# doesn't ship cuDNN packages — only the RHEL9 one does) and installs
-# libcudnn9-cuda-13.
-ensure_cudnn_runtime() {
-    local host="$1"
-    if ssh "${host}" "ldconfig -p | grep -q libcudnn.so.9" 2>/dev/null; then
-        return 0
-    fi
-    echo "[${host}] installing cuDNN runtime"
-    if ! ssh "${host}" "test -f /etc/yum.repos.d/cuda-rhel9-x86_64.repo" 2>/dev/null; then
-        if ! ssh "${host}" sudo dnf config-manager addrepo \
-            --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
-            >/dev/null 2>&1; then
-            echo "[${host}] WARNING: failed to add rhel9 CUDA repo (proceeding anyway)"
-        fi
-    fi
-    if ! ssh "${host}" sudo dnf install -y libcudnn9-cuda-13 >/dev/null 2>&1; then
-        echo "[${host}] WARNING: failed to install libcudnn9-cuda-13"
-        echo "[${host}]   neuron may fail to start; install cuDNN manually if so"
-    fi
-}
-
-# True when the named package needs to be installed or upgraded on the
-# remote host — either it's not present, or a newer version exists in
-# the repo. False only when the installed version is current.
-#
-# `dnf check-update <pkg>` returns 0 when the package isn't installed
-# at all (there's nothing to update), so we have to probe with rpm -q
-# first to distinguish "absent" from "current". Other dnf failures
-# collapse into "needs update" so the subsequent install step surfaces
-# the real diagnostic rather than this check swallowing it.
-needs_update() {
-    local host="$1" pkg="$2"
-    # Not installed → needs work.
-    if ! ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1; then
-        return 0
-    fi
-    # Installed; ask dnf whether the repo has something newer.
-    if ssh "${host}" sudo dnf check-update --refresh -q "${pkg}" >/dev/null 2>&1; then
-        return 1
-    else
-        return 0
-    fi
-}
-
-# True if the named package is currently installed on the remote host.
-# Used to decide between `dnf install` (fresh) and `dnf upgrade` (stale):
-# dnf5's `install` is a no-op when the package is already present at
-# any version — it does NOT auto-upgrade to the latest available — so
-# the wrong command silently leaves the host on an old build.
-is_installed() {
-    local host="$1" pkg="$2"
-    ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1
-}
-
-# Install or upgrade the named package on the remote, picking the
-# right dnf verb based on the installed-or-not state. Returns 0 with
-# dnf's combined stdout/stderr captured in __DNF_OUTPUT__ on success,
-# and 1 with the same captured output on failure.
-__DNF_OUTPUT__=""
-install_or_upgrade() {
-    local host="$1" pkg="$2"
-    local cmd
-    if is_installed "${host}" "${pkg}"; then
-        cmd="upgrade"
-    else
-        cmd="install"
-    fi
-    if __DNF_OUTPUT__=$(
-        ssh "${host}" sudo dnf "${cmd}" --refresh --allowerasing -y "${pkg}" 2>&1
-    ); then
-        return 0
-    else
-        return 1
-    fi
-}
-
-# ---------------------------------------------------------------------------
-# cortex (gateway)
-# ---------------------------------------------------------------------------
-
-ensure_lair_repo "${cortex_host}"
-cortex_nvr=$(installed_nvr "${cortex_host}" cortex)
-if needs_update "${cortex_host}" cortex; then
-    echo "[${cortex_host}] cortex update available (current: ${cortex_nvr})"
-    # Stop the service only if the unit file exists — fresh installs
-    # don't have it, and `systemctl stop` on a missing unit returns
-    # non-zero, which would otherwise short-circuit the install branch
-    # under set -e.
-    if ssh "${cortex_host}" "[ ! -f /usr/lib/systemd/system/cortex.service ] || sudo systemctl stop cortex.service"; then
-        echo "[${cortex_host}] stopped cortex service"
-        if install_or_upgrade "${cortex_host}" cortex; then
-            cortex_nvr=$(installed_nvr "${cortex_host}" cortex)
-            echo "[${cortex_host}] installed/upgraded cortex to ${cortex_nvr}"
-        else
-            echo "[${cortex_host}] failed to install/upgrade cortex:"
-            echo "${__DNF_OUTPUT__}" | sed "s/^/[${cortex_host}]   /"
-        fi
-    else
-        echo "[${cortex_host}] failed to stop cortex service"
-    fi
-else
-    echo "[${cortex_host}] cortex is up to date (${cortex_nvr})"
-    ssh "${cortex_host}" sudo systemctl stop cortex.service || true
-fi
-
-# Sync cortex.toml whether the package was upgraded or not — the config
-# can change without a package bump.
-if rsync \
-    --archive \
-    --compress \
-    --rsync-path 'sudo rsync' \
-    --chown root:root \
-    --chmod 644 \
-    "${REPO_DIR}/cortex.toml" \
-    "${cortex_host}:/etc/cortex/cortex.toml"; then
-    echo "[${cortex_host}] sync'd cortex.toml"
-else
-    echo "[${cortex_host}] failed to sync cortex.toml"
-fi
-
-# Sync models.toml on the same lifecycle as cortex.toml — operator-owned,
-# gitignored, drives /v1/models catalogue × topology resolution.
-if [[ -f "${REPO_DIR}/models.toml" ]]; then
-    if rsync \
-        --archive \
-        --compress \
-        --rsync-path 'sudo rsync' \
-        --chown root:root \
-        --chmod 644 \
-        "${REPO_DIR}/models.toml" \
-        "${cortex_host}:/etc/cortex/models.toml"; then
-        echo "[${cortex_host}] sync'd models.toml"
-    else
-        echo "[${cortex_host}] failed to sync models.toml"
-    fi
-else
-    echo "[${cortex_host}] no local models.toml — leaving /etc/cortex/models.toml untouched"
-fi
-
-ssh "${cortex_host}" sudo systemctl daemon-reload
-if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then
-    echo "[${cortex_host}] cortex service is active"
-elif ssh "${cortex_host}" sudo systemctl start cortex.service; then
-    echo "[${cortex_host}] started cortex service"
-else
-    echo "[${cortex_host}] failed to start cortex service"
-fi
-
-# ---------------------------------------------------------------------------
-# neuron (per-host, flavour from manifest)
-# ---------------------------------------------------------------------------
-
-for entry in "${neuron_entries[@]}"; do
-    IFS=$'\t' read -r neuron_host neuron_flavour <<< "${entry}"
-    package="helexa-neuron-${neuron_flavour}"
-    # First dot-component of the host keys the per-host config file
-    # under asset/neuron/<short>.toml. A host listed in the manifest
-    # without a corresponding config still deploys (the package's
-    # default /etc/neuron/neuron.toml stays in place; no pre-warm).
-    short_host="${neuron_host%%.*}"
-    host_config="${REPO_DIR}/asset/neuron/${short_host}.toml"
-
-    ensure_lair_repo "${neuron_host}"
-    ensure_cudnn_runtime "${neuron_host}"
-    neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
-
-    # Stop the service unconditionally before any reconfig step.
-    # `default_models` is read at activation, so a config change without
-    # a bounce silently leaves the host on the previous pre-warm set.
-    # Same shape as the cortex flow above. The `[ ! -f … ]` guard skips
-    # the stop on a fresh install where the unit file isn't there yet.
-    if ssh "${neuron_host}" "[ ! -f /usr/lib/systemd/system/neuron.service ] || sudo systemctl stop neuron.service"; then
-        echo "[${neuron_host}] stopped neuron service"
-    else
-        echo "[${neuron_host}] failed to stop neuron service (continuing)"
-    fi
-
-    if needs_update "${neuron_host}" "${package}"; then
-        echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"
-        # --allowerasing lets dnf swap out a previously-installed
-        # bare helexa-neuron or a different flavour without manual
-        # intervention. The Conflicts: clauses in the spec ensure
-        # only one flavour is ever resident.
-        if install_or_upgrade "${neuron_host}" "${package}"; then
-            neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
-            echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}"
-            # Ensure firewalld allows neuron port
-            ssh "${neuron_host}" "sudo firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null || sudo firewall-cmd --add-service=helexa-neuron --permanent && sudo firewall-cmd --reload" 2>/dev/null || true
-        else
-            echo "[${neuron_host}] failed to install ${package}:"
-            echo "${__DNF_OUTPUT__}" | sed "s/^/[${neuron_host}]   /"
-        fi
-    else
-        echo "[${neuron_host}] ${package} is up to date (${neuron_nvr})"
-    fi
-
-    # Sync per-host neuron.toml — drives default_models pre-warm so
-    # `/v1/models` on the gateway exposes the host's headline model
-    # immediately after the service comes back up. Missing per-host
-    # config leaves the package's installed neuron.toml untouched.
-    if [[ -f "${host_config}" ]]; then
-        if rsync \
-            --archive \
-            --compress \
-            --rsync-path 'sudo rsync' \
-            --chown root:root \
-            --chmod 644 \
-            "${host_config}" \
-            "${neuron_host}:/etc/neuron/neuron.toml"; then
-            echo "[${neuron_host}] sync'd asset/neuron/${short_host}.toml"
-        else
-            echo "[${neuron_host}] failed to sync neuron.toml"
-        fi
-    else
-        echo "[${neuron_host}] no asset/neuron/${short_host}.toml — leaving /etc/neuron/neuron.toml untouched"
-    fi
-
-    if ssh "${neuron_host}" "sudo systemctl daemon-reload && sudo systemctl start neuron.service"; then
-        echo "[${neuron_host}] started neuron service"
-    else
-        echo "[${neuron_host}] failed to start neuron service"
-    fi
-done