From 1a0400131e4cba1bc6703fb533ef753d30599602 Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Tue, 19 May 2026 14:10:48 +0300 Subject: [PATCH] fix(deploy): use dnf upgrade for stale installs, install only when absent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dnf5's `dnf install ` is a no-op when the package is already installed at ANY version — it does NOT auto-upgrade to the latest available. The deploy script's install branch was therefore silently leaving hosts on older builds even though needs_update correctly reported an upgrade was available. Add an is_installed() probe and an install_or_upgrade() helper that picks the right verb: `dnf install` when fresh, `dnf upgrade` when stale. Captured combined-stream output is exposed via __DNF_OUTPUT__ for the existing failure-diagnostic path. Verified end-to-end against the live fleet: hanzalova/beast/benjy/ quadbrat all upgraded cleanly from prior prerelease NVRs to 0.1.16-0.1.20260519134302.git1866b99.fc43, validation script returned "Paris" from all three neurons. Followup (not in this commit): all hosts running helexa-neuron-* need libcudnn.so.9 available at runtime. Currently: - quadbrat: libcudnn9-cuda-13 RPM (rhel9 CUDA repo) - beast: /usr/lib64/libcudnn.so.9 (manual install) - benjy: needed rhel9 CUDA repo added + libcudnn9-cuda-13 installed as part of this validation pass. The spec currently excludes cuDNN from auto-detected deps. Should add a Recommends:libcudnn9-cuda-13 (soft) and ensure the rhel9 CUDA repo is configured on each neuron host, similar to how ensure_lair_repo handles the unstable channel. Co-Authored-By: Claude Opus 4.7 (1M context) --- script/deploy.sh | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/script/deploy.sh b/script/deploy.sh index 379df6d..187cedb 100755 --- a/script/deploy.sh +++ b/script/deploy.sh @@ -94,6 +94,38 @@ needs_update() { fi } +# True if the named package is currently installed on the remote host. +# Used to decide between `dnf install` (fresh) and `dnf upgrade` (stale): +# dnf5's `install` is a no-op when the package is already present at +# any version — it does NOT auto-upgrade to the latest available — so +# the wrong command silently leaves the host on an old build. +is_installed() { + local host="$1" pkg="$2" + ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1 +} + +# Install or upgrade the named package on the remote, picking the +# right dnf verb based on the installed-or-not state. Returns 0 with +# dnf's combined stdout/stderr captured in __DNF_OUTPUT__ on success, +# and 1 with the same captured output on failure. +__DNF_OUTPUT__="" +install_or_upgrade() { + local host="$1" pkg="$2" + local cmd + if is_installed "${host}" "${pkg}"; then + cmd="upgrade" + else + cmd="install" + fi + if __DNF_OUTPUT__=$( + ssh "${host}" sudo dnf "${cmd}" --refresh --allowerasing -y "${pkg}" 2>&1 + ); then + return 0 + else + return 1 + fi +} + # --------------------------------------------------------------------------- # cortex (gateway) # --------------------------------------------------------------------------- @@ -108,12 +140,12 @@ if needs_update "${cortex_host}" cortex; then # under set -e. if ssh "${cortex_host}" "[ ! -f /usr/lib/systemd/system/cortex.service ] || sudo systemctl stop cortex.service"; then echo "[${cortex_host}] stopped cortex service" - if dnf_output=$(ssh "${cortex_host}" sudo dnf install --refresh --allowerasing -y cortex 2>&1); then + if install_or_upgrade "${cortex_host}" cortex; then cortex_nvr=$(installed_nvr "${cortex_host}" cortex) echo "[${cortex_host}] installed/upgraded cortex to ${cortex_nvr}" else echo "[${cortex_host}] failed to install/upgrade cortex:" - echo "${dnf_output}" | sed "s/^/[${cortex_host}] /" + echo "${__DNF_OUTPUT__}" | sed "s/^/[${cortex_host}] /" fi else echo "[${cortex_host}] failed to stop cortex service" @@ -165,7 +197,7 @@ for entry in "${neuron_entries[@]}"; do # bare helexa-neuron or a different flavour without manual # intervention. The Conflicts: clauses in the spec ensure # only one flavour is ever resident. - if dnf_output=$(ssh "${neuron_host}" sudo dnf install --refresh --allowerasing -y "${package}" 2>&1); then + if install_or_upgrade "${neuron_host}" "${package}"; then neuron_nvr=$(installed_nvr "${neuron_host}" "${package}") echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}" # Ensure firewalld allows neuron port @@ -177,7 +209,7 @@ for entry in "${neuron_entries[@]}"; do fi else echo "[${neuron_host}] failed to install ${package}:" - echo "${dnf_output}" | sed "s/^/[${neuron_host}] /" + echo "${__DNF_OUTPUT__}" | sed "s/^/[${neuron_host}] /" fi else echo "[${neuron_host}] failed to stop neuron service"