deploy: dnf-native version check + lair.cafe repo bootstrap

Replaces the string compare of 'git describe --tags' vs the binary's
self-reported --version (which lies about prereleases — every
0.1.16-* RPM reports just "0.1.16") with the dnf-native question of
"is the installed package current against what the repo offers".

Mechanism:
- installed_nvr(): rpm -q --qf '%{version}-%{release}' for the
  resident package, falling back to "(not installed)". Capturing rpm's
  output through a variable keeps its "package X is not installed"
  stdout message out of the result on failure.
- needs_update(): probes rpm -q first (treats absent as "needs work"),
  then asks dnf check-update --refresh -q. Other dnf failures collapse
  into "needs update" so the subsequent install surfaces a real error
  rather than this check swallowing one silently.
- ensure_lair_repo(): probes for /etc/yum.repos.d/lair-cafe-unstable.repo
  and adds it with `dnf config-manager addrepo` when missing. The
  upstream .repo file ships enabled=0 (unstable channel doesn't
  auto-engage on fetch), so we then run `dnf config-manager setopt
  lair-cafe-unstable.enabled=1` every run — cheap, idempotent.
- Cortex and neuron install branches now guard `systemctl stop` with
  `[ ! -f /usr/lib/systemd/system/...service ] || sudo systemctl stop`
  so fresh installs (no unit file yet) don't short-circuit the install
  step under set -e.
- dnf output is captured into a variable and only printed (with a
  [host]   prefix per line) on failure, so success stays quiet and
  failures show the actual diagnostic instead of being eaten by
  &> /dev/null.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 18:55:02 +03:00
parent aad314cdfa
commit 8a2334eacb

View File

@@ -27,63 +27,124 @@ mapfile -t neuron_entries < <(
yq -r '.neurons[] | .host + "\t" + .flavour' "${MANIFEST}"
)
latest_helexa_version=$(git -C "${REPO_DIR}" describe --tags --abbrev=0 | sed 's/^v//')
# Return the installed package's "version-release" string, or
# "(not installed)" when rpm reports the package as absent. Capture
# rpm's output into a variable so its "package X is not installed"
# stdout message (rpm writes that to stdout, not stderr, when -q fails)
# doesn't leak into the result.
installed_nvr() {
local host="$1" pkg="$2"
local nvr
if nvr=$(ssh "${host}" "rpm -q --qf '%{version}-%{release}' ${pkg} 2>/dev/null"); then
echo "${nvr}"
else
echo "(not installed)"
fi
}
# Ensure the rpm.lair.cafe unstable repo is configured AND enabled on
# the remote host.
#
# The upstream .repo file at https://rpm.lair.cafe/lair-cafe-unstable.repo
# ships with `enabled=0` so a host that just fetched it won't start
# pulling unstable packages by accident. We have to explicitly flip
# enabled=1 via `dnf config-manager setopt`. Both addrepo and setopt
# are idempotent.
#
# Non-fatal — if either step fails the subsequent `dnf install` will
# surface a clearer diagnostic on its own.
ensure_lair_repo() {
local host="$1"
if ! ssh "${host}" "test -f /etc/yum.repos.d/lair-cafe-unstable.repo" 2>/dev/null; then
echo "[${host}] adding rpm.lair.cafe unstable repo"
if ! ssh "${host}" sudo dnf config-manager addrepo \
--from-repofile=https://rpm.lair.cafe/lair-cafe-unstable.repo \
>/dev/null 2>&1; then
echo "[${host}] WARNING: failed to add lair.cafe repo file (proceeding anyway)"
return 0
fi
fi
# The .repo file ships enabled=0; flip it on. Cheap, idempotent.
if ! ssh "${host}" sudo dnf config-manager setopt \
lair-cafe-unstable.enabled=1 >/dev/null 2>&1; then
echo "[${host}] WARNING: failed to enable lair-cafe-unstable (proceeding anyway)"
fi
}
# True when the named package needs to be installed or upgraded on the
# remote host — either it's not present, or a newer version exists in
# the repo. False only when the installed version is current.
#
# `dnf check-update <pkg>` returns 0 when the package isn't installed
# at all (there's nothing to update), so we have to probe with rpm -q
# first to distinguish "absent" from "current". Other dnf failures
# collapse into "needs update" so the subsequent install step surfaces
# the real diagnostic rather than this check swallowing it.
needs_update() {
local host="$1" pkg="$2"
# Not installed → needs work.
if ! ssh "${host}" "rpm -q ${pkg}" >/dev/null 2>&1; then
return 0
fi
# Installed; ask dnf whether the repo has something newer.
if ssh "${host}" sudo dnf check-update --refresh -q "${pkg}" >/dev/null 2>&1; then
return 1
else
return 0
fi
}
# ---------------------------------------------------------------------------
# cortex (gateway)
# ---------------------------------------------------------------------------
observed_cortex_version=$(ssh "${cortex_host}" cortex --version | sed 's/^cortex //')
if [[ "${latest_helexa_version}" = "${observed_cortex_version}" ]]; then
echo "[${cortex_host}] cortex is up to date (${observed_cortex_version})"
if ssh "${cortex_host}" sudo systemctl stop cortex.service && rsync \
--archive \
--compress \
--rsync-path 'sudo rsync' \
--chown root:root \
--chmod 644 \
"${REPO_DIR}/cortex.toml" \
"${cortex_host}:/etc/cortex/cortex.toml"; then
echo "[${cortex_host}] sync'd cortex.toml"
ssh "${cortex_host}" sudo systemctl daemon-reload
ssh "${cortex_host}" sudo systemctl start cortex.service
else
echo "[${cortex_host}] failed to sync cortex.toml"
fi
if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then
echo "[${cortex_host}] cortex service is active"
elif ssh "${cortex_host}" sudo systemctl start cortex.service; then
echo "[${cortex_host}] started cortex service"
else
echo "[${cortex_host}] failed to start cortex service"
fi
else
echo "[${cortex_host}] cortex is out of date (${observed_cortex_version} != ${latest_helexa_version})"
if ssh "${cortex_host}" sudo systemctl stop cortex.service; then
ensure_lair_repo "${cortex_host}"
cortex_nvr=$(installed_nvr "${cortex_host}" cortex)
if needs_update "${cortex_host}" cortex; then
echo "[${cortex_host}] cortex update available (current: ${cortex_nvr})"
# Stop the service only if the unit file exists — fresh installs
# don't have it, and `systemctl stop` on a missing unit returns
# non-zero, which would otherwise short-circuit the install branch
# under set -e.
if ssh "${cortex_host}" "[ ! -f /usr/lib/systemd/system/cortex.service ] || sudo systemctl stop cortex.service"; then
echo "[${cortex_host}] stopped cortex service"
if ssh "${cortex_host}" sudo dnf upgrade --refresh -y cortex; then
echo "[${cortex_host}] upgraded cortex"
if rsync \
--archive \
--compress \
--verbose \
--rsync-path 'sudo rsync' \
--chown root:root \
--chmod 644 \
"${REPO_DIR}/cortex.toml" \
"${cortex_host}:/etc/cortex/cortex.toml"; then
echo "[${cortex_host}] sync'd cortex.toml"
ssh "${cortex_host}" sudo systemctl daemon-reload
ssh "${cortex_host}" sudo systemctl start cortex.service
else
echo "[${cortex_host}] failed to sync cortex.toml"
fi
if dnf_output=$(ssh "${cortex_host}" sudo dnf install --refresh --allowerasing -y cortex 2>&1); then
cortex_nvr=$(installed_nvr "${cortex_host}" cortex)
echo "[${cortex_host}] installed/upgraded cortex to ${cortex_nvr}"
else
echo "[${cortex_host}] failed to upgrade cortex"
echo "[${cortex_host}] failed to install/upgrade cortex:"
echo "${dnf_output}" | sed "s/^/[${cortex_host}] /"
fi
else
echo "[${cortex_host}] failed to stop cortex service"
fi
else
echo "[${cortex_host}] cortex is up to date (${cortex_nvr})"
ssh "${cortex_host}" sudo systemctl stop cortex.service || true
fi
# Sync cortex.toml whether the package was upgraded or not — the config
# can change without a package bump.
if rsync \
--archive \
--compress \
--rsync-path 'sudo rsync' \
--chown root:root \
--chmod 644 \
"${REPO_DIR}/cortex.toml" \
"${cortex_host}:/etc/cortex/cortex.toml"; then
echo "[${cortex_host}] sync'd cortex.toml"
else
echo "[${cortex_host}] failed to sync cortex.toml"
fi
ssh "${cortex_host}" sudo systemctl daemon-reload
if ssh "${cortex_host}" systemctl is-active --quiet cortex.service; then
echo "[${cortex_host}] cortex service is active"
elif ssh "${cortex_host}" sudo systemctl start cortex.service; then
echo "[${cortex_host}] started cortex service"
else
echo "[${cortex_host}] failed to start cortex service"
fi
# ---------------------------------------------------------------------------
@@ -94,26 +155,19 @@ for entry in "${neuron_entries[@]}"; do
IFS=$'\t' read -r neuron_host neuron_flavour <<< "${entry}"
package="helexa-neuron-${neuron_flavour}"
observed_neuron_version=$(ssh "${neuron_host}" neuron --version 2> /dev/null | sed 's/^neuron //' || true)
if [[ "${latest_helexa_version}" = "${observed_neuron_version}" ]]; then
echo "[${neuron_host}] neuron is up to date (${observed_neuron_version}, ${package})"
if ssh "${neuron_host}" systemctl is-active --quiet neuron.service; then
echo "[${neuron_host}] neuron service is active"
elif ssh "${neuron_host}" sudo systemctl start neuron.service; then
echo "[${neuron_host}] started neuron service"
else
echo "[${neuron_host}] failed to start neuron service"
fi
else
echo "[${neuron_host}] upgrading neuron from ${observed_neuron_version:-(absent)} to ${latest_helexa_version} (${package})"
ensure_lair_repo "${neuron_host}"
neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
if needs_update "${neuron_host}" "${package}"; then
echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"
if ssh "${neuron_host}" "[ ! -f /usr/lib/systemd/system/neuron.service ] || sudo systemctl stop neuron.service"; then
echo "[${neuron_host}] stopped neuron service"
# --allowerasing lets dnf swap out a previously-installed
# bare helexa-neuron or a different flavour without manual
# intervention. The Conflicts: clauses in the spec ensure
# only one flavour is ever resident.
if ssh "${neuron_host}" sudo dnf install --refresh --allowerasing -y "${package}" &> /dev/null; then
echo "[${neuron_host}] installed/upgraded ${package}"
if dnf_output=$(ssh "${neuron_host}" sudo dnf install --refresh --allowerasing -y "${package}" 2>&1); then
neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}"
# Ensure firewalld allows neuron port
ssh "${neuron_host}" "sudo firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null || sudo firewall-cmd --add-service=helexa-neuron --permanent && sudo firewall-cmd --reload" 2>/dev/null || true
if ssh "${neuron_host}" "sudo systemctl daemon-reload && sudo systemctl start neuron.service"; then
@@ -122,10 +176,20 @@ for entry in "${neuron_entries[@]}"; do
echo "[${neuron_host}] failed to start neuron service"
fi
else
echo "[${neuron_host}] failed to install ${package}"
echo "[${neuron_host}] failed to install ${package}:"
echo "${dnf_output}" | sed "s/^/[${neuron_host}] /"
fi
else
echo "[${neuron_host}] failed to stop neuron service"
fi
else
echo "[${neuron_host}] ${package} is up to date (${neuron_nvr})"
if ssh "${neuron_host}" systemctl is-active --quiet neuron.service; then
echo "[${neuron_host}] neuron service is active"
elif ssh "${neuron_host}" sudo systemctl start neuron.service; then
echo "[${neuron_host}] started neuron service"
else
echo "[${neuron_host}] failed to start neuron service"
fi
fi
done