feat(deploy): gitea workflow for rolling RPM deploys + host bootstrap
Replace operator-run script/deploy.sh with a CI-driven rolling deploy:
- .gitea/workflows/deploy.yml fires on build-prerelease success (and is
re-runnable via workflow_dispatch). Cortex upgrades first on
hanzalova.internal; the three neuron hosts upgrade in parallel under
fail-fast: false so one failing host doesn't sink the rest.
Concurrency-grouped to serialize overlapping deploys, never cancelling
in-flight runs (a half-applied dnf transaction is worse than a stale
deploy).
- asset/sudoers.d/{cortex,neuron}-host.conf are the canonical source for
the scoped privileges gitea_ci needs on each host kind, installed as
/etc/sudoers.d/helexa_gitea_ci. URLs and = signs are backslash-escaped
per sudoers reserved-character rules.
- script/infra-setup.sh idempotently provisions the gitea_ci user,
installs the runner pubkey, drops in the appropriate sudoers fragment
with visudo verification, and syncs cortex.toml / models.toml /
per-host asset/neuron/<short>.toml — config still ships from operator
workstations rather than CI because the first two are gitignored.
The CI-only secret is RSYNC_SSH_KEY (already configured for the repo);
the matching pubkey is ~/.ssh/id_gitea_ci.pub on the operator's box.
script/deploy.sh and asset/manifest.yml are left in place until the
first end-to-end deploy workflow run succeeds, then removed.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
126
.gitea/workflows/deploy.yml
Normal file
126
.gitea/workflows/deploy.yml
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
name: deploy
|
||||||
|
|
||||||
|
# Roll the freshly-published unstable RPMs onto the helexa fleet:
|
||||||
|
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host.
|
||||||
|
#
|
||||||
|
# Triggered automatically after `build-prerelease` succeeds (by which
|
||||||
|
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
|
||||||
|
# re-runnable manually from the Gitea UI.
|
||||||
|
#
|
||||||
|
# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
|
||||||
|
# sudoers drop-in) lives in script/infra-setup.sh — run that once per
|
||||||
|
# host before this workflow can succeed.
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: [build-prerelease]
|
||||||
|
types: [completed]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
# Serialize deploys. Overlapping runs would race on dnf metadata
|
||||||
|
# refresh and service-restart timing; queueing keeps the fleet
|
||||||
|
# predictable. Don't cancel an in-flight deploy — a half-applied dnf
|
||||||
|
# transaction is worse than a slightly stale deploy.
|
||||||
|
concurrency:
|
||||||
|
group: deploy
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
|
env:
|
||||||
|
DEPLOY_KEY: |
|
||||||
|
${{ secrets.RSYNC_SSH_KEY }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy-cortex:
|
||||||
|
runs-on: fedora-43
|
||||||
|
# Two trigger paths: manual dispatch always runs; workflow_run
|
||||||
|
# only runs if the upstream `build-prerelease` actually succeeded.
|
||||||
|
if: >-
|
||||||
|
${{
|
||||||
|
github.event_name == 'workflow_dispatch'
|
||||||
|
|| github.event.workflow_run.conclusion == 'success'
|
||||||
|
}}
|
||||||
|
steps:
|
||||||
|
- name: SSH init
|
||||||
|
run: |
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
|
||||||
|
chmod 600 ~/.ssh/id_ed25519
|
||||||
|
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
|
||||||
|
gitea_ci@hanzalova.internal 'hostname -f'
|
||||||
|
|
||||||
|
- name: Stop cortex.service
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@hanzalova.internal '
|
||||||
|
if systemctl is-active --quiet cortex.service; then
|
||||||
|
sudo /usr/bin/systemctl stop cortex.service
|
||||||
|
fi'
|
||||||
|
|
||||||
|
- name: Install / upgrade cortex from rpm.lair.cafe/unstable
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@hanzalova.internal '
|
||||||
|
if rpm -q cortex >/dev/null 2>&1; then
|
||||||
|
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
|
||||||
|
else
|
||||||
|
sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
|
||||||
|
fi'
|
||||||
|
|
||||||
|
- name: Start cortex.service
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@hanzalova.internal '
|
||||||
|
sudo /usr/bin/systemctl daemon-reload
|
||||||
|
sudo /usr/bin/systemctl start cortex.service'
|
||||||
|
|
||||||
|
deploy-neurons:
|
||||||
|
needs: [deploy-cortex]
|
||||||
|
runs-on: fedora-43
|
||||||
|
strategy:
|
||||||
|
# One neuron failing must not cancel the others. Cortex is up
|
||||||
|
# already; a partial neuron deploy is strictly better than
|
||||||
|
# rolling back to zero.
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- host: beast.hanzalova.internal
|
||||||
|
flavour: blackwell
|
||||||
|
- host: benjy.hanzalova.internal
|
||||||
|
flavour: ada
|
||||||
|
- host: quadbrat.hanzalova.internal
|
||||||
|
flavour: ampere
|
||||||
|
steps:
|
||||||
|
- name: SSH init
|
||||||
|
run: |
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
|
||||||
|
chmod 600 ~/.ssh/id_ed25519
|
||||||
|
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
|
||||||
|
gitea_ci@${{ matrix.host }} 'hostname -f'
|
||||||
|
|
||||||
|
- name: Stop neuron.service
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@${{ matrix.host }} '
|
||||||
|
if systemctl is-active --quiet neuron.service; then
|
||||||
|
sudo /usr/bin/systemctl stop neuron.service
|
||||||
|
fi'
|
||||||
|
|
||||||
|
- name: Install / upgrade helexa-neuron-${{ matrix.flavour }}
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@${{ matrix.host }} "
|
||||||
|
if rpm -q helexa-neuron-${{ matrix.flavour }} >/dev/null 2>&1; then
|
||||||
|
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }}
|
||||||
|
else
|
||||||
|
sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }}
|
||||||
|
fi"
|
||||||
|
|
||||||
|
- name: Ensure firewalld allows helexa-neuron
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@${{ matrix.host }} '
|
||||||
|
if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
|
||||||
|
sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
|
||||||
|
sudo /usr/bin/firewall-cmd --reload
|
||||||
|
fi'
|
||||||
|
|
||||||
|
- name: Start neuron.service
|
||||||
|
run: |
|
||||||
|
ssh gitea_ci@${{ matrix.host }} '
|
||||||
|
sudo /usr/bin/systemctl daemon-reload
|
||||||
|
sudo /usr/bin/systemctl start neuron.service'
|
||||||
20
asset/sudoers.d/cortex-host.conf
Normal file
20
asset/sudoers.d/cortex-host.conf
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# Install on the cortex gateway host as /etc/sudoers.d/helexa_gitea_ci
|
||||||
|
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
|
||||||
|
# which SSHes as gitea_ci@<gateway> to roll out cortex package upgrades
|
||||||
|
# and config changes.
|
||||||
|
#
|
||||||
|
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
|
||||||
|
# helexa-org apps can drop their own sudoers files on the same host
|
||||||
|
# without overwriting this one.
|
||||||
|
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
|
||||||
|
# sudoers reserves `:` and `=` and requires `\` escaping inside command
|
||||||
|
# arguments — without it visudo errors at the first `:` in `https://`.
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
|
||||||
33
asset/sudoers.d/neuron-host.conf
Normal file
33
asset/sudoers.d/neuron-host.conf
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Install on every neuron host as /etc/sudoers.d/helexa_gitea_ci
|
||||||
|
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
|
||||||
|
# which SSHes as gitea_ci@<neuron-host> to roll out helexa-neuron-<flavour>
|
||||||
|
# package upgrades and config changes.
|
||||||
|
#
|
||||||
|
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
|
||||||
|
# helexa-org apps can drop their own sudoers files on the same host
|
||||||
|
# without overwriting this one.
|
||||||
|
#
|
||||||
|
# All three CUDA flavours are listed because a host's flavour can change
|
||||||
|
# (e.g. GPU swap) and we don't want the sudoers file to need to change
|
||||||
|
# in lockstep. Only one flavour can be installed at a time (the packages
|
||||||
|
# Conflict: with each other), so the attack surface is bounded to "wrong
|
||||||
|
# flavour installed" — vandalism, not privilege escalation.
|
||||||
|
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ada
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ada
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-blackwell
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-blackwell
|
||||||
|
# sudoers reserves `:` and `=` and requires `\` escaping inside command
|
||||||
|
# arguments — without it visudo errors at the first `:` in `https://`.
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install -y libcudnn9-cuda-13
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
|
||||||
|
gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
|
||||||
146
script/infra-setup.sh
Executable file
146
script/infra-setup.sh
Executable file
@@ -0,0 +1,146 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
#
|
||||||
|
# One-time setup for the gitea_ci deploy-user on every host that the
|
||||||
|
# .gitea/workflows/deploy.yml workflow targets:
|
||||||
|
# - create the gitea_ci system user (if missing)
|
||||||
|
# - install the runner's pubkey into ~gitea_ci/.ssh/authorized_keys
|
||||||
|
# - install the appropriate /etc/sudoers.d/helexa_gitea_ci sudoers
|
||||||
|
# drop-in (cortex flavour on the gateway, neuron flavour on each
|
||||||
|
# neuron host)
|
||||||
|
#
|
||||||
|
# Idempotent — safe to re-run after fleet changes. Continues past
|
||||||
|
# unreachable hosts so a single offline node doesn't block the rest.
|
||||||
|
|
||||||
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
repo_path="$(cd "${script_dir}/.." && pwd)"
|
||||||
|
|
||||||
|
cortex_host=hanzalova.internal
|
||||||
|
neuron_hosts=(
|
||||||
|
beast.hanzalova.internal
|
||||||
|
benjy.hanzalova.internal
|
||||||
|
quadbrat.hanzalova.internal
|
||||||
|
)
|
||||||
|
|
||||||
|
pubkey="${HOME}/.ssh/id_gitea_ci.pub"
|
||||||
|
if [[ ! -f "${pubkey}" ]]; then
|
||||||
|
echo "fatal: ${pubkey} not found" >&2
|
||||||
|
echo " generate with: ssh-keygen -t ed25519 -f ${pubkey%.pub} -C gitea_ci" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Provision gitea_ci on every host (cortex + all neurons).
|
||||||
|
#
|
||||||
|
# Quoting matters here: "${cortex_host} ${neuron_hosts[@]}" inside a
|
||||||
|
# single pair of quotes collapses the scalar and the first array
|
||||||
|
# element into one space-joined word, which then word-splits when
|
||||||
|
# referenced unquoted in `ssh ${host}` — and ssh interprets the second
|
||||||
|
# hostname as the remote command. Separate quoting fixes it.
|
||||||
|
for host in "${cortex_host}" "${neuron_hosts[@]}"; do
|
||||||
|
echo "==> ${host}"
|
||||||
|
if ! ssh "${host}" '
|
||||||
|
set -eu
|
||||||
|
if id -u gitea_ci >/dev/null 2>&1; then
|
||||||
|
echo " gitea_ci user already present"
|
||||||
|
else
|
||||||
|
sudo useradd --system --create-home \
|
||||||
|
--home-dir /var/lib/gitea_ci --shell /bin/bash gitea_ci
|
||||||
|
echo " gitea_ci user created"
|
||||||
|
fi
|
||||||
|
# `sudo install` runs as root (not as gitea_ci), which avoids
|
||||||
|
# the "sudo: unknown user gitea_ci" failure seen immediately
|
||||||
|
# after useradd — NSS caching lags briefly and `sudo -u` cant
|
||||||
|
# resolve the just-created user, but `install -o` does its
|
||||||
|
# own fresh lookup.
|
||||||
|
sudo install -d -o gitea_ci -g gitea_ci -m 0700 \
|
||||||
|
/var/lib/gitea_ci/.ssh
|
||||||
|
'; then
|
||||||
|
echo " failed to provision gitea_ci — skipping ${host}"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if rsync \
|
||||||
|
--archive \
|
||||||
|
--compress \
|
||||||
|
--chown gitea_ci:gitea_ci \
|
||||||
|
--chmod 0600 \
|
||||||
|
--rsync-path 'sudo rsync' \
|
||||||
|
"${pubkey}" \
|
||||||
|
"${host}:/var/lib/gitea_ci/.ssh/authorized_keys"; then
|
||||||
|
echo " authorized_keys synced"
|
||||||
|
else
|
||||||
|
echo " failed to sync authorized_keys"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Install /etc/sudoers.d/helexa_gitea_ci on a host and verify the
|
||||||
|
# resulting file parses, so a typo cant lock root out.
|
||||||
|
install_sudoers() {
|
||||||
|
local host="$1" template="$2"
|
||||||
|
echo "==> ${host}: installing /etc/sudoers.d/helexa_gitea_ci"
|
||||||
|
if ! rsync \
|
||||||
|
--archive \
|
||||||
|
--compress \
|
||||||
|
--chown root:root \
|
||||||
|
--chmod 0440 \
|
||||||
|
--rsync-path 'sudo rsync' \
|
||||||
|
"${template}" \
|
||||||
|
"${host}:/etc/sudoers.d/helexa_gitea_ci"; then
|
||||||
|
echo " failed to sync ${template##*/}"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
if ssh "${host}" 'sudo visudo -cf /etc/sudoers.d/helexa_gitea_ci' \
|
||||||
|
>/dev/null; then
|
||||||
|
echo " installed and verified"
|
||||||
|
else
|
||||||
|
echo " WARNING: visudo rejected the installed file — review on ${host}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
install_sudoers "${cortex_host}" \
|
||||||
|
"${repo_path}/asset/sudoers.d/cortex-host.conf"
|
||||||
|
|
||||||
|
for neuron_host in "${neuron_hosts[@]}"; do
|
||||||
|
install_sudoers "${neuron_host}" \
|
||||||
|
"${repo_path}/asset/sudoers.d/neuron-host.conf"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Push application config to the fleet. The deploy workflow is
|
||||||
|
# scoped to package install + service restart; config changes ride
|
||||||
|
# along with this script instead, since:
|
||||||
|
# - cortex.toml and models.toml are gitignored (operator-owned, may
|
||||||
|
# include secrets), so CI never sees them
|
||||||
|
# - asset/neuron/<short>.toml is tracked but iterating locally is
|
||||||
|
# faster than pushing a commit and waiting for build-prerelease
|
||||||
|
# to roll over
|
||||||
|
# Missing source files are skipped silently — re-run after editing.
|
||||||
|
sync_config() {
|
||||||
|
local host="$1" src="$2" dst="$3"
|
||||||
|
if [[ ! -f "${src}" ]]; then
|
||||||
|
echo " ${src##*/} not present locally — skipping"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
if rsync \
|
||||||
|
--archive \
|
||||||
|
--compress \
|
||||||
|
--chown root:root \
|
||||||
|
--chmod 0644 \
|
||||||
|
--rsync-path 'sudo rsync' \
|
||||||
|
"${src}" \
|
||||||
|
"${host}:${dst}"; then
|
||||||
|
echo " ${src##*/} → ${host}:${dst}"
|
||||||
|
else
|
||||||
|
echo " failed to sync ${src##*/} to ${host}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "==> ${cortex_host}: syncing gateway configs"
|
||||||
|
sync_config "${cortex_host}" "${repo_path}/cortex.toml" /etc/cortex/cortex.toml
|
||||||
|
sync_config "${cortex_host}" "${repo_path}/models.toml" /etc/cortex/models.toml
|
||||||
|
|
||||||
|
for neuron_host in "${neuron_hosts[@]}"; do
|
||||||
|
short="${neuron_host%%.*}"
|
||||||
|
echo "==> ${neuron_host}: syncing per-host neuron config"
|
||||||
|
sync_config "${neuron_host}" \
|
||||||
|
"${repo_path}/asset/neuron/${short}.toml" \
|
||||||
|
/etc/neuron/neuron.toml
|
||||||
|
done
|
||||||
Reference in New Issue
Block a user