diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml new file mode 100644 index 0000000..33062e9 --- /dev/null +++ b/.gitea/workflows/deploy.yml @@ -0,0 +1,126 @@ +name: deploy + +# Roll the freshly-published unstable RPMs onto the helexa fleet: +# cortex on the gateway, helexa-neuron- on each neuron host. +# +# Triggered automatically after `build-prerelease` succeeds (by which +# point the new RPMs are live on rpm.lair.cafe/unstable), and also +# re-runnable manually from the Gitea UI. +# +# Per-host one-time setup (gitea_ci user, authorized_keys, scoped +# sudoers drop-in) lives in script/infra-setup.sh — run that once per +# host before this workflow can succeed. + +on: + workflow_run: + workflows: [build-prerelease] + types: [completed] + workflow_dispatch: + +# Serialize deploys. Overlapping runs would race on dnf metadata +# refresh and service-restart timing; queueing keeps the fleet +# predictable. Don't cancel an in-flight deploy — a half-applied dnf +# transaction is worse than a slightly stale deploy. +concurrency: + group: deploy + cancel-in-progress: false + +env: + DEPLOY_KEY: | + ${{ secrets.RSYNC_SSH_KEY }} + +jobs: + deploy-cortex: + runs-on: fedora-43 + # Two trigger paths: manual dispatch always runs; workflow_run + # only runs if the upstream `build-prerelease` actually succeeded. + if: >- + ${{ + github.event_name == 'workflow_dispatch' + || github.event.workflow_run.conclusion == 'success' + }} + steps: + - name: SSH init + run: | + mkdir -p ~/.ssh + echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \ + gitea_ci@hanzalova.internal 'hostname -f' + + - name: Stop cortex.service + run: | + ssh gitea_ci@hanzalova.internal ' + if systemctl is-active --quiet cortex.service; then + sudo /usr/bin/systemctl stop cortex.service + fi' + + - name: Install / upgrade cortex from rpm.lair.cafe/unstable + run: | + ssh gitea_ci@hanzalova.internal ' + if rpm -q cortex >/dev/null 2>&1; then + sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex + else + sudo /usr/bin/dnf install --refresh --allowerasing -y cortex + fi' + + - name: Start cortex.service + run: | + ssh gitea_ci@hanzalova.internal ' + sudo /usr/bin/systemctl daemon-reload + sudo /usr/bin/systemctl start cortex.service' + + deploy-neurons: + needs: [deploy-cortex] + runs-on: fedora-43 + strategy: + # One neuron failing must not cancel the others. Cortex is up + # already; a partial neuron deploy is strictly better than + # rolling back to zero. + fail-fast: false + matrix: + include: + - host: beast.hanzalova.internal + flavour: blackwell + - host: benjy.hanzalova.internal + flavour: ada + - host: quadbrat.hanzalova.internal + flavour: ampere + steps: + - name: SSH init + run: | + mkdir -p ~/.ssh + echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \ + gitea_ci@${{ matrix.host }} 'hostname -f' + + - name: Stop neuron.service + run: | + ssh gitea_ci@${{ matrix.host }} ' + if systemctl is-active --quiet neuron.service; then + sudo /usr/bin/systemctl stop neuron.service + fi' + + - name: Install / upgrade helexa-neuron-${{ matrix.flavour }} + run: | + ssh gitea_ci@${{ matrix.host }} " + if rpm -q helexa-neuron-${{ matrix.flavour }} >/dev/null 2>&1; then + sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }} + else + sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }} + fi" + + - name: Ensure firewalld allows helexa-neuron + run: | + ssh gitea_ci@${{ matrix.host }} ' + if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then + sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent + sudo /usr/bin/firewall-cmd --reload + fi' + + - name: Start neuron.service + run: | + ssh gitea_ci@${{ matrix.host }} ' + sudo /usr/bin/systemctl daemon-reload + sudo /usr/bin/systemctl start neuron.service' diff --git a/asset/sudoers.d/cortex-host.conf b/asset/sudoers.d/cortex-host.conf new file mode 100644 index 0000000..8ebe93d --- /dev/null +++ b/asset/sudoers.d/cortex-host.conf @@ -0,0 +1,20 @@ +# Install on the cortex gateway host as /etc/sudoers.d/helexa_gitea_ci +# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml, +# which SSHes as gitea_ci@ to roll out cortex package upgrades +# and config changes. +# +# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other +# helexa-org apps can drop their own sudoers files on the same host +# without overwriting this one. + +gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml +gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml +gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service +gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service +gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex +# sudoers reserves `:` and `=` and requires `\` escaping inside command +# arguments — without it visudo errors at the first `:` in `https://`. +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1 diff --git a/asset/sudoers.d/neuron-host.conf b/asset/sudoers.d/neuron-host.conf new file mode 100644 index 0000000..744044e --- /dev/null +++ b/asset/sudoers.d/neuron-host.conf @@ -0,0 +1,33 @@ +# Install on every neuron host as /etc/sudoers.d/helexa_gitea_ci +# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml, +# which SSHes as gitea_ci@ to roll out helexa-neuron- +# package upgrades and config changes. +# +# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other +# helexa-org apps can drop their own sudoers files on the same host +# without overwriting this one. +# +# All three CUDA flavours are listed because a host's flavour can change +# (e.g. GPU swap) and we don't want the sudoers file to need to change +# in lockstep. Only one flavour can be installed at a time (the packages +# Conflict: with each other), so the attack surface is bounded to "wrong +# flavour installed" — vandalism, not privilege escalation. + +gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml +gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service +gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service +gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ada +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ada +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-blackwell +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-blackwell +# sudoers reserves `:` and `=` and requires `\` escaping inside command +# arguments — without it visudo errors at the first `:` in `https://`. +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1 +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo +gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install -y libcudnn9-cuda-13 +gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent +gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload diff --git a/script/infra-setup.sh b/script/infra-setup.sh new file mode 100755 index 0000000..a9afda1 --- /dev/null +++ b/script/infra-setup.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# +# One-time setup for the gitea_ci deploy-user on every host that the +# .gitea/workflows/deploy.yml workflow targets: +# - create the gitea_ci system user (if missing) +# - install the runner's pubkey into ~gitea_ci/.ssh/authorized_keys +# - install the appropriate /etc/sudoers.d/helexa_gitea_ci sudoers +# drop-in (cortex flavour on the gateway, neuron flavour on each +# neuron host) +# +# Idempotent — safe to re-run after fleet changes. Continues past +# unreachable hosts so a single offline node doesn't block the rest. + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_path="$(cd "${script_dir}/.." && pwd)" + +cortex_host=hanzalova.internal +neuron_hosts=( + beast.hanzalova.internal + benjy.hanzalova.internal + quadbrat.hanzalova.internal +) + +pubkey="${HOME}/.ssh/id_gitea_ci.pub" +if [[ ! -f "${pubkey}" ]]; then + echo "fatal: ${pubkey} not found" >&2 + echo " generate with: ssh-keygen -t ed25519 -f ${pubkey%.pub} -C gitea_ci" >&2 + exit 1 +fi + +# Provision gitea_ci on every host (cortex + all neurons). +# +# Quoting matters here: "${cortex_host} ${neuron_hosts[@]}" inside a +# single pair of quotes collapses the scalar and the first array +# element into one space-joined word, which then word-splits when +# referenced unquoted in `ssh ${host}` — and ssh interprets the second +# hostname as the remote command. Separate quoting fixes it. +for host in "${cortex_host}" "${neuron_hosts[@]}"; do + echo "==> ${host}" + if ! ssh "${host}" ' + set -eu + if id -u gitea_ci >/dev/null 2>&1; then + echo " gitea_ci user already present" + else + sudo useradd --system --create-home \ + --home-dir /var/lib/gitea_ci --shell /bin/bash gitea_ci + echo " gitea_ci user created" + fi + # `sudo install` runs as root (not as gitea_ci), which avoids + # the "sudo: unknown user gitea_ci" failure seen immediately + # after useradd — NSS caching lags briefly and `sudo -u` cant + # resolve the just-created user, but `install -o` does its + # own fresh lookup. + sudo install -d -o gitea_ci -g gitea_ci -m 0700 \ + /var/lib/gitea_ci/.ssh + '; then + echo " failed to provision gitea_ci — skipping ${host}" + continue + fi + + if rsync \ + --archive \ + --compress \ + --chown gitea_ci:gitea_ci \ + --chmod 0600 \ + --rsync-path 'sudo rsync' \ + "${pubkey}" \ + "${host}:/var/lib/gitea_ci/.ssh/authorized_keys"; then + echo " authorized_keys synced" + else + echo " failed to sync authorized_keys" + fi +done + +# Install /etc/sudoers.d/helexa_gitea_ci on a host and verify the +# resulting file parses, so a typo cant lock root out. +install_sudoers() { + local host="$1" template="$2" + echo "==> ${host}: installing /etc/sudoers.d/helexa_gitea_ci" + if ! rsync \ + --archive \ + --compress \ + --chown root:root \ + --chmod 0440 \ + --rsync-path 'sudo rsync' \ + "${template}" \ + "${host}:/etc/sudoers.d/helexa_gitea_ci"; then + echo " failed to sync ${template##*/}" + return + fi + if ssh "${host}" 'sudo visudo -cf /etc/sudoers.d/helexa_gitea_ci' \ + >/dev/null; then + echo " installed and verified" + else + echo " WARNING: visudo rejected the installed file — review on ${host}" + fi +} + +install_sudoers "${cortex_host}" \ + "${repo_path}/asset/sudoers.d/cortex-host.conf" + +for neuron_host in "${neuron_hosts[@]}"; do + install_sudoers "${neuron_host}" \ + "${repo_path}/asset/sudoers.d/neuron-host.conf" +done + +# Push application config to the fleet. The deploy workflow is +# scoped to package install + service restart; config changes ride +# along with this script instead, since: +# - cortex.toml and models.toml are gitignored (operator-owned, may +# include secrets), so CI never sees them +# - asset/neuron/.toml is tracked but iterating locally is +# faster than pushing a commit and waiting for build-prerelease +# to roll over +# Missing source files are skipped silently — re-run after editing. +sync_config() { + local host="$1" src="$2" dst="$3" + if [[ ! -f "${src}" ]]; then + echo " ${src##*/} not present locally — skipping" + return + fi + if rsync \ + --archive \ + --compress \ + --chown root:root \ + --chmod 0644 \ + --rsync-path 'sudo rsync' \ + "${src}" \ + "${host}:${dst}"; then + echo " ${src##*/} → ${host}:${dst}" + else + echo " failed to sync ${src##*/} to ${host}" + fi +} + +echo "==> ${cortex_host}: syncing gateway configs" +sync_config "${cortex_host}" "${repo_path}/cortex.toml" /etc/cortex/cortex.toml +sync_config "${cortex_host}" "${repo_path}/models.toml" /etc/cortex/models.toml + +for neuron_host in "${neuron_hosts[@]}"; do + short="${neuron_host%%.*}" + echo "==> ${neuron_host}: syncing per-host neuron config" + sync_config "${neuron_host}" \ + "${repo_path}/asset/neuron/${short}.toml" \ + /etc/neuron/neuron.toml +done