feat(deploy): gitea workflow for rolling RPM deploys + host bootstrap

Replace operator-run script/deploy.sh with a CI-driven rolling deploy: - .gitea/workflows/deploy.yml fires on build-prerelease success (and is re-runnable via workflow_dispatch). Cortex upgrades first on hanzalova.internal; the three neuron hosts upgrade in parallel under fail-fast: false so one failing host doesn't sink the rest. Concurrency-grouped to serialize overlapping deploys, never cancelling in-flight runs (a half-applied dnf transaction is worse than a stale deploy). - asset/sudoers.d/{cortex,neuron}-host.conf are the canonical source for the scoped privileges gitea_ci needs on each host kind, installed as /etc/sudoers.d/helexa_gitea_ci. URLs and = signs are backslash-escaped per sudoers reserved-character rules. - script/infra-setup.sh idempotently provisions the gitea_ci user, installs the runner pubkey, drops in the appropriate sudoers fragment with visudo verification, and syncs cortex.toml / models.toml / per-host asset/neuron/<short>.toml — config still ships from operator workstations rather than CI because the first two are gitignored. The CI-only secret is RSYNC_SSH_KEY (already configured for the repo); the matching pubkey is ~/.ssh/id_gitea_ci.pub on the operator's box. script/deploy.sh and asset/manifest.yml are left in place until the first end-to-end deploy workflow run succeeds, then removed. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-01 14:58:23 +03:00
parent d0292ed377
commit 5c520c7e90
4 changed files with 325 additions and 0 deletions
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -0,0 +1,126 @@
 name: deploy
 # Roll the freshly-published unstable RPMs onto the helexa fleet:
 # cortex on the gateway, helexa-neuron-<flavour> on each neuron host.
 #
 # Triggered automatically after `build-prerelease` succeeds (by which
 # point the new RPMs are live on rpm.lair.cafe/unstable), and also
 # re-runnable manually from the Gitea UI.
 #
 # Per-host one-time setup (gitea_ci user, authorized_keys, scoped
 # sudoers drop-in) lives in script/infra-setup.sh — run that once per
 # host before this workflow can succeed.
 on:
  workflow_run:
    workflows: [build-prerelease]
    types: [completed]
  workflow_dispatch:
 # Serialize deploys. Overlapping runs would race on dnf metadata
 # refresh and service-restart timing; queueing keeps the fleet
 # predictable. Don't cancel an in-flight deploy — a half-applied dnf
 # transaction is worse than a slightly stale deploy.
 concurrency:
  group: deploy
  cancel-in-progress: false
 env:
  DEPLOY_KEY: |
    ${{ secrets.RSYNC_SSH_KEY }}
 jobs:
  deploy-cortex:
    runs-on: fedora-43
    # Two trigger paths: manual dispatch always runs; workflow_run
    # only runs if the upstream `build-prerelease` actually succeeded.
    if: >-
      ${{
        github.event_name == 'workflow_dispatch'
        || github.event.workflow_run.conclusion == 'success'
      }}
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@hanzalova.internal 'hostname -f'
      - name: Stop cortex.service
        run: |
          ssh gitea_ci@hanzalova.internal '
            if systemctl is-active --quiet cortex.service; then
              sudo /usr/bin/systemctl stop cortex.service
            fi'
      - name: Install / upgrade cortex from rpm.lair.cafe/unstable
        run: |
          ssh gitea_ci@hanzalova.internal '
            if rpm -q cortex >/dev/null 2>&1; then
              sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
            else
              sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
            fi'
      - name: Start cortex.service
        run: |
          ssh gitea_ci@hanzalova.internal '
            sudo /usr/bin/systemctl daemon-reload
            sudo /usr/bin/systemctl start cortex.service'
  deploy-neurons:
    needs: [deploy-cortex]
    runs-on: fedora-43
    strategy:
      # One neuron failing must not cancel the others. Cortex is up
      # already; a partial neuron deploy is strictly better than
      # rolling back to zero.
      fail-fast: false
      matrix:
        include:
          - host: beast.hanzalova.internal
            flavour: blackwell
          - host: benjy.hanzalova.internal
            flavour: ada
          - host: quadbrat.hanzalova.internal
            flavour: ampere
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@${{ matrix.host }} 'hostname -f'
      - name: Stop neuron.service
        run: |
          ssh gitea_ci@${{ matrix.host }} '
            if systemctl is-active --quiet neuron.service; then
              sudo /usr/bin/systemctl stop neuron.service
            fi'
      - name: Install / upgrade helexa-neuron-${{ matrix.flavour }}
        run: |
          ssh gitea_ci@${{ matrix.host }} "
            if rpm -q helexa-neuron-${{ matrix.flavour }} >/dev/null 2>&1; then
              sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }}
            else
              sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }}
            fi"
      - name: Ensure firewalld allows helexa-neuron
        run: |
          ssh gitea_ci@${{ matrix.host }} '
            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
              sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
              sudo /usr/bin/firewall-cmd --reload
            fi'
      - name: Start neuron.service
        run: |
          ssh gitea_ci@${{ matrix.host }} '
            sudo /usr/bin/systemctl daemon-reload
            sudo /usr/bin/systemctl start neuron.service'
--- a/asset/sudoers.d/cortex-host.conf
+++ b/asset/sudoers.d/cortex-host.conf
@@ -0,0 +1,20 @@
 # Install on the cortex gateway host as /etc/sudoers.d/helexa_gitea_ci
 # (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
 # which SSHes as gitea_ci@<gateway> to roll out cortex package upgrades
 # and config changes.
 #
 # Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
 # helexa-org apps can drop their own sudoers files on the same host
 # without overwriting this one.
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
 # sudoers reserves `:` and `=` and requires `\` escaping inside command
 # arguments — without it visudo errors at the first `:` in `https://`.
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
--- a/asset/sudoers.d/neuron-host.conf
+++ b/asset/sudoers.d/neuron-host.conf
@@ -0,0 +1,33 @@
 # Install on every neuron host as /etc/sudoers.d/helexa_gitea_ci
 # (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
 # which SSHes as gitea_ci@<neuron-host> to roll out helexa-neuron-<flavour>
 # package upgrades and config changes.
 #
 # Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
 # helexa-org apps can drop their own sudoers files on the same host
 # without overwriting this one.
 #
 # All three CUDA flavours are listed because a host's flavour can change
 # (e.g. GPU swap) and we don't want the sudoers file to need to change
 # in lockstep. Only one flavour can be installed at a time (the packages
 # Conflict: with each other), so the attack surface is bounded to "wrong
 # flavour installed" — vandalism, not privilege escalation.
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ada
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ada
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-blackwell
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-blackwell
 # sudoers reserves `:` and `=` and requires `\` escaping inside command
 # arguments — without it visudo errors at the first `:` in `https://`.
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install -y libcudnn9-cuda-13
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
 gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
--- a/script/infra-setup.sh
+++ b/script/infra-setup.sh
@@ -0,0 +1,146 @@
 #!/usr/bin/env bash
 #
 # One-time setup for the gitea_ci deploy-user on every host that the
 # .gitea/workflows/deploy.yml workflow targets:
 #   - create the gitea_ci system user (if missing)
 #   - install the runner's pubkey into ~gitea_ci/.ssh/authorized_keys
 #   - install the appropriate /etc/sudoers.d/helexa_gitea_ci sudoers
 #     drop-in (cortex flavour on the gateway, neuron flavour on each
 #     neuron host)
 #
 # Idempotent — safe to re-run after fleet changes. Continues past
 # unreachable hosts so a single offline node doesn't block the rest.
 script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 repo_path="$(cd "${script_dir}/.." && pwd)"
 cortex_host=hanzalova.internal
 neuron_hosts=(
    beast.hanzalova.internal
    benjy.hanzalova.internal
    quadbrat.hanzalova.internal
 )
 pubkey="${HOME}/.ssh/id_gitea_ci.pub"
 if [[ ! -f "${pubkey}" ]]; then
    echo "fatal: ${pubkey} not found" >&2
    echo "  generate with: ssh-keygen -t ed25519 -f ${pubkey%.pub} -C gitea_ci" >&2
    exit 1
 fi
 # Provision gitea_ci on every host (cortex + all neurons).
 #
 # Quoting matters here: "${cortex_host} ${neuron_hosts[@]}" inside a
 # single pair of quotes collapses the scalar and the first array
 # element into one space-joined word, which then word-splits when
 # referenced unquoted in `ssh ${host}` — and ssh interprets the second
 # hostname as the remote command. Separate quoting fixes it.
 for host in "${cortex_host}" "${neuron_hosts[@]}"; do
    echo "==> ${host}"
    if ! ssh "${host}" '
        set -eu
        if id -u gitea_ci >/dev/null 2>&1; then
            echo "  gitea_ci user already present"
        else
            sudo useradd --system --create-home \
                --home-dir /var/lib/gitea_ci --shell /bin/bash gitea_ci
            echo "  gitea_ci user created"
        fi
        # `sudo install` runs as root (not as gitea_ci), which avoids
        # the "sudo: unknown user gitea_ci" failure seen immediately
        # after useradd — NSS caching lags briefly and `sudo -u` cant
        # resolve the just-created user, but `install -o` does its
        # own fresh lookup.
        sudo install -d -o gitea_ci -g gitea_ci -m 0700 \
            /var/lib/gitea_ci/.ssh
    '; then
        echo "  failed to provision gitea_ci — skipping ${host}"
        continue
    fi
    if rsync \
        --archive \
        --compress \
        --chown gitea_ci:gitea_ci \
        --chmod 0600 \
        --rsync-path 'sudo rsync' \
        "${pubkey}" \
        "${host}:/var/lib/gitea_ci/.ssh/authorized_keys"; then
        echo "  authorized_keys synced"
    else
        echo "  failed to sync authorized_keys"
    fi
 done
 # Install /etc/sudoers.d/helexa_gitea_ci on a host and verify the
 # resulting file parses, so a typo cant lock root out.
 install_sudoers() {
    local host="$1" template="$2"
    echo "==> ${host}: installing /etc/sudoers.d/helexa_gitea_ci"
    if ! rsync \
        --archive \
        --compress \
        --chown root:root \
        --chmod 0440 \
        --rsync-path 'sudo rsync' \
        "${template}" \
        "${host}:/etc/sudoers.d/helexa_gitea_ci"; then
        echo "  failed to sync ${template##*/}"
        return
    fi
    if ssh "${host}" 'sudo visudo -cf /etc/sudoers.d/helexa_gitea_ci' \
            >/dev/null; then
        echo "  installed and verified"
    else
        echo "  WARNING: visudo rejected the installed file — review on ${host}"
    fi
 }
 install_sudoers "${cortex_host}" \
    "${repo_path}/asset/sudoers.d/cortex-host.conf"
 for neuron_host in "${neuron_hosts[@]}"; do
    install_sudoers "${neuron_host}" \
        "${repo_path}/asset/sudoers.d/neuron-host.conf"
 done
 # Push application config to the fleet. The deploy workflow is
 # scoped to package install + service restart; config changes ride
 # along with this script instead, since:
 #   - cortex.toml and models.toml are gitignored (operator-owned, may
 #     include secrets), so CI never sees them
 #   - asset/neuron/<short>.toml is tracked but iterating locally is
 #     faster than pushing a commit and waiting for build-prerelease
 #     to roll over
 # Missing source files are skipped silently — re-run after editing.
 sync_config() {
    local host="$1" src="$2" dst="$3"
    if [[ ! -f "${src}" ]]; then
        echo "  ${src##*/} not present locally — skipping"
        return
    fi
    if rsync \
        --archive \
        --compress \
        --chown root:root \
        --chmod 0644 \
        --rsync-path 'sudo rsync' \
        "${src}" \
        "${host}:${dst}"; then
        echo "  ${src##*/} → ${host}:${dst}"
    else
        echo "  failed to sync ${src##*/} to ${host}"
    fi
 }
 echo "==> ${cortex_host}: syncing gateway configs"
 sync_config "${cortex_host}" "${repo_path}/cortex.toml" /etc/cortex/cortex.toml
 sync_config "${cortex_host}" "${repo_path}/models.toml" /etc/cortex/models.toml
 for neuron_host in "${neuron_hosts[@]}"; do
    short="${neuron_host%%.*}"
    echo "==> ${neuron_host}: syncing per-host neuron config"
    sync_config "${neuron_host}" \
        "${repo_path}/asset/neuron/${short}.toml" \
        /etc/neuron/neuron.toml
 done