After both `Start cortex.service` and `Start neuron.service`, sleep 10s and run `journalctl --unit <unit> -I --no-pager` to record the latest invocation's log in the workflow output. Step is guarded by `if: always()` so a failed start still leaves a usable trace. infra-setup.sh now adds gitea_ci to the systemd-journal group during user provisioning, so `journalctl` works without a sudoers entry. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
147 lines
5.2 KiB
YAML
147 lines
5.2 KiB
YAML
name: deploy
|
|
|
|
# Roll the freshly-published unstable RPMs onto the helexa fleet:
|
|
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host.
|
|
#
|
|
# Triggered automatically after `build-prerelease` succeeds (by which
|
|
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
|
|
# re-runnable manually from the Gitea UI.
|
|
#
|
|
# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
|
|
# sudoers drop-in) lives in script/infra-setup.sh — run that once per
|
|
# host before this workflow can succeed.
|
|
|
|
on:
|
|
workflow_run:
|
|
workflows: [build-prerelease]
|
|
types: [completed]
|
|
workflow_dispatch:
|
|
|
|
# Serialize deploys. Overlapping runs would race on dnf metadata
|
|
# refresh and service-restart timing; queueing keeps the fleet
|
|
# predictable. Don't cancel an in-flight deploy — a half-applied dnf
|
|
# transaction is worse than a slightly stale deploy.
|
|
concurrency:
|
|
group: deploy
|
|
cancel-in-progress: false
|
|
|
|
env:
|
|
DEPLOY_KEY: |
|
|
${{ secrets.RSYNC_SSH_KEY }}
|
|
|
|
jobs:
|
|
deploy-cortex:
|
|
runs-on: fedora-43
|
|
# Two trigger paths: manual dispatch always runs; workflow_run
|
|
# only runs if the upstream `build-prerelease` actually succeeded.
|
|
if: >-
|
|
${{
|
|
github.event_name == 'workflow_dispatch'
|
|
|| github.event.workflow_run.conclusion == 'success'
|
|
}}
|
|
steps:
|
|
- name: SSH init
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
|
|
gitea_ci@hanzalova.internal 'hostname -f'
|
|
|
|
- name: Stop cortex.service
|
|
run: |
|
|
ssh gitea_ci@hanzalova.internal '
|
|
if systemctl is-active --quiet cortex.service; then
|
|
sudo /usr/bin/systemctl stop cortex.service
|
|
fi'
|
|
|
|
- name: Install / upgrade cortex from rpm.lair.cafe/unstable
|
|
run: |
|
|
ssh gitea_ci@hanzalova.internal '
|
|
if rpm -q cortex >/dev/null 2>&1; then
|
|
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
|
|
else
|
|
sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
|
|
fi'
|
|
|
|
- name: Start cortex.service
|
|
run: |
|
|
ssh gitea_ci@hanzalova.internal '
|
|
sudo /usr/bin/systemctl daemon-reload
|
|
sudo /usr/bin/systemctl start cortex.service'
|
|
|
|
# Wait for the service to either come up or wedge, then capture
|
|
# the latest-invocation journal. Runs even on prior failure so a
|
|
# failed start step still leaves a usable record in the deploy log.
|
|
- name: Capture cortex.service startup journal
|
|
if: always()
|
|
run: |
|
|
sleep 10
|
|
ssh gitea_ci@hanzalova.internal \
|
|
'journalctl --unit cortex.service -I --no-pager'
|
|
|
|
deploy-neurons:
|
|
needs: [deploy-cortex]
|
|
runs-on: fedora-43
|
|
strategy:
|
|
# One neuron failing must not cancel the others. Cortex is up
|
|
# already; a partial neuron deploy is strictly better than
|
|
# rolling back to zero.
|
|
fail-fast: false
|
|
matrix:
|
|
include:
|
|
- host: beast.hanzalova.internal
|
|
flavour: blackwell
|
|
- host: benjy.hanzalova.internal
|
|
flavour: ada
|
|
- host: quadbrat.hanzalova.internal
|
|
flavour: ampere
|
|
steps:
|
|
- name: SSH init
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
|
|
gitea_ci@${{ matrix.host }} 'hostname -f'
|
|
|
|
- name: Stop neuron.service
|
|
run: |
|
|
ssh gitea_ci@${{ matrix.host }} '
|
|
if systemctl is-active --quiet neuron.service; then
|
|
sudo /usr/bin/systemctl stop neuron.service
|
|
fi'
|
|
|
|
- name: Install / upgrade helexa-neuron-${{ matrix.flavour }}
|
|
run: |
|
|
ssh gitea_ci@${{ matrix.host }} "
|
|
if rpm -q helexa-neuron-${{ matrix.flavour }} >/dev/null 2>&1; then
|
|
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }}
|
|
else
|
|
sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-${{ matrix.flavour }}
|
|
fi"
|
|
|
|
- name: Ensure firewalld allows helexa-neuron
|
|
run: |
|
|
ssh gitea_ci@${{ matrix.host }} '
|
|
if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
|
|
sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
|
|
sudo /usr/bin/firewall-cmd --reload
|
|
fi'
|
|
|
|
- name: Start neuron.service
|
|
run: |
|
|
ssh gitea_ci@${{ matrix.host }} '
|
|
sudo /usr/bin/systemctl daemon-reload
|
|
sudo /usr/bin/systemctl start neuron.service'
|
|
|
|
# Wait for the service to either come up or wedge, then capture
|
|
# the latest-invocation journal. Runs even on prior failure so a
|
|
# failed start step still leaves a usable record in the deploy log.
|
|
- name: Capture neuron.service startup journal
|
|
if: always()
|
|
run: |
|
|
sleep 10
|
|
ssh gitea_ci@${{ matrix.host }} \
|
|
'journalctl --unit neuron.service -I --no-pager'
|