Merge branch 'feat/deploy-bench-on-bob' into main
All checks were successful
build-prerelease / Resolve version stamps + change detection (push) Successful in 36s
build-prerelease / Lint (fmt + clippy) (push) Has been skipped
build-prerelease / Test (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Has been skipped
build-prerelease / Build neuron-ada (push) Has been skipped
build-prerelease / Build neuron-ampere (push) Has been skipped
build-prerelease / Package helexa-neuron-ada RPM (push) Has been skipped
build-prerelease / Package helexa-neuron-ampere RPM (push) Has been skipped
build-prerelease / Package helexa-neuron-blackwell RPM (push) Has been skipped
build-prerelease / Build cortex binary (push) Has been skipped
build-prerelease / Package cortex RPM (push) Has been skipped
build-prerelease / Build helexa-bench binary (push) Has been skipped
build-prerelease / Package helexa-bench RPM (push) Has been skipped
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Has been skipped

ci(deploy): deploy helexa-bench to bob + enable all fleet services on boot

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-14 09:17:11 +03:00
7 changed files with 183 additions and 5 deletions

View File

@@ -123,7 +123,9 @@ jobs:
# Exact command form required by the sudoers rule in
# asset/sudoers.d/neuron-host.conf — change both together.
sudo /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
sudo /usr/bin/systemctl start neuron.service
# enable --now so a dev deploy also leaves the unit enabled
# for boot, consistent with deploy.yml.
sudo /usr/bin/systemctl enable --now neuron.service
rm -f /var/lib/gitea_ci/neuron-dev'
- name: Capture neuron.service startup journal

View File

@@ -1,7 +1,8 @@
name: deploy
# Roll the freshly-published unstable RPMs onto the helexa fleet:
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host.
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host,
# and helexa-bench on bob (the bench host).
#
# Triggered automatically after `build-prerelease` succeeds (by which
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
@@ -88,7 +89,9 @@ jobs:
sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
fi
sudo /usr/bin/systemctl daemon-reload
sudo /usr/bin/systemctl start cortex.service
# enable --now: start the service AND enable it for boot so the
# fleet self-heals after a host reboot.
sudo /usr/bin/systemctl enable --now cortex.service
DEPLOY
# Wait for the service to either come up or wedge, then capture
@@ -164,7 +167,9 @@ jobs:
sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
fi
sudo /usr/bin/systemctl daemon-reload
sudo /usr/bin/systemctl start neuron.service
# enable --now: start the service AND enable it for boot so the
# fleet self-heals after a host reboot.
sudo /usr/bin/systemctl enable --now neuron.service
# ── Post-deploy validation ────────────────────────────────
# A deploy only goes green if the neuron (a) finishes loading
@@ -250,3 +255,85 @@ jobs:
sleep 10
ssh gitea_ci@${{ matrix.host }} \
'journalctl --unit neuron.service -I --no-pager'
# helexa-bench is a separate package on a separate host (bob), and it
# only consumes the fleet's HTTP APIs — it has no deploy-ordering
# dependency on cortex or the neurons (the sweep loop is version-aware
# and picks up whatever each neuron reports whenever). So it runs
# alongside the cortex→neurons chain rather than after it.
deploy-bench:
runs-on: fedora-43
if: >-
${{
github.event_name == 'workflow_dispatch'
|| github.event.workflow_run.conclusion == 'success'
}}
steps:
- name: SSH init
run: |
mkdir -p ~/.ssh
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
gitea_ci@bob.hanzalova.internal 'hostname -f'
# See deploy-cortex for why gating uses the publish manifest and
# not unprivileged `dnf check-update`.
- name: Deploy helexa-bench (skips when already current)
run: |
ssh gitea_ci@bob.hanzalova.internal 'bash -s' <<'DEPLOY'
set -eu
pkg=helexa-bench
installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
| python3 -c '
import json, sys
name = sys.argv[1]
cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
if cands:
p = max(cands, key=lambda p: p.get("buildTime", 0))
print(p["version"] + "-" + p["release"])
' "${pkg}" 2>/dev/null || true)
if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
echo "${pkg}-${installed} already current — leaving service untouched"
exit 0
fi
echo "installed=${installed} published=${latest:-unknown} — deploying"
if systemctl is-active --quiet helexa-bench.service; then
sudo /usr/bin/systemctl stop helexa-bench.service
fi
if rpm -q "${pkg}" >/dev/null 2>&1; then
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
else
sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
fi
sudo /usr/bin/systemctl daemon-reload
# enable --now: start the service AND enable it for boot so the
# bench resumes collecting after a host reboot.
sudo /usr/bin/systemctl enable --now helexa-bench.service
# ── Post-deploy validation ────────────────────────────────
# Unlike a neuron, the bench has no listener to probe and no
# model to load — it's a continuous outbound sweep loop. The
# meaningful check is simply that it stays up: a bad config or
# a crash-on-start would leave the unit in activating/failed
# (Restart=always backoff) rather than active. Give it a beat,
# then assert active.
sleep 5
if ! systemctl is-active --quiet helexa-bench.service; then
echo "FAIL: helexa-bench.service is not active after start"
systemctl --no-pager status helexa-bench.service | head -20 || true
exit 1
fi
echo "helexa-bench.service active"
DEPLOY
# Wait for the service to either come up or wedge, then capture
# the latest-invocation journal. Runs even on prior failure so a
# failed start step still leaves a usable record in the deploy log.
- name: Capture helexa-bench.service startup journal
if: always()
run: |
sleep 10
ssh gitea_ci@bob.hanzalova.internal \
'journalctl --unit helexa-bench.service -I --no-pager'

View File

@@ -0,0 +1,32 @@
# helexa-bench config for bob.hanzalova.internal.
#
# Synced to /etc/helexa-bench/helexa-bench.toml by script/infra-setup.sh
# (the helexa-bench RPM ships helexa-bench.example.toml as a
# %config(noreplace) default; this per-host file overrides it).
#
# bob is a client host (it also runs Agent Zero); helexa-bench here hits
# every neuron on the fleet directly and records build-stamped results
# into the local SQLite store.
[bench]
sweep_interval_secs = 1800
samples_per_version = 5
iteration_pause_secs = 2
request_timeout_secs = 600
db_path = "/var/lib/helexa-bench/bench.sqlite"
[scenarios]
prompt_sizes = [128, 4096]
max_tokens = 256
[[targets]]
name = "beast"
endpoint = "http://beast.hanzalova.internal:13131"
[[targets]]
name = "benjy"
endpoint = "http://benjy.hanzalova.internal:13131"
[[targets]]
name = "quadbrat"
endpoint = "http://quadbrat.hanzalova.internal:13131"

View File

@@ -0,0 +1,24 @@
# Install on the bench host (bob) as /etc/sudoers.d/helexa_gitea_ci
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
# which SSHes as gitea_ci@bob to roll out helexa-bench package upgrades
# and config changes.
#
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
# helexa-org apps can drop their own sudoers files on the same host
# without overwriting this one.
#
# helexa-bench is a pure-Rust, outbound-only daemon (it polls the neuron
# fleet) — no listener, so unlike the neuron host there is no
# firewall-cmd entry here.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/helexa-bench/helexa-bench.toml
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start helexa-bench.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop helexa-bench.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now helexa-bench.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
# sudoers reserves `:` and `=` and requires `\` escaping inside command
# arguments — without it visudo errors at the first `:` in `https://`.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1

View File

@@ -11,6 +11,7 @@ gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now cortex.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex

View File

@@ -16,6 +16,7 @@
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now neuron.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere

View File

@@ -20,6 +20,9 @@ neuron_hosts=(
benjy.hanzalova.internal
quadbrat.hanzalova.internal
)
# Bench host: runs helexa-bench (outbound-only; polls the neuron fleet).
# Also runs Agent Zero — it's a client host, not a GPU node.
bench_host=bob.hanzalova.internal
pubkey="${HOME}/.ssh/id_gitea_ci.pub"
if [[ ! -f "${pubkey}" ]]; then
@@ -35,7 +38,7 @@ fi
# element into one space-joined word, which then word-splits when
# referenced unquoted in `ssh ${host}` — and ssh interprets the second
# hostname as the remote command. Separate quoting fixes it.
for host in "${cortex_host}" "${neuron_hosts[@]}"; do
for host in "${cortex_host}" "${neuron_hosts[@]}" "${bench_host}"; do
echo "==> ${host}"
if ! ssh "${host}" '
set -eu
@@ -109,6 +112,29 @@ for neuron_host in "${neuron_hosts[@]}"; do
"${repo_path}/asset/sudoers.d/neuron-host.conf"
done
install_sudoers "${bench_host}" \
"${repo_path}/asset/sudoers.d/bench-host.conf"
# bob doesn't otherwise carry the lair-cafe RPM repo (it's a client
# host, not a cortex/neuron node), so helexa-bench's `dnf install` in
# deploy.yml would have nothing to install from. Enable the unstable
# repo here, and pre-create /etc/helexa-bench so the config sync below
# lands even before the first package install. Idempotent.
echo "==> ${bench_host}: ensuring lair-cafe-unstable repo + config dir"
if ! ssh "${bench_host}" '
set -eu
if dnf repolist --all 2>/dev/null | grep -q "^lair-cafe-unstable"; then
echo " lair-cafe-unstable already present"
else
sudo dnf config-manager addrepo --from-repofile=https://rpm.lair.cafe/lair-cafe-unstable.repo
sudo dnf config-manager setopt lair-cafe-unstable.enabled=1
echo " lair-cafe-unstable enabled"
fi
sudo install -d -o root -g root -m 0755 /etc/helexa-bench
'; then
echo " failed to prepare ${bench_host} for helexa-bench"
fi
# Push application config to the fleet. The deploy workflow is
# scoped to package install + service restart; config changes ride
# along with this script instead, since:
@@ -149,3 +175,8 @@ for neuron_host in "${neuron_hosts[@]}"; do
"${repo_path}/asset/neuron/${short}.toml" \
/etc/neuron/neuron.toml
done
echo "==> ${bench_host}: syncing bench config"
sync_config "${bench_host}" \
"${repo_path}/asset/helexa-bench/${bench_host%%.*}.toml" \
/etc/helexa-bench/helexa-bench.toml