Merge branch 'feat/deploy-bench-on-bob' into main
All checks were successful
build-prerelease / Resolve version stamps + change detection (push) Successful in 36s
build-prerelease / Lint (fmt + clippy) (push) Has been skipped
build-prerelease / Test (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Has been skipped
build-prerelease / Build neuron-ada (push) Has been skipped
build-prerelease / Build neuron-ampere (push) Has been skipped
build-prerelease / Package helexa-neuron-ada RPM (push) Has been skipped
build-prerelease / Package helexa-neuron-ampere RPM (push) Has been skipped
build-prerelease / Package helexa-neuron-blackwell RPM (push) Has been skipped
build-prerelease / Build cortex binary (push) Has been skipped
build-prerelease / Package cortex RPM (push) Has been skipped
build-prerelease / Build helexa-bench binary (push) Has been skipped
build-prerelease / Package helexa-bench RPM (push) Has been skipped
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Has been skipped
All checks were successful
build-prerelease / Resolve version stamps + change detection (push) Successful in 36s
build-prerelease / Lint (fmt + clippy) (push) Has been skipped
build-prerelease / Test (push) Has been skipped
build-prerelease / Build neuron-blackwell (push) Has been skipped
build-prerelease / Build neuron-ada (push) Has been skipped
build-prerelease / Build neuron-ampere (push) Has been skipped
build-prerelease / Package helexa-neuron-ada RPM (push) Has been skipped
build-prerelease / Package helexa-neuron-ampere RPM (push) Has been skipped
build-prerelease / Package helexa-neuron-blackwell RPM (push) Has been skipped
build-prerelease / Build cortex binary (push) Has been skipped
build-prerelease / Package cortex RPM (push) Has been skipped
build-prerelease / Build helexa-bench binary (push) Has been skipped
build-prerelease / Package helexa-bench RPM (push) Has been skipped
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Has been skipped
ci(deploy): deploy helexa-bench to bob + enable all fleet services on boot Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -123,7 +123,9 @@ jobs:
|
||||
# Exact command form required by the sudoers rule in
|
||||
# asset/sudoers.d/neuron-host.conf — change both together.
|
||||
sudo /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
|
||||
sudo /usr/bin/systemctl start neuron.service
|
||||
# enable --now so a dev deploy also leaves the unit enabled
|
||||
# for boot, consistent with deploy.yml.
|
||||
sudo /usr/bin/systemctl enable --now neuron.service
|
||||
rm -f /var/lib/gitea_ci/neuron-dev'
|
||||
|
||||
- name: Capture neuron.service startup journal
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
name: deploy
|
||||
|
||||
# Roll the freshly-published unstable RPMs onto the helexa fleet:
|
||||
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host.
|
||||
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host,
|
||||
# and helexa-bench on bob (the bench host).
|
||||
#
|
||||
# Triggered automatically after `build-prerelease` succeeds (by which
|
||||
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
|
||||
@@ -88,7 +89,9 @@ jobs:
|
||||
sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
|
||||
fi
|
||||
sudo /usr/bin/systemctl daemon-reload
|
||||
sudo /usr/bin/systemctl start cortex.service
|
||||
# enable --now: start the service AND enable it for boot so the
|
||||
# fleet self-heals after a host reboot.
|
||||
sudo /usr/bin/systemctl enable --now cortex.service
|
||||
DEPLOY
|
||||
|
||||
# Wait for the service to either come up or wedge, then capture
|
||||
@@ -164,7 +167,9 @@ jobs:
|
||||
sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
|
||||
fi
|
||||
sudo /usr/bin/systemctl daemon-reload
|
||||
sudo /usr/bin/systemctl start neuron.service
|
||||
# enable --now: start the service AND enable it for boot so the
|
||||
# fleet self-heals after a host reboot.
|
||||
sudo /usr/bin/systemctl enable --now neuron.service
|
||||
|
||||
# ── Post-deploy validation ────────────────────────────────
|
||||
# A deploy only goes green if the neuron (a) finishes loading
|
||||
@@ -250,3 +255,85 @@ jobs:
|
||||
sleep 10
|
||||
ssh gitea_ci@${{ matrix.host }} \
|
||||
'journalctl --unit neuron.service -I --no-pager'
|
||||
|
||||
# helexa-bench is a separate package on a separate host (bob), and it
|
||||
# only consumes the fleet's HTTP APIs — it has no deploy-ordering
|
||||
# dependency on cortex or the neurons (the sweep loop is version-aware
|
||||
# and picks up whatever each neuron reports whenever). So it runs
|
||||
# alongside the cortex→neurons chain rather than after it.
|
||||
deploy-bench:
|
||||
runs-on: fedora-43
|
||||
if: >-
|
||||
${{
|
||||
github.event_name == 'workflow_dispatch'
|
||||
|| github.event.workflow_run.conclusion == 'success'
|
||||
}}
|
||||
steps:
|
||||
- name: SSH init
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
|
||||
gitea_ci@bob.hanzalova.internal 'hostname -f'
|
||||
|
||||
# See deploy-cortex for why gating uses the publish manifest and
|
||||
# not unprivileged `dnf check-update`.
|
||||
- name: Deploy helexa-bench (skips when already current)
|
||||
run: |
|
||||
ssh gitea_ci@bob.hanzalova.internal 'bash -s' <<'DEPLOY'
|
||||
set -eu
|
||||
pkg=helexa-bench
|
||||
installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
|
||||
latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
|
||||
| python3 -c '
|
||||
import json, sys
|
||||
name = sys.argv[1]
|
||||
cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
|
||||
if cands:
|
||||
p = max(cands, key=lambda p: p.get("buildTime", 0))
|
||||
print(p["version"] + "-" + p["release"])
|
||||
' "${pkg}" 2>/dev/null || true)
|
||||
if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
|
||||
echo "${pkg}-${installed} already current — leaving service untouched"
|
||||
exit 0
|
||||
fi
|
||||
echo "installed=${installed} published=${latest:-unknown} — deploying"
|
||||
if systemctl is-active --quiet helexa-bench.service; then
|
||||
sudo /usr/bin/systemctl stop helexa-bench.service
|
||||
fi
|
||||
if rpm -q "${pkg}" >/dev/null 2>&1; then
|
||||
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
|
||||
else
|
||||
sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
|
||||
fi
|
||||
sudo /usr/bin/systemctl daemon-reload
|
||||
# enable --now: start the service AND enable it for boot so the
|
||||
# bench resumes collecting after a host reboot.
|
||||
sudo /usr/bin/systemctl enable --now helexa-bench.service
|
||||
|
||||
# ── Post-deploy validation ────────────────────────────────
|
||||
# Unlike a neuron, the bench has no listener to probe and no
|
||||
# model to load — it's a continuous outbound sweep loop. The
|
||||
# meaningful check is simply that it stays up: a bad config or
|
||||
# a crash-on-start would leave the unit in activating/failed
|
||||
# (Restart=always backoff) rather than active. Give it a beat,
|
||||
# then assert active.
|
||||
sleep 5
|
||||
if ! systemctl is-active --quiet helexa-bench.service; then
|
||||
echo "FAIL: helexa-bench.service is not active after start"
|
||||
systemctl --no-pager status helexa-bench.service | head -20 || true
|
||||
exit 1
|
||||
fi
|
||||
echo "helexa-bench.service active"
|
||||
DEPLOY
|
||||
|
||||
# Wait for the service to either come up or wedge, then capture
|
||||
# the latest-invocation journal. Runs even on prior failure so a
|
||||
# failed start step still leaves a usable record in the deploy log.
|
||||
- name: Capture helexa-bench.service startup journal
|
||||
if: always()
|
||||
run: |
|
||||
sleep 10
|
||||
ssh gitea_ci@bob.hanzalova.internal \
|
||||
'journalctl --unit helexa-bench.service -I --no-pager'
|
||||
|
||||
32
asset/helexa-bench/bob.toml
Normal file
32
asset/helexa-bench/bob.toml
Normal file
@@ -0,0 +1,32 @@
|
||||
# helexa-bench config for bob.hanzalova.internal.
|
||||
#
|
||||
# Synced to /etc/helexa-bench/helexa-bench.toml by script/infra-setup.sh
|
||||
# (the helexa-bench RPM ships helexa-bench.example.toml as a
|
||||
# %config(noreplace) default; this per-host file overrides it).
|
||||
#
|
||||
# bob is a client host (it also runs Agent Zero); helexa-bench here hits
|
||||
# every neuron on the fleet directly and records build-stamped results
|
||||
# into the local SQLite store.
|
||||
|
||||
[bench]
|
||||
sweep_interval_secs = 1800
|
||||
samples_per_version = 5
|
||||
iteration_pause_secs = 2
|
||||
request_timeout_secs = 600
|
||||
db_path = "/var/lib/helexa-bench/bench.sqlite"
|
||||
|
||||
[scenarios]
|
||||
prompt_sizes = [128, 4096]
|
||||
max_tokens = 256
|
||||
|
||||
[[targets]]
|
||||
name = "beast"
|
||||
endpoint = "http://beast.hanzalova.internal:13131"
|
||||
|
||||
[[targets]]
|
||||
name = "benjy"
|
||||
endpoint = "http://benjy.hanzalova.internal:13131"
|
||||
|
||||
[[targets]]
|
||||
name = "quadbrat"
|
||||
endpoint = "http://quadbrat.hanzalova.internal:13131"
|
||||
24
asset/sudoers.d/bench-host.conf
Normal file
24
asset/sudoers.d/bench-host.conf
Normal file
@@ -0,0 +1,24 @@
|
||||
# Install on the bench host (bob) as /etc/sudoers.d/helexa_gitea_ci
|
||||
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
|
||||
# which SSHes as gitea_ci@bob to roll out helexa-bench package upgrades
|
||||
# and config changes.
|
||||
#
|
||||
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
|
||||
# helexa-org apps can drop their own sudoers files on the same host
|
||||
# without overwriting this one.
|
||||
#
|
||||
# helexa-bench is a pure-Rust, outbound-only daemon (it polls the neuron
|
||||
# fleet) — no listener, so unlike the neuron host there is no
|
||||
# firewall-cmd entry here.
|
||||
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/helexa-bench/helexa-bench.toml
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start helexa-bench.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop helexa-bench.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now helexa-bench.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
|
||||
# sudoers reserves `:` and `=` and requires `\` escaping inside command
|
||||
# arguments — without it visudo errors at the first `:` in `https://`.
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
|
||||
@@ -11,6 +11,7 @@ gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now cortex.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now neuron.service
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
|
||||
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere
|
||||
|
||||
@@ -20,6 +20,9 @@ neuron_hosts=(
|
||||
benjy.hanzalova.internal
|
||||
quadbrat.hanzalova.internal
|
||||
)
|
||||
# Bench host: runs helexa-bench (outbound-only; polls the neuron fleet).
|
||||
# Also runs Agent Zero — it's a client host, not a GPU node.
|
||||
bench_host=bob.hanzalova.internal
|
||||
|
||||
pubkey="${HOME}/.ssh/id_gitea_ci.pub"
|
||||
if [[ ! -f "${pubkey}" ]]; then
|
||||
@@ -35,7 +38,7 @@ fi
|
||||
# element into one space-joined word, which then word-splits when
|
||||
# referenced unquoted in `ssh ${host}` — and ssh interprets the second
|
||||
# hostname as the remote command. Separate quoting fixes it.
|
||||
for host in "${cortex_host}" "${neuron_hosts[@]}"; do
|
||||
for host in "${cortex_host}" "${neuron_hosts[@]}" "${bench_host}"; do
|
||||
echo "==> ${host}"
|
||||
if ! ssh "${host}" '
|
||||
set -eu
|
||||
@@ -109,6 +112,29 @@ for neuron_host in "${neuron_hosts[@]}"; do
|
||||
"${repo_path}/asset/sudoers.d/neuron-host.conf"
|
||||
done
|
||||
|
||||
install_sudoers "${bench_host}" \
|
||||
"${repo_path}/asset/sudoers.d/bench-host.conf"
|
||||
|
||||
# bob doesn't otherwise carry the lair-cafe RPM repo (it's a client
|
||||
# host, not a cortex/neuron node), so helexa-bench's `dnf install` in
|
||||
# deploy.yml would have nothing to install from. Enable the unstable
|
||||
# repo here, and pre-create /etc/helexa-bench so the config sync below
|
||||
# lands even before the first package install. Idempotent.
|
||||
echo "==> ${bench_host}: ensuring lair-cafe-unstable repo + config dir"
|
||||
if ! ssh "${bench_host}" '
|
||||
set -eu
|
||||
if dnf repolist --all 2>/dev/null | grep -q "^lair-cafe-unstable"; then
|
||||
echo " lair-cafe-unstable already present"
|
||||
else
|
||||
sudo dnf config-manager addrepo --from-repofile=https://rpm.lair.cafe/lair-cafe-unstable.repo
|
||||
sudo dnf config-manager setopt lair-cafe-unstable.enabled=1
|
||||
echo " lair-cafe-unstable enabled"
|
||||
fi
|
||||
sudo install -d -o root -g root -m 0755 /etc/helexa-bench
|
||||
'; then
|
||||
echo " failed to prepare ${bench_host} for helexa-bench"
|
||||
fi
|
||||
|
||||
# Push application config to the fleet. The deploy workflow is
|
||||
# scoped to package install + service restart; config changes ride
|
||||
# along with this script instead, since:
|
||||
@@ -149,3 +175,8 @@ for neuron_host in "${neuron_hosts[@]}"; do
|
||||
"${repo_path}/asset/neuron/${short}.toml" \
|
||||
/etc/neuron/neuron.toml
|
||||
done
|
||||
|
||||
echo "==> ${bench_host}: syncing bench config"
|
||||
sync_config "${bench_host}" \
|
||||
"${repo_path}/asset/helexa-bench/${bench_host%%.*}.toml" \
|
||||
/etc/helexa-bench/helexa-bench.toml
|
||||
|
||||
Reference in New Issue
Block a user