helexa/.gitea/workflows/deploy.yml

name: deploy

# Roll the freshly-published unstable RPMs onto the helexa fleet:
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host.
#
# Triggered automatically after `build-prerelease` succeeds (by which
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
# re-runnable manually from the Gitea UI.
#
# Each host self-gates: if dnf sees no newer package than what is
# installed, the service is left alone — no stop, no restart, no model
# cold-load. Combined with build-prerelease's change detection this
# means a docs- or gateway-only push never restarts the neurons (a
# neuron restart costs ~5 min of 27B cold-load, see issue #1).
#
# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
# sudoers drop-in) lives in script/infra-setup.sh — run that once per
# host before this workflow can succeed.

on:
  workflow_run:
    workflows: [build-prerelease]
    types: [completed]
  workflow_dispatch:

# Serialize deploys. Overlapping runs would race on dnf metadata
# refresh and service-restart timing; queueing keeps the fleet
# predictable. Don't cancel an in-flight deploy — a half-applied dnf
# transaction is worse than a slightly stale deploy.
concurrency:
  group: deploy
  cancel-in-progress: false

env:
  DEPLOY_KEY: |
    ${{ secrets.RSYNC_SSH_KEY }}

jobs:
  deploy-cortex:
    runs-on: fedora-43
    # Two trigger paths: manual dispatch always runs; workflow_run
    # only runs if the upstream `build-prerelease` actually succeeded.
    if: >-
      ${{
        github.event_name == 'workflow_dispatch'
        || github.event.workflow_run.conclusion == 'success'
      }}
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@hanzalova.internal 'hostname -f'

      # Gating compares `rpm -q` against the packages.json manifest the
      # publish job maintains — NOT unprivileged `dnf check-update`,
      # which proved unreliable as the gitea_ci user (hung on metadata
      # locks on one host, silently reported "no updates" on others).
      # An unreadable/unparsable manifest fails open: deploy proceeds.
      - name: Deploy cortex (skips when already current)
        run: |
          ssh gitea_ci@hanzalova.internal 'bash -s' <<'DEPLOY'
          set -eu
          pkg=cortex
          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
            | python3 -c '
          import json, sys
          name = sys.argv[1]
          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
          if cands:
              p = max(cands, key=lambda p: p.get("buildTime", 0))
              print(p["version"] + "-" + p["release"])
          ' "${pkg}" 2>/dev/null || true)
          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
            echo "${pkg}-${installed} already current — leaving service untouched"
            exit 0
          fi
          echo "installed=${installed} published=${latest:-unknown} — deploying"
          if systemctl is-active --quiet cortex.service; then
            sudo /usr/bin/systemctl stop cortex.service
          fi
          if rpm -q "${pkg}" >/dev/null 2>&1; then
            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
          else
            sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
          fi
          sudo /usr/bin/systemctl daemon-reload
          sudo /usr/bin/systemctl start cortex.service
          DEPLOY

      # Wait for the service to either come up or wedge, then capture
      # the latest-invocation journal. Runs even on prior failure so a
      # failed start step still leaves a usable record in the deploy log.
      - name: Capture cortex.service startup journal
        if: always()
        run: |
          sleep 10
          ssh gitea_ci@hanzalova.internal \
              'journalctl --unit cortex.service -I --no-pager'

  deploy-neurons:
    needs: [deploy-cortex]
    runs-on: fedora-43
    strategy:
      # One neuron failing must not cancel the others. Cortex is up
      # already; a partial neuron deploy is strictly better than
      # rolling back to zero.
      fail-fast: false
      matrix:
        include:
          # load_timeout: how long to wait for default_models to finish
          # loading after a restart. beast cold-loads Qwen3.6-27B Q6K
          # TP=2 (~5-6 min typical, see #1); benjy/quadbrat load small
          # single-GPU models in well under a minute.
          - host: beast.hanzalova.internal
            flavour: blackwell
            load_timeout: 900
          - host: benjy.hanzalova.internal
            flavour: ada
            load_timeout: 300
          - host: quadbrat.hanzalova.internal
            flavour: ampere
            load_timeout: 300
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@${{ matrix.host }} 'hostname -f'

      # See deploy-cortex for why gating uses the publish manifest and
      # not unprivileged `dnf check-update`.
      - name: Deploy helexa-neuron-${{ matrix.flavour }} (skips when already current)
        run: |
          ssh gitea_ci@${{ matrix.host }} 'bash -s' <<'DEPLOY'
          set -eu
          pkg=helexa-neuron-${{ matrix.flavour }}
          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
            | python3 -c '
          import json, sys
          name = sys.argv[1]
          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
          if cands:
              p = max(cands, key=lambda p: p.get("buildTime", 0))
              print(p["version"] + "-" + p["release"])
          ' "${pkg}" 2>/dev/null || true)
          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
            echo "${pkg}-${installed} already current — leaving service untouched"
            exit 0
          fi
          echo "installed=${installed} published=${latest:-unknown} — deploying"
          if systemctl is-active --quiet neuron.service; then
            sudo /usr/bin/systemctl stop neuron.service
          fi
          if rpm -q "${pkg}" >/dev/null 2>&1; then
            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y "${pkg}"
          else
            sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
          fi
          sudo /usr/bin/systemctl daemon-reload
          sudo /usr/bin/systemctl start neuron.service

          # ── Post-deploy validation ────────────────────────────────
          # A deploy only goes green if the neuron (a) finishes loading
          # its default models and (b) answers a trivial prompt like an
          # LLM should. Catches the class of bug where the binary
          # starts fine but model load or inference is broken — which
          # previously surfaced only when a human noticed. The wait
          # polls /health activation (the structured source of the
          # "loaded default model" journal line, plus per-model failure
          # detail); the journal-capture step below still runs for
          # forensics either way.
          load_timeout=${{ matrix.load_timeout }}
          echo "waiting for default models (timeout ${load_timeout}s)"
          deadline=$(( $(date +%s) + load_timeout ))
          health=""
          while :; do
            health=$(curl -fsS --max-time 5 http://localhost:13131/health 2>/dev/null || true)
            state=$(printf %s "${health}" | python3 -c '
          import json, sys
          try:
              print(json.load(sys.stdin).get("activation", {}).get("state", ""))
          except Exception:
              print("")
          ')
            if [ "${state}" = "ready" ]; then
              break
            fi
            if [ "$(date +%s)" -ge "${deadline}" ]; then
              echo "FAIL: activation not ready within ${load_timeout}s (last state: ${state:-unreachable})"
              exit 1
            fi
            sleep 10
          done

          model=$(printf %s "${health}" | python3 -c '
          import json, sys
          a = json.load(sys.stdin).get("activation", {})
          failed = a.get("failed", [])
          if failed:
              for f in failed:
                  msg = "FAILED " + str(f.get("model_id")) + ": " + str(f.get("error", ""))[:400]
                  sys.stderr.write(msg + chr(10))
              sys.exit(1)
          completed = a.get("completed", [])
          print(completed[0] if completed else "")
          ')
          if [ -z "${model}" ]; then
            echo "no default models configured — skipping LLM probe"
            exit 0
          fi

          echo "LLM probe against ${model}"
          probe_body=$(printf '{"model":"%s","messages":[{"role":"user","content":"Reply with exactly one word: pineapple"}],"max_tokens":512,"temperature":0}' "${model}")
          resp=$(curl -fsS --max-time 180 -H "content-type: application/json" \
            -d "${probe_body}" http://localhost:13131/v1/chat/completions) || {
            echo "FAIL: probe request errored"
            exit 1
          }
          if printf %s "${resp}" | grep -qi pineapple; then
            echo "LLM probe passed"
          else
            echo "FAIL: probe response missing expected token"
            printf %s "${resp}" | head -c 2000
            echo
            exit 1
          fi
          DEPLOY

      - name: Ensure firewalld allows helexa-neuron
        run: |
          ssh gitea_ci@${{ matrix.host }} '
            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
              sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
              sudo /usr/bin/firewall-cmd --reload
            fi'

      # Wait for the service to either come up or wedge, then capture
      # the latest-invocation journal. Runs even on prior failure so a
      # failed start step still leaves a usable record in the deploy log.
      - name: Capture neuron.service startup journal
        if: always()
        run: |
          sleep 10
          ssh gitea_ci@${{ matrix.host }} \
              'journalctl --unit neuron.service -I --no-pager'