helexa/.gitea/workflows/deploy.yml

name: deploy

# Roll the freshly-published unstable RPMs onto the helexa fleet:
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host,
# and helexa-bench on bob (the bench host).
#
# Triggered automatically after `build-prerelease` succeeds (by which
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
# re-runnable manually from the Gitea UI.
#
# Each host self-gates: if dnf sees no newer package than what is
# installed, the service is left alone — no stop, no restart, no model
# cold-load. Combined with build-prerelease's change detection this
# means a docs- or gateway-only push never restarts the neurons (a
# neuron restart costs ~5 min of 27B cold-load, see issue #1).
#
# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
# sudoers drop-in) lives in script/infra-setup.sh — run that once per
# host before this workflow can succeed.

on:
  workflow_run:
    workflows: [build-prerelease]
    types: [completed]
  workflow_dispatch:

# Serialize deploys. Overlapping runs would race on dnf metadata
# refresh and service-restart timing; queueing keeps the fleet
# predictable. Don't cancel an in-flight deploy — a half-applied dnf
# transaction is worse than a slightly stale deploy.
concurrency:
  group: deploy
  cancel-in-progress: false

env:
  DEPLOY_KEY: |
    ${{ secrets.RSYNC_SSH_KEY }}

jobs:
  deploy-cortex:
    runs-on: fedora-43
    # Two trigger paths: manual dispatch always runs; workflow_run
    # only runs if the upstream `build-prerelease` actually succeeded.
    if: >-
      ${{
        github.event_name == 'workflow_dispatch'
        || github.event.workflow_run.conclusion == 'success'
      }}
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@hanzalova.internal 'hostname -f'

      # Gating compares `rpm -q` against the packages.json manifest the
      # publish job maintains — NOT unprivileged `dnf check-update`,
      # which proved unreliable as the gitea_ci user (hung on metadata
      # locks on one host, silently reported "no updates" on others).
      # An unreadable/unparsable manifest fails open: deploy proceeds.
      - name: Deploy cortex (skips when already current)
        run: |
          ssh gitea_ci@hanzalova.internal 'bash -s' <<'DEPLOY'
          set -eu
          pkg=cortex
          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
            | python3 -c '
          import json, sys
          name = sys.argv[1]
          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
          if cands:
              p = max(cands, key=lambda p: p.get("buildTime", 0))
              print(p["version"] + "-" + p["release"])
          ' "${pkg}" 2>/dev/null || true)
          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
            echo "${pkg}-${installed} already current — leaving service untouched"
            exit 0
          fi
          echo "installed=${installed} published=${latest:-unknown} — deploying"
          if systemctl is-active --quiet cortex.service; then
            sudo /usr/bin/systemctl stop cortex.service
          fi
          if rpm -q "${pkg}" >/dev/null 2>&1; then
            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
          else
            sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
          fi
          sudo /usr/bin/systemctl daemon-reload
          # enable --now: start the service AND enable it for boot so the
          # fleet self-heals after a host reboot.
          sudo /usr/bin/systemctl enable --now cortex.service
          DEPLOY

      # Wait for the service to either come up or wedge, then capture
      # the latest-invocation journal. Runs even on prior failure so a
      # failed start step still leaves a usable record in the deploy log.
      - name: Capture cortex.service startup journal
        if: always()
        run: |
          sleep 10
          ssh gitea_ci@hanzalova.internal \
              'journalctl --unit cortex.service -I --no-pager'

  deploy-neurons:
    needs: [deploy-cortex]
    runs-on: fedora-43
    strategy:
      # One neuron failing must not cancel the others. Cortex is up
      # already; a partial neuron deploy is strictly better than
      # rolling back to zero.
      fail-fast: false
      matrix:
        include:
          # load_timeout: how long to wait for default_models to finish
          # loading after a restart. beast cold-loads Qwen3.6-27B Q6K
          # TP=2 (~5-6 min typical, see #1); benjy/quadbrat load small
          # single-GPU models in well under a minute.
          #
          # max_prompt_tokens: per-model context cap, written to the
          # neuron.service.d/model.conf drop-in (NEURON_MAX_PROMPT_TOKENS).
          # A change here restarts the neuron even with no new RPM. Values
          # are VRAM-safe ceilings derived per model — see
          # doc/context-limits.md. beast (Qwen3.6-27B, hybrid linear, 2x
          # 32GB) has ample KV headroom; benjy (Qwen3-8B dense, ~6GB free)
          # is VRAM-bound and stays at the default; quadbrat (Qwen3-1.7B)
          # likewise conservative.
          - host: beast.hanzalova.internal
            flavour: blackwell
            load_timeout: 900
            max_prompt_tokens: 131072
          - host: benjy.hanzalova.internal
            flavour: ada
            load_timeout: 300
            max_prompt_tokens: 16384
          - host: quadbrat.hanzalova.internal
            flavour: ampere
            load_timeout: 300
            max_prompt_tokens: 16384
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@${{ matrix.host }} 'hostname -f'

      # See deploy-cortex for why gating uses the publish manifest and
      # not unprivileged `dnf check-update`.
      - name: Deploy helexa-neuron-${{ matrix.flavour }} (skips when already current)
        run: |
          ssh gitea_ci@${{ matrix.host }} 'bash -s' <<'DEPLOY'
          set -eu
          pkg=helexa-neuron-${{ matrix.flavour }}
          max_prompt_tokens="${{ matrix.max_prompt_tokens }}"

          # ── Desired per-model systemd drop-in ─────────────────────────
          # model.conf carries NEURON_MAX_PROMPT_TOKENS so the context cap
          # is deterministic per host and rolled out (with a restart) by
          # this workflow, not hand-edited. It sorts after local.conf, so a
          # deploy-managed value wins over any manual local override of the
          # same variable. See doc/context-limits.md.
          conf=/etc/systemd/system/neuron.service.d/model.conf
          config_changed=0
          if [ -n "${max_prompt_tokens}" ]; then
            desired=$(printf '%s\n%s\n%s\n%s' \
              "# Managed by .gitea/workflows/deploy.yml - do not edit by hand." \
              "# Per-model context cap; see doc/context-limits.md." \
              "[Service]" \
              "Environment=NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}")
            [ "${desired}" = "$(cat "${conf}" 2>/dev/null || true)" ] || config_changed=1
          fi

          # ── Package version gate (manifest rationale: see deploy-cortex) ──
          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
            | python3 -c '
          import json, sys
          name = sys.argv[1]
          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
          if cands:
              p = max(cands, key=lambda p: p.get("buildTime", 0))
              print(p["version"] + "-" + p["release"])
          ' "${pkg}" 2>/dev/null || true)
          pkg_changed=1
          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
            pkg_changed=0
          fi

          # Skip only when BOTH the package and the drop-in are unchanged —
          # a context-cap change must restart the neuron even with no new RPM.
          if [ "${pkg_changed}" -eq 0 ] && [ "${config_changed}" -eq 0 ]; then
            echo "${pkg}-${installed} current; NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens:-<unset>} unchanged — leaving service untouched"
            exit 0
          fi
          echo "installed=${installed} published=${latest:-unknown} pkg_changed=${pkg_changed} config_changed=${config_changed} — deploying"

          # Write the drop-in (staged in gitea_ci's dir, installed root-owned).
          if [ "${config_changed}" -eq 1 ]; then
            printf '%s\n' "${desired}" > /var/lib/gitea_ci/model.conf
            sudo /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf "${conf}"
            rm -f /var/lib/gitea_ci/model.conf
            echo "applied ${conf}: NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}"
          fi

          if systemctl is-active --quiet neuron.service; then
            sudo /usr/bin/systemctl stop neuron.service
          fi
          if [ "${pkg_changed}" -eq 1 ]; then
            if rpm -q "${pkg}" >/dev/null 2>&1; then
              sudo /usr/bin/dnf upgrade --refresh --allowerasing -y "${pkg}"
            else
              sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
            fi
          fi
          # daemon-reload picks up both a new unit (dnf) and the drop-in.
          sudo /usr/bin/systemctl daemon-reload
          # enable --now: start the service AND enable it for boot so the
          # fleet self-heals after a host reboot.
          sudo /usr/bin/systemctl enable --now neuron.service

          # ── Post-deploy validation ────────────────────────────────
          # A deploy only goes green if the neuron (a) finishes loading
          # its default models and (b) answers a trivial prompt like an
          # LLM should. Catches the class of bug where the binary
          # starts fine but model load or inference is broken — which
          # previously surfaced only when a human noticed. The wait
          # polls /health activation (the structured source of the
          # "loaded default model" journal line, plus per-model failure
          # detail); the journal-capture step below still runs for
          # forensics either way.
          load_timeout=${{ matrix.load_timeout }}
          echo "waiting for default models (timeout ${load_timeout}s)"
          deadline=$(( $(date +%s) + load_timeout ))
          health=""
          while :; do
            health=$(curl -fsS --max-time 5 http://localhost:13131/health 2>/dev/null || true)
            state=$(printf %s "${health}" | python3 -c '
          import json, sys
          try:
              print(json.load(sys.stdin).get("activation", {}).get("state", ""))
          except Exception:
              print("")
          ')
            if [ "${state}" = "ready" ]; then
              break
            fi
            if [ "$(date +%s)" -ge "${deadline}" ]; then
              echo "FAIL: activation not ready within ${load_timeout}s (last state: ${state:-unreachable})"
              exit 1
            fi
            sleep 10
          done

          model=$(printf %s "${health}" | python3 -c '
          import json, sys
          a = json.load(sys.stdin).get("activation", {})
          failed = a.get("failed", [])
          if failed:
              for f in failed:
                  msg = "FAILED " + str(f.get("model_id")) + ": " + str(f.get("error", ""))[:400]
                  sys.stderr.write(msg + chr(10))
              sys.exit(1)
          completed = a.get("completed", [])
          print(completed[0] if completed else "")
          ')
          if [ -z "${model}" ]; then
            echo "no default models configured — skipping LLM probe"
            exit 0
          fi

          echo "LLM probe against ${model}"
          probe_body=$(printf '{"model":"%s","messages":[{"role":"user","content":"Reply with exactly one word: pineapple"}],"max_tokens":512,"temperature":0}' "${model}")
          resp=$(curl -fsS --max-time 180 -H "content-type: application/json" \
            -d "${probe_body}" http://localhost:13131/v1/chat/completions) || {
            echo "FAIL: probe request errored"
            exit 1
          }
          if printf %s "${resp}" | grep -qi pineapple; then
            echo "LLM probe passed"
          else
            echo "FAIL: probe response missing expected token"
            printf %s "${resp}" | head -c 2000
            echo
            exit 1
          fi
          DEPLOY

      - name: Ensure firewalld allows helexa-neuron
        run: |
          ssh gitea_ci@${{ matrix.host }} '
            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
              sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
              sudo /usr/bin/firewall-cmd --reload
            fi'

      # Wait for the service to either come up or wedge, then capture
      # the latest-invocation journal. Runs even on prior failure so a
      # failed start step still leaves a usable record in the deploy log.
      - name: Capture neuron.service startup journal
        if: always()
        run: |
          sleep 10
          ssh gitea_ci@${{ matrix.host }} \
              'journalctl --unit neuron.service -I --no-pager'

  # helexa-bench is a separate package on a separate host (bob), and it
  # only consumes the fleet's HTTP APIs — it has no deploy-ordering
  # dependency on cortex or the neurons (the sweep loop is version-aware
  # and picks up whatever each neuron reports whenever). So it runs
  # alongside the cortex→neurons chain rather than after it.
  deploy-bench:
    runs-on: fedora-43
    if: >-
      ${{
        github.event_name == 'workflow_dispatch'
        || github.event.workflow_run.conclusion == 'success'
      }}
    steps:
      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@bob.hanzalova.internal 'hostname -f'

      # See deploy-cortex for why gating uses the publish manifest and
      # not unprivileged `dnf check-update`.
      - name: Deploy helexa-bench (skips when already current)
        run: |
          ssh gitea_ci@bob.hanzalova.internal 'bash -s' <<'DEPLOY'
          set -eu
          pkg=helexa-bench
          installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
          latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
            | python3 -c '
          import json, sys
          name = sys.argv[1]
          cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
          if cands:
              p = max(cands, key=lambda p: p.get("buildTime", 0))
              print(p["version"] + "-" + p["release"])
          ' "${pkg}" 2>/dev/null || true)
          if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
            echo "${pkg}-${installed} already current — leaving service untouched"
            exit 0
          fi
          echo "installed=${installed} published=${latest:-unknown} — deploying"
          if systemctl is-active --quiet helexa-bench.service; then
            sudo /usr/bin/systemctl stop helexa-bench.service
          fi
          if rpm -q "${pkg}" >/dev/null 2>&1; then
            sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
          else
            sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
          fi
          sudo /usr/bin/systemctl daemon-reload
          # enable --now: start the service AND enable it for boot so the
          # bench resumes collecting after a host reboot.
          sudo /usr/bin/systemctl enable --now helexa-bench.service

          # ── Post-deploy validation ────────────────────────────────
          # The bench serves a read-only API on :13132 alongside the
          # outbound sweep loop. Probe the API over localhost (bypasses
          # firewalld) — catches a crash-on-start or a bad bind. Bail
          # early if the unit drops out of active (Restart backoff).
          echo "waiting for bench API on :13132"
          deadline=$(( $(date +%s) + 30 ))
          while :; do
            if curl -fsS --max-time 5 http://localhost:13132/api/health >/dev/null 2>&1; then
              echo "bench API healthy"
              break
            fi
            if ! systemctl is-active --quiet helexa-bench.service; then
              echo "FAIL: helexa-bench.service is not active"
              systemctl --no-pager status helexa-bench.service | head -20 || true
              exit 1
            fi
            if [ "$(date +%s)" -ge "${deadline}" ]; then
              echo "FAIL: bench API not healthy within 30s"
              exit 1
            fi
            sleep 3
          done
          DEPLOY

      - name: Ensure firewalld allows helexa-bench
        run: |
          ssh gitea_ci@bob.hanzalova.internal '
            if ! sudo /usr/bin/firewall-cmd --query-service=helexa-bench --quiet 2>/dev/null; then
              sudo /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
              sudo /usr/bin/firewall-cmd --reload
            fi'

      # Wait for the service to either come up or wedge, then capture
      # the latest-invocation journal. Runs even on prior failure so a
      # failed start step still leaves a usable record in the deploy log.
      - name: Capture helexa-bench.service startup journal
        if: always()
        run: |
          sleep 10
          ssh gitea_ci@bob.hanzalova.internal \
              'journalctl --unit helexa-bench.service -I --no-pager'

  # Build the bench UI and publish it to the public nginx vhost on the
  # gateway (https://bench.helexa.ai). The vhost + Let's Encrypt cert are
  # one-time host setup (script/infra-setup.sh); this job just refreshes
  # the static assets. nginx reverse-proxies /api to the bob API, so the
  # SPA is built same-origin (no VITE_API_BASE). Independent of the other
  # deploy jobs.
  deploy-bench-ui:
    runs-on: fedora-43
    if: >-
      ${{
        github.event_name == 'workflow_dispatch'
        || github.event.workflow_run.conclusion == 'success'
      }}
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-node@v4
        with:
          node-version: "20"

      - name: Build UI
        run: |
          cd bench
          npm ci
          npm run build

      - name: SSH init
        run: |
          mkdir -p ~/.ssh
          echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
              gitea_ci@hanzalova.internal 'hostname -f'

      - name: Rsync built UI to gateway webroot
        run: |
          rsync --archive --compress --delete \
            --rsync-path 'sudo rsync' \
            bench/dist/ \
            gitea_ci@hanzalova.internal:/var/www/bench.helexa.ai/