From d3f2d5074909624b757269a063a3627fde1f9d27 Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Tue, 26 May 2026 14:05:54 +0300
Subject: [PATCH] feat(deploy): per-host neuron config + pre-warm headline
 models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds asset/neuron/{beast,benjy,quadbrat}.toml — per-host neuron.toml
files keyed by the first dot-component of the host. deploy.sh now
rsyncs the matching file to /etc/neuron/neuron.toml on each neuron and
stops+starts the service so default_models is re-read.

Headline model per host (drives /v1/models output immediately after a
clean deploy):

  beast     Qwen/Qwen3.6-27B  (q5k, tp=2, devices=[0,1])
  benjy     Qwen/Qwen3-8B     (bf16, devices=[0])
  quadbrat  Qwen/Qwen3-1.7B   (bf16, devices=[0])

Removes the need to follow deploy.sh with `validate-neuron.sh beast
Qwen/Qwen3.6-27B q5k 2` to surface the 27B in the catalogue — the
neuron loads it itself on activation.

The neuron loop now mirrors the cortex flow (stop → install/upgrade →
sync config → start) so config-only changes pick up on subsequent
deploys; previously a no-package-change deploy would silently leave
the host on the old default_models.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 asset/neuron/beast.toml    | 24 ++++++++++++
 asset/neuron/benjy.toml    | 19 +++++++++
 asset/neuron/quadbrat.toml | 19 +++++++++
 script/deploy.sh           | 80 +++++++++++++++++++++++++-------------
 4 files changed, 116 insertions(+), 26 deletions(-)
 create mode 100644 asset/neuron/beast.toml
 create mode 100644 asset/neuron/benjy.toml
 create mode 100644 asset/neuron/quadbrat.toml
diff --git a/asset/neuron/beast.toml b/asset/neuron/beast.toml
new file mode 100644
index 0000000..432a660
--- /dev/null
+++ b/asset/neuron/beast.toml
@@ -0,0 +1,24 @@
+# neuron.toml for beast.hanzalova.internal
+#
+# 2x RTX 5090 (32 GB each) — TP-2 capable. Pre-warms Qwen3.6-27B with
+# q5k ISQ across both GPUs at activation, matching the validate-neuron
+# invocation: `validate-neuron.sh beast.hanzalova.internal
+# Qwen/Qwen3.6-27B q5k 2`.
+#
+# Synced by script/deploy.sh from asset/neuron/<short-host>.toml. Edits
+# take effect on the next deploy.sh run (which stops + restarts the
+# service so default_models is re-read at activation).
+
+port = 13131
+
+[[harnesses]]
+name = "candle"
+
+[harness.candle]
+
+[[default_models]]
+model_id = "Qwen/Qwen3.6-27B"
+harness = "candle"
+quant = "q5k"
+tensor_parallel = 2
+devices = [0, 1]
diff --git a/asset/neuron/benjy.toml b/asset/neuron/benjy.toml
new file mode 100644
index 0000000..793bd06
--- /dev/null
+++ b/asset/neuron/benjy.toml
@@ -0,0 +1,19 @@
+# neuron.toml for benjy.hanzalova.internal
+#
+# 1x RTX 4090 (24 GB) — largest single-GPU host on the fleet. Pre-warms
+# Qwen3-8B (bf16, ~18 GB), leaving ~6 GB for KV cache + activations on
+# moderate-length contexts.
+#
+# Synced by script/deploy.sh from asset/neuron/<short-host>.toml.
+
+port = 13131
+
+[[harnesses]]
+name = "candle"
+
+[harness.candle]
+
+[[default_models]]
+model_id = "Qwen/Qwen3-8B"
+harness = "candle"
+devices = [0]
diff --git a/asset/neuron/quadbrat.toml b/asset/neuron/quadbrat.toml
new file mode 100644
index 0000000..4135557
--- /dev/null
+++ b/asset/neuron/quadbrat.toml
@@ -0,0 +1,19 @@
+# neuron.toml for quadbrat.hanzalova.internal
+#
+# 1x RTX 3060 (12 GB) — small / quantised tier. Pre-warms Qwen3-1.7B
+# (bf16, ~4 GB), leaving ~7 GB for KV cache so long contexts on a small
+# model still have plenty of room.
+#
+# Synced by script/deploy.sh from asset/neuron/<short-host>.toml.
+
+port = 13131
+
+[[harnesses]]
+name = "candle"
+
+[harness.candle]
+
+[[default_models]]
+model_id = "Qwen/Qwen3-1.7B"
+harness = "candle"
+devices = [0]
diff --git a/script/deploy.sh b/script/deploy.sh
index 9ce61da..1f3aa9f 100755
--- a/script/deploy.sh
+++ b/script/deploy.sh
@@ -233,43 +233,71 @@ fi
 for entry in "${neuron_entries[@]}"; do
     IFS=$'\t' read -r neuron_host neuron_flavour <<< "${entry}"
     package="helexa-neuron-${neuron_flavour}"
+    # First dot-component of the host keys the per-host config file
+    # under asset/neuron/<short>.toml. A host listed in the manifest
+    # without a corresponding config still deploys (the package's
+    # default /etc/neuron/neuron.toml stays in place; no pre-warm).
+    short_host="${neuron_host%%.*}"
+    host_config="${REPO_DIR}/asset/neuron/${short_host}.toml"
 
     ensure_lair_repo "${neuron_host}"
     ensure_cudnn_runtime "${neuron_host}"
     neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
+
+    # Stop the service unconditionally before any reconfig step.
+    # `default_models` is read at activation, so a config change without
+    # a bounce silently leaves the host on the previous pre-warm set.
+    # Same shape as the cortex flow above. The `[ ! -f … ]` guard skips
+    # the stop on a fresh install where the unit file isn't there yet.
+    if ssh "${neuron_host}" "[ ! -f /usr/lib/systemd/system/neuron.service ] || sudo systemctl stop neuron.service"; then
+        echo "[${neuron_host}] stopped neuron service"
+    else
+        echo "[${neuron_host}] failed to stop neuron service (continuing)"
+    fi
+
     if needs_update "${neuron_host}" "${package}"; then
         echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"
-        if ssh "${neuron_host}" "[ ! -f /usr/lib/systemd/system/neuron.service ] || sudo systemctl stop neuron.service"; then
-            echo "[${neuron_host}] stopped neuron service"
-            # --allowerasing lets dnf swap out a previously-installed
-            # bare helexa-neuron or a different flavour without manual
-            # intervention. The Conflicts: clauses in the spec ensure
-            # only one flavour is ever resident.
-            if install_or_upgrade "${neuron_host}" "${package}"; then
-                neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
-                echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}"
-                # Ensure firewalld allows neuron port
-                ssh "${neuron_host}" "sudo firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null || sudo firewall-cmd --add-service=helexa-neuron --permanent && sudo firewall-cmd --reload" 2>/dev/null || true
-                if ssh "${neuron_host}" "sudo systemctl daemon-reload && sudo systemctl start neuron.service"; then
-                    echo "[${neuron_host}] started neuron service"
-                else
-                    echo "[${neuron_host}] failed to start neuron service"
-                fi
-            else
-                echo "[${neuron_host}] failed to install ${package}:"
-                echo "${__DNF_OUTPUT__}" | sed "s/^/[${neuron_host}]   /"
-            fi
+        # --allowerasing lets dnf swap out a previously-installed
+        # bare helexa-neuron or a different flavour without manual
+        # intervention. The Conflicts: clauses in the spec ensure
+        # only one flavour is ever resident.
+        if install_or_upgrade "${neuron_host}" "${package}"; then
+            neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
+            echo "[${neuron_host}] installed/upgraded ${package} to ${neuron_nvr}"
+            # Ensure firewalld allows neuron port
+            ssh "${neuron_host}" "sudo firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null || sudo firewall-cmd --add-service=helexa-neuron --permanent && sudo firewall-cmd --reload" 2>/dev/null || true
         else
-            echo "[${neuron_host}] failed to stop neuron service"
+            echo "[${neuron_host}] failed to install ${package}:"
+            echo "${__DNF_OUTPUT__}" | sed "s/^/[${neuron_host}]   /"
         fi
     else
         echo "[${neuron_host}] ${package} is up to date (${neuron_nvr})"
-        if ssh "${neuron_host}" systemctl is-active --quiet neuron.service; then
-            echo "[${neuron_host}] neuron service is active"
-        elif ssh "${neuron_host}" sudo systemctl start neuron.service; then
-            echo "[${neuron_host}] started neuron service"
+    fi
+
+    # Sync per-host neuron.toml — drives default_models pre-warm so
+    # `/v1/models` on the gateway exposes the host's headline model
+    # immediately after the service comes back up. Missing per-host
+    # config leaves the package's installed neuron.toml untouched.
+    if [[ -f "${host_config}" ]]; then
+        if rsync \
+            --archive \
+            --compress \
+            --rsync-path 'sudo rsync' \
+            --chown root:root \
+            --chmod 644 \
+            "${host_config}" \
+            "${neuron_host}:/etc/neuron/neuron.toml"; then
+            echo "[${neuron_host}] sync'd asset/neuron/${short_host}.toml"
         else
-            echo "[${neuron_host}] failed to start neuron service"
+            echo "[${neuron_host}] failed to sync neuron.toml"
         fi
+    else
+        echo "[${neuron_host}] no asset/neuron/${short_host}.toml — leaving /etc/neuron/neuron.toml untouched"
+    fi
+
+    if ssh "${neuron_host}" "sudo systemctl daemon-reload && sudo systemctl start neuron.service"; then
+        echo "[${neuron_host}] started neuron service"
+    else
+        echo "[${neuron_host}] failed to start neuron service"
     fi
 done