From becf61b9c14de709ed4def2887d21e97a0cba594 Mon Sep 17 00:00:00 2001
From: rob thijssen <grenade@rob.tn>
Date: Tue, 26 May 2026 15:26:21 +0300
Subject: [PATCH] feat(script): validate-neuron.sh waits for /health
 activation=ready

Adds wait_for_ready() that polls /health until activation.state flips
to "ready" (or the NEURON_LOAD_TIMEOUT deadline). Inserted between
probe_health and the is_loaded/trigger_load step.

Before this, running validate-neuron.sh right after deploy.sh raced
the background pre-warm and failed in ~9 ms with "neuron not reachable"
(the pre-2026-05-26 build) or with a partial-load error (the new
build, where the listener binds before default_models finishes).

The poll prints the in_progress model on each tick so an operator
watching the log can see which model is delaying readiness. Backs off
from 2s to 10s after the first few iterations so a long TP load
doesn't spam.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 script/validate-neuron.sh | 57 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
diff --git a/script/validate-neuron.sh b/script/validate-neuron.sh
index 3d39d81..7bd1bbc 100755
--- a/script/validate-neuron.sh
+++ b/script/validate-neuron.sh
@@ -66,6 +66,58 @@ probe_health() {
         || die "neuron not reachable at ${BASE}/health"
 }
 
+# Block until the neuron reports `activation.state == "ready"` on
+# `/health`. Without this, validate-neuron.sh used to race the
+# background pre-warm (the listener binds immediately but big TP
+# loads run for minutes after) and either fail with ECONNREFUSED
+# (pre-2026-05-26 build, where load was synchronous before bind) or
+# get a 404 from /models/load against a partially-loaded model.
+#
+# The poll cap is `NEURON_LOAD_TIMEOUT` since pre-warm and an
+# on-demand load are the same operation under different triggers.
+# Short interval at the start (catches a quick-loading host without
+# extra latency) backs off after the first few iterations to keep
+# log spam down on a slow load.
+wait_for_ready() {
+    local deadline=$(( $(date +%s) + LOAD_TIMEOUT ))
+    local state= attempt=0
+    while (( $(date +%s) < deadline )); do
+        attempt=$(( attempt + 1 ))
+        state=$(
+            curl --silent --max-time 5 "${BASE}/health" \
+                | jq -r '.activation.state // "unknown"'
+        ) || state=unreachable
+        case "${state}" in
+            ready)
+                say "/health activation.state=ready (after ${attempt} probe(s))"
+                return 0
+                ;;
+            pre_warming)
+                local in_progress
+                in_progress=$(
+                    curl --silent --max-time 5 "${BASE}/health" \
+                        | jq -r '.activation.in_progress // "<none>"'
+                ) || in_progress='<unreadable>'
+                say "/health pre_warming (in_progress=${in_progress}); waiting"
+                ;;
+            unreachable)
+                say "/health unreachable; waiting"
+                ;;
+            *)
+                say "/health unexpected activation.state=${state}; waiting"
+                ;;
+        esac
+        # 2s for the first few iterations to catch quick loads, then
+        # 10s to avoid log spam on a multi-minute TP load.
+        if (( attempt < 5 )); then
+            sleep 2
+        else
+            sleep 10
+        fi
+    done
+    die "neuron not ready within ${LOAD_TIMEOUT}s (last state: ${state})"
+}
+
 list_loaded_ids() {
     # The manifest is YAML and uses yq; HTTP responses are JSON and use
     # jq directly. pip-yq parses input as YAML by default, which trips
@@ -157,6 +209,11 @@ run_probe() {
 say "validating neuron at ${BASE}"
 probe_health
 say "/health OK"
+# Background pre-warm from default_models means /health is reachable
+# but `activation.state` can still be `pre_warming` for minutes after
+# service start. Block here so the subsequent is_loaded / trigger_load
+# steps don't race a partially-materialised model.
+wait_for_ready
 
 if is_loaded; then
     say "${MODEL_ID} already loaded"