fix(controller): reap stuck-registered runners after 5min instead of 30min
A runner that registers with Gitea but never picks up a job (because the queued jobs that triggered its spawn turned out to be waiting on 'needs:' / gated by 'if:' conditions, or any other claim-side stall) holds capacity for the full window. With a 30-minute threshold, a burst of over-eager spawns can block all real work for half an hour. Drop the threshold to 5 minutes. False positives are self-healing: if a job was about to be claimed, the next brew tick (5s) will see it still queued and spawn a fresh runner — cost is one extra image pull (cached) and a registration token round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -682,14 +682,16 @@ async fn reap_stale(
|
||||
mark_failed(pool, *runner_id, "reaped: stuck started > 60s").await?;
|
||||
}
|
||||
|
||||
// --- Stuck registered > 30min (registered but never picked up a job) ---
|
||||
// --- Stuck registered > 5min (registered but never picked up a job) ---
|
||||
// False positives are self-healing: if a job was about to be claimed, the
|
||||
// next brew tick will spawn a fresh runner for the still-queued job.
|
||||
let stuck_registered = sqlx::query_as::<_, (Uuid, Option<String>, String)>(
|
||||
r#"
|
||||
SELECT r.id, r.container_id, h.agent_endpoint
|
||||
FROM runners r
|
||||
JOIN hosts h ON h.id = r.host_id
|
||||
WHERE r.state = 'registered'
|
||||
AND COALESCE(r.registered_at, r.updated_at) < now() - interval '30 minutes'
|
||||
AND COALESCE(r.registered_at, r.updated_at) < now() - interval '5 minutes'
|
||||
"#,
|
||||
)
|
||||
.fetch_all(pool)
|
||||
@@ -705,12 +707,12 @@ async fn reap_stale(
|
||||
runner_id: *runner_id,
|
||||
container_id: cid.clone(),
|
||||
force: false,
|
||||
reason: "stuck in registered > 30min".to_owned(),
|
||||
reason: "stuck in registered > 5min".to_owned(),
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
mark_failed(pool, *runner_id, "reaped: stuck registered > 30min").await?;
|
||||
mark_failed(pool, *runner_id, "reaped: stuck registered > 5min").await?;
|
||||
}
|
||||
|
||||
// --- Silent runner: running but no update for 10 minutes ---
|
||||
|
||||
Reference in New Issue
Block a user