diff --git a/crates/neuron/src/main.rs b/crates/neuron/src/main.rs
index b78b9f4..6a7c691 100644
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -78,11 +78,21 @@ async fn main() -> Result<()> {
         candle,
     });
 
-    let app = api::neuron_routes().with_state(state);
+    let app = api::neuron_routes().with_state(Arc::clone(&state));
     let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?;
     tracing::info!("neuron listening on {addr}");
     let listener = tokio::net::TcpListener::bind(addr).await?;
-    axum::serve(listener, app).await?;
+    axum::serve(listener, app)
+        .with_graceful_shutdown(startup::shutdown_signal())
+        .await?;
+
+    // Deactivation: serve has returned (graceful shutdown signal
+    // received and connections drained). Release CUDA contexts / VRAM
+    // by unloading every model before exiting; systemd's TimeoutStopSec
+    // bounds how long this phase may take.
+    let registry = state.registry.read().await;
+    startup::unload_all_models(&registry).await;
+    tracing::info!("shutdown complete");
 
     Ok(())
 }
diff --git a/crates/neuron/src/startup.rs b/crates/neuron/src/startup.rs
index d4c5296..3d348ff 100644
--- a/crates/neuron/src/startup.rs
+++ b/crates/neuron/src/startup.rs
@@ -1,12 +1,14 @@
-//! Activation-time orchestration.
+//! Activation- and deactivation-time orchestration.
 //!
-//! Wired from `main.rs` after the harness registry is built and before
-//! the HTTP listener binds. Kept in its own module so the logic is
+//! Wired from `main.rs` around the HTTP listener — activation runs
+//! before bind, deactivation runs after axum returns from its
+//! graceful-shutdown future. Kept in its own module so the logic is
 //! unit-testable without spinning up a full neuron process.
 
 use crate::harness::HarnessRegistry;
 use cortex_core::harness::ModelSpec;
 use std::time::Instant;
+use tokio::signal;
 
 /// Load each spec sequentially against the registry, treating
 /// individual failures as warnings rather than fatal errors.
@@ -36,3 +38,60 @@ pub async fn load_default_models(registry: &HarnessRegistry, specs: &[ModelSpec]
         }
     }
 }
+
+/// Future that resolves on SIGINT (Ctrl-C) or SIGTERM (systemd stop).
+///
+/// Wired into `axum::serve(...).with_graceful_shutdown(shutdown_signal())`
+/// so the HTTP listener stops accepting new connections, lets in-flight
+/// requests drain, and then yields control back to main for cleanup.
+pub async fn shutdown_signal() {
+    let ctrl_c = async {
+        signal::ctrl_c().await.ok();
+    };
+    let terminate = async {
+        signal::unix::signal(signal::unix::SignalKind::terminate())
+            .expect("install SIGTERM handler")
+            .recv()
+            .await;
+    };
+    tokio::select! {
+        _ = ctrl_c => tracing::info!("received SIGINT, shutting down"),
+        _ = terminate => tracing::info!("received SIGTERM, shutting down"),
+    }
+}
+
+/// Unload every model currently registered. Called from `main.rs` after
+/// axum's graceful shutdown future resolves, so CUDA contexts and VRAM
+/// are released before the process exits rather than left to the OS to
+/// reclaim. Per-model failures are logged and skipped — keep cleanup
+/// going even when one harness is unhealthy.
+pub async fn unload_all_models(registry: &HarnessRegistry) {
+    let listed = match registry.list_all_models().await {
+        Ok(m) => m,
+        Err(e) => {
+            tracing::warn!(error = %e, "failed to list models during shutdown");
+            return;
+        }
+    };
+
+    if listed.is_empty() {
+        return;
+    }
+
+    tracing::info!(count = listed.len(), "unloading models for shutdown");
+    for model in listed {
+        let start = Instant::now();
+        match registry.unload_model(&model.id).await {
+            Ok(()) => tracing::info!(
+                model = %model.id,
+                elapsed_ms = start.elapsed().as_millis() as u64,
+                "unloaded"
+            ),
+            Err(e) => tracing::warn!(
+                model = %model.id,
+                error = %e,
+                "unload failed during shutdown"
+            ),
+        }
+    }
+}
diff --git a/crates/neuron/tests/shutdown.rs b/crates/neuron/tests/shutdown.rs
new file mode 100644
index 0000000..3a399cb
--- /dev/null
+++ b/crates/neuron/tests/shutdown.rs
@@ -0,0 +1,32 @@
+//! Deactivation behaviour: unload_all_models tolerates an empty
+//! registry and continues past per-model unload failures.
+
+use cortex_core::harness::HarnessConfig;
+use neuron::config::HarnessSettings;
+use neuron::harness::HarnessRegistry;
+use neuron::startup;
+
+#[tokio::test]
+async fn test_unload_all_models_empty_registry_is_noop() {
+    let registry = HarnessRegistry::new();
+    startup::unload_all_models(&registry).await;
+}
+
+#[tokio::test]
+async fn test_unload_all_models_with_no_loaded_models() {
+    let registry = HarnessRegistry::from_configs(
+        &[HarnessConfig {
+            name: "candle".into(),
+        }],
+        "http://localhost:0",
+        &HarnessSettings::default(),
+    );
+
+    startup::unload_all_models(&registry).await;
+
+    let listed = registry
+        .list_all_models()
+        .await
+        .expect("list_all_models should still succeed after shutdown cleanup");
+    assert!(listed.is_empty());
+}
diff --git a/data/neuron.service b/data/neuron.service
index 207b4da..c844da7 100644
--- a/data/neuron.service
+++ b/data/neuron.service
@@ -15,6 +15,11 @@ Group=neuron
 # materialise on first activation. systemd's default TimeoutStartSec
 # (90s) is far too short; allow 30 minutes.
 TimeoutStartSec=1800s
+# On stop, neuron drains in-flight requests then unloads every model
+# to release CUDA contexts cleanly. Allow generous time for big-model
+# unloads; systemd will SIGKILL after this bound.
+TimeoutStopSec=120s
+KillSignal=SIGTERM
 
 [Install]
 WantedBy=multi-user.target