diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs
index b945fbf..e792bb8 100644
--- a/crates/neuron/src/harness/candle.rs
+++ b/crates/neuron/src/harness/candle.rs
@@ -53,6 +53,34 @@ pub enum ModelArch {
     Qwen3Quantized(QuantizedQwen3Weights),
 }
 
+/// Repetition penalty applied to recently-generated tokens before
+/// sampling. 1.0 disables it; >1.0 makes recently-emitted tokens less
+/// likely. mistral.rs and llama.cpp default to 1.1, which is enough to
+/// stop small quantized models from degenerating into "Wait, no, no..."
+/// loops without distorting normal output.
+const REPEAT_PENALTY: f32 = 1.1;
+
+/// Number of recently-generated tokens to feed into the repetition
+/// penalty. Matches the candle quantized-qwen3 example default.
+const REPEAT_LAST_N: usize = 64;
+
+/// Apply the repetition penalty (if any) to the prediction logits and
+/// then sample. Centralises the prefill / generation-loop call sites
+/// so they share identical sampling behaviour.
+fn sample_with_penalty(
+    logits: &Tensor,
+    history: &[u32],
+    logits_processor: &mut LogitsProcessor,
+) -> Result<u32> {
+    let penalised = if (REPEAT_PENALTY - 1.0).abs() < f32::EPSILON || history.is_empty() {
+        logits.clone()
+    } else {
+        let start = history.len().saturating_sub(REPEAT_LAST_N);
+        candle_transformers::utils::apply_repeat_penalty(logits, REPEAT_PENALTY, &history[start..])?
+    };
+    Ok(logits_processor.sample(&penalised)?)
+}
+
 impl CandleHarness {
     pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
         Self {
@@ -521,7 +549,7 @@ fn run_inference(
             let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
             let logits = model.forward(&input, 0)?;
             let logits = logits.squeeze(0)?;
-            logits_processor.sample(&logits)?
+            sample_with_penalty(&logits, &generated, &mut logits_processor)?
         }
     };
 
@@ -536,7 +564,7 @@ fn run_inference(
                 let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
                 let logits = model.forward(&input, prompt_tokens.len() + index)?;
                 let logits = logits.squeeze(0)?;
-                logits_processor.sample(&logits)?
+                sample_with_penalty(&logits, &generated, &mut logits_processor)?
             }
         };
         if Some(next_token) == eos_id {
@@ -592,7 +620,7 @@ fn run_inference_streaming(
             let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
             let logits = model.forward(&input, 0)?;
             let logits = logits.squeeze(0)?;
-            logits_processor.sample(&logits)?
+            sample_with_penalty(&logits, &all_tokens, &mut logits_processor)?
         }
     };
 
@@ -640,7 +668,7 @@ fn run_inference_streaming(
                     let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
                     let logits = model.forward(&input, prompt_tokens.len() + index)?;
                     let logits = logits.squeeze(0)?;
-                    logits_processor.sample(&logits)?
+                    sample_with_penalty(&logits, &all_tokens, &mut logits_processor)?
                 }
             };
             if Some(next_token) == eos_id {
diff --git a/script/deploy.sh b/script/deploy.sh
index 187cedb..4bebc1c 100755
--- a/script/deploy.sh
+++ b/script/deploy.sh
@@ -71,6 +71,34 @@ ensure_lair_repo() {
     fi
 }
 
+# Ensure libcudnn.so.9 is resolvable on the remote host so the
+# neuron binary (built with --features cudnn) doesn't fail at startup
+# with "cannot open shared object file: No such file or directory".
+#
+# Probes ldconfig first — if cuDNN was installed manually (.tar/.run
+# install), it'll be cached by ldconfig and we don't touch it.
+# Otherwise adds NVIDIA's RHEL9 CUDA repo (the Fedora 43 CUDA repo
+# doesn't ship cuDNN packages — only the RHEL9 one does) and installs
+# libcudnn9-cuda-13.
+ensure_cudnn_runtime() {
+    local host="$1"
+    if ssh "${host}" "ldconfig -p | grep -q libcudnn.so.9" 2>/dev/null; then
+        return 0
+    fi
+    echo "[${host}] installing cuDNN runtime"
+    if ! ssh "${host}" "test -f /etc/yum.repos.d/cuda-rhel9-x86_64.repo" 2>/dev/null; then
+        if ! ssh "${host}" sudo dnf config-manager addrepo \
+            --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
+            >/dev/null 2>&1; then
+            echo "[${host}] WARNING: failed to add rhel9 CUDA repo (proceeding anyway)"
+        fi
+    fi
+    if ! ssh "${host}" sudo dnf install -y libcudnn9-cuda-13 >/dev/null 2>&1; then
+        echo "[${host}] WARNING: failed to install libcudnn9-cuda-13"
+        echo "[${host}]   neuron may fail to start; install cuDNN manually if so"
+    fi
+}
+
 # True when the named package needs to be installed or upgraded on the
 # remote host — either it's not present, or a newer version exists in
 # the repo. False only when the installed version is current.
@@ -188,6 +216,7 @@ for entry in "${neuron_entries[@]}"; do
     package="helexa-neuron-${neuron_flavour}"
 
     ensure_lair_repo "${neuron_host}"
+    ensure_cudnn_runtime "${neuron_host}"
     neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
     if needs_update "${neuron_host}" "${package}"; then
         echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"