diff --git a/crates/neuron/src/harness/candle.rs b/crates/neuron/src/harness/candle.rs index b945fbf..e792bb8 100644 --- a/crates/neuron/src/harness/candle.rs +++ b/crates/neuron/src/harness/candle.rs @@ -53,6 +53,34 @@ pub enum ModelArch { Qwen3Quantized(QuantizedQwen3Weights), } +/// Repetition penalty applied to recently-generated tokens before +/// sampling. 1.0 disables it; >1.0 makes recently-emitted tokens less +/// likely. mistral.rs and llama.cpp default to 1.1, which is enough to +/// stop small quantized models from degenerating into "Wait, no, no..." +/// loops without distorting normal output. +const REPEAT_PENALTY: f32 = 1.1; + +/// Number of recently-generated tokens to feed into the repetition +/// penalty. Matches the candle quantized-qwen3 example default. +const REPEAT_LAST_N: usize = 64; + +/// Apply the repetition penalty (if any) to the prediction logits and +/// then sample. Centralises the prefill / generation-loop call sites +/// so they share identical sampling behaviour. +fn sample_with_penalty( + logits: &Tensor, + history: &[u32], + logits_processor: &mut LogitsProcessor, +) -> Result { + let penalised = if (REPEAT_PENALTY - 1.0).abs() < f32::EPSILON || history.is_empty() { + logits.clone() + } else { + let start = history.len().saturating_sub(REPEAT_LAST_N); + candle_transformers::utils::apply_repeat_penalty(logits, REPEAT_PENALTY, &history[start..])? + }; + Ok(logits_processor.sample(&penalised)?) +} + impl CandleHarness { pub fn new(bind_url: String, hf_cache: Option) -> Self { Self { @@ -521,7 +549,7 @@ fn run_inference( let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?; let logits = model.forward(&input, 0)?; let logits = logits.squeeze(0)?; - logits_processor.sample(&logits)? + sample_with_penalty(&logits, &generated, &mut logits_processor)? } }; @@ -536,7 +564,7 @@ fn run_inference( let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?; let logits = model.forward(&input, prompt_tokens.len() + index)?; let logits = logits.squeeze(0)?; - logits_processor.sample(&logits)? + sample_with_penalty(&logits, &generated, &mut logits_processor)? } }; if Some(next_token) == eos_id { @@ -592,7 +620,7 @@ fn run_inference_streaming( let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?; let logits = model.forward(&input, 0)?; let logits = logits.squeeze(0)?; - logits_processor.sample(&logits)? + sample_with_penalty(&logits, &all_tokens, &mut logits_processor)? } }; @@ -640,7 +668,7 @@ fn run_inference_streaming( let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?; let logits = model.forward(&input, prompt_tokens.len() + index)?; let logits = logits.squeeze(0)?; - logits_processor.sample(&logits)? + sample_with_penalty(&logits, &all_tokens, &mut logits_processor)? } }; if Some(next_token) == eos_id { diff --git a/script/deploy.sh b/script/deploy.sh index 187cedb..4bebc1c 100755 --- a/script/deploy.sh +++ b/script/deploy.sh @@ -71,6 +71,34 @@ ensure_lair_repo() { fi } +# Ensure libcudnn.so.9 is resolvable on the remote host so the +# neuron binary (built with --features cudnn) doesn't fail at startup +# with "cannot open shared object file: No such file or directory". +# +# Probes ldconfig first — if cuDNN was installed manually (.tar/.run +# install), it'll be cached by ldconfig and we don't touch it. +# Otherwise adds NVIDIA's RHEL9 CUDA repo (the Fedora 43 CUDA repo +# doesn't ship cuDNN packages — only the RHEL9 one does) and installs +# libcudnn9-cuda-13. +ensure_cudnn_runtime() { + local host="$1" + if ssh "${host}" "ldconfig -p | grep -q libcudnn.so.9" 2>/dev/null; then + return 0 + fi + echo "[${host}] installing cuDNN runtime" + if ! ssh "${host}" "test -f /etc/yum.repos.d/cuda-rhel9-x86_64.repo" 2>/dev/null; then + if ! ssh "${host}" sudo dnf config-manager addrepo \ + --from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \ + >/dev/null 2>&1; then + echo "[${host}] WARNING: failed to add rhel9 CUDA repo (proceeding anyway)" + fi + fi + if ! ssh "${host}" sudo dnf install -y libcudnn9-cuda-13 >/dev/null 2>&1; then + echo "[${host}] WARNING: failed to install libcudnn9-cuda-13" + echo "[${host}] neuron may fail to start; install cuDNN manually if so" + fi +} + # True when the named package needs to be installed or upgraded on the # remote host — either it's not present, or a newer version exists in # the repo. False only when the installed version is current. @@ -188,6 +216,7 @@ for entry in "${neuron_entries[@]}"; do package="helexa-neuron-${neuron_flavour}" ensure_lair_repo "${neuron_host}" + ensure_cudnn_runtime "${neuron_host}" neuron_nvr=$(installed_nvr "${neuron_host}" "${package}") if needs_update "${neuron_host}" "${package}"; then echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"