post-validation cleanup: cuDNN runtime + repetition penalty
All checks were successful
CI / Format (push) Successful in 34s
build-prerelease / Resolve version stamps (push) Successful in 35s
CI / Clippy (push) Successful in 2m17s
CI / Test (push) Successful in 4m16s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m28s
build-prerelease / Build neuron-blackwell (push) Successful in 3m42s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Build neuron-ampere (push) Successful in 4m27s
build-prerelease / Build neuron-ada (push) Successful in 4m51s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m40s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 6m52s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 2m32s
All checks were successful
CI / Format (push) Successful in 34s
build-prerelease / Resolve version stamps (push) Successful in 35s
CI / Clippy (push) Successful in 2m17s
CI / Test (push) Successful in 4m16s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build cortex binary (push) Successful in 4m28s
build-prerelease / Build neuron-blackwell (push) Successful in 3m42s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Build neuron-ampere (push) Successful in 4m27s
build-prerelease / Build neuron-ada (push) Successful in 4m51s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 2m50s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m40s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 6m52s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 2m32s
Two followups from the live single-GPU validation pass. 1. deploy.sh now ensures libcudnn.so.9 is available on each neuron host before installing/upgrading the package. Probes ldconfig first so hosts with a manual (tar/runfile) cuDNN install are untouched, then adds NVIDIA's RHEL9 CUDA repo (the Fedora 43 CUDA repo doesn't ship cuDNN; only the RHEL9 one does) and installs libcudnn9-cuda-13. benjy hit "cannot open shared object file: libcudnn.so.9" during validation; this prevents that recurring. 2. candle.rs applies a 1.1 repetition penalty over the last 64 generated tokens before sampling, in both the non-streaming chat_completion path and the streaming chat_completion_stream path. Without it small Q4_K_M models degenerate into "Wait, no, no..." loops once they hit a confident-but-wrong path; with it sampling stays coherent. Defaults match mistral.rs and llama.cpp; exposing the value via the OpenAI request (frequency/presence penalty mapping) is Stage 8 territory. Both routes through a new sample_with_penalty() helper so future sampling tweaks land in one place. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -53,6 +53,34 @@ pub enum ModelArch {
|
|||||||
Qwen3Quantized(QuantizedQwen3Weights),
|
Qwen3Quantized(QuantizedQwen3Weights),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Repetition penalty applied to recently-generated tokens before
|
||||||
|
/// sampling. 1.0 disables it; >1.0 makes recently-emitted tokens less
|
||||||
|
/// likely. mistral.rs and llama.cpp default to 1.1, which is enough to
|
||||||
|
/// stop small quantized models from degenerating into "Wait, no, no..."
|
||||||
|
/// loops without distorting normal output.
|
||||||
|
const REPEAT_PENALTY: f32 = 1.1;
|
||||||
|
|
||||||
|
/// Number of recently-generated tokens to feed into the repetition
|
||||||
|
/// penalty. Matches the candle quantized-qwen3 example default.
|
||||||
|
const REPEAT_LAST_N: usize = 64;
|
||||||
|
|
||||||
|
/// Apply the repetition penalty (if any) to the prediction logits and
|
||||||
|
/// then sample. Centralises the prefill / generation-loop call sites
|
||||||
|
/// so they share identical sampling behaviour.
|
||||||
|
fn sample_with_penalty(
|
||||||
|
logits: &Tensor,
|
||||||
|
history: &[u32],
|
||||||
|
logits_processor: &mut LogitsProcessor,
|
||||||
|
) -> Result<u32> {
|
||||||
|
let penalised = if (REPEAT_PENALTY - 1.0).abs() < f32::EPSILON || history.is_empty() {
|
||||||
|
logits.clone()
|
||||||
|
} else {
|
||||||
|
let start = history.len().saturating_sub(REPEAT_LAST_N);
|
||||||
|
candle_transformers::utils::apply_repeat_penalty(logits, REPEAT_PENALTY, &history[start..])?
|
||||||
|
};
|
||||||
|
Ok(logits_processor.sample(&penalised)?)
|
||||||
|
}
|
||||||
|
|
||||||
impl CandleHarness {
|
impl CandleHarness {
|
||||||
pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
|
pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
@@ -521,7 +549,7 @@ fn run_inference(
|
|||||||
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
|
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
|
||||||
let logits = model.forward(&input, 0)?;
|
let logits = model.forward(&input, 0)?;
|
||||||
let logits = logits.squeeze(0)?;
|
let logits = logits.squeeze(0)?;
|
||||||
logits_processor.sample(&logits)?
|
sample_with_penalty(&logits, &generated, &mut logits_processor)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -536,7 +564,7 @@ fn run_inference(
|
|||||||
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
|
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
|
||||||
let logits = model.forward(&input, prompt_tokens.len() + index)?;
|
let logits = model.forward(&input, prompt_tokens.len() + index)?;
|
||||||
let logits = logits.squeeze(0)?;
|
let logits = logits.squeeze(0)?;
|
||||||
logits_processor.sample(&logits)?
|
sample_with_penalty(&logits, &generated, &mut logits_processor)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
if Some(next_token) == eos_id {
|
if Some(next_token) == eos_id {
|
||||||
@@ -592,7 +620,7 @@ fn run_inference_streaming(
|
|||||||
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
|
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
|
||||||
let logits = model.forward(&input, 0)?;
|
let logits = model.forward(&input, 0)?;
|
||||||
let logits = logits.squeeze(0)?;
|
let logits = logits.squeeze(0)?;
|
||||||
logits_processor.sample(&logits)?
|
sample_with_penalty(&logits, &all_tokens, &mut logits_processor)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -640,7 +668,7 @@ fn run_inference_streaming(
|
|||||||
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
|
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
|
||||||
let logits = model.forward(&input, prompt_tokens.len() + index)?;
|
let logits = model.forward(&input, prompt_tokens.len() + index)?;
|
||||||
let logits = logits.squeeze(0)?;
|
let logits = logits.squeeze(0)?;
|
||||||
logits_processor.sample(&logits)?
|
sample_with_penalty(&logits, &all_tokens, &mut logits_processor)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
if Some(next_token) == eos_id {
|
if Some(next_token) == eos_id {
|
||||||
|
|||||||
@@ -71,6 +71,34 @@ ensure_lair_repo() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Ensure libcudnn.so.9 is resolvable on the remote host so the
|
||||||
|
# neuron binary (built with --features cudnn) doesn't fail at startup
|
||||||
|
# with "cannot open shared object file: No such file or directory".
|
||||||
|
#
|
||||||
|
# Probes ldconfig first — if cuDNN was installed manually (.tar/.run
|
||||||
|
# install), it'll be cached by ldconfig and we don't touch it.
|
||||||
|
# Otherwise adds NVIDIA's RHEL9 CUDA repo (the Fedora 43 CUDA repo
|
||||||
|
# doesn't ship cuDNN packages — only the RHEL9 one does) and installs
|
||||||
|
# libcudnn9-cuda-13.
|
||||||
|
ensure_cudnn_runtime() {
|
||||||
|
local host="$1"
|
||||||
|
if ssh "${host}" "ldconfig -p | grep -q libcudnn.so.9" 2>/dev/null; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "[${host}] installing cuDNN runtime"
|
||||||
|
if ! ssh "${host}" "test -f /etc/yum.repos.d/cuda-rhel9-x86_64.repo" 2>/dev/null; then
|
||||||
|
if ! ssh "${host}" sudo dnf config-manager addrepo \
|
||||||
|
--from-repofile=https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo \
|
||||||
|
>/dev/null 2>&1; then
|
||||||
|
echo "[${host}] WARNING: failed to add rhel9 CUDA repo (proceeding anyway)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if ! ssh "${host}" sudo dnf install -y libcudnn9-cuda-13 >/dev/null 2>&1; then
|
||||||
|
echo "[${host}] WARNING: failed to install libcudnn9-cuda-13"
|
||||||
|
echo "[${host}] neuron may fail to start; install cuDNN manually if so"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# True when the named package needs to be installed or upgraded on the
|
# True when the named package needs to be installed or upgraded on the
|
||||||
# remote host — either it's not present, or a newer version exists in
|
# remote host — either it's not present, or a newer version exists in
|
||||||
# the repo. False only when the installed version is current.
|
# the repo. False only when the installed version is current.
|
||||||
@@ -188,6 +216,7 @@ for entry in "${neuron_entries[@]}"; do
|
|||||||
package="helexa-neuron-${neuron_flavour}"
|
package="helexa-neuron-${neuron_flavour}"
|
||||||
|
|
||||||
ensure_lair_repo "${neuron_host}"
|
ensure_lair_repo "${neuron_host}"
|
||||||
|
ensure_cudnn_runtime "${neuron_host}"
|
||||||
neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
|
neuron_nvr=$(installed_nvr "${neuron_host}" "${package}")
|
||||||
if needs_update "${neuron_host}" "${package}"; then
|
if needs_update "${neuron_host}" "${package}"; then
|
||||||
echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"
|
echo "[${neuron_host}] ${package} update available (current: ${neuron_nvr})"
|
||||||
|
|||||||
Reference in New Issue
Block a user