fix(neuron): cap vision max_pixels to the pos_embed patch budget (#14)
All checks were successful
CI / CUDA type-check (push) Successful in 31s
build-prerelease / Resolve version stamps (push) Successful in 29s
CI / Format (push) Successful in 30s
CI / Clippy (push) Successful in 2m32s
build-prerelease / Build neuron-blackwell (push) Successful in 6m5s
CI / Test (push) Successful in 5m49s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build neuron-ampere (push) Successful in 8m11s
build-prerelease / Build neuron-ada (push) Successful in 5m40s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m4s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m2s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m57s
build-prerelease / Build cortex binary (push) Successful in 4m21s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m16s
All checks were successful
CI / CUDA type-check (push) Successful in 31s
build-prerelease / Resolve version stamps (push) Successful in 29s
CI / Format (push) Successful in 30s
CI / Clippy (push) Successful in 2m32s
build-prerelease / Build neuron-blackwell (push) Successful in 6m5s
CI / Test (push) Successful in 5m49s
CI / Build cortex SRPM (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Build neuron-ampere (push) Successful in 8m11s
build-prerelease / Build neuron-ada (push) Successful in 5m40s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m4s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m2s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 3m57s
build-prerelease / Build cortex binary (push) Successful in 4m21s
build-prerelease / Package cortex RPM (push) Successful in 1m25s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m16s
Beast testing surfaced a real regression in the dynamic-resolution default: a tall 808×1600 image resized (within the 1024² max_pixels) to a 90×44 patch grid = 3960 patches, exceeding the vision tower's hard `num_position_embeddings = 2304` pos-embed budget. The per-rank `patch count 3960 exceeds pos_embed budget 2304` error fired mid-TP- forward and poisoned the device context, bricking the model until reload. Hard-cap `max_pixels` to `2304 × 16² = 589_824` px (≤ 2304 patches → ≤ 576 LM tokens), clamping even the operator env override. `smart_resize` floors the pixel count under the cap, so no resized image can ever exceed the budget — the tower check never fires, no poison. The pos-embed grid (48×48) is the resolution Qwen3.6 was trained at, so the cap is principled, not just defensive. Still ~3× the old fixed 196 tokens, and the book-cover OCR test (1176 patches) already reads full title+subtitle. Test: a huge/tall/wide/extreme image battery stays within the 2304 patch budget. (Per-rank-error poison robustness itself remains issue #17.) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -55,12 +55,23 @@ pub struct PreprocessProfile {
|
|||||||
pub image_std: [f32; 3],
|
pub image_std: [f32; 3],
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Default pixel budget for Qwen3.6 (`256² … 1024²` → 64 … 1024 LM
|
/// The Qwen3.6 vision tower rejects any image whose **patch** count
|
||||||
/// tokens/image). Generous for documents/OCR, bounded for serving on
|
/// exceeds its learned pos-embed budget (`num_position_embeddings =
|
||||||
/// 2×RTX5090. Operators tune with `NEURON_VISION_MIN_PIXELS` /
|
/// 2304 = 48²`; see `vision.rs`). At `patch_size = 16` that is
|
||||||
/// `NEURON_VISION_MAX_PIXELS` (matching the other `NEURON_VISION_*` knobs).
|
/// `2304 × 16² = 589_824` source pixels. `max_pixels` is hard-capped to
|
||||||
|
/// this so `smart_resize` can never produce an over-budget grid — a
|
||||||
|
/// per-rank "patch count exceeds pos_embed budget" error mid-TP-forward
|
||||||
|
/// would otherwise poison the device context. The pos-embed grid is the
|
||||||
|
/// resolution Qwen3.6 was trained at, so this cap is principled, not just
|
||||||
|
/// defensive.
|
||||||
|
const QWEN3_6_MAX_PIXELS_CAP: u32 = 2304 * 16 * 16; // 589_824 → ≤ 2304 patches → ≤ 576 LM tokens
|
||||||
|
|
||||||
|
/// Default pixel budget for Qwen3.6: `256²` (64 LM tokens) up to the
|
||||||
|
/// pos-embed cap (576 LM tokens). Generous for documents/OCR, bounded
|
||||||
|
/// for serving. Operators lower it with `NEURON_VISION_MIN_PIXELS` /
|
||||||
|
/// `NEURON_VISION_MAX_PIXELS` (the upper bound is still clamped to the
|
||||||
|
/// cap above — raising it past the budget would poison the model).
|
||||||
const QWEN3_6_MIN_PIXELS: u32 = 65_536;
|
const QWEN3_6_MIN_PIXELS: u32 = 65_536;
|
||||||
const QWEN3_6_MAX_PIXELS: u32 = 1_048_576;
|
|
||||||
|
|
||||||
fn env_pixels(name: &str, default: u32) -> u32 {
|
fn env_pixels(name: &str, default: u32) -> u32 {
|
||||||
std::env::var(name)
|
std::env::var(name)
|
||||||
@@ -72,15 +83,19 @@ fn env_pixels(name: &str, default: u32) -> u32 {
|
|||||||
impl PreprocessProfile {
|
impl PreprocessProfile {
|
||||||
/// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32),
|
/// Profile for Qwen3.6. Native-aspect `smart_resize` (factor 32),
|
||||||
/// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults to
|
/// normalise to `[-1, 1]` via mean=std=0.5. Pixel budget defaults to
|
||||||
/// [`QWEN3_6_MIN_PIXELS`]…[`QWEN3_6_MAX_PIXELS`], overridable via the
|
/// [`QWEN3_6_MIN_PIXELS`]…[`QWEN3_6_MAX_PIXELS_CAP`], overridable via
|
||||||
/// `NEURON_VISION_MIN_PIXELS` / `NEURON_VISION_MAX_PIXELS` env vars.
|
/// `NEURON_VISION_MIN_PIXELS` / `NEURON_VISION_MAX_PIXELS`. Clamped
|
||||||
/// The budget is clamped sane: `min ≥ factor²` (at least one LM token)
|
/// sane: `factor² ≤ min ≤ max`, and `max ≤` the pos-embed cap (so the
|
||||||
/// and `max ≥ min`.
|
/// vision tower never rejects a resized image and poisons the context).
|
||||||
pub fn qwen3_6() -> Self {
|
pub fn qwen3_6() -> Self {
|
||||||
let factor = 32u32;
|
let factor = 32u32;
|
||||||
let f2 = factor * factor;
|
let f2 = factor * factor;
|
||||||
let min_pixels = env_pixels("NEURON_VISION_MIN_PIXELS", QWEN3_6_MIN_PIXELS).max(f2);
|
let min_pixels = env_pixels("NEURON_VISION_MIN_PIXELS", QWEN3_6_MIN_PIXELS)
|
||||||
let max_pixels = env_pixels("NEURON_VISION_MAX_PIXELS", QWEN3_6_MAX_PIXELS).max(min_pixels);
|
.max(f2)
|
||||||
|
.min(QWEN3_6_MAX_PIXELS_CAP);
|
||||||
|
let max_pixels = env_pixels("NEURON_VISION_MAX_PIXELS", QWEN3_6_MAX_PIXELS_CAP)
|
||||||
|
.min(QWEN3_6_MAX_PIXELS_CAP)
|
||||||
|
.max(min_pixels);
|
||||||
Self {
|
Self {
|
||||||
factor,
|
factor,
|
||||||
min_pixels,
|
min_pixels,
|
||||||
@@ -388,6 +403,28 @@ mod tests {
|
|||||||
assert!(format!("{err:#}").contains("200:1"));
|
assert!(format!("{err:#}").contains("200:1"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn qwen3_6_never_exceeds_pos_embed_patch_budget() {
|
||||||
|
// The pos-embed cap must hold for huge, tall, wide, and extreme
|
||||||
|
// images — exceeding 2304 patches errors mid-tower and poisons
|
||||||
|
// the device context, so this invariant is load-bearing.
|
||||||
|
let p = PreprocessProfile::qwen3_6();
|
||||||
|
for (sh, sw) in [
|
||||||
|
(8000u32, 6000u32),
|
||||||
|
(808, 1600),
|
||||||
|
(4000, 400),
|
||||||
|
(1, 199),
|
||||||
|
(16, 16),
|
||||||
|
] {
|
||||||
|
let (h, w) = p.resized_dims(sh, sw).unwrap();
|
||||||
|
let patches = (h / 16) * (w / 16);
|
||||||
|
assert!(
|
||||||
|
patches <= 2304,
|
||||||
|
"{sh}x{sw} → {h}x{w} = {patches} patches exceeds the 2304 budget"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn qwen3_6_default_budget_bounds_lm_tokens() {
|
fn qwen3_6_default_budget_bounds_lm_tokens() {
|
||||||
// A huge source image caps at max_pixels → the per-image LM token
|
// A huge source image caps at max_pixels → the per-image LM token
|
||||||
|
|||||||
Reference in New Issue
Block a user