From d0292ed37740bc0bca8d8b5996c36c27bc25de7a Mon Sep 17 00:00:00 2001 From: rob thijssen Date: Mon, 1 Jun 2026 14:53:58 +0300 Subject: [PATCH] feat(cortex): catalogue source field + scheme-qualified /models/load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 of plan-source-aware-loader-preflight. Adds an optional `source` field to `ModelProfile` and threads it through the router's cold-load path so a profile pointing at the helexa registry forwards `helexa:` to neuron's `/models/load` instead of leaving neuron to substitute its `default_source` (typically `huggingface`). Without this, an operator who declares `source = "helexa"` in models.toml would still see neuron fetch from HuggingFace — the catalogue → ModelSpec translation in `profile_to_spec` was dropping the scheme on the floor. What lands: - `cortex-core::catalogue::ModelProfile.source: Option`. None is the default and preserves pre-Phase-3 behaviour. - `cortex-gateway::router::qualified_model_id(profile)` — small pure helper, extracted from `profile_to_spec` so it can be unit-tested. Empty-string `source` is treated as None so operators who blank out a previously-set value don't trip a scheme-with-no-scheme failure mode in neuron. - `models.example.toml` documents the new field with a commented-out helexa-scheme example pointing back at neuron.example.toml's matching sources block. Tests: - 2 new unit tests in `cortex-core::catalogue`: source-absent round-trip and source-present round-trip through TOML. - 3 new unit tests in `cortex-gateway::router`: pass-through when None, prefix when Some, pass-through on empty-string source. - ModelProfile literal in catalogue's existing test updated to carry `source: None`. CI gate: cargo fmt --check, cargo clippy --workspace --all-targets -- -D warnings, cargo test --workspace (24 test groups ok, zero failures). Completes Phase 3. With Phases 1+2+3 landed: - neuron parses `scheme:org/name`, routes per-source hf-hub Api with disambiguated cache. - preflight returns structured errors before any device allocation. - cortex catalogue declares per-model source jurisdiction and forwards it to neuron. The registry itself (registry.helexa.ai service, MinIO, nginx, mirror fabric) is the next moving piece — landing under a separate project per the design discussion. Co-Authored-By: Claude Opus 4.7 --- crates/cortex-core/src/catalogue.rs | 35 ++++++++++++++++++ crates/cortex-gateway/src/router.rs | 56 +++++++++++++++++++++++++++-- models.example.toml | 22 +++++++++++- 3 files changed, 110 insertions(+), 3 deletions(-) diff --git a/crates/cortex-core/src/catalogue.rs b/crates/cortex-core/src/catalogue.rs index ac0e028..e606cce 100644 --- a/crates/cortex-core/src/catalogue.rs +++ b/crates/cortex-core/src/catalogue.rs @@ -24,6 +24,17 @@ pub struct ModelProfile { /// Neurons where this model should never be evicted. #[serde(default)] pub pinned_on: Vec, + /// Source scheme this profile's weights come from. When set, the + /// router prefixes `id` with `scheme:` before forwarding the load + /// request to neuron, ensuring the daemon fetches from the right + /// registry regardless of which entry happens to match `id`. + /// + /// `None` lets neuron substitute its own `default_source` (typically + /// `huggingface`). Set to `"helexa"` when the model is hosted in + /// the helexa registry — operator-procurement-grade audit relies + /// on this being explicit per model rather than implicit. + #[serde(default)] + pub source: Option, } fn default_min_devices() -> u32 { @@ -140,6 +151,7 @@ mod tests { min_devices: 2, min_device_vram_mb: Some(24_000), pinned_on: vec![], + source: None, } } @@ -197,6 +209,29 @@ mod tests { assert_eq!(cat.resolve_alias("Qwen/Qwen3-8B"), "Qwen/Qwen3-8B"); } + #[test] + fn source_defaults_to_none_when_absent_from_toml() { + let src = r#" +[[models]] +id = "Qwen/Qwen3-30B" +harness = "candle" +"#; + let cat: ModelCatalogue = toml::from_str(src).expect("parse models table"); + assert!(cat.models[0].source.is_none()); + } + + #[test] + fn source_round_trips_through_toml() { + let src = r#" +[[models]] +id = "Helexa/Qwen3.6-27B-Uncensored" +harness = "candle" +source = "helexa" +"#; + let cat: ModelCatalogue = toml::from_str(src).expect("parse models table"); + assert_eq!(cat.models[0].source.as_deref(), Some("helexa")); + } + #[test] fn aliases_table_round_trips_through_toml() { let src = r#" diff --git a/crates/cortex-gateway/src/router.rs b/crates/cortex-gateway/src/router.rs index 5e59757..eeb20b4 100644 --- a/crates/cortex-gateway/src/router.rs +++ b/crates/cortex-gateway/src/router.rs @@ -292,7 +292,7 @@ async fn profile_to_spec( }; ModelSpec { - model_id: profile.id.clone(), + model_id: qualified_model_id(profile), harness: profile.harness.clone(), quant: profile.quant.clone(), tensor_parallel, @@ -300,6 +300,22 @@ async fn profile_to_spec( } } +/// Prefix the catalogue id with the scheme when one is declared, so +/// neuron resolves the load against the right registry. Without this, +/// a profile pointing at the helexa registry would resolve via +/// neuron's `default_source` (typically `huggingface`) and fetch +/// bytes from the wrong place. Profiles that omit `source` continue +/// to pass the bare id through, preserving the pre-Phase-3 contract. +/// +/// Stays at module scope (not nested in `profile_to_spec`) so the unit +/// tests can exercise it without spinning up CortexState topology. +fn qualified_model_id(profile: &ModelProfile) -> String { + match profile.source.as_deref() { + Some(scheme) if !scheme.is_empty() => format!("{scheme}:{}", profile.id), + _ => profile.id.clone(), + } +} + /// Resolve neuron's `/models/{id}/endpoint` to its inference URL and /// build the final `RouteDecision`. Shared by all three priority /// branches above. @@ -375,7 +391,43 @@ fn rewrite_loopback_host(inference_url: &str, neuron_endpoint: &str) -> Option) -> ModelProfile { + ModelProfile { + id: id.into(), + harness: "candle".into(), + quant: None, + vram_mb: None, + min_devices: 1, + min_device_vram_mb: None, + pinned_on: vec![], + source: source.map(String::from), + } + } + + #[test] + fn qualified_id_passes_through_when_source_absent() { + let p = bare_profile("Qwen/Qwen3-30B", None); + assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B"); + } + + #[test] + fn qualified_id_prefixes_when_source_set() { + let p = bare_profile("Helexa/Qwen3.6-27B-Uncensored", Some("helexa")); + assert_eq!( + qualified_model_id(&p), + "helexa:Helexa/Qwen3.6-27B-Uncensored" + ); + } + + #[test] + fn qualified_id_passes_through_when_source_is_empty_string() { + // An empty scheme is treated as absent — neuron's default_source + // substitution kicks in. + let p = bare_profile("Qwen/Qwen3-30B", Some("")); + assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B"); + } #[test] fn rewrites_localhost_keeps_port_and_path() { diff --git a/models.example.toml b/models.example.toml index 12da8a5..eb2dca1 100644 --- a/models.example.toml +++ b/models.example.toml @@ -7,7 +7,8 @@ # returns and what the router can cold-load on demand. # # Field reference: -# id - HuggingFace model id, exact match. +# id - Repo id in the source registry (e.g. "Qwen/Qwen3.6-27B"). +# Exact match. # harness - which engine handles inference (currently "candle"). # quant - GGUF quantisation tag for the file in the HF repo # (e.g. "Q4_K_M"). Omit/empty for the dense @@ -20,6 +21,11 @@ # pinned_on - optional whitelist of neuron names. Non-empty # narrows feasibility to just those neurons and # protects the model from LRU eviction there. +# source - optional source scheme ("huggingface", "helexa", +# operator mirror tag). When set, cortex forwards +# the load to neuron as `scheme:id` so the daemon +# fetches from the right registry. Omit to let +# neuron substitute its own `default_source`. # Tensor-parallel target — needs a neuron with at least 2 large GPUs. # The example pins to a specific neuron name; adjust or remove the @@ -49,6 +55,20 @@ vram_mb = 500 min_devices = 1 min_device_vram_mb = 4000 +# Helexa registry model — `source` pins this entry to the helexa +# scheme so cortex forwards `helexa:Helexa/Qwen3.6-27B-Uncensored` to +# neuron's /models/load. Requires the neuron config to declare a +# matching [harness.candle.sources.helexa] entry pointing at the +# helexa registry endpoint (see neuron.example.toml). +# +# [[models]] +# id = "Helexa/Qwen3.6-27B-Uncensored" +# harness = "candle" +# source = "helexa" +# vram_mb = 54000 +# min_devices = 2 +# min_device_vram_mb = 24000 + # -- Tier aliases ------------------------------------------------------------ # Optional. Clients can request inference against an alias (e.g. # `model: "helexa/small"` in /v1/chat/completions) and cortex