fix: yaml syntax

fix(ci): unset RUSTC_WRAPPER during sccache install
The workflow-level env set RUSTC_WRAPPER=sccache for every step, including the install step itself. cargo install sccache then tried to invoke `sccache rustc -vV` to detect the toolchain before sccache existed on PATH, failing with "No such file or directory". Override RUSTC_WRAPPER to empty on the install step so cargo uses rustc directly; subsequent steps still inherit the wrapper. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 09:25:02 +03:00 · 2026-04-16 08:31:26 +03:00 · 2026-04-15 17:44:21 +03:00 · 2026-04-15 17:38:13 +03:00 · 2026-04-15 16:31:13 +03:00 · 2026-04-15 16:28:31 +03:00
46 changed files with 2335 additions and 522 deletions
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -2,11 +2,20 @@ name: CI

 on:
  push:
-    branches: ['**']
-    tags: ['v*']
+    branches: ["**"]
+    tags: ["v*"]
  pull_request:
    branches: [main]

+env:
+  RUSTC_WRAPPER: sccache
+  SCCACHE_BUCKET: sccache
+  SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
+  SCCACHE_REGION: auto
+  SCCACHE_S3_USE_SSL: "false"
+  AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
+
 jobs:
  check:
    name: Format, lint, build, test
@@ -14,6 +23,16 @@ jobs:
    steps:
      - uses: actions/checkout@v4

+      - name: Ensure sccache with S3 support
+        env:
+          RUSTC_WRAPPER: ""
+        run: |
+          if sccache --version 2>/dev/null && sccache --show-stats 2>/dev/null; then
+            echo "sccache with S3 support already installed"
+          else
+            cargo install sccache --features s3 --locked
+          fi
+
      - name: Check formatting
        run: cargo fmt --check --all

@@ -26,8 +45,11 @@ jobs:
      - name: Test
        run: cargo test --workspace

-  rpm:
-    name: Build SRPM
+      - name: Show sccache stats
+        run: sccache --show-stats
+
+  srpm-cortex:
+    name: Build cortex SRPM
    runs-on: fedora
    needs: check
    if: startsWith(github.ref, 'refs/tags/v')
@@ -39,14 +61,12 @@ jobs:
        run: |
          VERSION="${GITHUB_REF#refs/tags/v}"
          echo "VERSION=${VERSION}" >> "$GITHUB_OUTPUT"
-          echo "Building version: ${VERSION}"

-      - name: Stamp version into spec
+      - name: Stamp version
        run: |
          VERSION="${{ steps.version.outputs.VERSION }}"
          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
          sed -i "s/^Version:.*/Version:        ${VERSION}/" cortex.spec
-          echo "Stamped version ${VERSION}"

      - name: Generate source tarball
        run: |
@@ -77,19 +97,70 @@ jobs:
      - name: Upload SRPM artifact
        uses: actions/upload-artifact@v3
        with:
-          name: srpm
-          path: '*.src.rpm'
+          name: srpm-cortex
+          path: "*.src.rpm"

-  copr:
-    name: Publish to COPR
+  srpm-neuron:
+    name: Build neuron SRPM
    runs-on: fedora
-    needs: rpm
+    needs: check
    if: startsWith(github.ref, 'refs/tags/v')
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Determine version
+        id: version
+        run: |
+          VERSION="${GITHUB_REF#refs/tags/v}"
+          echo "VERSION=${VERSION}" >> "$GITHUB_OUTPUT"
+
+      - name: Stamp version
+        run: |
+          VERSION="${{ steps.version.outputs.VERSION }}"
+          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
+          sed -i "s/^Version:.*/Version:        ${VERSION}/" neuron.spec
+
+      - name: Generate source tarball
+        run: |
+          set -ex
+          VERSION="${{ steps.version.outputs.VERSION }}"
+          tar czf /tmp/neuron-${VERSION}.tar.gz \
+            --transform "s,^\.,neuron-${VERSION}," \
+            --exclude='./target' \
+            --exclude='./.git' \
+            --exclude='*.tar.gz' \
+            --exclude='*.src.rpm' \
+            .
+          mv /tmp/neuron-${VERSION}.tar.gz .
+
+      - name: Vendor Rust dependencies
+        run: |
+          VERSION="${{ steps.version.outputs.VERSION }}"
+          cargo vendor vendor/
+          tar czf neuron-${VERSION}-vendor.tar.gz vendor/
+          rm -rf vendor/
+
+      - name: Build SRPM
+        run: |
+          rpmbuild -bs neuron.spec \
+            --define "_sourcedir $(pwd)" \
+            --define "_srcrpmdir $(pwd)"
+
+      - name: Upload SRPM artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: srpm-neuron
+          path: "*.src.rpm"
+
+  copr-cortex:
+    name: Publish cortex to COPR
+    runs-on: fedora
+    needs: srpm-cortex
    steps:
      - name: Download SRPM
        uses: actions/download-artifact@v3
        with:
-          name: srpm
+          name: srpm-cortex

      - name: Configure copr-cli
        run: |
@@ -97,4 +168,49 @@ jobs:
          echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr

      - name: Submit build to COPR
-        run: copr-cli build cortex *.src.rpm
+        run: copr-cli build helexa/cortex *.src.rpm
+
+  copr-neuron:
+    name: Publish neuron to COPR
+    runs-on: fedora
+    needs: srpm-neuron
+    steps:
+      - name: Download SRPM
+        uses: actions/download-artifact@v3
+        with:
+          name: srpm-neuron
+
+      - name: Configure copr-cli
+        run: |
+          mkdir -p ~/.config
+          echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
+
+      - name: Submit build to COPR
+        run: copr-cli build helexa/neuron *.src.rpm
+
+  bump-version:
+    name: Bump version in source
+    runs-on: fedora
+    needs: [copr-cortex, copr-neuron]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Stamp version and push
+        env:
+          GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
+        run: |
+          VERSION="${GITHUB_REF#refs/tags/v}"
+          sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
+          sed -i "s/^Version:.*/Version:        ${VERSION}/" cortex.spec
+          sed -i "s/^Version:.*/Version:        ${VERSION}/" neuron.spec
+          cargo check --workspace 2>/dev/null || true
+          git config user.name "Gitea Actions"
+          git config user.email "actions@git.lair.cafe"
+          git add Cargo.toml Cargo.lock cortex.spec neuron.spec
+          if git diff --cached --quiet; then
+            echo "Version already at ${VERSION}"
+          else
+            git commit -m "chore: bump version to ${VERSION}"
+            git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/helexa/cortex.git"
+            git push origin HEAD:main
+          fi
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 .idea/
 .vscode/
 cortex.toml
+doc/plan/*
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -277,15 +277,388 @@ histograms appear after a proxied request.
 Token-level metrics (tok/s, TTFT) deferred — requires parsing the
 response body or final SSE chunk, which is Phase 6b work.

-### Phase 7 (lower priority): Agent sidecar
+## 2026-04-15 addendum

-**Goal:** Per-node binary that handles VRAM defrag restarts and
-reports real VRAM usage via `nvidia-smi`.
+**Phases 1–6 complete.** The gateway proxies requests (streaming and
+non-streaming), routes by model name to the correct node, polls node
+`/v1/models` for live state, evicts LRU models with pinning, translates
+Anthropic ↔ OpenAI envelopes, and emits Prometheus metrics. CI is green.

-This is deferred. The gateway handles the critical path (model
-lifecycle) entirely via the mistral.rs HTTP API. The agent adds
-operational polish: automatic process restart when `lifecycle_cycles`
-exceeds threshold, real VRAM reporting (vs. estimates), and
-potentially GPU temperature/power monitoring.
+**Phase 7 onward** introduces `neuron` — the per-node daemon that replaces
+the placeholder `cortex-agent` crate — along with hardware discovery,
+a harness abstraction (so cortex is not permanently wedded to mistral.rs),
+and a model catalogue for placement decisions.

-**Defer until:** Phases 1-6 are merged and running in production.
+
+### Architecture: cortex + neuron
+
+cortex is the **control plane**. It exposes the unified API, routes
+requests, manages model lifecycle across the fleet, and collects metrics.
+
+neuron is the **node plane**. One instance runs on every GPU host. It:
+- **Discovers** local hardware (GPU count, types, VRAM, CUDA compute
+  capability, driver version) and reports it to cortex.
+- **Manages harnesses** — inference engines like mistral.rs, llama.cpp,
+  or ComfyUI. Each harness is a trait implementation. neuron starts,
+  stops, health-checks, and proxies to whichever harness is serving a
+  given model.
+- **Manages model lifecycle** — load, unload, status — abstracting the
+  differences between harnesses (mistral.rs has HTTP lifecycle endpoints;
+  llama.cpp may need process management).
+- **Reports runtime state** — per-device VRAM usage, GPU utilisation,
+  temperature, loaded models with actual VRAM consumption.
+
+cortex never shells out to `nvidia-smi`, never touches systemd units,
+and never talks directly to a harness. It talks only to neurons.
+
+```
+                    ┌─────────────────────┐
+                    │      cortex         │
+                    │  (cortex-gateway)   │
+                    │  Router · Evictor   │
+                    │  Metrics · Translate│
+                    └──┬──────┬────────┬──┘
+                       │      │        │
+            ┌──────────▼┐  ┌──▼─────┐  ┌▼──────────┐
+            │  neuron   │  │ neuron │  │  neuron   │
+            │  beast    │  │ benjy  │  │ quadbrat  │
+            │           │  │        │  │           │
+            │ harness:  │  │harness:│  │ harness:  │
+            │ mistralrs │  │mistral │  │ mistralrs │
+            │ (+ comfy) │  │rs      │  │           │
+            └───────────┘  └────────┘  └───────────┘
+```
+
+
+## The Harness trait
+
+Defined in `cortex-core` so both cortex and neuron share the type
+definitions. neuron provides the runtime implementations.
+
+```rust
+/// What an inference harness must do, from neuron's perspective.
+#[async_trait]
+pub trait Harness: Send + Sync {
+    /// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
+    fn name(&self) -> &str;
+
+    /// Start the harness process if it is not already running.
+    async fn start(&self, config: &HarnessConfig) -> Result<()>;
+
+    /// Stop the harness process gracefully.
+    async fn stop(&self) -> Result<()>;
+
+    /// Health check. Returns the harness process status.
+    async fn health(&self) -> HarnessHealth;
+
+    /// List models the harness knows about (loaded + unloaded).
+    async fn list_models(&self) -> Result<Vec<ModelInfo>>;
+
+    /// Load a model with the given spec (quant, TP, device assignment).
+    async fn load_model(&self, spec: &ModelSpec) -> Result<()>;
+
+    /// Unload a model, freeing device memory.
+    async fn unload_model(&self, model_id: &str) -> Result<()>;
+
+    /// Return the URL where inference requests for this model should
+    /// be sent. None if the model is not loaded.
+    async fn inference_endpoint(&self, model_id: &str) -> Option<String>;
+}
+```
+
+The mistral.rs implementation wraps the HTTP API:
+- `list_models` → `GET /v1/models`
+- `load_model` → `POST /v1/models/reload`
+- `unload_model` → `POST /v1/models/unload`
+- `inference_endpoint` → returns the base URL (the model name routes
+  internally within mistral.rs)
+- `start`/`stop` → manage the `mistralrs.service` systemd unit
+
+A future llama.cpp implementation would manage per-model `llama-server`
+processes (one process per loaded model, each on its own port).
+
+
+## neuron API
+
+neuron exposes an HTTP API on port 9090 that cortex polls and calls.
+
+```
+GET  /discovery
+     → {
+         hostname, os, kernel,
+         cuda_version, driver_version,
+         devices: [{ index, name, vram_total_mb, compute_capability }],
+         harnesses: ["mistralrs", ...]
+       }
+
+GET  /health
+     → {
+         uptime_secs,
+         devices: [{ index, vram_used_mb, vram_free_mb, utilization_pct, temp_c }]
+       }
+
+GET  /models
+     → [{ id, harness, status, devices: [int], vram_used_mb }]
+
+POST /models/load
+     ← { model_id, harness, quant, tensor_parallel, devices: [int] }
+     → { status: "loaded" | "loading" }
+
+POST /models/unload
+     ← { model_id }
+     → { status: "unloaded" }
+
+GET  /models/{model_id}/endpoint
+     → { url: "http://localhost:8080" }
+```
+
+cortex never constructs a harness-specific URL. It asks neuron for the
+inference endpoint and proxies there.
+
+
+## Discovery replaces static device config
+
+cortex.toml no longer contains device types, VRAM sizes, or CUDA
+architectures. That information comes from neuron's `/discovery`
+endpoint. cortex.toml shrinks to:
+
+```toml
+[gateway]
+listen = "0.0.0.0:8000"
+metrics_listen = "0.0.0.0:9100"
+
+[eviction]
+strategy = "lru"
+defrag_after_cycles = 50
+
+[[neurons]]
+name = "beast"
+endpoint = "http://beast.hanzalova.internal:9090"
+
+[[neurons]]
+name = "benjy"
+endpoint = "http://benjy.kosherinata.internal:9090"
+
+[[neurons]]
+name = "quadbrat"
+endpoint = "http://quadbrat.hanzalova.internal:9090"
+```
+
+On startup and periodically, cortex calls `GET /discovery` and
+`GET /health` on each neuron to build its topology map. The router
+uses this topology — not config — to make placement decisions.
+
+
+## Model catalogue
+
+Model serving profiles live in a separate file (`models.toml`) because
+they describe how to serve a model, not where. cortex matches these
+profiles against the discovered topology to determine valid placements.
+
+```toml
+[[models]]
+id = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
+harness = "mistralrs"
+quant = "Q4_K_M"
+vram_mb = 19000
+min_devices = 2
+min_device_vram_mb = 10000
+pinned_on = ["beast"]       # optional: never evict from these neurons
+
+[[models]]
+id = "Qwen/Qwen3-VL-8B"
+harness = "mistralrs"
+quant = "Q8_0"
+vram_mb = 10000
+min_devices = 1
+
+[[models]]
+id = "Qwen/Qwen2.5-Coder-14B-Instruct"
+harness = "mistralrs"
+quant = "Q6_K"
+vram_mb = 12000
+min_devices = 1
+pinned_on = ["benjy"]
+```
+
+The router consults the catalogue to answer: "model X needs 2 devices
+with ≥10GB each; beast has 2× RTX 5090 at 32GB each; that's a valid
+placement." This replaces the current per-node `pinned` list in config
+and the hardcoded `vram_mb` per node.
+
+
+## Revised repository layout
+
+```
+cortex/
+├── Cargo.toml
+├── cortex.toml                 # gateway config (neurons only)
+├── models.toml                 # model catalogue
+├── README.md
+├── CLAUDE.md
+├── crates/
+│   ├── cortex-core/            # shared types
+│   │   └── src/
+│   │       ├── lib.rs
+│   │       ├── config.rs       # GatewayConfig, NeuronEndpoint
+│   │       ├── catalogue.rs    # ModelProfile, placement matching
+│   │       ├── discovery.rs    # DeviceInfo, DiscoveryResponse
+│   │       ├── harness.rs      # Harness trait, HarnessConfig, HarnessHealth
+│   │       ├── node.rs         # NodeState, ModelEntry, ModelStatus
+│   │       ├── openai.rs       # OpenAI envelope types
+│   │       ├── anthropic.rs    # Anthropic envelope types
+│   │       ├── translate.rs    # OpenAI <-> Anthropic translation
+│   │       └── metrics.rs      # RequestMetrics
+│   ├── cortex-gateway/         # control plane (existing, modified)
+│   │   └── src/
+│   │       ├── lib.rs
+│   │       ├── state.rs        # CortexState (updated: discovery topology)
+│   │       ├── router.rs       # updated: catalogue + discovery placement
+│   │       ├── proxy.rs        # streaming proxy (unchanged)
+│   │       ├── evictor.rs      # updated: talks to neuron, not mistralrs
+│   │       ├── poller.rs       # updated: polls neuron, not mistralrs
+│   │       ├── handlers.rs     # axum handlers (unchanged API surface)
+│   │       └── metrics.rs      # prometheus exporter (unchanged)
+│   ├── neuron/                 # node plane (replaces cortex-agent)
+│   │   └── src/
+│   │       ├── main.rs         # binary entrypoint, axum server on :9090
+│   │       ├── discovery.rs    # nvidia-smi, device enumeration
+│   │       ├── health.rs       # runtime GPU polling
+│   │       ├── api.rs          # HTTP handlers for /discovery, /models, etc.
+│   │       ├── harness/
+│   │       │   ├── mod.rs      # Harness trait re-export, registry
+│   │       │   ├── mistralrs.rs  # mistral.rs HTTP API wrapper
+│   │       │   └── llamacpp.rs   # stub for future llama.cpp support
+│   │       └── models.rs       # local model lifecycle orchestration
+│   └── cortex-cli/             # CLI entrypoint (unchanged)
+│       └── src/
+│           └── main.rs
+└── tests/
+```
+
+The `cortex-agent` crate is deleted. Its replacement is `neuron/`.
+
+
+## Implementation plan (phases 7+)
+
+Phases 1–6 are merged and passing CI. Each subsequent phase is a
+branch → PR. CI (fmt, clippy, test) must pass before merge.
+
+### Phase 7: neuron scaffold and discovery ✅
+
+Completed. Deleted `cortex-agent`, created `crates/neuron/` (binary:
+`neuron`). Added shared types to cortex-core: `discovery.rs`
+(DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse) and
+`harness.rs` (Harness async trait, HarnessConfig, ModelSpec, ModelInfo).
+
+neuron discovers GPUs via nvidia-smi, caches health readings, and
+serves `GET /discovery` and `GET /health`. Pure parsing functions
+separated from command execution for testability. 9 unit tests for
+nvidia-smi CSV parsing, 3 integration tests for the HTTP endpoints.
+
+### Phase 8: neuron harness — mistral.rs implementation ✅
+
+Completed. Full `Harness` trait implementation for mistral.rs in
+`neuron/src/harness/mistralrs.rs`: list_models, load_model, unload_model,
+inference_endpoint, health, start/stop (systemd). `HarnessRegistry` in
+`harness/mod.rs` maps harness name → `Box<dyn Harness>`, built from
+`neuron.toml` config. Four new neuron API endpoints: `GET /models`,
+`POST /models/load`, `POST /models/unload`, `GET /models/:id/endpoint`.
+
+Config via `neuron.toml` (figment + env override). Integration test
+covers full model lifecycle through neuron → mock mistral.rs backend.
+
+### Phase 9: cortex talks to neurons ✅
+
+Completed. Full refactor of cortex-gateway to talk to neurons:
+
+- **Config**: `NodeConfig { endpoint, vram_mb, pinned }` replaced with
+  `NeuronEndpoint { name, endpoint }`. Hardware info comes from neuron
+  discovery, pinning from `models.toml` catalogue.
+- **catalogue.rs**: `ModelProfile` with `pinned_on`, `ModelCatalogue`
+  with `is_pinned()` for eviction decisions.
+- **Poller**: polls neuron's `GET /models` (ModelInfo format) instead
+  of mistralrs `/v1/models`.
+- **Router**: asks neuron `GET /models/{id}/endpoint` for the inference
+  URL before proxying. Decouples cortex from knowing harness ports.
+- **Evictor**: calls `POST {neuron}/models/unload` instead of
+  mistralrs directly. Uses catalogue for pinning.
+- **Tests**: all 22 gateway tests updated to mock neuron API instead
+  of raw mistralrs. 36 total tests passing.
+
+Topology-aware placement (min_devices, min_device_vram_mb) deferred —
+the router currently routes based on polled model status. Catalogue
+placement matching can be added incrementally.
+
+### Phase 10: RPM packaging ✅
+
+Completed. Both packages have RPM specs, systemd units, and example configs.
+CI builds parallel SRPMs on tag push and publishes to separate COPR repos.
+
+- `cortex.spec` → `helexa/cortex` COPR: binary, systemd unit, config files
+- `neuron.spec` → `helexa/neuron` COPR: binary, systemd unit, config
+- `data/cortex.service`, `data/neuron.service` — systemd units
+- `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
+- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR publish
+
+Install:
+```sh
+dnf copr enable helexa/cortex && dnf install cortex    # gateway host
+dnf copr enable helexa/neuron && dnf install neuron    # GPU nodes
+```
+
+### Phase 11: llama.cpp harness stub
+
+**Goal:** Prove the harness abstraction works with a second engine.
+
+**Steps:**
+1. `crates/neuron/src/harness/llamacpp.rs` — implement the `Harness`
+   trait for llama.cpp's `llama-server`.
+   - `start()` — launch `llama-server` with the correct model path,
+     `--port`, `--n-gpu-layers`, `--tensor-split` args. Track the
+     child process.
+   - `stop()` — send SIGTERM to the child process.
+   - `list_models()` — llama-server serves one model per process, so
+     return a single-element list.
+   - `load_model()` — start a new llama-server process for this model.
+   - `unload_model()` — stop the process.
+   - `inference_endpoint()` — return `http://localhost:{assigned_port}`.
+2. Port allocation: neuron assigns ports from a range (e.g. 8100-8199)
+   to llama-server instances.
+3. Register in `HarnessRegistry` when configured:
+   ```toml
+   [[harnesses]]
+   name = "llamacpp"
+   binary = "/usr/local/bin/llama-server"
+   port_range = [8100, 8199]
+   ```
+4. Tests: mock llama-server (simple HTTP server returning canned
+   responses), test load/unload/endpoint lifecycle.
+
+**Done when:** A model with `harness = "llamacpp"` in `models.toml` can
+be loaded and served through cortex. Tests pass with mock llama-server.
+
+### Phase 12 (lower priority): mistral.rs COPR packaging
+
+**Goal:** Fedora RPMs for mistral.rs built against specific CUDA versions.
+
+**Steps:**
+1. `mistralrs-cuda.spec` — RPM spec that clones a pinned mistral.rs git
+   tag, builds with `--features cuda`, links against the system CUDA
+   toolkit. Produces `mistralrs-cuda13-server` (CUDA 13.x / sm_120) and
+   `mistralrs-cuda12-server` (CUDA 12.x / sm_89). Install binary to
+   `/usr/local/bin/mistralrs`.
+2. COPR build config: enable the NVIDIA CUDA repo as a build dependency.
+   Pin the CUDA toolkit version in `BuildRequires`.
+3. Gitea Actions or manual workflow: bump the mistral.rs tag in the spec,
+   trigger COPR rebuild.
+4. neuron's mistralrs harness config references which binary/package
+   provides the mistral.rs binary. neuron could warn at startup if the
+   installed mistral.rs CUDA version doesn't match the discovered driver.
+
+**Done when:** `dnf install mistralrs-cuda13-server` on beast provides a
+working `mistralrs` binary built for Blackwell GPUs. `dnf install
+mistralrs-cuda12-server` on benjy provides one built for Ada GPUs.
+
+This is a separate repo/spec — not part of the cortex workspace — but
+tightly coupled operationally. Track it as a sibling project.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -88,6 +88,17 @@ version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"

+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "atomic"
 version = "0.6.1"
@@ -338,19 +349,6 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"

-[[package]]
-name = "cortex-agent"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "cortex-core",
- "reqwest",
- "serde",
- "serde_json",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "cortex-cli"
 version = "0.1.0"
@@ -371,6 +369,7 @@ name = "cortex-core"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "chrono",
 "figment",
 "serde",
@@ -402,6 +401,7 @@ dependencies = [
 "tower",
 "tower-http",
 "tracing",
+ "urlencoding",
 ]

 [[package]]
@@ -1182,6 +1182,25 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "neuron"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "axum",
+ "clap",
+ "cortex-core",
+ "figment",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "tokio",
+ "toml",
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -2219,6 +2238,12 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "urlencoding"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+
 [[package]]
 name = "utf8_iter"
 version = "1.0.4"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,8 +3,8 @@ resolver = "2"
 members = [
    "crates/cortex-core",
    "crates/cortex-gateway",
-    "crates/cortex-agent",
    "crates/cortex-cli",
+    "crates/neuron",
 ]

 [workspace.package]
@@ -46,6 +46,12 @@ figment = { version = "0.10", features = ["toml", "env"] }
 anyhow = "1"
 thiserror = "2"

+# async traits
+async-trait = "0.1"
+
+# CLI
+clap = { version = "4", features = ["derive"] }
+
 # futures / streams (for SSE proxying)
 futures = "0.3"
 tokio-stream = "0.1"
@@ -54,4 +60,3 @@ eventsource-stream = "0.2"
 # workspace crates
 cortex-core = { path = "crates/cortex-core" }
 cortex-gateway = { path = "crates/cortex-gateway" }
-cortex-agent = { path = "crates/cortex-agent" }
--- a/cortex.spec
+++ b/cortex.spec
@@ -1,7 +1,7 @@
 Name:           cortex
 Version:        0.1.0
 Release:        1%{?dist}
-Summary:        Inference gateway for multi-node mistral.rs clusters
+Summary:        Inference gateway for multi-node GPU clusters

 License:        GPL-3.0-or-later
 URL:            https://git.lair.cafe/helexa/cortex
@@ -15,11 +15,13 @@ BuildRequires:  cargo
 BuildRequires:  gcc
 BuildRequires:  systemd-rpm-macros

+Requires(pre):  shadow-utils
+
 %description
-Cortex is a Rust reverse-proxy that sits in front of multiple mistral.rs
-inference nodes and presents a unified OpenAI and Anthropic compatible
-API surface. It handles model routing, lifecycle management, request
-translation, and metrics collection.
+Cortex is a Rust reverse-proxy that sits in front of multiple inference
+nodes (via neuron daemons) and presents a unified OpenAI and Anthropic
+compatible API surface. It handles model routing, lifecycle management,
+request translation, and metrics collection.

 %prep
 %autosetup
@@ -38,12 +40,33 @@ cargo build --release -p cortex-cli

 %install
 install -Dm755 target/release/cortex %{buildroot}%{_bindir}/cortex
+install -Dm644 data/cortex.service %{buildroot}%{_unitdir}/cortex.service
+install -dm750 %{buildroot}%{_sysconfdir}/cortex
+install -Dm640 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
+install -Dm640 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
+
+%pre
+getent group cortex >/dev/null || groupadd -r cortex
+getent passwd cortex >/dev/null || useradd -r -g cortex -d /var/lib/cortex -s /sbin/nologin cortex
+
+%post
+%systemd_post cortex.service
+
+%preun
+%systemd_preun cortex.service
+
+%postun
+%systemd_postun_with_restart cortex.service

 %files
 %license LICENSE
 %doc README.md
 %{_bindir}/cortex
+%{_unitdir}/cortex.service
+%dir %attr(750,root,cortex) %{_sysconfdir}/cortex
+%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/cortex.toml
+%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/models.toml

 %changelog
-* Mon Apr 14 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
+* Tue Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
 - Initial package
--- a/crates/cortex-agent/Cargo.toml
+++ b/crates/cortex-agent/Cargo.toml
@@ -1,14 +0,0 @@
-[package]
-name = "cortex-agent"
-version.workspace = true
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-cortex-core.workspace = true
-tokio.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-reqwest.workspace = true
-tracing.workspace = true
-anyhow.workspace = true
--- a/crates/cortex-agent/src/agent.rs
+++ b/crates/cortex-agent/src/agent.rs
@@ -1,72 +0,0 @@
-//! Per-node agent sidecar.
-//!
-//! This is a future component that runs on each GPU node alongside mistralrs.
-//! It handles:
-//!   - VRAM defragmentation (restarting the mistralrs systemd unit when the
-//!     gateway signals that lifecycle_cycles has exceeded the threshold)
-//!   - Local nvidia-smi polling for actual VRAM usage reporting
-//!   - Systemd unit management for mistralrs process restarts
-//!
-//! For now this is a stub. The gateway's poller + evictor handle the critical
-//! path (model lifecycle via the mistralrs HTTP API). The agent adds
-//! operational niceties that can be built incrementally.
-
-/// Placeholder for agent configuration.
-#[derive(Debug, Clone)]
-pub struct AgentConfig {
-    /// The local mistralrs endpoint to monitor.
-    pub mistralrs_endpoint: String,
-    /// The systemd unit name for mistralrs (e.g. "mistralrs.service").
-    pub systemd_unit: String,
-}
-
-/// Restart the local mistralrs process via systemd.
-/// This is the nuclear option for VRAM defragmentation.
-pub async fn restart_mistralrs(config: &AgentConfig) -> anyhow::Result<()> {
-    tracing::warn!(
-        unit = %config.systemd_unit,
-        "restarting mistralrs for VRAM defragmentation"
-    );
-
-    let output = tokio::process::Command::new("systemctl")
-        .args(["restart", &config.systemd_unit])
-        .output()
-        .await?;
-
-    if output.status.success() {
-        tracing::info!(unit = %config.systemd_unit, "mistralrs restarted successfully");
-        Ok(())
-    } else {
-        let stderr = String::from_utf8_lossy(&output.stderr);
-        anyhow::bail!("systemctl restart failed: {stderr}");
-    }
-}
-
-/// Query nvidia-smi for current VRAM usage on this node.
-/// Returns (used_mb, total_mb) for each GPU.
-pub async fn query_vram() -> anyhow::Result<Vec<(u64, u64)>> {
-    let output = tokio::process::Command::new("nvidia-smi")
-        .args([
-            "--query-gpu=memory.used,memory.total",
-            "--format=csv,noheader,nounits",
-        ])
-        .output()
-        .await?;
-
-    if !output.status.success() {
-        let stderr = String::from_utf8_lossy(&output.stderr);
-        anyhow::bail!("nvidia-smi failed: {stderr}");
-    }
-
-    let stdout = String::from_utf8_lossy(&output.stdout);
-    let mut gpus = Vec::new();
-    for line in stdout.lines() {
-        let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
-        if parts.len() == 2 {
-            let used: u64 = parts[0].parse().unwrap_or(0);
-            let total: u64 = parts[1].parse().unwrap_or(0);
-            gpus.push((used, total));
-        }
-    }
-    Ok(gpus)
-}
--- a/crates/cortex-agent/src/lib.rs
+++ b/crates/cortex-agent/src/lib.rs
@@ -1 +0,0 @@
-pub mod agent;
--- a/crates/cortex-cli/Cargo.toml
+++ b/crates/cortex-cli/Cargo.toml
@@ -17,4 +17,4 @@ tracing-subscriber.workspace = true
 anyhow.workspace = true
 reqwest.workspace = true
 serde_json.workspace = true
-clap = { version = "4", features = ["derive"] }
+clap.workspace = true
--- a/crates/cortex-cli/src/main.rs
+++ b/crates/cortex-cli/src/main.rs
@@ -46,7 +46,7 @@ async fn main() -> Result<()> {
                .map_err(|e| anyhow::anyhow!("failed to load config from '{config}': {e}"))?;

            tracing::info!(
-                nodes = cfg.nodes.len(),
+                neurons = cfg.neurons.len(),
                listen = %cfg.gateway.listen,
                "starting cortex"
            );
--- a/crates/cortex-core/Cargo.toml
+++ b/crates/cortex-core/Cargo.toml
@@ -13,3 +13,4 @@ chrono.workspace = true
 anyhow.workspace = true
 thiserror.workspace = true
 tracing.workspace = true
+async-trait.workspace = true
--- a/crates/cortex-core/src/catalogue.rs
+++ b/crates/cortex-core/src/catalogue.rs
@@ -0,0 +1,67 @@
+//! Model catalogue — profiles describing how to serve each model.
+
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+/// A model serving profile loaded from models.toml.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelProfile {
+    pub id: String,
+    pub harness: String,
+    #[serde(default)]
+    pub quant: Option<String>,
+    /// Estimated VRAM usage in MB when loaded.
+    #[serde(default)]
+    pub vram_mb: Option<u64>,
+    /// Minimum number of GPU devices required.
+    #[serde(default = "default_min_devices")]
+    pub min_devices: u32,
+    /// Minimum VRAM per device in MB.
+    #[serde(default)]
+    pub min_device_vram_mb: Option<u64>,
+    /// Neurons where this model should never be evicted.
+    #[serde(default)]
+    pub pinned_on: Vec<String>,
+}
+
+fn default_min_devices() -> u32 {
+    1
+}
+
+/// The full model catalogue.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ModelCatalogue {
+    #[serde(default)]
+    pub models: Vec<ModelProfile>,
+}
+
+impl ModelCatalogue {
+    /// Load the catalogue from a TOML file. Returns empty catalogue if file doesn't exist.
+    pub fn load(path: impl AsRef<Path>) -> Self {
+        let path = path.as_ref();
+        if !path.exists() {
+            tracing::info!(path = %path.display(), "no model catalogue found, using empty");
+            return Self::default();
+        }
+        match std::fs::read_to_string(path) {
+            Ok(contents) => match toml::from_str(&contents) {
+                Ok(cat) => cat,
+                Err(e) => {
+                    tracing::warn!(path = %path.display(), error = %e, "failed to parse model catalogue");
+                    Self::default()
+                }
+            },
+            Err(e) => {
+                tracing::warn!(path = %path.display(), error = %e, "failed to read model catalogue");
+                Self::default()
+            }
+        }
+    }
+
+    /// Check if a model is pinned on a given neuron.
+    pub fn is_pinned(&self, model_id: &str, neuron_name: &str) -> bool {
+        self.models
+            .iter()
+            .any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string()))
+    }
+}
--- a/crates/cortex-core/src/config.rs
+++ b/crates/cortex-core/src/config.rs
@@ -9,7 +9,15 @@ use std::path::Path;
 pub struct GatewayConfig {
    pub gateway: GatewaySettings,
    pub eviction: EvictionSettings,
-    pub nodes: Vec<NodeConfig>,
+    /// Neuron endpoints (replaces old NodeConfig with static vram_mb/pinned).
+    pub neurons: Vec<NeuronEndpoint>,
+    /// Path to the model catalogue file (default: "models.toml").
+    #[serde(default = "default_models_path")]
+    pub models_config: String,
+}
+
+fn default_models_path() -> String {
+    "models.toml".into()
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -24,8 +32,7 @@ pub struct GatewaySettings {
 pub struct EvictionSettings {
    /// Eviction strategy: "lru" or "priority"
    pub strategy: EvictionStrategy,
-    /// Restart the mistralrs process after this many load/unload cycles
-    /// to reclaim fragmented VRAM. 0 = never.
+    /// Number of load/unload cycles before flagging for defrag. 0 = never.
    #[serde(default)]
    pub defrag_after_cycles: u32,
 }
@@ -37,23 +44,19 @@ pub enum EvictionStrategy {
    Priority,
 }

+/// A neuron endpoint in the fleet. Hardware details come from
+/// neuron's /discovery endpoint, not from config.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct NodeConfig {
-    /// Human-readable node name (e.g. "gpu-large")
+pub struct NeuronEndpoint {
+    /// Human-readable node name (e.g. "beast")
    pub name: String,
-    /// Base URL of the mistralrs HTTP server (e.g. "http://gpu-large.internal:8080")
+    /// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
    pub endpoint: String,
-    /// Total VRAM in MB across all GPUs on this node
-    pub vram_mb: u64,
-    /// Model IDs that should never be evicted from this node
-    #[serde(default)]
-    pub pinned: Vec<String>,
 }

 impl GatewayConfig {
    /// Load configuration from a TOML file, with environment variable overrides.
-    /// Env vars are prefixed with `CORTEX_` and use `__` as a separator
-    /// (e.g. `CORTEX_GATEWAY__LISTEN=0.0.0.0:9000`).
+    /// Env vars are prefixed with `CORTEX_` and use `__` as a separator.
    pub fn load(path: impl AsRef<Path>) -> Result<Self, Box<figment::Error>> {
        Figment::new()
            .merge(Toml::file(path))
@@ -74,7 +77,8 @@ impl Default for GatewayConfig {
                strategy: EvictionStrategy::Lru,
                defrag_after_cycles: 50,
            },
-            nodes: vec![],
+            neurons: vec![],
+            models_config: default_models_path(),
        }
    }
 }
--- a/crates/cortex-core/src/discovery.rs
+++ b/crates/cortex-core/src/discovery.rs
@@ -0,0 +1,43 @@
+//! Hardware discovery and health types shared between cortex and neuron.
+
+use serde::{Deserialize, Serialize};
+
+/// Information about a single GPU device discovered on a node.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DeviceInfo {
+    pub index: u32,
+    pub name: String,
+    pub vram_total_mb: u64,
+    pub compute_capability: String,
+}
+
+/// Full discovery response from a neuron endpoint.
+/// Returned by `GET /discovery`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DiscoveryResponse {
+    pub hostname: String,
+    pub os: String,
+    pub kernel: String,
+    pub cuda_version: Option<String>,
+    pub driver_version: Option<String>,
+    pub devices: Vec<DeviceInfo>,
+    pub harnesses: Vec<String>,
+}
+
+/// Runtime health metrics for a single GPU device.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DeviceHealth {
+    pub index: u32,
+    pub vram_used_mb: u64,
+    pub vram_free_mb: u64,
+    pub utilization_pct: u32,
+    pub temp_c: u32,
+}
+
+/// Runtime health response from a neuron endpoint.
+/// Returned by `GET /health`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HealthResponse {
+    pub uptime_secs: u64,
+    pub devices: Vec<DeviceHealth>,
+}
--- a/crates/cortex-core/src/harness.rs
+++ b/crates/cortex-core/src/harness.rs
@@ -0,0 +1,76 @@
+//! Harness trait and supporting types for inference engine management.
+//!
+//! Defined in cortex-core so both cortex (control plane) and neuron
+//! (node plane) share the type definitions. neuron provides the
+//! runtime implementations.
+
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+/// Configuration for a harness instance on a neuron.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HarnessConfig {
+    pub name: String,
+    /// Base URL of the harness (e.g. "http://localhost:8080" for mistral.rs).
+    pub endpoint: Option<String>,
+    /// Systemd unit name, if the harness is managed via systemd.
+    pub systemd_unit: Option<String>,
+}
+
+/// Health status of a harness process.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HarnessHealth {
+    pub name: String,
+    pub running: bool,
+    pub uptime_secs: Option<u64>,
+}
+
+/// Specification for loading a model through a harness.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelSpec {
+    pub model_id: String,
+    pub harness: String,
+    pub quant: Option<String>,
+    pub tensor_parallel: Option<u32>,
+    pub devices: Option<Vec<u32>>,
+}
+
+/// A model as reported by a harness.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ModelInfo {
+    pub id: String,
+    pub harness: String,
+    pub status: String,
+    pub devices: Vec<u32>,
+    pub vram_used_mb: Option<u64>,
+}
+
+/// What an inference harness must do, from neuron's perspective.
+#[async_trait]
+pub trait Harness: Send + Sync {
+    /// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
+    fn name(&self) -> &str;
+
+    /// Start the harness process if it is not already running.
+    async fn start(&self, config: &HarnessConfig) -> Result<()>;
+
+    /// Stop the harness process gracefully.
+    async fn stop(&self) -> Result<()>;
+
+    /// Health check. Returns the harness process status.
+    async fn health(&self) -> HarnessHealth;
+
+    /// List models the harness knows about (loaded + unloaded).
+    async fn list_models(&self) -> Result<Vec<ModelInfo>>;
+
+    /// Load a model with the given spec (quant, TP, device assignment).
+    async fn load_model(&self, spec: &ModelSpec) -> Result<()>;
+
+    /// Unload a model, freeing device memory.
+    async fn unload_model(&self, model_id: &str) -> Result<()>;
+
+    /// Return the URL where inference requests for this model should
+    /// be sent. None if the model is not loaded.
+    async fn inference_endpoint(&self, model_id: &str) -> Option<String>;
+}
--- a/crates/cortex-core/src/lib.rs
+++ b/crates/cortex-core/src/lib.rs
@@ -1,5 +1,8 @@
 pub mod anthropic;
+pub mod catalogue;
 pub mod config;
+pub mod discovery;
+pub mod harness;
 pub mod metrics;
 pub mod node;
 pub mod openai;
--- a/crates/cortex-core/src/node.rs
+++ b/crates/cortex-core/src/node.rs
@@ -2,13 +2,12 @@ use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;

-/// Runtime state of a single node in the fleet.
+/// Runtime state of a single neuron in the fleet.
 #[derive(Debug, Clone)]
 pub struct NodeState {
    pub name: String,
+    /// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
    pub endpoint: String,
-    pub vram_mb: u64,
-    pub pinned: Vec<String>,
    pub healthy: bool,
    pub models: HashMap<String, ModelEntry>,
    /// Number of load/unload cycles since last process restart.
@@ -27,7 +26,7 @@ pub struct ModelEntry {
    pub vram_estimate_mb: Option<u64>,
 }

-/// Model lifecycle status, matching the mistral.rs API.
+/// Model lifecycle status.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase")]
 pub enum ModelStatus {
@@ -52,23 +51,3 @@ pub struct ModelLocation {
    pub status: ModelStatus,
    pub vram_estimate_mb: Option<u64>,
 }
-
-/// Response from mistral.rs `GET /v1/models`.
-/// This is the upstream format we parse when polling nodes.
-#[derive(Debug, Clone, Deserialize)]
-pub struct MistralModelsResponse {
-    pub data: Vec<MistralModelEntry>,
-}
-
-#[derive(Debug, Clone, Deserialize)]
-pub struct MistralModelEntry {
-    pub id: String,
-    #[serde(default)]
-    pub status: Option<String>,
-}
-
-/// Request body for mistral.rs model lifecycle endpoints.
-#[derive(Debug, Clone, Serialize)]
-pub struct ModelLifecycleRequest {
-    pub model_id: String,
-}
--- a/crates/cortex-gateway/Cargo.toml
+++ b/crates/cortex-gateway/Cargo.toml
@@ -23,6 +23,7 @@ futures.workspace = true
 tokio-stream.workspace = true
 eventsource-stream.workspace = true
 bytes = "1"
+urlencoding = "2"

 [dev-dependencies]
 tokio = { workspace = true, features = ["test-util"] }
--- a/crates/cortex-gateway/src/evictor.rs
+++ b/crates/cortex-gateway/src/evictor.rs
@@ -1,29 +1,19 @@
 //! Model eviction logic.
 //!
-//! The evictor runs as a background task. When the router determines that a
-//! model needs to be loaded on a node but VRAM is tight, it can request
-//! eviction via a channel. The evictor then:
-//!   1. Identifies the LRU model on that node (excluding pinned models)
-//!   2. Calls `POST /v1/models/unload` on the node
-//!   3. Increments the lifecycle cycle counter (for defrag tracking)
+//! The evictor identifies the LRU model on a node (excluding pinned models),
+//! calls neuron's `POST /models/unload` to free the model, and updates
+//! local state.

 use crate::state::CortexState;
-use cortex_core::node::{ModelLifecycleRequest, ModelStatus};
+use cortex_core::node::ModelStatus;
 use std::sync::Arc;
 use std::time::Duration;

-/// Runs forever. Currently a placeholder that periodically checks for
-/// eviction opportunities. In the future, this will be driven by a
-/// channel from the router when VRAM pressure is detected.
+/// Runs forever. Placeholder for future channel-driven eviction.
 pub async fn eviction_loop(fleet: Arc<CortexState>) {
-    // TODO: Replace this polling approach with a channel-driven design
-    // where the router sends eviction requests when it detects that a
-    // model load would exceed available VRAM.
    loop {
        tokio::time::sleep(Duration::from_secs(30)).await;
-        // Placeholder: the actual eviction logic is in `evict_lru_on_node`,
-        // called on demand by the router.
-        let _ = &fleet; // suppress unused warning
+        let _ = &fleet;
    }
 }

@@ -33,18 +23,19 @@ pub async fn evict_lru_on_node(
    fleet: &CortexState,
    node_name: &str,
 ) -> anyhow::Result<Option<String>> {
-    let (endpoint, candidate) = {
+    let (neuron_endpoint, candidate) = {
        let nodes = fleet.nodes.read().await;
        let Some(node) = nodes.get(node_name) else {
            anyhow::bail!("node '{node_name}' not found");
        };

-        // Find the loaded model with the oldest last_accessed, excluding pinned.
+        // Find the loaded model with the oldest last_accessed,
+        // excluding models pinned on this neuron (from catalogue).
        let candidate = node
            .models
            .values()
            .filter(|m| m.status == ModelStatus::Loaded)
-            .filter(|m| !node.pinned.contains(&m.id))
+            .filter(|m| !fleet.catalogue.is_pinned(&m.id, node_name))
            .min_by_key(|m| m.last_accessed)
            .map(|m| m.id.clone());

@@ -58,18 +49,16 @@ pub async fn evict_lru_on_node(

    tracing::info!(node = node_name, model = %model_id, "evicting model");

-    let url = format!("{endpoint}/v1/models/unload");
+    // Call neuron's unload endpoint.
+    let url = format!("{neuron_endpoint}/models/unload");
    let resp = fleet
        .http_client
        .post(&url)
-        .json(&ModelLifecycleRequest {
-            model_id: model_id.clone(),
-        })
+        .json(&serde_json::json!({ "model_id": model_id }))
        .send()
        .await?;

    if resp.status().is_success() {
-        // Update local state.
        let mut nodes = fleet.nodes.write().await;
        if let Some(node) = nodes.get_mut(node_name) {
            if let Some(entry) = node.models.get_mut(&model_id) {
@@ -77,14 +66,13 @@ pub async fn evict_lru_on_node(
            }
            node.lifecycle_cycles += 1;

-            // Check if we should flag for defrag.
            if fleet.eviction.defrag_after_cycles > 0
                && node.lifecycle_cycles >= fleet.eviction.defrag_after_cycles
            {
                tracing::warn!(
                    node = node_name,
                    cycles = node.lifecycle_cycles,
-                    "VRAM fragmentation threshold reached — consider restarting mistralrs"
+                    "VRAM fragmentation threshold reached — consider restarting harness"
                );
            }
        }
--- a/crates/cortex-gateway/src/poller.rs
+++ b/crates/cortex-gateway/src/poller.rs
@@ -1,15 +1,16 @@
-//! Background poller that periodically queries each node's `/v1/models`
-//! endpoint to refresh the fleet state.
+//! Background poller that periodically queries each neuron's API
+//! to refresh the fleet state.

 use crate::state::CortexState;
 use chrono::Utc;
-use cortex_core::node::{MistralModelsResponse, ModelEntry, ModelStatus};
+use cortex_core::harness::ModelInfo;
+use cortex_core::node::{ModelEntry, ModelStatus};
 use std::sync::Arc;
 use std::time::Duration;

 const POLL_INTERVAL: Duration = Duration::from_secs(10);

-/// Runs forever, polling all nodes on a fixed interval.
+/// Runs forever, polling all neurons on a fixed interval.
 pub async fn poll_loop(fleet: Arc<CortexState>) {
    loop {
        poll_once(&fleet).await;
@@ -17,15 +18,15 @@ pub async fn poll_loop(fleet: Arc<CortexState>) {
    }
 }

-/// Poll all nodes once. Used by `poll_loop` and available for testing.
+/// Poll all neurons once. Used by `poll_loop` and available for testing.
 pub async fn poll_once(fleet: &CortexState) {
-    for nc in &fleet.node_configs {
-        poll_node(fleet, &nc.name, &nc.endpoint).await;
+    for nc in &fleet.neuron_configs {
+        poll_neuron(fleet, &nc.name, &nc.endpoint).await;
    }
 }

-async fn poll_node(fleet: &CortexState, name: &str, endpoint: &str) {
-    let url = format!("{endpoint}/v1/models");
+async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
+    let url = format!("{endpoint}/models");

    let result = fleet
        .http_client
@@ -41,38 +42,36 @@ async fn poll_node(fleet: &CortexState, name: &str, endpoint: &str) {

    match result {
        Ok(resp) if resp.status().is_success() => {
-            match resp.json::<MistralModelsResponse>().await {
-                Ok(models_resp) => {
-                    // Merge upstream model list into our state, preserving
-                    // our local metadata (last_accessed, vram_estimate).
+            match resp.json::<Vec<ModelInfo>>().await {
+                Ok(models) => {
                    let mut seen = std::collections::HashSet::new();
-                    for upstream in &models_resp.data {
+                    for upstream in &models {
                        seen.insert(upstream.id.clone());
-                        let status = parse_status(upstream.status.as_deref());
+                        let status = parse_status(&upstream.status);

                        node.models
                            .entry(upstream.id.clone())
                            .and_modify(|e| {
                                e.status = status;
+                                e.vram_estimate_mb = upstream.vram_used_mb;
                            })
                            .or_insert_with(|| ModelEntry {
                                id: upstream.id.clone(),
                                status,
                                last_accessed: None,
-                                vram_estimate_mb: None,
+                                vram_estimate_mb: upstream.vram_used_mb,
                            });
                    }

-                    // Remove models that are no longer reported by the node
-                    // (e.g. after a config change / restart).
+                    // Remove models no longer reported by the neuron.
                    node.models.retain(|id, _| seen.contains(id));

                    node.healthy = true;
                    node.last_poll = Some(Utc::now());
-                    tracing::debug!(node = name, models = models_resp.data.len(), "poll ok");
+                    tracing::debug!(node = name, models = models.len(), "poll ok");
                }
                Err(e) => {
-                    tracing::warn!(node = name, error = %e, "failed to parse /v1/models response");
+                    tracing::warn!(node = name, error = %e, "failed to parse /models response");
                    node.healthy = false;
                }
            }
@@ -81,24 +80,22 @@ async fn poll_node(fleet: &CortexState, name: &str, endpoint: &str) {
            tracing::warn!(
                node = name,
                status = %resp.status(),
-                "node returned non-success status"
+                "neuron returned non-success status"
            );
            node.healthy = false;
        }
        Err(e) => {
-            tracing::warn!(node = name, error = %e, "failed to reach node");
+            tracing::warn!(node = name, error = %e, "failed to reach neuron");
            node.healthy = false;
        }
    }
 }

-fn parse_status(s: Option<&str>) -> ModelStatus {
+fn parse_status(s: &str) -> ModelStatus {
    match s {
-        Some("loaded") => ModelStatus::Loaded,
-        Some("unloaded") => ModelStatus::Unloaded,
-        Some("reloading") => ModelStatus::Reloading,
-        // If the status field is absent, assume loaded (older mistral.rs versions
-        // may not include it).
+        "loaded" => ModelStatus::Loaded,
+        "unloaded" => ModelStatus::Unloaded,
+        "reloading" => ModelStatus::Reloading,
        _ => ModelStatus::Loaded,
    }
 }
--- a/crates/cortex-gateway/src/router.rs
+++ b/crates/cortex-gateway/src/router.rs
@@ -14,6 +14,7 @@ use std::sync::Arc;
 #[derive(Debug, Clone)]
 pub struct RouteDecision {
    pub node_name: String,
+    /// The inference endpoint to proxy to (from neuron's /models/{id}/endpoint).
    pub endpoint: String,
    /// Whether the model will need to load (cold start).
    pub cold_start: bool,
@@ -25,16 +26,19 @@ pub enum RouteError {
    ModelNotFound(String),
    #[error("no healthy nodes available")]
    NoHealthyNodes,
+    #[error("failed to resolve inference endpoint for model '{0}' on node '{1}'")]
+    EndpointResolveFailed(String, String),
 }

 /// Resolve which node should serve a request for the given model.
+/// Asks the neuron for the inference endpoint after selecting a node.
 pub async fn resolve(
    fleet: &Arc<CortexState>,
    model_id: &str,
 ) -> Result<RouteDecision, RouteError> {
+    let (node_name, neuron_endpoint, cold_start) = {
        let nodes = fleet.nodes.read().await;

-    // Pass 1: find a node where the model is already loaded.
        let mut loaded_candidate = None;
        let mut unloaded_candidate = None;

@@ -45,20 +49,13 @@ pub async fn resolve(
            if let Some(entry) = node.models.get(model_id) {
                match entry.status {
                    ModelStatus::Loaded | ModelStatus::Reloading => {
-                    loaded_candidate = Some(RouteDecision {
-                        node_name: node.name.clone(),
-                        endpoint: node.endpoint.clone(),
-                        cold_start: false,
-                    });
-                    break; // loaded is best, stop searching
+                        loaded_candidate = Some((node.name.clone(), node.endpoint.clone(), false));
+                        break;
                    }
                    ModelStatus::Unloaded => {
                        if unloaded_candidate.is_none() {
-                        unloaded_candidate = Some(RouteDecision {
-                            node_name: node.name.clone(),
-                            endpoint: node.endpoint.clone(),
-                            cold_start: true,
-                        });
+                            unloaded_candidate =
+                                Some((node.name.clone(), node.endpoint.clone(), true));
                        }
                    }
                }
@@ -71,5 +68,34 @@ pub async fn resolve(
            } else {
                RouteError::NoHealthyNodes
            }
+        })?
+    };
+
+    // Ask the neuron for the inference endpoint for this model.
+    let endpoint_url = format!(
+        "{}/models/{}/endpoint",
+        neuron_endpoint,
+        urlencoding::encode(model_id)
+    );
+
+    let inference_endpoint = match fleet.http_client.get(&endpoint_url).send().await {
+        Ok(resp) if resp.status().is_success() => match resp.json::<serde_json::Value>().await {
+            Ok(body) => body
+                .get("url")
+                .and_then(|v| v.as_str())
+                .map(|s| s.to_string()),
+            Err(_) => None,
+        },
+        _ => None,
+    };
+
+    let endpoint = inference_endpoint.ok_or_else(|| {
+        RouteError::EndpointResolveFailed(model_id.to_string(), node_name.clone())
+    })?;
+
+    Ok(RouteDecision {
+        node_name,
+        endpoint,
+        cold_start,
    })
 }
--- a/crates/cortex-gateway/src/state.rs
+++ b/crates/cortex-gateway/src/state.rs
@@ -1,4 +1,5 @@
-use cortex_core::config::{EvictionSettings, GatewayConfig, NodeConfig};
+use cortex_core::catalogue::ModelCatalogue;
+use cortex_core::config::{EvictionSettings, GatewayConfig, NeuronEndpoint};
 use cortex_core::node::NodeState;
 use std::collections::HashMap;
 use tokio::sync::RwLock;
@@ -6,23 +7,22 @@ use tokio::sync::RwLock;
 /// Shared fleet state, protected by a RwLock for concurrent reader access.
 pub struct CortexState {
    pub nodes: RwLock<HashMap<String, NodeState>>,
-    pub node_configs: Vec<NodeConfig>,
+    pub neuron_configs: Vec<NeuronEndpoint>,
    pub eviction: EvictionSettings,
+    pub catalogue: ModelCatalogue,
    pub http_client: reqwest::Client,
 }

 impl CortexState {
    pub fn from_config(config: &GatewayConfig) -> Self {
        let mut nodes = HashMap::new();
-        for nc in &config.nodes {
+        for nc in &config.neurons {
            nodes.insert(
                nc.name.clone(),
                NodeState {
                    name: nc.name.clone(),
                    endpoint: nc.endpoint.clone(),
-                    vram_mb: nc.vram_mb,
-                    pinned: nc.pinned.clone(),
-                    healthy: false, // will be set by first poll
+                    healthy: false,
                    models: HashMap::new(),
                    lifecycle_cycles: 0,
                    last_poll: None,
@@ -30,10 +30,13 @@ impl CortexState {
            );
        }

+        let catalogue = ModelCatalogue::load(&config.models_config);
+
        Self {
            nodes: RwLock::new(nodes),
-            node_configs: config.nodes.clone(),
+            neuron_configs: config.neurons.clone(),
            eviction: config.eviction.clone(),
+            catalogue,
            http_client: reqwest::Client::builder()
                .timeout(std::time::Duration::from_secs(300))
                .build()
--- a/crates/cortex-gateway/tests/anthropic.rs
+++ b/crates/cortex-gateway/tests/anthropic.rs
@@ -4,7 +4,7 @@ use serde_json::json;

 #[tokio::test]
 async fn test_anthropic_to_openai_round_trip() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -14,9 +14,7 @@ async fn test_anthropic_to_openai_round_trip() {
        .json(&json!({
            "model": "test-model",
            "max_tokens": 100,
-            "messages": [
-                {"role": "user", "content": "Hi"}
-            ]
+            "messages": [{"role": "user", "content": "Hi"}]
        }))
        .send()
        .await
@@ -25,29 +23,22 @@ async fn test_anthropic_to_openai_round_trip() {
    assert_eq!(resp.status(), 200);

    let body: serde_json::Value = resp.json().await.expect("valid JSON");
-
-    // Response should be in Anthropic format.
    assert_eq!(body["type"], "message");
    assert_eq!(body["role"], "assistant");
    assert_eq!(body["model"], "test-model");

-    // Content should be an array of content blocks.
    let content = body["content"].as_array().expect("content array");
    assert_eq!(content.len(), 1);
    assert_eq!(content[0]["type"], "text");
    assert_eq!(content[0]["text"], "Hello from mock backend");
-
-    // Stop reason should be translated from "stop" to "end_turn".
    assert_eq!(body["stop_reason"], "end_turn");
-
-    // Usage should have Anthropic field names.
    assert_eq!(body["usage"]["input_tokens"], 10);
    assert_eq!(body["usage"]["output_tokens"], 5);
 }

 #[tokio::test]
 async fn test_anthropic_with_system_prompt() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -58,24 +49,20 @@ async fn test_anthropic_with_system_prompt() {
            "model": "test-model",
            "max_tokens": 100,
            "system": "You are a helpful assistant.",
-            "messages": [
-                {"role": "user", "content": "Hi"}
-            ]
+            "messages": [{"role": "user", "content": "Hi"}]
        }))
        .send()
        .await
        .expect("request should succeed");

    assert_eq!(resp.status(), 200);
-
    let body: serde_json::Value = resp.json().await.expect("valid JSON");
    assert_eq!(body["type"], "message");
-    assert_eq!(body["content"][0]["text"], "Hello from mock backend");
 }

 #[tokio::test]
 async fn test_anthropic_with_content_blocks() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -85,29 +72,23 @@ async fn test_anthropic_with_content_blocks() {
        .json(&json!({
            "model": "test-model",
            "max_tokens": 100,
-            "messages": [
-                {
+            "messages": [{
                "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What is this?"}
-                    ]
-                }
-            ]
+                "content": [{"type": "text", "text": "What is this?"}]
+            }]
        }))
        .send()
        .await
        .expect("request should succeed");

    assert_eq!(resp.status(), 200);
-
    let body: serde_json::Value = resp.json().await.expect("valid JSON");
    assert_eq!(body["type"], "message");
-    assert_eq!(body["content"][0]["text"], "Hello from mock backend");
 }

 #[tokio::test]
 async fn test_anthropic_model_not_found() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -117,9 +98,7 @@ async fn test_anthropic_model_not_found() {
        .json(&json!({
            "model": "nonexistent",
            "max_tokens": 100,
-            "messages": [
-                {"role": "user", "content": "Hi"}
-            ]
+            "messages": [{"role": "user", "content": "Hi"}]
        }))
        .send()
        .await
@@ -130,27 +109,17 @@ async fn test_anthropic_model_not_found() {

 #[tokio::test]
 async fn test_anthropic_invalid_request() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
    let resp = client
        .post(format!("{gw_url}/v1/messages"))
        .header("content-type", "application/json")
-        .json(&json!({
-            "not_a_valid": "request"
-        }))
+        .json(&json!({"not_a_valid": "request"}))
        .send()
        .await
        .expect("request should succeed");

    assert_eq!(resp.status(), 400);
-
-    let body: serde_json::Value = resp.json().await.unwrap();
-    assert!(
-        body["error"]["message"]
-            .as_str()
-            .unwrap()
-            .contains("invalid Anthropic request")
-    );
 }
--- a/crates/cortex-gateway/tests/common/mod.rs
+++ b/crates/cortex-gateway/tests/common/mod.rs
@@ -1,12 +1,13 @@
 #![allow(dead_code)]

 use axum::body::Body;
+use axum::extract::Path;
 use axum::http::header;
 use axum::response::Response;
 use axum::routing::{get, post};
 use axum::{Json, Router};
 use cortex_core::config::{
-    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NodeConfig,
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
 };
 use cortex_core::node::{ModelEntry, ModelStatus};
 use cortex_gateway::state::CortexState;
@@ -16,20 +17,52 @@ use std::sync::Arc;
 use std::time::Duration;
 use tokio::net::TcpListener;

-/// Spawns a mock mistral.rs backend on a random port.
-/// Returns the base URL (e.g. "http://127.0.0.1:12345").
-pub async fn spawn_mock_backend() -> String {
-    let app = Router::new()
-        .route("/v1/chat/completions", post(mock_chat_completions))
-        .route("/v1/models", get(mock_list_models));
-
+/// Spawns a mock neuron that serves:
+/// - GET /models (returns one loaded "test-model")
+/// - GET /models/:id/endpoint (returns the inference URL)
+/// - POST /models/unload (accepts unload requests)
+/// - GET /v1/chat/completions + POST /v1/chat/completions (inference)
+/// Returns the neuron base URL.
+pub async fn spawn_mock_neuron() -> String {
    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+
+    let app = Router::new()
+        .route("/models", get(mock_neuron_list_models))
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_model_id): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
+        .route(
+            "/models/unload",
+            post(|Json(_body): Json<Value>| async { Json(json!({"status": "unloaded"})) }),
+        )
+        .route("/v1/chat/completions", post(mock_chat_completions))
+        .route("/v1/models", get(mock_v1_models));
+
    tokio::spawn(async move {
        axum::serve(listener, app).await.unwrap();
    });

-    format!("http://{addr}")
+    base_url
+}
+
+async fn mock_neuron_list_models() -> Json<Value> {
+    Json(json!([
+        {"id": "test-model", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000}
+    ]))
+}
+
+async fn mock_v1_models() -> Json<Value> {
+    Json(json!({
+        "object": "list",
+        "data": [{"id": "test-model", "object": "model", "status": "loaded"}]
+    }))
 }

 async fn mock_chat_completions(Json(body): Json<Value>) -> Json<Value> {
@@ -59,21 +92,22 @@ async fn mock_chat_completions(Json(body): Json<Value>) -> Json<Value> {
    }))
 }

-async fn mock_list_models() -> Json<Value> {
-    Json(json!({
-        "object": "list",
-        "data": [{
-            "id": "test-model",
-            "object": "model",
-            "status": "loaded"
-        }]
-    }))
-}
+/// Spawns a mock neuron that returns SSE streaming responses for chat completions.
+pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Duration) -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();

-/// Spawns a mock mistral.rs backend that returns SSE streaming responses.
-/// Each chunk is delayed by `chunk_delay` to prove the proxy streams incrementally.
-pub async fn spawn_streaming_mock_backend(chunk_count: usize, chunk_delay: Duration) -> String {
    let app = Router::new()
+        .route("/models", get(mock_neuron_list_models))
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_model_id): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
        .route(
            "/v1/chat/completions",
            post(move |Json(body): Json<Value>| async move {
@@ -118,40 +152,51 @@ pub async fn spawn_streaming_mock_backend(chunk_count: usize, chunk_delay: Durat
                    .body(Body::from_stream(stream))
                    .unwrap()
            }),
-        )
-        .route("/v1/models", get(mock_list_models));
+        );

-    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
-    let addr = listener.local_addr().unwrap();
    tokio::spawn(async move {
        axum::serve(listener, app).await.unwrap();
    });

-    format!("http://{addr}")
+    base_url
 }

-/// Spawns a mock backend with a custom `/v1/models` response.
-pub async fn spawn_mock_backend_with_models(models_response: Value) -> String {
+/// Spawns a mock neuron with a custom models list.
+pub async fn spawn_mock_neuron_with_models(models_response: Value) -> String {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+
    let app = Router::new()
-        .route("/v1/chat/completions", post(mock_chat_completions))
        .route(
-            "/v1/models",
+            "/models",
            get(move || {
                let resp = models_response.clone();
                async move { Json(resp) }
            }),
-        );
+        )
+        .route(
+            "/models/{model_id}/endpoint",
+            get(move |Path(_model_id): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
+            }),
+        )
+        .route(
+            "/models/unload",
+            post(|Json(_body): Json<Value>| async { Json(json!({"status": "unloaded"})) }),
+        )
+        .route("/v1/chat/completions", post(mock_chat_completions));

-    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
-    let addr = listener.local_addr().unwrap();
    tokio::spawn(async move {
        axum::serve(listener, app).await.unwrap();
    });

-    format!("http://{addr}")
+    base_url
 }

-/// Spawns the cortex gateway with a single node pointing at `mock_url`.
+/// Spawns the cortex gateway with a single neuron pointing at `mock_url`.
 /// The node is pre-seeded as healthy with one loaded model ("test-model").
 /// Returns the gateway's base URL.
 pub async fn spawn_gateway(mock_url: &str) -> String {
@@ -159,8 +204,7 @@ pub async fn spawn_gateway(mock_url: &str) -> String {
    url
 }

-/// Like `spawn_gateway` but also returns the shared `CortexState` so tests
-/// can call `poll_once` or inspect state directly.
+/// Like `spawn_gateway` but also returns the shared `CortexState`.
 pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, String) {
    let config = GatewayConfig {
        gateway: GatewaySettings {
@@ -171,18 +215,16 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "mock-node".into(),
            endpoint: mock_url.to_string(),
-            vram_mb: 24000,
-            pinned: vec![],
        }],
+        models_config: "/dev/null".into(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));

    // Seed the node as healthy with a loaded model.
-    // (Bypasses the poller, which is not running in tests.)
    {
        let mut nodes = fleet.nodes.write().await;
        let node = nodes.get_mut("mock-node").expect("node must exist");
--- a/crates/cortex-gateway/tests/eviction.rs
+++ b/crates/cortex-gateway/tests/eviction.rs
@@ -2,15 +2,16 @@ mod common;

 use chrono::Utc;
 use cortex_core::config::{
-    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NodeConfig,
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
 };
 use cortex_core::node::{ModelEntry, ModelStatus};
 use cortex_gateway::state::CortexState;
 use serde_json::json;
 use std::sync::Arc;

-/// Spawn a mock backend that accepts `/v1/models/unload` and records the call.
+/// Spawn a mock neuron that accepts `/models/unload` and records unload calls.
 async fn spawn_eviction_mock() -> (String, Arc<tokio::sync::Mutex<Vec<String>>>) {
+    use axum::extract::Path;
    use axum::routing::{get, post};
    use axum::{Json, Router};
    use serde_json::Value;
@@ -18,9 +19,14 @@ async fn spawn_eviction_mock() -> (String, Arc<tokio::sync::Mutex<Vec<String>>>)
    let unloaded: Arc<tokio::sync::Mutex<Vec<String>>> = Arc::new(tokio::sync::Mutex::new(vec![]));
    let unloaded_clone = Arc::clone(&unloaded);

+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let base_url = format!("http://{addr}");
+    let inference_url = base_url.clone();
+
    let app = Router::new()
        .route(
-            "/v1/models/unload",
+            "/models/unload",
            post(move |Json(body): Json<Value>| {
                let unloaded = Arc::clone(&unloaded_clone);
                async move {
@@ -30,30 +36,27 @@ async fn spawn_eviction_mock() -> (String, Arc<tokio::sync::Mutex<Vec<String>>>)
                        .unwrap_or("")
                        .to_string();
                    unloaded.lock().await.push(model_id);
-                    Json(json!({"status": "ok"}))
+                    Json(json!({"status": "unloaded"}))
                }
            }),
        )
+        .route("/models", get(|| async { Json(json!([])) }))
        .route(
-            "/v1/models",
-            get(|| async {
-                Json(json!({
-                    "object": "list",
-                    "data": []
-                }))
+            "/models/{model_id}/endpoint",
+            get(move |Path(_model_id): Path<String>| {
+                let url = inference_url.clone();
+                async move { Json(json!({"url": url})) }
            }),
        );

-    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
-    let addr = listener.local_addr().unwrap();
    tokio::spawn(async move {
        axum::serve(listener, app).await.unwrap();
    });

-    (format!("http://{addr}"), unloaded)
+    (base_url, unloaded)
 }

-fn make_fleet(endpoint: &str, pinned: Vec<String>, defrag_after: u32) -> Arc<CortexState> {
+fn make_fleet(endpoint: &str, defrag_after: u32) -> Arc<CortexState> {
    let config = GatewayConfig {
        gateway: GatewaySettings {
            listen: "127.0.0.1:0".into(),
@@ -63,12 +66,11 @@ fn make_fleet(endpoint: &str, pinned: Vec<String>, defrag_after: u32) -> Arc<Cor
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: defrag_after,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "gpu-node".into(),
            endpoint: endpoint.to_string(),
-            vram_mb: 24000,
-            pinned,
        }],
+        models_config: "/dev/null".into(),
    };
    Arc::new(CortexState::from_config(&config))
 }
@@ -76,9 +78,8 @@ fn make_fleet(endpoint: &str, pinned: Vec<String>, defrag_after: u32) -> Arc<Cor
 #[tokio::test]
 async fn test_evict_lru_model() {
    let (mock_url, unloaded) = spawn_eviction_mock().await;
-    let fleet = make_fleet(&mock_url, vec![], 0);
+    let fleet = make_fleet(&mock_url, 0);

-    // Seed two loaded models. "old-model" was accessed earlier than "new-model".
    {
        let mut nodes = fleet.nodes.write().await;
        let node = nodes.get_mut("gpu-node").unwrap();
@@ -107,15 +108,12 @@ async fn test_evict_lru_model() {
        .await
        .expect("eviction should succeed");

-    // The older model should be evicted.
    assert_eq!(evicted, Some("old-model".to_string()));

-    // Mock received the unload call.
    let calls = unloaded.lock().await;
    assert_eq!(calls.len(), 1);
    assert_eq!(calls[0], "old-model");

-    // Local state updated.
    let nodes = fleet.nodes.read().await;
    let node = nodes.get("gpu-node").unwrap();
    assert_eq!(
@@ -128,67 +126,15 @@ async fn test_evict_lru_model() {
    );
 }

-#[tokio::test]
-async fn test_eviction_skips_pinned_models() {
-    let (mock_url, unloaded) = spawn_eviction_mock().await;
-    // Pin "old-model" so it can't be evicted.
-    let fleet = make_fleet(&mock_url, vec!["old-model".into()], 0);
-
-    {
-        let mut nodes = fleet.nodes.write().await;
-        let node = nodes.get_mut("gpu-node").unwrap();
-        node.healthy = true;
-        // old-model is pinned and older — normally it would be evicted.
-        node.models.insert(
-            "old-model".into(),
-            ModelEntry {
-                id: "old-model".into(),
-                status: ModelStatus::Loaded,
-                last_accessed: Some(Utc::now() - chrono::Duration::hours(2)),
-                vram_estimate_mb: Some(8000),
-            },
-        );
-        node.models.insert(
-            "new-model".into(),
-            ModelEntry {
-                id: "new-model".into(),
-                status: ModelStatus::Loaded,
-                last_accessed: Some(Utc::now()),
-                vram_estimate_mb: Some(8000),
-            },
-        );
-    }
-
-    let evicted = cortex_gateway::evictor::evict_lru_on_node(&fleet, "gpu-node")
-        .await
-        .expect("eviction should succeed");
-
-    // new-model is evicted instead because old-model is pinned.
-    assert_eq!(evicted, Some("new-model".to_string()));
-
-    let calls = unloaded.lock().await;
-    assert_eq!(calls[0], "new-model");
-}
-
 #[tokio::test]
 async fn test_eviction_nothing_to_evict() {
    let (mock_url, unloaded) = spawn_eviction_mock().await;
-    // Pin the only model.
-    let fleet = make_fleet(&mock_url, vec!["only-model".into()], 0);
+    let fleet = make_fleet(&mock_url, 0);

+    // No models at all.
    {
        let mut nodes = fleet.nodes.write().await;
-        let node = nodes.get_mut("gpu-node").unwrap();
-        node.healthy = true;
-        node.models.insert(
-            "only-model".into(),
-            ModelEntry {
-                id: "only-model".into(),
-                status: ModelStatus::Loaded,
-                last_accessed: None,
-                vram_estimate_mb: Some(8000),
-            },
-        );
+        nodes.get_mut("gpu-node").unwrap().healthy = true;
    }

    let evicted = cortex_gateway::evictor::evict_lru_on_node(&fleet, "gpu-node")
@@ -196,8 +142,6 @@ async fn test_eviction_nothing_to_evict() {
        .expect("eviction should succeed");

    assert_eq!(evicted, None);
-
-    // No unload call made.
    let calls = unloaded.lock().await;
    assert!(calls.is_empty());
 }
@@ -205,7 +149,7 @@ async fn test_eviction_nothing_to_evict() {
 #[tokio::test]
 async fn test_eviction_increments_lifecycle_cycles() {
    let (mock_url, _) = spawn_eviction_mock().await;
-    let fleet = make_fleet(&mock_url, vec![], 0);
+    let fleet = make_fleet(&mock_url, 0);

    {
        let mut nodes = fleet.nodes.write().await;
@@ -233,10 +177,9 @@ async fn test_eviction_increments_lifecycle_cycles() {

 #[tokio::test]
 async fn test_last_accessed_updated_on_request() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let (fleet, gw_url) = common::spawn_gateway_with_state(&mock_url).await;

-    // Verify last_accessed is None initially.
    {
        let nodes = fleet.nodes.read().await;
        let node = nodes.get("mock-node").unwrap();
@@ -249,7 +192,6 @@ async fn test_last_accessed_updated_on_request() {
        );
    }

-    // Make a request.
    let client = reqwest::Client::new();
    client
        .post(format!("{gw_url}/v1/chat/completions"))
@@ -262,7 +204,6 @@ async fn test_last_accessed_updated_on_request() {
        .await
        .expect("request should succeed");

-    // Verify last_accessed is now set.
    let nodes = fleet.nodes.read().await;
    let node = nodes.get("mock-node").unwrap();
    assert!(
--- a/crates/cortex-gateway/tests/metrics.rs
+++ b/crates/cortex-gateway/tests/metrics.rs
@@ -4,21 +4,17 @@ use serde_json::json;

 #[tokio::test]
 async fn test_metrics_emitted_after_proxy() {
-    // Install a test recorder (no HTTP listener, renders to string).
-    // This sets the global recorder, so only one test can do this.
    let handle = cortex_gateway::metrics::install_test_recorder().expect("recorder should install");

-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

-    // Verify no request metrics yet.
    let before = handle.render();
    assert!(
        !before.contains("cortex_requests_total"),
        "no request metrics before any requests"
    );

-    // Make a successful request.
    let client = reqwest::Client::new();
    let resp = client
        .post(format!("{gw_url}/v1/chat/completions"))
@@ -31,10 +27,8 @@ async fn test_metrics_emitted_after_proxy() {
        .await
        .expect("request should succeed");
    assert_eq!(resp.status(), 200);
-    // Consume the response body to ensure the proxy completes.
    let _body: serde_json::Value = resp.json().await.unwrap();

-    // Check metrics were emitted.
    let after = handle.render();

    assert!(
@@ -45,7 +39,6 @@ async fn test_metrics_emitted_after_proxy() {
        after.contains("cortex_request_duration_seconds"),
        "cortex_request_duration_seconds should be present.\nMetrics:\n{after}"
    );
-    // Should NOT have error or cold start counters for this request.
    assert!(
        !after.contains("cortex_request_errors_total"),
        "no errors expected for a successful request"
--- a/crates/cortex-gateway/tests/poller.rs
+++ b/crates/cortex-gateway/tests/poller.rs
@@ -1,7 +1,7 @@
 mod common;

 use cortex_core::config::{
-    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NodeConfig,
+    EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
 };
 use cortex_core::node::ModelStatus;
 use cortex_gateway::state::CortexState;
@@ -10,14 +10,11 @@ use std::sync::Arc;

 #[tokio::test]
 async fn test_poller_discovers_models() {
-    // Mock backend reports 2 models: one loaded, one unloaded.
-    let mock_url = common::spawn_mock_backend_with_models(json!({
-        "object": "list",
-        "data": [
-            { "id": "model-a", "object": "model", "status": "loaded" },
-            { "id": "model-b", "object": "model", "status": "unloaded" }
-        ]
-    }))
+    // Mock neuron reports 2 models via /models endpoint (neuron format).
+    let mock_url = common::spawn_mock_neuron_with_models(json!([
+        {"id": "model-a", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
+        {"id": "model-b", "harness": "mistralrs", "status": "unloaded", "devices": [], "vram_used_mb": null}
+    ]))
    .await;

    let config = GatewayConfig {
@@ -29,17 +26,15 @@ async fn test_poller_discovers_models() {
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "test-node".into(),
            endpoint: mock_url,
-            vram_mb: 24000,
-            pinned: vec![],
        }],
+        models_config: "/dev/null".into(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));

-    // Before polling: node is unhealthy, no models.
    {
        let nodes = fleet.nodes.read().await;
        let node = nodes.get("test-node").unwrap();
@@ -47,10 +42,8 @@ async fn test_poller_discovers_models() {
        assert!(node.models.is_empty());
    }

-    // Poll once.
    cortex_gateway::poller::poll_once(&fleet).await;

-    // After polling: node is healthy, both models discovered with correct status.
    {
        let nodes = fleet.nodes.read().await;
        let node = nodes.get("test-node").unwrap();
@@ -69,14 +62,10 @@ async fn test_poller_discovers_models() {

 #[tokio::test]
 async fn test_poller_updates_gateway_models_endpoint() {
-    // Mock backend with 2 models.
-    let mock_url = common::spawn_mock_backend_with_models(json!({
-        "object": "list",
-        "data": [
-            { "id": "model-x", "object": "model", "status": "loaded" },
-            { "id": "model-y", "object": "model", "status": "loaded" }
-        ]
-    }))
+    let mock_url = common::spawn_mock_neuron_with_models(json!([
+        {"id": "model-x", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
+        {"id": "model-y", "harness": "mistralrs", "status": "loaded", "devices": [1], "vram_used_mb": null}
+    ]))
    .await;

    let config = GatewayConfig {
@@ -88,20 +77,16 @@ async fn test_poller_updates_gateway_models_endpoint() {
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "poll-node".into(),
            endpoint: mock_url,
-            vram_mb: 24000,
-            pinned: vec![],
        }],
+        models_config: "/dev/null".into(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
-
-    // Poll to discover models and mark node healthy.
    cortex_gateway::poller::poll_once(&fleet).await;

-    // Start gateway with the polled state.
    let app = cortex_gateway::build_app(Arc::clone(&fleet));
    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
    let addr = listener.local_addr().unwrap();
@@ -109,7 +94,6 @@ async fn test_poller_updates_gateway_models_endpoint() {
        axum::serve(listener, app).await.unwrap();
    });

-    // Query /v1/models on the gateway.
    let client = reqwest::Client::new();
    let resp = client
        .get(format!("http://{addr}/v1/models"))
@@ -127,7 +111,6 @@ async fn test_poller_updates_gateway_models_endpoint() {
    assert!(ids.contains(&"model-x"));
    assert!(ids.contains(&"model-y"));

-    // Verify node attribution in locations.
    for model in data {
        let locations = model["locations"].as_array().expect("locations array");
        assert_eq!(locations.len(), 1);
@@ -146,17 +129,15 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "dead-node".into(),
-            endpoint: "http://127.0.0.1:1".into(), // unreachable
-            vram_mb: 24000,
-            pinned: vec![],
+            endpoint: "http://127.0.0.1:1".into(),
        }],
+        models_config: "/dev/null".into(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));

-    // Manually mark healthy to verify poller flips it.
    {
        let mut nodes = fleet.nodes.write().await;
        nodes.get_mut("dead-node").unwrap().healthy = true;
@@ -170,14 +151,10 @@ async fn test_poller_marks_unreachable_node_unhealthy() {

 #[tokio::test]
 async fn test_poller_removes_stale_models() {
-    // Start with a mock that reports 2 models.
-    let mock_url = common::spawn_mock_backend_with_models(json!({
-        "object": "list",
-        "data": [
-            { "id": "keep-me", "object": "model", "status": "loaded" },
-            { "id": "drop-me", "object": "model", "status": "loaded" }
-        ]
-    }))
+    let mock_url = common::spawn_mock_neuron_with_models(json!([
+        {"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
+        {"id": "drop-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
+    ]))
    .await;

    let config = GatewayConfig {
@@ -189,35 +166,27 @@ async fn test_poller_removes_stale_models() {
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "test-node".into(),
            endpoint: mock_url,
-            vram_mb: 24000,
-            pinned: vec![],
        }],
+        models_config: "/dev/null".into(),
    };

    let fleet = Arc::new(CortexState::from_config(&config));
    cortex_gateway::poller::poll_once(&fleet).await;

-    // Verify both models exist.
    {
        let nodes = fleet.nodes.read().await;
        assert_eq!(nodes.get("test-node").unwrap().models.len(), 2);
    }

-    // Now spin up a new mock that only reports one model, and re-point the node.
-    let new_mock_url = common::spawn_mock_backend_with_models(json!({
-        "object": "list",
-        "data": [
-            { "id": "keep-me", "object": "model", "status": "loaded" }
-        ]
-    }))
+    // New mock with only one model.
+    let new_mock_url = common::spawn_mock_neuron_with_models(json!([
+        {"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
+    ]))
    .await;

-    // Update the node endpoint to point at the new mock.
-    // We can't change node_configs (they're immutable), so instead we'll
-    // create a new fleet with the updated endpoint and poll that.
    let config2 = GatewayConfig {
        gateway: GatewaySettings {
            listen: "127.0.0.1:0".into(),
@@ -227,17 +196,16 @@ async fn test_poller_removes_stale_models() {
            strategy: EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![NodeConfig {
+        neurons: vec![NeuronEndpoint {
            name: "test-node".into(),
            endpoint: new_mock_url,
-            vram_mb: 24000,
-            pinned: vec![],
        }],
+        models_config: "/dev/null".into(),
    };

    let fleet2 = Arc::new(CortexState::from_config(&config2));

-    // Seed the stale model so we can verify it gets removed.
+    // Seed stale model.
    {
        let mut nodes = fleet2.nodes.write().await;
        let node = nodes.get_mut("test-node").unwrap();
--- a/crates/cortex-gateway/tests/proxy_basic.rs
+++ b/crates/cortex-gateway/tests/proxy_basic.rs
@@ -4,7 +4,7 @@ use serde_json::json;

 #[tokio::test]
 async fn test_chat_completion_proxy() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -33,7 +33,7 @@ async fn test_chat_completion_proxy() {

 #[tokio::test]
 async fn test_health_endpoint() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -53,7 +53,7 @@ async fn test_health_endpoint() {

 #[tokio::test]
 async fn test_list_models() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -75,7 +75,7 @@ async fn test_list_models() {

 #[tokio::test]
 async fn test_model_not_found() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -112,12 +112,11 @@ async fn test_no_healthy_nodes() {
            strategy: cortex_core::config::EvictionStrategy::Lru,
            defrag_after_cycles: 0,
        },
-        nodes: vec![cortex_core::config::NodeConfig {
+        neurons: vec![cortex_core::config::NeuronEndpoint {
            name: "dead-node".into(),
            endpoint: "http://127.0.0.1:1".into(),
-            vram_mb: 24000,
-            pinned: vec![],
        }],
+        models_config: "/dev/null".into(),
    };
    let fleet = std::sync::Arc::new(cortex_gateway::state::CortexState::from_config(&config));

@@ -153,7 +152,7 @@ async fn test_no_healthy_nodes() {

 #[tokio::test]
 async fn test_missing_model_field() {
-    let mock_url = common::spawn_mock_backend().await;
+    let mock_url = common::spawn_mock_neuron().await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
--- a/crates/cortex-gateway/tests/streaming.rs
+++ b/crates/cortex-gateway/tests/streaming.rs
@@ -8,7 +8,7 @@ use std::time::{Duration, Instant};
 async fn test_streaming_sse_passthrough() {
    let chunk_count = 5;
    let chunk_delay = Duration::from_millis(50);
-    let mock_url = common::spawn_streaming_mock_backend(chunk_count, chunk_delay).await;
+    let mock_url = common::spawn_streaming_mock_neuron(chunk_count, chunk_delay).await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
@@ -33,7 +33,6 @@ async fn test_streaming_sse_passthrough() {
        "text/event-stream"
    );

-    // Collect SSE chunks as they arrive, recording arrival times.
    let start = Instant::now();
    let mut chunk_times = Vec::new();
    let mut chunks = Vec::new();
@@ -51,7 +50,6 @@ async fn test_streaming_sse_passthrough() {
        }
    }

-    // Verify we got all content chunks plus [DONE].
    assert!(
        chunks.len() >= chunk_count + 1,
        "expected at least {} chunks (got {}): {:?}",
@@ -60,10 +58,8 @@ async fn test_streaming_sse_passthrough() {
        chunks,
    );

-    // The last chunk should be [DONE].
    assert_eq!(chunks.last().unwrap(), "[DONE]");

-    // Verify the content chunks contain expected tokens.
    for i in 0..chunk_count {
        let chunk_json: serde_json::Value =
            serde_json::from_str(&chunks[i]).expect("chunk should be valid JSON");
@@ -73,10 +69,6 @@ async fn test_streaming_sse_passthrough() {
        );
    }

-    // Verify streaming behavior: total time should reflect incremental delivery,
-    // not a single batch. With 5 chunks at 50ms each + [DONE], we expect ~300ms total.
-    // If buffered, all chunks would arrive at once after ~300ms with no spread.
-    // We verify that the last chunk arrived noticeably after the first.
    let first = chunk_times.first().unwrap();
    let last = chunk_times.last().unwrap();
    let spread = *last - *first;
@@ -88,7 +80,7 @@ async fn test_streaming_sse_passthrough() {

 #[tokio::test]
 async fn test_streaming_done_terminator() {
-    let mock_url = common::spawn_streaming_mock_backend(2, Duration::from_millis(10)).await;
+    let mock_url = common::spawn_streaming_mock_neuron(2, Duration::from_millis(10)).await;
    let gw_url = common::spawn_gateway(&mock_url).await;

    let client = reqwest::Client::new();
--- a/crates/neuron/Cargo.toml
+++ b/crates/neuron/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "neuron"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lib]
+name = "neuron"
+path = "src/lib.rs"
+
+[[bin]]
+name = "neuron"
+path = "src/main.rs"
+
+[dependencies]
+cortex-core.workspace = true
+tokio.workspace = true
+axum.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+reqwest.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+anyhow.workspace = true
+async-trait.workspace = true
+clap.workspace = true
+figment.workspace = true
+toml.workspace = true
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["test-util"] }
+reqwest.workspace = true
--- a/crates/neuron/src/api.rs
+++ b/crates/neuron/src/api.rs
@@ -0,0 +1,104 @@
+//! HTTP API handlers for the neuron daemon.
+
+use crate::harness::HarnessRegistry;
+use crate::health::HealthCache;
+use axum::Router;
+use axum::extract::{Path, State};
+use axum::http::StatusCode;
+use axum::response::{IntoResponse, Json};
+use axum::routing::{get, post};
+use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
+use cortex_core::harness::ModelSpec;
+use serde_json::{Value, json};
+use std::sync::Arc;
+use tokio::sync::RwLock;
+
+/// Shared state for the neuron HTTP server.
+pub struct NeuronState {
+    pub discovery: DiscoveryResponse,
+    pub health_cache: Arc<HealthCache>,
+    pub registry: RwLock<HarnessRegistry>,
+}
+
+/// Build the neuron API router.
+pub fn neuron_routes() -> Router<Arc<NeuronState>> {
+    Router::new()
+        .route("/discovery", get(discovery_handler))
+        .route("/health", get(health_handler))
+        .route("/models", get(list_models))
+        .route("/models/load", post(load_model))
+        .route("/models/unload", post(unload_model))
+        .route("/models/{model_id}/endpoint", get(model_endpoint))
+}
+
+async fn discovery_handler(State(state): State<Arc<NeuronState>>) -> Json<DiscoveryResponse> {
+    Json(state.discovery.clone())
+}
+
+async fn health_handler(State(state): State<Arc<NeuronState>>) -> Json<HealthResponse> {
+    Json(state.health_cache.snapshot().await)
+}
+
+async fn list_models(State(state): State<Arc<NeuronState>>) -> impl IntoResponse {
+    let registry = state.registry.read().await;
+    match registry.list_all_models().await {
+        Ok(models) => Json(json!(models)).into_response(),
+        Err(e) => (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({"error": e.to_string()})),
+        )
+            .into_response(),
+    }
+}
+
+async fn load_model(
+    State(state): State<Arc<NeuronState>>,
+    Json(spec): Json<ModelSpec>,
+) -> impl IntoResponse {
+    let registry = state.registry.read().await;
+    match registry.load_model(&spec).await {
+        Ok(()) => Json(json!({"status": "loaded"})).into_response(),
+        Err(e) => (
+            StatusCode::BAD_REQUEST,
+            Json(json!({"error": e.to_string()})),
+        )
+            .into_response(),
+    }
+}
+
+async fn unload_model(
+    State(state): State<Arc<NeuronState>>,
+    Json(body): Json<Value>,
+) -> impl IntoResponse {
+    let model_id = match body.get("model_id").and_then(|v| v.as_str()) {
+        Some(id) => id.to_string(),
+        None => {
+            return (
+                StatusCode::BAD_REQUEST,
+                Json(json!({"error": "missing model_id"})),
+            )
+                .into_response();
+        }
+    };
+
+    let registry = state.registry.read().await;
+    match registry.unload_model(&model_id).await {
+        Ok(()) => Json(json!({"status": "unloaded"})).into_response(),
+        Err(e) => (StatusCode::NOT_FOUND, Json(json!({"error": e.to_string()}))).into_response(),
+    }
+}
+
+async fn model_endpoint(
+    State(state): State<Arc<NeuronState>>,
+    Path(model_id): Path<String>,
+) -> impl IntoResponse {
+    let registry = state.registry.read().await;
+    match registry.inference_endpoint(&model_id).await {
+        Some(url) => Json(json!({"url": url})).into_response(),
+        None => (
+            StatusCode::NOT_FOUND,
+            Json(json!({"error": format!("model '{}' not loaded", model_id)})),
+        )
+            .into_response(),
+    }
+}
--- a/crates/neuron/src/config.rs
+++ b/crates/neuron/src/config.rs
@@ -0,0 +1,40 @@
+//! Neuron configuration loaded from neuron.toml.
+
+use cortex_core::harness::HarnessConfig;
+use figment::{
+    Figment,
+    providers::{Env, Format, Toml},
+};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct NeuronConfig {
+    #[serde(default = "default_port")]
+    pub port: u16,
+    #[serde(default)]
+    pub harnesses: Vec<HarnessConfig>,
+}
+
+fn default_port() -> u16 {
+    9090
+}
+
+impl NeuronConfig {
+    pub fn load(path: impl AsRef<Path>) -> Result<Self, Box<figment::Error>> {
+        Figment::new()
+            .merge(Toml::file(path))
+            .merge(Env::prefixed("NEURON_").split("__"))
+            .extract()
+            .map_err(Box::new)
+    }
+}
+
+impl Default for NeuronConfig {
+    fn default() -> Self {
+        Self {
+            port: 9090,
+            harnesses: vec![],
+        }
+    }
+}
--- a/crates/neuron/src/discovery.rs
+++ b/crates/neuron/src/discovery.rs
@@ -0,0 +1,275 @@
+//! GPU discovery via nvidia-smi and system info gathering.
+//!
+//! Pure parsing functions are separated from command execution for testability.
+
+use anyhow::{Context, Result};
+use cortex_core::discovery::{DeviceHealth, DeviceInfo, DiscoveryResponse};
+
+const NVIDIA_SMI_DISCOVERY_QUERY: &str = "index,name,memory.total,compute_cap,driver_version";
+const NVIDIA_SMI_HEALTH_QUERY: &str =
+    "index,memory.used,memory.free,utilization.gpu,temperature.gpu";
+
+// ── Pure parsing functions (testable without GPU) ───────────────────
+
+/// Parse nvidia-smi CSV output for device discovery.
+///
+/// Expected input format (one line per GPU):
+/// ```text
+/// 0, NVIDIA GeForce RTX 5090, 32614, 12.0, 570.86.16
+/// 1, NVIDIA GeForce RTX 5090, 32614, 12.0, 570.86.16
+/// ```
+pub fn parse_gpu_info(csv_output: &str) -> Result<Vec<DeviceInfo>> {
+    let mut devices = Vec::new();
+    for line in csv_output.lines() {
+        let line = line.trim();
+        if line.is_empty() {
+            continue;
+        }
+        let parts: Vec<&str> = line.splitn(5, ',').map(|s| s.trim()).collect();
+        if parts.len() < 5 {
+            anyhow::bail!("malformed nvidia-smi line (expected 5 fields): {line}");
+        }
+        devices.push(DeviceInfo {
+            index: parts[0]
+                .parse()
+                .with_context(|| format!("invalid GPU index: {}", parts[0]))?,
+            name: parts[1].to_string(),
+            vram_total_mb: parts[2]
+                .parse()
+                .with_context(|| format!("invalid VRAM: {}", parts[2]))?,
+            compute_capability: parts[3].to_string(),
+        });
+    }
+    Ok(devices)
+}
+
+/// Extract the driver version from nvidia-smi discovery output.
+/// Takes the driver_version field from the first GPU line.
+pub fn parse_driver_version(csv_output: &str) -> Option<String> {
+    let line = csv_output.lines().find(|l| !l.trim().is_empty())?;
+    let parts: Vec<&str> = line.splitn(5, ',').map(|s| s.trim()).collect();
+    if parts.len() >= 5 {
+        Some(parts[4].to_string())
+    } else {
+        None
+    }
+}
+
+/// Parse the CUDA version from `nvcc --version` output.
+///
+/// Expected line: `Cuda compilation tools, release 12.8, V12.8.93`
+pub fn parse_cuda_version(nvcc_output: &str) -> Option<String> {
+    for line in nvcc_output.lines() {
+        if line.contains("release") {
+            // Extract "12.8" from "release 12.8,"
+            let after_release = line.split("release").nth(1)?;
+            let version = after_release.trim().split(',').next()?.trim();
+            if !version.is_empty() {
+                return Some(version.to_string());
+            }
+        }
+    }
+    None
+}
+
+/// Parse nvidia-smi CSV output for health metrics.
+///
+/// Expected input format (one line per GPU):
+/// ```text
+/// 0, 8192, 24372, 45, 62
+/// ```
+pub fn parse_health_info(csv_output: &str) -> Result<Vec<DeviceHealth>> {
+    let mut devices = Vec::new();
+    for line in csv_output.lines() {
+        let line = line.trim();
+        if line.is_empty() {
+            continue;
+        }
+        let parts: Vec<&str> = line.splitn(5, ',').map(|s| s.trim()).collect();
+        if parts.len() < 5 {
+            anyhow::bail!("malformed nvidia-smi health line (expected 5 fields): {line}");
+        }
+        devices.push(DeviceHealth {
+            index: parts[0].parse().with_context(|| "invalid index")?,
+            vram_used_mb: parts[1].parse().with_context(|| "invalid vram_used")?,
+            vram_free_mb: parts[2].parse().with_context(|| "invalid vram_free")?,
+            utilization_pct: parts[3].parse().with_context(|| "invalid utilization")?,
+            temp_c: parts[4].parse().with_context(|| "invalid temp")?,
+        });
+    }
+    Ok(devices)
+}
+
+// ── Command execution wrappers ──────────────────────────────────────
+
+async fn run_command(cmd: &str, args: &[&str]) -> Result<String> {
+    let output = tokio::process::Command::new(cmd)
+        .args(args)
+        .output()
+        .await
+        .with_context(|| format!("failed to execute {cmd}"))?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("{cmd} failed: {stderr}");
+    }
+    Ok(String::from_utf8_lossy(&output.stdout).to_string())
+}
+
+async fn run_command_optional(cmd: &str, args: &[&str]) -> Option<String> {
+    run_command(cmd, args).await.ok()
+}
+
+/// Discover the full system: hostname, OS, kernel, GPUs, CUDA version.
+/// Handles nvidia-smi not found gracefully (returns empty devices).
+pub async fn discover_system() -> Result<DiscoveryResponse> {
+    let hostname = run_command("uname", &["-n"])
+        .await
+        .unwrap_or_else(|_| "unknown".into())
+        .trim()
+        .to_string();
+    let os = run_command("uname", &["-s"])
+        .await
+        .unwrap_or_else(|_| "unknown".into())
+        .trim()
+        .to_string();
+    let kernel = run_command("uname", &["-r"])
+        .await
+        .unwrap_or_else(|_| "unknown".into())
+        .trim()
+        .to_string();
+
+    let (devices, driver_version) = match run_command_optional(
+        "nvidia-smi",
+        &[
+            &format!("--query-gpu={NVIDIA_SMI_DISCOVERY_QUERY}"),
+            "--format=csv,noheader,nounits",
+        ],
+    )
+    .await
+    {
+        Some(output) => {
+            let devs = parse_gpu_info(&output).unwrap_or_default();
+            let driver = parse_driver_version(&output);
+            (devs, driver)
+        }
+        None => {
+            tracing::info!("nvidia-smi not found — no GPU devices discovered");
+            (vec![], None)
+        }
+    };
+
+    let cuda_version = match run_command_optional("nvcc", &["--version"]).await {
+        Some(output) => parse_cuda_version(&output),
+        None => None,
+    };
+
+    Ok(DiscoveryResponse {
+        hostname,
+        os,
+        kernel,
+        cuda_version,
+        driver_version,
+        devices,
+        harnesses: vec![], // populated by harness registry in Phase 8
+    })
+}
+
+/// Run nvidia-smi health query and parse the output.
+pub async fn query_health() -> Result<Vec<DeviceHealth>> {
+    let output = run_command(
+        "nvidia-smi",
+        &[
+            &format!("--query-gpu={NVIDIA_SMI_HEALTH_QUERY}"),
+            "--format=csv,noheader,nounits",
+        ],
+    )
+    .await?;
+    parse_health_info(&output)
+}
+
+// ── Tests ───────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_gpu_info_single_gpu() {
+        let csv = "0, NVIDIA GeForce RTX 4090, 24564, 8.9, 570.86.16\n";
+        let devices = parse_gpu_info(csv).unwrap();
+        assert_eq!(devices.len(), 1);
+        assert_eq!(devices[0].index, 0);
+        assert_eq!(devices[0].name, "NVIDIA GeForce RTX 4090");
+        assert_eq!(devices[0].vram_total_mb, 24564);
+        assert_eq!(devices[0].compute_capability, "8.9");
+    }
+
+    #[test]
+    fn test_parse_gpu_info_multi_gpu() {
+        let csv = "\
+            0, NVIDIA GeForce RTX 5090, 32614, 12.0, 570.86.16\n\
+            1, NVIDIA GeForce RTX 5090, 32614, 12.0, 570.86.16\n";
+        let devices = parse_gpu_info(csv).unwrap();
+        assert_eq!(devices.len(), 2);
+        assert_eq!(devices[0].index, 0);
+        assert_eq!(devices[1].index, 1);
+        assert_eq!(devices[0].vram_total_mb, 32614);
+    }
+
+    #[test]
+    fn test_parse_gpu_info_empty() {
+        let devices = parse_gpu_info("").unwrap();
+        assert!(devices.is_empty());
+    }
+
+    #[test]
+    fn test_parse_gpu_info_malformed() {
+        let result = parse_gpu_info("garbage data");
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parse_driver_version() {
+        let csv = "0, NVIDIA GeForce RTX 4090, 24564, 8.9, 570.86.16\n";
+        assert_eq!(parse_driver_version(csv), Some("570.86.16".to_string()));
+    }
+
+    #[test]
+    fn test_parse_cuda_version() {
+        let nvcc = "\
+            nvcc: NVIDIA (R) Cuda compiler driver\n\
+            Copyright (c) 2005-2024 NVIDIA Corporation\n\
+            Built on Thu_Sep_12_02:18:05_PDT_2024\n\
+            Cuda compilation tools, release 12.8, V12.8.93\n";
+        assert_eq!(parse_cuda_version(nvcc), Some("12.8".to_string()));
+    }
+
+    #[test]
+    fn test_parse_cuda_version_missing() {
+        assert_eq!(parse_cuda_version("unrelated output"), None);
+    }
+
+    #[test]
+    fn test_parse_health_info() {
+        let csv = "0, 8192, 16372, 45, 62\n";
+        let health = parse_health_info(csv).unwrap();
+        assert_eq!(health.len(), 1);
+        assert_eq!(health[0].index, 0);
+        assert_eq!(health[0].vram_used_mb, 8192);
+        assert_eq!(health[0].vram_free_mb, 16372);
+        assert_eq!(health[0].utilization_pct, 45);
+        assert_eq!(health[0].temp_c, 62);
+    }
+
+    #[test]
+    fn test_parse_health_info_multi_gpu() {
+        let csv = "\
+            0, 8192, 24372, 45, 62\n\
+            1, 4096, 28468, 30, 58\n";
+        let health = parse_health_info(csv).unwrap();
+        assert_eq!(health.len(), 2);
+        assert_eq!(health[1].vram_used_mb, 4096);
+        assert_eq!(health[1].temp_c, 58);
+    }
+}
--- a/crates/neuron/src/harness/llamacpp.rs
+++ b/crates/neuron/src/harness/llamacpp.rs
@@ -0,0 +1 @@
+// llama.cpp harness implementation — Phase 11.
--- a/crates/neuron/src/harness/mistralrs.rs
+++ b/crates/neuron/src/harness/mistralrs.rs
@@ -0,0 +1,163 @@
+//! mistral.rs harness implementation.
+//!
+//! Wraps the mistral.rs HTTP API for model lifecycle management
+//! and optionally manages the process via systemd.
+
+use anyhow::Result;
+use async_trait::async_trait;
+use cortex_core::harness::{Harness, HarnessConfig, HarnessHealth, ModelInfo, ModelSpec};
+use reqwest::Client;
+use serde::Deserialize;
+
+pub struct MistralRsHarness {
+    endpoint: String,
+    systemd_unit: Option<String>,
+    client: Client,
+}
+
+impl MistralRsHarness {
+    pub fn new(endpoint: String, systemd_unit: Option<String>) -> Self {
+        Self {
+            endpoint,
+            systemd_unit,
+            client: Client::builder()
+                .timeout(std::time::Duration::from_secs(30))
+                .build()
+                .expect("failed to build HTTP client"),
+        }
+    }
+}
+
+/// Response from mistral.rs `GET /v1/models`.
+#[derive(Debug, Deserialize)]
+struct ModelsResponse {
+    data: Vec<ModelEntry>,
+}
+
+#[derive(Debug, Deserialize)]
+struct ModelEntry {
+    id: String,
+    #[serde(default)]
+    status: Option<String>,
+}
+
+#[async_trait]
+impl Harness for MistralRsHarness {
+    fn name(&self) -> &str {
+        "mistralrs"
+    }
+
+    async fn start(&self, _config: &HarnessConfig) -> Result<()> {
+        let Some(unit) = &self.systemd_unit else {
+            anyhow::bail!("no systemd unit configured for mistralrs harness");
+        };
+
+        let output = tokio::process::Command::new("systemctl")
+            .args(["start", unit])
+            .output()
+            .await?;
+
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            anyhow::bail!("systemctl start {unit} failed: {stderr}");
+        }
+
+        // Wait for the health endpoint to respond (up to 30s).
+        let url = format!("{}/health", self.endpoint);
+        for _ in 0..30 {
+            tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+            if self.client.get(&url).send().await.is_ok() {
+                tracing::info!(unit, "mistralrs started and healthy");
+                return Ok(());
+            }
+        }
+        anyhow::bail!("mistralrs started but health endpoint did not respond within 30s");
+    }
+
+    async fn stop(&self) -> Result<()> {
+        let Some(unit) = &self.systemd_unit else {
+            anyhow::bail!("no systemd unit configured for mistralrs harness");
+        };
+
+        let output = tokio::process::Command::new("systemctl")
+            .args(["stop", unit])
+            .output()
+            .await?;
+
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            anyhow::bail!("systemctl stop {unit} failed: {stderr}");
+        }
+        Ok(())
+    }
+
+    async fn health(&self) -> HarnessHealth {
+        let url = format!("{}/health", self.endpoint);
+        let running = self.client.get(&url).send().await.is_ok();
+        HarnessHealth {
+            name: "mistralrs".into(),
+            running,
+            uptime_secs: None,
+        }
+    }
+
+    async fn list_models(&self) -> Result<Vec<ModelInfo>> {
+        let url = format!("{}/v1/models", self.endpoint);
+        let resp = self.client.get(&url).send().await?;
+
+        if !resp.status().is_success() {
+            anyhow::bail!("GET /v1/models returned {}", resp.status());
+        }
+
+        let models_resp: ModelsResponse = resp.json().await?;
+        Ok(models_resp
+            .data
+            .into_iter()
+            .map(|m| ModelInfo {
+                id: m.id,
+                harness: "mistralrs".into(),
+                status: m.status.unwrap_or_else(|| "loaded".into()),
+                devices: vec![],
+                vram_used_mb: None,
+            })
+            .collect())
+    }
+
+    async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
+        let url = format!("{}/v1/models/reload", self.endpoint);
+        let resp = self
+            .client
+            .post(&url)
+            .json(&serde_json::json!({ "model_id": spec.model_id }))
+            .send()
+            .await?;
+
+        if !resp.status().is_success() {
+            let body = resp.text().await.unwrap_or_default();
+            anyhow::bail!("POST /v1/models/reload failed: {body}");
+        }
+        Ok(())
+    }
+
+    async fn unload_model(&self, model_id: &str) -> Result<()> {
+        let url = format!("{}/v1/models/unload", self.endpoint);
+        let resp = self
+            .client
+            .post(&url)
+            .json(&serde_json::json!({ "model_id": model_id }))
+            .send()
+            .await?;
+
+        if !resp.status().is_success() {
+            let body = resp.text().await.unwrap_or_default();
+            anyhow::bail!("POST /v1/models/unload failed: {body}");
+        }
+        Ok(())
+    }
+
+    async fn inference_endpoint(&self, _model_id: &str) -> Option<String> {
+        // mistral.rs routes internally by model name in the request body,
+        // so the inference endpoint is always the base URL.
+        Some(self.endpoint.clone())
+    }
+}
--- a/crates/neuron/src/harness/mod.rs
+++ b/crates/neuron/src/harness/mod.rs
@@ -0,0 +1,105 @@
+//! Harness registry — maps harness names to trait implementations.
+
+pub mod llamacpp;
+pub mod mistralrs;
+
+use anyhow::Result;
+use cortex_core::harness::{Harness, HarnessConfig, ModelInfo, ModelSpec};
+use std::collections::HashMap;
+
+/// Registry of available harness implementations.
+pub struct HarnessRegistry {
+    harnesses: HashMap<String, Box<dyn Harness>>,
+}
+
+impl Default for HarnessRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl HarnessRegistry {
+    pub fn new() -> Self {
+        Self {
+            harnesses: HashMap::new(),
+        }
+    }
+
+    pub fn register(&mut self, harness: Box<dyn Harness>) {
+        self.harnesses.insert(harness.name().to_string(), harness);
+    }
+
+    /// List all registered harness names.
+    pub fn names(&self) -> Vec<String> {
+        self.harnesses.keys().cloned().collect()
+    }
+
+    /// List models from all registered harnesses.
+    pub async fn list_all_models(&self) -> Result<Vec<ModelInfo>> {
+        let mut all = Vec::new();
+        for harness in self.harnesses.values() {
+            match harness.list_models().await {
+                Ok(models) => all.extend(models),
+                Err(e) => {
+                    tracing::warn!(harness = harness.name(), error = %e, "failed to list models");
+                }
+            }
+        }
+        Ok(all)
+    }
+
+    /// Load a model on the specified harness.
+    pub async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
+        let harness = self
+            .harnesses
+            .get(&spec.harness)
+            .ok_or_else(|| anyhow::anyhow!("unknown harness: {}", spec.harness))?;
+        harness.load_model(spec).await
+    }
+
+    /// Unload a model. Tries each harness until one claims it.
+    pub async fn unload_model(&self, model_id: &str) -> Result<()> {
+        for harness in self.harnesses.values() {
+            match harness.list_models().await {
+                Ok(models) if models.iter().any(|m| m.id == model_id) => {
+                    return harness.unload_model(model_id).await;
+                }
+                _ => continue,
+            }
+        }
+        anyhow::bail!("model '{model_id}' not found on any harness")
+    }
+
+    /// Get the inference endpoint for a model.
+    pub async fn inference_endpoint(&self, model_id: &str) -> Option<String> {
+        for harness in self.harnesses.values() {
+            if let Some(url) = harness.inference_endpoint(model_id).await {
+                return Some(url);
+            }
+        }
+        None
+    }
+
+    /// Build a registry from harness configs.
+    pub fn from_configs(configs: &[HarnessConfig]) -> Self {
+        let mut registry = Self::new();
+        for config in configs {
+            match config.name.as_str() {
+                "mistralrs" => {
+                    if let Some(endpoint) = &config.endpoint {
+                        registry.register(Box::new(mistralrs::MistralRsHarness::new(
+                            endpoint.clone(),
+                            config.systemd_unit.clone(),
+                        )));
+                    } else {
+                        tracing::warn!("mistralrs harness missing endpoint, skipping");
+                    }
+                }
+                other => {
+                    tracing::warn!(harness = other, "unknown harness type, skipping");
+                }
+            }
+        }
+        registry
+    }
+}
--- a/crates/neuron/src/health.rs
+++ b/crates/neuron/src/health.rs
@@ -0,0 +1,70 @@
+//! Cached GPU health monitoring via periodic nvidia-smi polling.
+
+use cortex_core::discovery::HealthResponse;
+use std::time::{Duration, Instant};
+use tokio::sync::RwLock;
+
+const POLL_INTERVAL: Duration = Duration::from_secs(5);
+
+/// Thread-safe cache for the latest GPU health reading.
+pub struct HealthCache {
+    inner: RwLock<HealthResponse>,
+    has_gpus: RwLock<bool>,
+}
+
+impl Default for HealthCache {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl HealthCache {
+    pub fn new() -> Self {
+        Self {
+            inner: RwLock::new(HealthResponse {
+                uptime_secs: 0,
+                devices: vec![],
+            }),
+            has_gpus: RwLock::new(false),
+        }
+    }
+
+    /// Mark whether this node has GPUs (set after discovery).
+    pub async fn set_has_gpus(&self, has_gpus: bool) {
+        *self.has_gpus.write().await = has_gpus;
+    }
+
+    /// Get a snapshot of the current health state.
+    pub async fn snapshot(&self) -> HealthResponse {
+        self.inner.read().await.clone()
+    }
+
+    /// Run forever, polling nvidia-smi every 5 seconds and updating the cache.
+    pub async fn poll_loop(&self, start_time: Instant) {
+        loop {
+            tokio::time::sleep(POLL_INTERVAL).await;
+
+            let uptime = start_time.elapsed().as_secs();
+
+            if !*self.has_gpus.read().await {
+                let mut health = self.inner.write().await;
+                health.uptime_secs = uptime;
+                continue;
+            }
+
+            match crate::discovery::query_health().await {
+                Ok(devices) => {
+                    let mut health = self.inner.write().await;
+                    health.uptime_secs = uptime;
+                    health.devices = devices;
+                }
+                Err(e) => {
+                    tracing::warn!(error = %e, "failed to poll GPU health");
+                    // Keep last known reading, just update uptime.
+                    let mut health = self.inner.write().await;
+                    health.uptime_secs = uptime;
+                }
+            }
+        }
+    }
+}
--- a/crates/neuron/src/lib.rs
+++ b/crates/neuron/src/lib.rs
@@ -0,0 +1,5 @@
+pub mod api;
+pub mod config;
+pub mod discovery;
+pub mod harness;
+pub mod health;
--- a/crates/neuron/src/main.rs
+++ b/crates/neuron/src/main.rs
@@ -0,0 +1,77 @@
+use anyhow::Result;
+use clap::Parser;
+use neuron::{api, config::NeuronConfig, discovery, harness::HarnessRegistry, health};
+use std::sync::Arc;
+use std::time::Instant;
+use tokio::sync::RwLock;
+use tracing_subscriber::EnvFilter;
+
+#[derive(Parser)]
+#[command(name = "neuron")]
+#[command(about = "Per-node daemon for cortex inference clusters")]
+#[command(version)]
+struct Args {
+    /// Port to listen on (overrides config file).
+    #[arg(short, long)]
+    port: Option<u16>,
+
+    /// Path to the neuron config file.
+    #[arg(short, long, default_value = "neuron.toml")]
+    config: String,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| EnvFilter::new("info,neuron=debug")),
+        )
+        .init();
+
+    let args = Args::parse();
+
+    let cfg = NeuronConfig::load(&args.config).unwrap_or_else(|e| {
+        tracing::warn!(path = %args.config, error = %e, "config not found, using defaults");
+        NeuronConfig::default()
+    });
+
+    let port = args.port.unwrap_or(cfg.port);
+    let start_time = Instant::now();
+
+    tracing::info!("running hardware discovery");
+    let mut discovery_result = discovery::discover_system().await?;
+    tracing::info!(
+        hostname = %discovery_result.hostname,
+        devices = discovery_result.devices.len(),
+        "discovery complete"
+    );
+
+    // Build harness registry from config.
+    let registry = HarnessRegistry::from_configs(&cfg.harnesses);
+    discovery_result.harnesses = registry.names();
+
+    let health_cache = Arc::new(health::HealthCache::new());
+    health_cache
+        .set_has_gpus(!discovery_result.devices.is_empty())
+        .await;
+
+    let poller_cache = Arc::clone(&health_cache);
+    tokio::spawn(async move {
+        poller_cache.poll_loop(start_time).await;
+    });
+
+    let state = Arc::new(api::NeuronState {
+        discovery: discovery_result,
+        health_cache,
+        registry: RwLock::new(registry),
+    });
+
+    let app = api::neuron_routes().with_state(state);
+    let addr: std::net::SocketAddr = format!("0.0.0.0:{port}").parse()?;
+    tracing::info!("neuron listening on {addr}");
+    let listener = tokio::net::TcpListener::bind(addr).await?;
+    axum::serve(listener, app).await?;
+
+    Ok(())
+}
--- a/crates/neuron/tests/api.rs
+++ b/crates/neuron/tests/api.rs
@@ -0,0 +1,249 @@
+use cortex_core::discovery::{DeviceInfo, DiscoveryResponse};
+use neuron::api::{self, NeuronState};
+use neuron::harness::HarnessRegistry;
+use neuron::health::HealthCache;
+use serde_json::json;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+
+async fn spawn_neuron(discovery: DiscoveryResponse) -> String {
+    let health_cache = Arc::new(HealthCache::new());
+    let registry = HarnessRegistry::new();
+
+    let state = Arc::new(NeuronState {
+        discovery,
+        health_cache,
+        registry: RwLock::new(registry),
+    });
+
+    let app = api::neuron_routes().with_state(state);
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    format!("http://{addr}")
+}
+
+fn fake_discovery() -> DiscoveryResponse {
+    DiscoveryResponse {
+        hostname: "test-node".into(),
+        os: "Linux".into(),
+        kernel: "6.19.0".into(),
+        cuda_version: Some("12.8".into()),
+        driver_version: Some("570.86.16".into()),
+        devices: vec![
+            DeviceInfo {
+                index: 0,
+                name: "NVIDIA GeForce RTX 5090".into(),
+                vram_total_mb: 32614,
+                compute_capability: "12.0".into(),
+            },
+            DeviceInfo {
+                index: 1,
+                name: "NVIDIA GeForce RTX 5090".into(),
+                vram_total_mb: 32614,
+                compute_capability: "12.0".into(),
+            },
+        ],
+        harnesses: vec![],
+    }
+}
+
+#[tokio::test]
+async fn test_discovery_endpoint() {
+    let url = spawn_neuron(fake_discovery()).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(format!("{url}/discovery"))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(body["hostname"], "test-node");
+    assert_eq!(body["cuda_version"], "12.8");
+
+    let devices = body["devices"].as_array().unwrap();
+    assert_eq!(devices.len(), 2);
+    assert_eq!(devices[0]["name"], "NVIDIA GeForce RTX 5090");
+    assert_eq!(devices[0]["vram_total_mb"], 32614);
+}
+
+#[tokio::test]
+async fn test_health_endpoint() {
+    let url = spawn_neuron(fake_discovery()).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(format!("{url}/health"))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(body["uptime_secs"], 0);
+}
+
+#[tokio::test]
+async fn test_discovery_no_gpus() {
+    let disc = DiscoveryResponse {
+        hostname: "cpu-only".into(),
+        os: "Linux".into(),
+        kernel: "6.19.0".into(),
+        cuda_version: None,
+        driver_version: None,
+        devices: vec![],
+        harnesses: vec![],
+    };
+    let url = spawn_neuron(disc).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(format!("{url}/discovery"))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(body["hostname"], "cpu-only");
+    assert!(body["cuda_version"].is_null());
+    assert!(body["devices"].as_array().unwrap().is_empty());
+}
+
+#[tokio::test]
+async fn test_models_empty_registry() {
+    let url = spawn_neuron(fake_discovery()).await;
+
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(format!("{url}/models"))
+        .send()
+        .await
+        .expect("request should succeed");
+
+    assert_eq!(resp.status(), 200);
+
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert!(body.as_array().unwrap().is_empty());
+}
+
+/// Spawn a mock mistral.rs backend and a neuron with the mistralrs harness
+/// pointing at it, then test the full model lifecycle through neuron's API.
+#[tokio::test]
+async fn test_models_via_mistralrs_harness() {
+    use axum::routing::{get, post};
+    use axum::{Json, Router};
+    use cortex_core::harness::HarnessConfig;
+    use serde_json::Value;
+
+    // Mock mistral.rs backend.
+    let mock_app = Router::new()
+        .route(
+            "/v1/models",
+            get(|| async {
+                Json(json!({
+                    "data": [
+                        {"id": "test-model", "status": "loaded"},
+                        {"id": "other-model", "status": "unloaded"}
+                    ]
+                }))
+            }),
+        )
+        .route(
+            "/v1/models/unload",
+            post(|Json(_body): Json<Value>| async { Json(json!({"status": "ok"})) }),
+        )
+        .route(
+            "/v1/models/reload",
+            post(|Json(_body): Json<Value>| async { Json(json!({"status": "ok"})) }),
+        );
+
+    let mock_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let mock_addr = mock_listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(mock_listener, mock_app).await.unwrap();
+    });
+    let mock_url = format!("http://{mock_addr}");
+
+    // Build neuron with mistralrs harness pointing at mock.
+    let registry = HarnessRegistry::from_configs(&[HarnessConfig {
+        name: "mistralrs".into(),
+        endpoint: Some(mock_url.clone()),
+        systemd_unit: None,
+    }]);
+
+    let health_cache = Arc::new(HealthCache::new());
+    let state = Arc::new(NeuronState {
+        discovery: fake_discovery(),
+        health_cache,
+        registry: RwLock::new(registry),
+    });
+
+    let app = api::neuron_routes().with_state(state);
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let neuron_addr = listener.local_addr().unwrap();
+    tokio::spawn(async move {
+        axum::serve(listener, app).await.unwrap();
+    });
+    let neuron_url = format!("http://{neuron_addr}");
+
+    let client = reqwest::Client::new();
+
+    // GET /models — should return models from mock mistralrs.
+    let resp = client
+        .get(format!("{neuron_url}/models"))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    let models: Vec<serde_json::Value> = resp.json().await.unwrap();
+    assert_eq!(models.len(), 2);
+    assert_eq!(models[0]["id"], "test-model");
+    assert_eq!(models[0]["harness"], "mistralrs");
+    assert_eq!(models[0]["status"], "loaded");
+    assert_eq!(models[1]["id"], "other-model");
+    assert_eq!(models[1]["status"], "unloaded");
+
+    // GET /models/test-model/endpoint — should return mock URL.
+    let resp = client
+        .get(format!("{neuron_url}/models/test-model/endpoint"))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(body["url"], mock_url);
+
+    // POST /models/unload — should succeed.
+    let resp = client
+        .post(format!("{neuron_url}/models/unload"))
+        .json(&json!({"model_id": "test-model"}))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(body["status"], "unloaded");
+
+    // POST /models/load — should succeed.
+    let resp = client
+        .post(format!("{neuron_url}/models/load"))
+        .json(&json!({
+            "model_id": "test-model",
+            "harness": "mistralrs"
+        }))
+        .send()
+        .await
+        .unwrap();
+    assert_eq!(resp.status(), 200);
+    let body: serde_json::Value = resp.json().await.unwrap();
+    assert_eq!(body["status"], "loaded");
+}
--- a/data/cortex.service
+++ b/data/cortex.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Cortex — inference gateway for multi-node GPU clusters
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/cortex serve --config /etc/cortex/cortex.toml
+Restart=on-failure
+RestartSec=5
+User=cortex
+Group=cortex
+
+[Install]
+WantedBy=multi-user.target
--- a/data/neuron.service
+++ b/data/neuron.service
@@ -0,0 +1,15 @@
+[Unit]
+Description=Neuron — per-node GPU discovery and harness daemon for cortex
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/neuron --config /etc/cortex/neuron.toml
+Restart=on-failure
+RestartSec=5
+User=cortex
+Group=cortex
+
+[Install]
+WantedBy=multi-user.target
--- a/models.example.toml
+++ b/models.example.toml
@@ -0,0 +1,29 @@
+# models.example.toml — model catalogue
+#
+# Copy to /etc/cortex/models.toml and adjust for your environment.
+# Describes how to serve each model. Cortex matches these profiles
+# against discovered neuron topologies for placement decisions.
+
+[[models]]
+id = "your-org/large-model"
+harness = "mistralrs"
+quant = "Q4_K_M"
+vram_mb = 19000
+min_devices = 2
+min_device_vram_mb = 10000
+pinned_on = ["gpu-large"]
+
+[[models]]
+id = "your-org/medium-model"
+harness = "mistralrs"
+quant = "Q6_K"
+vram_mb = 12000
+min_devices = 1
+pinned_on = ["gpu-medium"]
+
+[[models]]
+id = "your-org/embedding-model"
+harness = "mistralrs"
+quant = "Q8_0"
+vram_mb = 8000
+min_devices = 1
--- a/neuron.example.toml
+++ b/neuron.example.toml
@@ -0,0 +1,16 @@
+# neuron.example.toml — example configuration
+#
+# Copy to /etc/cortex/neuron.toml and adjust for your environment.
+#
+# Environment variable overrides use NEURON_ prefix with __ separators:
+#   NEURON_PORT=9090
+
+port = 9090
+
+# -- Harnesses ---------------------------------------------------------------
+# Each [[harnesses]] entry declares an inference engine managed by neuron.
+
+[[harnesses]]
+name = "mistralrs"
+endpoint = "http://localhost:8080"
+systemd_unit = "mistralrs.service"
--- a/neuron.spec
+++ b/neuron.spec
@@ -0,0 +1,69 @@
+Name:           neuron
+Version:        0.1.0
+Release:        1%{?dist}
+Summary:        Per-node GPU discovery and harness management daemon for cortex
+
+License:        GPL-3.0-or-later
+URL:            https://git.lair.cafe/helexa/cortex
+Source0:        %{name}-%{version}.tar.gz
+Source1:        %{name}-%{version}-vendor.tar.gz
+
+ExclusiveArch:  x86_64
+
+BuildRequires:  rust >= 1.85
+BuildRequires:  cargo
+BuildRequires:  gcc
+BuildRequires:  systemd-rpm-macros
+
+Requires(pre):  shadow-utils
+
+%description
+Neuron is a per-node daemon for cortex inference clusters. It discovers
+local GPU hardware via nvidia-smi, manages inference harnesses (mistral.rs,
+llama.cpp), and exposes an HTTP API for model lifecycle management.
+
+%prep
+%autosetup
+tar xf %{SOURCE1}
+mkdir -p .cargo
+cat > .cargo/config.toml << 'EOF'
+[source.crates-io]
+replace-with = "vendored-sources"
+
+[source.vendored-sources]
+directory = "vendor"
+EOF
+
+%build
+cargo build --release -p neuron
+
+%install
+install -Dm755 target/release/neuron %{buildroot}%{_bindir}/neuron
+install -Dm644 data/neuron.service %{buildroot}%{_unitdir}/neuron.service
+install -dm750 %{buildroot}%{_sysconfdir}/cortex
+install -Dm640 neuron.example.toml %{buildroot}%{_sysconfdir}/cortex/neuron.toml
+
+%pre
+getent group cortex >/dev/null || groupadd -r cortex
+getent passwd cortex >/dev/null || useradd -r -g cortex -d /var/lib/cortex -s /sbin/nologin cortex
+
+%post
+%systemd_post neuron.service
+
+%preun
+%systemd_preun neuron.service
+
+%postun
+%systemd_postun_with_restart neuron.service
+
+%files
+%license LICENSE
+%doc README.md
+%{_bindir}/neuron
+%{_unitdir}/neuron.service
+%dir %attr(750,root,cortex) %{_sysconfdir}/cortex
+%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/neuron.toml
+
+%changelog
+* Tue Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
+- Initial package
Author	SHA1	Message	Date
rob thijssen	7befa882d5	fix: yaml syntax Some checks failed CI / Format, lint, build, test (push) Successful in 1m42s Details CI / Build neuron SRPM (push) Successful in 42s Details CI / Build cortex SRPM (push) Successful in 1m40s Details CI / Publish neuron to COPR (push) Failing after 4m11s Details CI / Publish cortex to COPR (push) Failing after 3m16s Details CI / Bump version in source (push) Has been skipped Details	2026-04-16 09:25:02 +03:00
rob thijssen	d03fae960a	fix(ci): unset RUSTC_WRAPPER during sccache install All checks were successful CI / Format, lint, build, test (push) Successful in 2m40s Details CI / Build cortex SRPM (push) Has been skipped Details CI / Build neuron SRPM (push) Has been skipped Details CI / Publish cortex to COPR (push) Has been skipped Details CI / Publish neuron to COPR (push) Has been skipped Details CI / Bump version in source (push) Has been skipped Details The workflow-level env set RUSTC_WRAPPER=sccache for every step, including the install step itself. cargo install sccache then tried to invoke `sccache rustc -vV` to detect the toolchain before sccache existed on PATH, failing with "No such file or directory". Override RUSTC_WRAPPER to empty on the install step so cargo uses rustc directly; subsequent steps still inherit the wrapper. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-16 08:31:26 +03:00
rob thijssen	7b2235d56b	fix(ci): install sccache with S3 feature if missing Some checks failed CI / Format, lint, build, test (push) Failing after 4s Details CI / Build cortex SRPM (push) Has been skipped Details CI / Publish cortex to COPR (push) Has been skipped Details CI / Build neuron SRPM (push) Has been skipped Details CI / Publish neuron to COPR (push) Has been skipped Details CI / Bump version in source (push) Has been skipped Details The distro sccache package lacks S3 support. Install from cargo with --features s3 if the existing binary can't connect to the S3 backend. Skips install if already present and working. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 17:44:21 +03:00
rob thijssen	54f9f3dc36	ci: add sccache with MinIO backend for build caching Some checks failed CI / Format, lint, build, test (push) Failing after 3s Details CI / Build cortex SRPM (push) Has been skipped Details CI / Build neuron SRPM (push) Has been skipped Details CI / Publish cortex to COPR (push) Has been skipped Details CI / Publish neuron to COPR (push) Has been skipped Details CI / Bump version in source (push) Has been skipped Details All Rust compilation steps now use sccache backed by MinIO S3 at caveman.kosherinata.internal:9000. Credentials via repo secrets SCCACHE_S3_ACCESS_KEY and SCCACHE_S3_SECRET_KEY. Cache is shared across all bare metal runners. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 17:38:13 +03:00
rob thijssen	caee8bba11	fix(ci): use GITEA_TOKEN env var for push, not checkout Some checks failed CI / Format, lint, build, test (push) Successful in 2m40s Details CI / Build neuron SRPM (push) Successful in 47s Details CI / Build cortex SRPM (push) Successful in 48s Details CI / Publish cortex to COPR (push) Failing after 7s Details CI / Publish neuron to COPR (push) Failing after 3s Details CI / Bump version in source (push) Has been skipped Details Token is only needed for the authenticated push, not the public checkout. Set remote URL with token inline before pushing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 16:31:13 +03:00
rob thijssen	324dfa05c5	ci: add RPM packaging for cortex and neuron - cortex.spec: gateway binary, cortex.service systemd unit, cortex.toml + models.toml config files - neuron.spec: neuron binary, neuron.service systemd unit, neuron.toml config file - Parallel CI: srpm-cortex and srpm-neuron jobs build SRPMs concurrently, then publish to separate COPR repos (helexa/cortex and helexa/neuron) - bump-version job: after both COPR publishes succeed, stamps tag version into Cargo.toml, specs, Cargo.lock and pushes to main via GITEA_TOKEN - Shared cortex user/group across both packages - Example configs: cortex.example.toml, neuron.example.toml, models.example.toml Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 16:28:31 +03:00
rob thijssen	c85d50066e	ci: add RPM packaging for cortex and neuron - cortex.spec: gateway binary, cortex.service systemd unit, cortex.toml + models.toml config files - neuron.spec: neuron binary, neuron.service systemd unit, neuron.toml config file - Parallel CI: srpm-cortex and srpm-neuron jobs build SRPMs concurrently, then publish to separate COPR repos (helexa/cortex and helexa/neuron) - Shared cortex user/group across both packages - Example configs: cortex.example.toml, neuron.example.toml, models.example.toml Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 16:09:04 +03:00
rob thijssen	6c238f4557	refactor: rename cortex-neuron binary and crate to neuron All checks were successful CI / Format, lint, build, test (push) Successful in 2m28s Details CI / Build SRPM (push) Has been skipped Details CI / Publish to COPR (push) Has been skipped Details Package name, lib name, and binary all now just "neuron" without the cortex- prefix. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 15:51:15 +03:00
rob thijssen	e42e8ee81f	refactor: cortex talks to neurons instead of mistral.rs directly All checks were successful CI / Format, lint, build, test (push) Successful in 2m46s Details CI / Build SRPM (push) Has been skipped Details CI / Publish to COPR (push) Has been skipped Details Replace NodeConfig (static vram_mb, pinned) with NeuronEndpoint. Hardware discovery and model pinning now come from neuron API and models.toml catalogue respectively. - config.rs: nodes -> neurons, add models_config path - catalogue.rs: ModelProfile with pinned_on, ModelCatalogue - poller.rs: poll neuron GET /models (ModelInfo format) - router.rs: resolve inference endpoint via neuron GET /models/{id}/endpoint - evictor.rs: call neuron POST /models/unload - node.rs: remove vram_mb, pinned fields (come from discovery/catalogue) - All 22 gateway tests updated to mock neuron API - Remove MistralModelsResponse, ModelLifecycleRequest (no longer needed) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 14:42:52 +03:00
rob thijssen	26e5e7ead8	feat: implement mistral.rs harness and neuron model API All checks were successful CI / Format, lint, build, test (push) Successful in 2m30s Details CI / Build SRPM (push) Has been skipped Details CI / Publish to COPR (push) Has been skipped Details - MistralRsHarness: Harness trait impl wrapping mistral.rs HTTP API (list/load/unload models, health check, start/stop via systemd) - HarnessRegistry: maps harness name -> Box<dyn Harness>, built from neuron.toml config - Neuron API endpoints: GET /models, POST /models/load, POST /models/unload, GET /models/:id/endpoint - NeuronConfig: figment-based config loading from neuron.toml - Integration test: full model lifecycle through mock mistral.rs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 14:29:42 +03:00
rob thijssen	6dc717ebcd	feat: add neuron daemon with GPU discovery and health endpoints All checks were successful CI / Format, lint, build, test (push) Successful in 2m29s Details CI / Build SRPM (push) Has been skipped Details CI / Publish to COPR (push) Has been skipped Details Replace cortex-agent stub with neuron (cortex-neuron binary). cortex-core additions: - discovery.rs: DeviceInfo, DiscoveryResponse, DeviceHealth, HealthResponse - harness.rs: Harness async trait, HarnessConfig, ModelSpec, ModelInfo neuron crate (crates/neuron/): - discovery.rs: nvidia-smi CSV parsing (pure functions) + system discovery via uname/nvidia-smi/nvcc - health.rs: cached GPU health polling every 5s - api.rs: GET /discovery and GET /health axum handlers - main.rs: CLI entrypoint with --port flag (default 9090) - harness stubs for mistralrs (Phase 8) and llamacpp (Phase 11) 12 new tests (9 unit + 3 integration), 35 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-15 14:23:42 +03:00
				`@@ -0,0 +1 @@`
				`// llama.cpp harness implementation — Phase 11.`