Compare commits
2 Commits
57d7ef8d3c
...
v0.1.15
| Author | SHA1 | Date | |
|---|---|---|---|
|
0184ccab28
|
|||
|
471b9b7629
|
@@ -24,19 +24,6 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Cache cargo registry and target
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/bin
|
||||
~/.cargo/registry/index
|
||||
~/.cargo/registry/cache
|
||||
~/.cargo/git/db
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-cargo-
|
||||
|
||||
- name: Ensure sccache with S3 support
|
||||
env:
|
||||
RUSTC_WRAPPER: ""
|
||||
|
||||
17
CLAUDE.md
17
CLAUDE.md
@@ -125,7 +125,8 @@ automatically. Clippy warnings must be resolved, not suppressed with
|
||||
- One or more GPU nodes running mistral.rs on port 8080
|
||||
- Optionally a metrics-only node (no GPU) for Prometheus/Grafana
|
||||
- Each node runs `mistralrs serve` on port 8080
|
||||
- Gateway listens on port 8000 (API) and 9100 (metrics)
|
||||
- Gateway listens on port 31313 (API) and 31314 (metrics)
|
||||
- neuron listens on port 13131 on each GPU host
|
||||
- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
|
||||
|
||||
## Conventions
|
||||
@@ -380,7 +381,7 @@ processes (one process per loaded model, each on its own port).
|
||||
|
||||
## neuron API
|
||||
|
||||
neuron exposes an HTTP API on port 9090 that cortex polls and calls.
|
||||
neuron exposes an HTTP API on port 13131 that cortex polls and calls.
|
||||
|
||||
```
|
||||
GET /discovery
|
||||
@@ -424,8 +425,8 @@ endpoint. cortex.toml shrinks to:
|
||||
|
||||
```toml
|
||||
[gateway]
|
||||
listen = "0.0.0.0:8000"
|
||||
metrics_listen = "0.0.0.0:9100"
|
||||
listen = "0.0.0.0:31313"
|
||||
metrics_listen = "0.0.0.0:31314"
|
||||
|
||||
[eviction]
|
||||
strategy = "lru"
|
||||
@@ -433,15 +434,15 @@ defrag_after_cycles = 50
|
||||
|
||||
[[neurons]]
|
||||
name = "beast"
|
||||
endpoint = "http://beast.hanzalova.internal:9090"
|
||||
endpoint = "http://beast.hanzalova.internal:13131"
|
||||
|
||||
[[neurons]]
|
||||
name = "benjy"
|
||||
endpoint = "http://benjy.kosherinata.internal:9090"
|
||||
endpoint = "http://benjy.hanzalova.internal:13131"
|
||||
|
||||
[[neurons]]
|
||||
name = "quadbrat"
|
||||
endpoint = "http://quadbrat.hanzalova.internal:9090"
|
||||
endpoint = "http://quadbrat.hanzalova.internal:13131"
|
||||
```
|
||||
|
||||
On startup and periodically, cortex calls `GET /discovery` and
|
||||
@@ -521,7 +522,7 @@ cortex/
|
||||
│ │ └── metrics.rs # prometheus exporter (unchanged)
|
||||
│ ├── neuron/ # node plane (replaces cortex-agent)
|
||||
│ │ └── src/
|
||||
│ │ ├── main.rs # binary entrypoint, axum server on :9090
|
||||
│ │ ├── main.rs # binary entrypoint, axum server on :13131
|
||||
│ │ ├── discovery.rs # nvidia-smi, device enumeration
|
||||
│ │ ├── health.rs # runtime GPU polling
|
||||
│ │ ├── api.rs # HTTP handlers for /discovery, /models, etc.
|
||||
|
||||
@@ -88,8 +88,8 @@ WantedBy=multi-user.target
|
||||
```toml
|
||||
# cortex.toml
|
||||
[gateway]
|
||||
listen = "0.0.0.0:8000"
|
||||
metrics_listen = "0.0.0.0:9100"
|
||||
listen = "0.0.0.0:31313"
|
||||
metrics_listen = "0.0.0.0:31314"
|
||||
|
||||
[eviction]
|
||||
strategy = "lru" # lru | priority
|
||||
@@ -143,7 +143,7 @@ cortex serve --config cortex.toml
|
||||
cortex status
|
||||
|
||||
# list all models across nodes
|
||||
curl http://localhost:8000/v1/models
|
||||
curl http://localhost:31313/v1/models
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
# Copy to cortex.toml and adjust for your environment.
|
||||
#
|
||||
# Environment variable overrides use CORTEX_ prefix with __ separators:
|
||||
# CORTEX_GATEWAY__LISTEN=0.0.0.0:9000
|
||||
# CORTEX_GATEWAY__LISTEN=0.0.0.0:31313
|
||||
|
||||
[gateway]
|
||||
listen = "0.0.0.0:8000"
|
||||
metrics_listen = "0.0.0.0:9100"
|
||||
listen = "0.0.0.0:31313"
|
||||
metrics_listen = "0.0.0.0:31314"
|
||||
|
||||
[eviction]
|
||||
strategy = "lru"
|
||||
|
||||
@@ -23,7 +23,7 @@ enum Commands {
|
||||
/// Print the fleet status (models, nodes, health).
|
||||
Status {
|
||||
/// Gateway API endpoint to query.
|
||||
#[arg(short, long, default_value = "http://localhost:8000")]
|
||||
#[arg(short, long, default_value = "http://localhost:31313")]
|
||||
endpoint: String,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -22,9 +22,9 @@ fn default_models_path() -> String {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct GatewaySettings {
|
||||
/// Address to listen on for API requests (e.g. "0.0.0.0:8000")
|
||||
/// Address to listen on for API requests (e.g. "0.0.0.0:31313")
|
||||
pub listen: String,
|
||||
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
|
||||
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:31314")
|
||||
pub metrics_listen: String,
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ pub enum EvictionStrategy {
|
||||
pub struct NeuronEndpoint {
|
||||
/// Human-readable node name (e.g. "beast")
|
||||
pub name: String,
|
||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
|
||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:13131")
|
||||
pub endpoint: String,
|
||||
}
|
||||
|
||||
@@ -70,8 +70,8 @@ impl Default for GatewayConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
gateway: GatewaySettings {
|
||||
listen: "0.0.0.0:8000".into(),
|
||||
metrics_listen: "0.0.0.0:9100".into(),
|
||||
listen: "0.0.0.0:31313".into(),
|
||||
metrics_listen: "0.0.0.0:31314".into(),
|
||||
},
|
||||
eviction: EvictionSettings {
|
||||
strategy: EvictionStrategy::Lru,
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::collections::HashMap;
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NodeState {
|
||||
pub name: String,
|
||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
|
||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:13131").
|
||||
pub endpoint: String,
|
||||
pub healthy: bool,
|
||||
pub models: HashMap<String, ModelEntry>,
|
||||
|
||||
@@ -17,7 +17,7 @@ pub struct NeuronConfig {
|
||||
}
|
||||
|
||||
fn default_port() -> u16 {
|
||||
9090
|
||||
13131
|
||||
}
|
||||
|
||||
impl NeuronConfig {
|
||||
@@ -33,7 +33,7 @@ impl NeuronConfig {
|
||||
impl Default for NeuronConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
port: 9090,
|
||||
port: 13131,
|
||||
harnesses: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
# Copy to /etc/neuron/neuron.toml and adjust for your environment.
|
||||
#
|
||||
# Environment variable overrides use NEURON_ prefix with __ separators:
|
||||
# NEURON_PORT=9090
|
||||
# NEURON_PORT=13131
|
||||
|
||||
port = 9090
|
||||
port = 13131
|
||||
|
||||
# -- Harnesses ---------------------------------------------------------------
|
||||
# Each [[harnesses]] entry declares an inference engine managed by neuron.
|
||||
|
||||
Reference in New Issue
Block a user