Compare commits

..

2 Commits

Author SHA1 Message Date
a5bc992590 fix(rpm): explicitly Provides user(cortex)/group(cortex)
Some checks failed
CI / Format, lint, build, test (push) Successful in 1m4s
CI / Build cortex SRPM (push) Successful in 44s
CI / Build neuron SRPM (push) Successful in 1m46s
CI / Publish cortex to COPR (push) Successful in 8m49s
CI / Publish neuron to COPR (push) Successful in 9m51s
CI / Bump version in source (push) Failing after 47s
dnf5 was silently rejecting neuron-0.1.3 with "Nothing to do" because
it had an unresolvable Requires. Inspection showed:

  Requires: user(cortex)               ← unversioned
  Provides: user(cortex) = <base64>    ← versioned only, no unversioned

rpm's sysusers provides-generator only emits the unversioned user()
provide when the u-line is minimal. Our sysusers.conf specifies GECOS,
home dir, and shell, which pushes the generator to versioned-only.
The matching Requires (auto-generated from %attr(,,cortex) on config
files) is unversioned, so resolution failed silently.

Explicitly declare Provides: user(cortex) and Provides: group(cortex)
to guarantee the unversioned forms exist. group(cortex) was already
emitted unversioned but adding it for symmetry and to protect against
future generator changes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 12:04:19 +03:00
5a86f7cc16 ci: dump COPR per-chroot build logs to CI output
Previously the COPR publish steps only surfaced copr-cli's status
updates (pending/importing/running). When a build failed, diagnosing
required clicking through to the COPR web UI. Now we submit with
--nowait, watch the build, then use copr-cli download-build to fetch
each chroot's builder-live.log and cat them as collapsible ::group::
blocks in the CI output.

Logic is factored into .gitea/scripts/copr-build.sh so cortex and
neuron jobs share it. Both COPR jobs now check out the repo to access
the script.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 11:41:15 +03:00
211 changed files with 927 additions and 61110 deletions

61
.gitea/scripts/copr-build.sh Executable file
View File

@@ -0,0 +1,61 @@
#!/bin/bash
# Submit an SRPM to COPR, watch the build, and dump per-chroot build logs
# to stdout so they are captured in CI output.
#
# Usage: copr-build.sh <project> <srpm> [srpm...]
# Example: copr-build.sh helexa/cortex ./cortex-0.1.2-1.fc43.src.rpm
set -o pipefail
PROJECT="$1"
shift
if [ -z "$PROJECT" ] || [ "$#" -eq 0 ]; then
echo "usage: $0 <project> <srpm> [srpm...]" >&2
exit 2
fi
# Submit without waiting; capture the build ID from stdout.
SUBMIT_OUT=$(copr-cli build --nowait "$PROJECT" "$@")
echo "$SUBMIT_OUT"
BUILD_ID=$(echo "$SUBMIT_OUT" | grep -oP 'Created builds: \K[0-9]+' | head -n1)
if [ -z "$BUILD_ID" ]; then
echo "error: could not parse build ID from copr-cli output" >&2
exit 1
fi
echo
echo "Build $BUILD_ID submitted to $PROJECT"
echo "Follow live: https://copr.fedorainfracloud.org/coprs/build/$BUILD_ID"
echo
# Watch the build; captures status transitions to stdout. Exit non-zero
# on build failure, but defer propagating that until after we've fetched
# logs so the CI output contains diagnostics either way.
if copr-cli watch-build "$BUILD_ID"; then
STATUS=0
else
STATUS=$?
fi
# Fetch per-chroot results (logs + rpms). Anonymous download — no auth needed.
mkdir -p copr-logs
copr-cli download-build --dest copr-logs "$BUILD_ID" || {
echo "warning: failed to download build artifacts" >&2
}
# Dump each chroot's builder-live.log as a collapsible group.
for chroot_dir in copr-logs/*/; do
[ -d "$chroot_dir" ] || continue
chroot=$(basename "$chroot_dir")
log="${chroot_dir}builder-live.log"
if [ -f "$log" ]; then
echo
echo "::group::${chroot} builder-live.log"
cat "$log"
echo "::endgroup::"
fi
done
exit "$STATUS"

View File

@@ -1,618 +0,0 @@
name: build-prerelease
# Builds CUDA-flavoured neuron binaries (and a single cortex binary),
# packages each as a Fedora RPM, signs them, and publishes to the
# `unstable` channel at rpm.lair.cafe.
#
# Change-aware: the `prepare` job diffs HEAD against the git sha
# embedded in the most recently *published* unstable RPM (per package)
# and skips builds whose inputs didn't change. Docs-only commits build
# nothing; gateway-only commits skip the 3 CUDA builds (and, via
# deploy.yml's own check-update gate, the neuron restarts + model
# cold-loads). Diffing against the published sha — not the previous
# push — means a failed run can never cause a change to be missed.
#
# Lint (fmt+clippy) and test run here as parallel jobs and gate
# `publish`; ci.yml no longer runs on pushes to main (see its trigger
# comment), so the two workflows stop competing for the same runners.
#
# The published packages are versioned as e.g.
# helexa-neuron-blackwell-0.1.16-0.1.20260518T140530.gitabcdef0.fc43.x86_64
# ^^^^^^^^^^^^^^^^^^ ^^^^^^^^
# commit time (s) commit sha
# so they sort BELOW the eventual 0.1.16-1 stable release, and so two
# commits on the same day are still strictly ordered by their commit
# timestamps (rather than by RPM-vercmp's alpha-vs-digit precedence
# on the SHA fragment).
on:
# Auto-build on every push to main so the unstable channel tracks
# head without a manual dispatch step.
push:
branches: [main]
# Manual dispatch still available to build from a non-main ref.
# Dispatched runs skip change detection and build everything.
workflow_dispatch:
inputs:
ref:
description: "Git ref to build (branch / tag / commit). Defaults to the workflow's branch."
required: false
default: ""
# Coalesce same-ref pushes: a newer push cancels the older in-flight
# run — the newest commit is the one we want on the fleet. The publish
# job keeps its own `rpm-publish` group (cancel=false) so an in-flight
# repo update is never interrupted. Runners are ephemeral (one VM per
# job) so concurrent runs no longer race on a shared workspace; the
# old shared `cortex-runner-pool` group with ci.yml is gone.
concurrency:
group: build-prerelease-${{ github.ref }}
cancel-in-progress: true
env:
CARGO_INCREMENTAL: "0"
CARGO_TERM_COLOR: "always"
jobs:
prepare:
name: Resolve version stamps + change detection
timeout-minutes: 10
runs-on: rust
outputs:
version: ${{ steps.info.outputs.version }}
release: ${{ steps.info.outputs.release }}
short_sha: ${{ steps.info.outputs.short_sha }}
commit_timestamp: ${{ steps.info.outputs.commit_timestamp }}
build_cortex: ${{ steps.changes.outputs.build_cortex }}
build_neuron: ${{ steps.changes.outputs.build_neuron }}
build_bench: ${{ steps.changes.outputs.build_bench }}
check_rust: ${{ steps.changes.outputs.check_rust }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
fetch-depth: 0
- id: info
run: |
set -eux
VERSION=$(awk -F\" '/^version[[:space:]]*=/ { print $2; exit }' Cargo.toml)
SHORT_SHA=$(git rev-parse --short=7 HEAD)
# Second-precise commit timestamp gives the release stamp a
# strictly monotonic numeric prefix. The earlier %Y%m%d-only
# form let same-day builds be ordered by RPM's rpmvercmp
# rules over the SHA, which is non-chronological — e.g.
# "git602e8e1" sorts newer than "gitf9f5fa4" purely because
# rpmvercmp ranks digit-prefixed segments above alpha ones.
# The SHA stays only as a debug identifier; sort order is
# decided entirely by the timestamp.
COMMIT_TIMESTAMP=$(git log -1 --format=%cd --date=format:%Y%m%d%H%M%S HEAD)
RELEASE="0.1.${COMMIT_TIMESTAMP}.git${SHORT_SHA}"
echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
echo "release=${RELEASE}" >> "$GITHUB_OUTPUT"
echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
echo "commit_timestamp=${COMMIT_TIMESTAMP}" >> "$GITHUB_OUTPUT"
- id: changes
run: |
set -ux
# Default: build everything. Detection only ever narrows
# this, and any failure along the way (manifest unreachable,
# unparsable, sha not in history after a force-push) leaves
# the full build in place. Manual dispatches always build
# everything — predictable when building odd refs.
BUILD_CORTEX=true
BUILD_NEURON=true
BUILD_BENCH=true
CHECK_RUST=true
if [ "${GITHUB_EVENT_NAME}" = "push" ]; then
MANIFEST_URL="https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json"
if curl -fsS --max-time 20 -o /tmp/packages.json "$MANIFEST_URL"; then
# Latest published sha per package, by buildTime.
base_for() {
python3 - "$1" <<'PY'
import json, re, sys
name = sys.argv[1]
try:
with open("/tmp/packages.json") as f:
pkgs = json.load(f)["packages"]
cands = [p for p in pkgs if p.get("name") == name]
if cands:
latest = max(cands, key=lambda p: p.get("buildTime", 0))
m = re.search(r"git\.?([0-9a-f]{7,40})", latest.get("release", ""))
if m:
print(m.group(1))
except Exception:
pass
PY
}
# true if no usable base, else true iff the diff since
# the published sha touches the given path pattern.
decide() {
local base="$1" pattern="$2"
if [ -z "$base" ] \
|| ! git cat-file -e "${base}^{commit}" 2>/dev/null \
|| ! git merge-base --is-ancestor "$base" HEAD 2>/dev/null; then
echo true; return
fi
if git diff --name-only "${base}..HEAD" | grep -qE "$pattern"; then
echo true
else
echo false
fi
}
# cortex-core is shared by both binaries; Cargo.{toml,lock}
# affect both; this workflow file affects both.
NEURON_RE='^crates/neuron/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/helexa-neuron-prerelease\.spec$|^data/neuron|^neuron\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
CORTEX_RE='^crates/cortex-gateway/|^crates/cortex-cli/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/cortex-prerelease\.spec$|^data/cortex|^cortex\.example\.toml$|^models\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
BENCH_RE='^crates/helexa-bench/|^crates/cortex-core/|^Cargo\.toml$|^Cargo\.lock$|^rpm/helexa-bench-prerelease\.spec$|^data/helexa-bench|^helexa-bench\.example\.toml$|^\.gitea/workflows/build-prerelease\.yml$'
# Any Rust change (incl. crates not packaged here, e.g.
# helexa-acp) still needs lint+test on main.
RUST_RE='\.rs$|^crates/|Cargo\.toml$|^Cargo\.lock$'
CORTEX_BASE=$(base_for cortex)
NEURON_BASE=$(base_for helexa-neuron-blackwell)
BENCH_BASE=$(base_for helexa-bench)
BUILD_CORTEX=$(decide "$CORTEX_BASE" "$CORTEX_RE")
BUILD_NEURON=$(decide "$NEURON_BASE" "$NEURON_RE")
BUILD_BENCH=$(decide "$BENCH_BASE" "$BENCH_RE")
if [ "$BUILD_CORTEX" = "true" ] || [ "$BUILD_NEURON" = "true" ] || [ "$BUILD_BENCH" = "true" ]; then
CHECK_RUST=true
else
CHECK_RUST=$(decide "$CORTEX_BASE" "$RUST_RE")
fi
fi
fi
echo "build_cortex=${BUILD_CORTEX}" >> "$GITHUB_OUTPUT"
echo "build_neuron=${BUILD_NEURON}" >> "$GITHUB_OUTPUT"
echo "build_bench=${BUILD_BENCH}" >> "$GITHUB_OUTPUT"
echo "check_rust=${CHECK_RUST}" >> "$GITHUB_OUTPUT"
echo "### change detection: build_cortex=${BUILD_CORTEX} build_neuron=${BUILD_NEURON} build_bench=${BUILD_BENCH} check_rust=${CHECK_RUST}"
# fmt + clippy + test moved here from ci.yml for main pushes so the
# two workflows stop queueing against each other (ci.yml's checks
# used to delay build-cortex by ~12 minutes on the shared runner
# pool). They run in parallel with the builds and gate `publish`,
# not the builds themselves — a clippy warning still can't reach the
# fleet, but it also doesn't serialize the pipeline.
lint:
name: Lint (fmt + clippy)
timeout-minutes: 25
needs: prepare
if: needs.prepare.outputs.check_rust == 'true'
runs-on: rust
env:
RUSTC_WRAPPER: sccache
SCCACHE_BUCKET: sccache
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
SCCACHE_REGION: auto
SCCACHE_S3_USE_SSL: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- run: cargo fmt --check --all
# Failure-aware sccache escalation lives in the shared script: a
# signal death (rustc SIGSEGV / OOM-kill) keeps the cache and fails
# fast instead of triggering a slower uncached rebuild; only a real
# sccache fault drops the cache. See script/ci-cargo-escalate.sh.
- name: Clippy (sccache escalation)
run: script/ci-cargo-escalate.sh cargo clippy --workspace -- -D warnings
test:
name: Test
timeout-minutes: 25
needs: prepare
if: needs.prepare.outputs.check_rust == 'true'
runs-on: rust
env:
RUSTC_WRAPPER: sccache
SCCACHE_BUCKET: sccache
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
SCCACHE_REGION: auto
SCCACHE_S3_USE_SSL: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
# See script/ci-cargo-escalate.sh for the escalation rationale.
- name: Test (sccache escalation)
run: script/ci-cargo-escalate.sh cargo test --workspace
build-cortex:
name: Build cortex binary
timeout-minutes: 25
needs: prepare
if: needs.prepare.outputs.build_cortex == 'true'
# runner-rust image already provides rust/cargo/clippy/rustfmt via
# dnf — no rustup install step needed.
runs-on: rust
env:
RUSTC_WRAPPER: sccache
SCCACHE_BUCKET: sccache
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
SCCACHE_REGION: auto
SCCACHE_S3_USE_SSL: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
# See script/ci-cargo-escalate.sh for the escalation rationale.
- name: Build cortex (release, sccache escalation)
run: script/ci-cargo-escalate.sh cargo build --release -p cortex-cli
- name: Stage binary
run: |
mkdir --parents artifacts
cp target/release/cortex artifacts/cortex
./artifacts/cortex --version || true
- uses: actions/upload-artifact@v3
with:
name: cortex-fc43
path: artifacts/cortex
retention-days: 1
build-bench:
name: Build helexa-bench binary
timeout-minutes: 25
needs: prepare
if: needs.prepare.outputs.build_bench == 'true'
# Pure-Rust, non-CUDA binary — same runner as cortex.
runs-on: rust
env:
RUSTC_WRAPPER: sccache
SCCACHE_BUCKET: sccache
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
SCCACHE_REGION: auto
SCCACHE_S3_USE_SSL: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- name: Build helexa-bench (release, sccache escalation)
run: |
# Stamp the SHA helexa-bench records as bench_sha against every
# run (option_env! in sweep.rs reads it at compile time).
export HELEXA_BUILD_SHA="$(git rev-parse HEAD)"
script/ci-cargo-escalate.sh cargo build --release -p helexa-bench
- name: Stage binary
run: |
mkdir --parents artifacts
cp target/release/helexa-bench artifacts/helexa-bench
./artifacts/helexa-bench --version || true
- uses: actions/upload-artifact@v3
with:
name: bench-fc43
path: artifacts/helexa-bench
retention-days: 1
build-neuron:
name: Build neuron-${{ matrix.flavour }}
timeout-minutes: 35
needs: prepare
if: needs.prepare.outputs.build_neuron == 'true'
strategy:
fail-fast: false
matrix:
include:
- flavour: ampere
compute_cap: "86"
runner: cuda-13.0
cuda_home: /usr/local/cuda-13.0
build_jobs: 8
nvcc_threads: 4
cargo_features: "cuda cudnn"
- flavour: ada
compute_cap: "89"
runner: cuda-13.0
cuda_home: /usr/local/cuda-13.0
build_jobs: 8
nvcc_threads: 4
cargo_features: "cuda cudnn"
- flavour: blackwell
compute_cap: "120"
runner: cuda-13.0
cuda_home: /usr/local/cuda-13.0
build_jobs: 8
nvcc_threads: 4
cargo_features: "cuda cudnn"
runs-on: ${{ matrix.runner }}
env:
SCCACHE_BUCKET: sccache
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
SCCACHE_REGION: auto
SCCACHE_S3_USE_SSL: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
# sccache handling + failure classification lives in
# script/ci-cargo-escalate.sh: it probes for sccache (the CUDA
# image may not ship it — a missing binary degrades to an uncached
# build rather than failing at `sccache rustc -vV`), and a rustc
# SIGSEGV / OOM-kill keeps the cache and fails fast instead of
# escalating to a slower uncached rebuild. The cache covers the
# ~600-crate host-side dep tree (the bulk of the 10-14 min build),
# shared across all three flavours, so even one run seeds the next.
- name: Build neuron with CUDA (${{ matrix.flavour }})
run: |
export PATH="${{ matrix.cuda_home }}/bin:${PATH}"
export LD_LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LD_LIBRARY_PATH:-}"
export LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LIBRARY_PATH:-}"
# Pin the build SHA neuron reports from GET /version. The git
# fallback in build.rs would also work on a full checkout, but
# injecting the exact checked-out commit is unambiguous under
# shallow/detached states and makes the artifact self-describing.
export HELEXA_BUILD_SHA="$(git rev-parse HEAD)"
script/ci-cargo-escalate.sh cargo build --release -p neuron --features "${{ matrix.cargo_features }}"
env:
CUDA_COMPUTE_CAP: ${{ matrix.compute_cap }}
CARGO_BUILD_JOBS: ${{ matrix.build_jobs }}
NVCC_THREADS: ${{ matrix.nvcc_threads }}
- name: Stage binary
run: |
mkdir --parents artifacts
cp target/release/neuron artifacts/neuron-${{ matrix.flavour }}
file "artifacts/neuron-${{ matrix.flavour }}"
- uses: actions/upload-artifact@v3
with:
name: neuron-${{ matrix.flavour }}-fc43
path: artifacts/neuron-${{ matrix.flavour }}
retention-days: 1
package-cortex:
name: Package cortex RPM
timeout-minutes: 20
needs: [prepare, build-cortex]
runs-on: rpm
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- uses: actions/download-artifact@v3
with:
name: cortex-fc43
path: artifacts/
- name: Build RPM
run: |
set -eux
rm -f ~/.rpmmacros
rpmdev-setuptree
cp artifacts/cortex ~/rpmbuild/SOURCES/
cp data/cortex.service ~/rpmbuild/SOURCES/
cp data/cortex-sysusers.conf ~/rpmbuild/SOURCES/
cp data/cortex-firewalld.xml ~/rpmbuild/SOURCES/
cp cortex.example.toml ~/rpmbuild/SOURCES/
cp models.example.toml ~/rpmbuild/SOURCES/
cp LICENSE ~/rpmbuild/SOURCES/
rpmbuild -bb rpm/cortex-prerelease.spec \
--define "cortex_version ${{ needs.prepare.outputs.version }}" \
--define "cortex_prerelease ${{ needs.prepare.outputs.release }}" \
--undefine dist \
--define "dist .fc43"
- uses: actions/upload-artifact@v3
with:
name: rpm-cortex-fc43
path: ~/rpmbuild/RPMS/x86_64/*.rpm
retention-days: 7
package-bench:
name: Package helexa-bench RPM
timeout-minutes: 20
needs: [prepare, build-bench]
runs-on: rpm
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- uses: actions/download-artifact@v3
with:
name: bench-fc43
path: artifacts/
- name: Build RPM
run: |
set -eux
rm -f ~/.rpmmacros
rpmdev-setuptree
cp artifacts/helexa-bench ~/rpmbuild/SOURCES/
cp data/helexa-bench.service ~/rpmbuild/SOURCES/
cp data/helexa-bench-sysusers.conf ~/rpmbuild/SOURCES/
cp data/helexa-bench-firewalld.xml ~/rpmbuild/SOURCES/
cp helexa-bench.example.toml ~/rpmbuild/SOURCES/
cp LICENSE ~/rpmbuild/SOURCES/
rpmbuild -bb rpm/helexa-bench-prerelease.spec \
--define "bench_version ${{ needs.prepare.outputs.version }}" \
--define "bench_prerelease ${{ needs.prepare.outputs.release }}" \
--undefine dist \
--define "dist .fc43"
- uses: actions/upload-artifact@v3
with:
name: rpm-bench-fc43
path: ~/rpmbuild/RPMS/x86_64/*.rpm
retention-days: 7
package-neuron:
name: Package helexa-neuron-${{ matrix.flavour }} RPM
timeout-minutes: 20
needs: [prepare, build-neuron]
runs-on: rpm
strategy:
fail-fast: false
matrix:
include:
- flavour: ampere
- flavour: ada
- flavour: blackwell
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- uses: actions/download-artifact@v3
with:
name: neuron-${{ matrix.flavour }}-fc43
path: artifacts/
- name: Build RPM
run: |
set -eux
rm -f ~/.rpmmacros
rpmdev-setuptree
cp artifacts/neuron-${{ matrix.flavour }} ~/rpmbuild/SOURCES/
cp data/neuron.service ~/rpmbuild/SOURCES/
cp data/neuron-sysusers.conf ~/rpmbuild/SOURCES/
cp data/neuron-firewalld.xml ~/rpmbuild/SOURCES/
cp neuron.example.toml ~/rpmbuild/SOURCES/
cp LICENSE ~/rpmbuild/SOURCES/
rpmbuild -bb rpm/helexa-neuron-prerelease.spec \
--define "neuron_version ${{ needs.prepare.outputs.version }}" \
--define "neuron_flavour ${{ matrix.flavour }}" \
--define "neuron_prerelease ${{ needs.prepare.outputs.release }}" \
--undefine dist \
--define "dist .fc43"
- uses: actions/upload-artifact@v3
with:
name: rpm-neuron-${{ matrix.flavour }}-fc43
path: ~/rpmbuild/RPMS/x86_64/*.rpm
retention-days: 7
publish:
name: Publish to rpm.lair.cafe (unstable)
timeout-minutes: 25
needs: [lint, test, package-cortex, package-neuron, package-bench]
# Runs when at least one package was built and nothing failed.
# lint/test may be skipped (docs-only refs never get here because
# no packages build), but a real failure in any blocks the
# fleet from receiving the RPMs.
if: >-
${{
!cancelled()
&& (needs.lint.result == 'success' || needs.lint.result == 'skipped')
&& (needs.test.result == 'success' || needs.test.result == 'skipped')
&& (needs.package-cortex.result == 'success' || needs.package-neuron.result == 'success' || needs.package-bench.result == 'success')
&& needs.package-cortex.result != 'failure'
&& needs.package-neuron.result != 'failure'
&& needs.package-bench.result != 'failure'
}}
runs-on: rpm
concurrency:
group: rpm-publish
cancel-in-progress: false
env:
RPM_REPO_HOST: oolon.kosherinata.internal
FEDORA_VERSION: "43"
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- name: Download all built RPMs
uses: actions/download-artifact@v3
with:
path: rpms/
pattern: rpm-*-fc43
- name: Flatten RPM artifacts
run: |
set -eux
find rpms/ -name '*.rpm' -exec mv --target-directory=rpms/ {} +
find rpms/ -mindepth 1 -type d -empty -delete
ls -la rpms/
- name: Check for sequoia-sq
run: |
if ! command -v sq &> /dev/null; then
echo "ERROR: sequoia-sq is not installed. Install with: sudo dnf install sequoia-sq"
exit 1
fi
- name: Import signing key
env:
# Pass secrets via env so values stay out of the rendered shell
# script (which Gitea includes in step logs). Template
# expansion of ${{ secrets.X }} inside `run:` writes the literal
# value into the script and depends on Gitea's log masker to
# scrub it — fragile for multi-line keys.
RPM_SIGNING_KEY: ${{ secrets.RPM_SIGNING_KEY }}
RPM_SIGNING_KEY_ID: ${{ secrets.RPM_SIGNING_KEY_ID }}
run: |
echo "$RPM_SIGNING_KEY" | gpg --batch --import
fpr=$(gpg --batch --with-colons --list-keys "$RPM_SIGNING_KEY_ID" | awk -F: '/^fpr:/ { print $10; exit }')
echo "${fpr}:6:" | gpg --batch --import-ownertrust
sed "s/@GPG_NAME@/$RPM_SIGNING_KEY_ID/" rpm/rpmmacros > ~/.rpmmacros
- name: Sign RPMs
run: |
set -eux
for rpm in rpms/*.rpm; do
echo "signing ${rpm}..."
rpm --addsign "${rpm}"
done
- name: Set up SSH for rsync
run: |
install --directory --mode 700 ~/.ssh
echo "${RSYNC_SSH_KEY}" | install --mode 600 /dev/stdin ~/.ssh/id_ed25519
env:
RSYNC_SSH_KEY: ${{ secrets.RSYNC_SSH_KEY }}
- name: Test SSH connectivity
run: |
ssh -o StrictHostKeyChecking=accept-new "gitea_ci@${RPM_REPO_HOST}" exit
- name: Ensure unstable repo directory exists
run: |
ssh "gitea_ci@${RPM_REPO_HOST}" \
"mkdir --parents /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable"
- name: Sync RPMs to unstable repo
run: |
rsync \
--archive \
--verbose \
--chmod D755,F644 \
rpms/*.rpm \
"gitea_ci@${RPM_REPO_HOST}:/var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/"
- name: Update unstable repo metadata
run: |
ssh "gitea_ci@${RPM_REPO_HOST}" \
"cd /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable && createrepo_c --update ."
- name: Generate packages.json manifest
run: |
scp script/generate-packages-json.py "gitea_ci@${RPM_REPO_HOST}:/tmp/"
ssh "gitea_ci@${RPM_REPO_HOST}" \
"python3 /tmp/generate-packages-json.py \
--repodata-dir /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/repodata \
--output /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/packages.json \
--base-url https://rpm.lair.cafe/fedora/${FEDORA_VERSION}/x86_64/unstable"

View File

@@ -1,26 +1,12 @@
name: CI
# Pushes to main are deliberately excluded: build-prerelease.yml runs
# its own lint/test jobs there (gating publish), and running both
# workflows on the same push made them queue against each other on the
# same runner labels — ~12 minutes of added latency per deploy. Feature
# branches, PRs to main, and release tags keep the full gate here.
on:
push:
branches-ignore: [main]
branches: ["**"]
tags: ["v*"]
pull_request:
branches: [main]
# Coalesce same-ref pushes; a newer push supersedes the in-flight run.
# (The old shared `cortex-runner-pool` group with build-prerelease.yml
# is gone — the workflows no longer trigger on the same refs, and
# ephemeral one-VM-per-job runners removed the shared-workspace race
# that group existed to serialize.)
concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true
env:
CARGO_INCREMENTAL: "0"
RUSTC_WRAPPER: sccache
@@ -30,103 +16,56 @@ env:
SCCACHE_S3_USE_SSL: "false"
AWS_ACCESS_KEY_ID: ${{ secrets.SCCACHE_S3_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
# fmt, clippy, and test all run in parallel on the same `rust` runner
# and would otherwise share /root/.cache/act/<hash>/hostexecutor/target/,
# racing each other's cargo temp files (.tmpXXXXXX) and failing builds
# mid-compile. Give each job its own target directory so the invocations
# don't collide. sccache still backs the actual rustc cache, so the
# rebuild penalty is small.
CARGO_TARGET_DIR: target-${{ github.job }}
jobs:
fmt:
name: Format
timeout-minutes: 15
runs-on: rust
check:
name: Format, lint, build, test
runs-on: fedora
steps:
- uses: actions/checkout@v4
- run: cargo fmt --check --all
clippy:
name: Clippy
timeout-minutes: 25
runs-on: rust
steps:
- uses: actions/checkout@v4
# Failure-aware sccache escalation lives in the shared script (kept
# in sync with build-prerelease.yml): a signal death (rustc SIGSEGV
# / OOM-kill) keeps the cache and fails fast instead of an uncached
# rebuild; only a real sccache fault drops the cache.
- name: Clippy (sccache escalation)
run: script/ci-cargo-escalate.sh cargo clippy --workspace -- -D warnings
- name: Cache cargo registry and target
uses: actions/cache@v4
with:
path: |
~/.cargo/bin
~/.cargo/registry/index
~/.cargo/registry/cache
~/.cargo/git/db
target
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-cargo-
test:
name: Test
timeout-minutes: 25
runs-on: rust
steps:
- uses: actions/checkout@v4
# See script/ci-cargo-escalate.sh for the escalation rationale.
- name: Test (sccache escalation)
run: script/ci-cargo-escalate.sh cargo test --workspace
# Type-check the CUDA-only code path. Borrow-check-only — we
# never run the tests here (the runner has no GPU). This catches
# the category of bug where a refactor compiles fine under the
# default feature set (which is what the `clippy` and `test` jobs
# exercise) but fails inside a `#[cfg(feature = "cuda")]` block.
# `runs-on: cuda-13.0` selects the runner that ships nvcc /
# cudarc's build prerequisites. The generic `rust` and `rpm`
# runners don't have them (the previous label `rpm` was tried
# first and tripped cudarc's `nvcc --version` build script —
# see commit history).
cuda-check:
name: CUDA type-check
timeout-minutes: 35
runs-on: cuda-13.0
# The workflow-level env sets `RUSTC_WRAPPER: sccache`
# unconditionally, which hard-fails cargo if the CUDA image
# doesn't ship sccache. Clear it at job level; the "Enable
# sccache when available" step opts back in only after probing
# for the binary. SCCACHE_*/AWS creds stay set — harmless when
# the wrapper is off, required when it's on.
env:
RUSTC_WRAPPER: ""
# candle-kernels' build script falls back to `nvidia-smi` for
# compute-cap detection when this is unset — and the GPU-less
# builder image doesn't ship nvidia-smi. Any valid cap works for
# a borrow-check; the real per-flavour caps live in
# build-prerelease.yml's matrix.
CUDA_COMPUTE_CAP: "86"
steps:
- uses: actions/checkout@v4
# sccache probing + failure classification lives in the shared
# script (see build-prerelease.yml's neuron build for the same
# pattern). It probes for sccache and, on a rustc SIGSEGV / OOM,
# keeps the cache and fails fast rather than rebuilding uncached.
- name: cargo check --features cuda (sccache escalation)
- name: Ensure sccache with S3 support
env:
RUSTC_WRAPPER: ""
run: |
# act launches the step shell without /etc/profile, so the
# gitea_runner user's inherited PATH lacks /usr/local/cuda-13.0/bin.
# cudarc's build.rs shells out to `nvcc --version` (the neuron
# crate enables cuda-version-from-build-system) and panics with
# ENOENT if nvcc isn't resolvable — keep this export in sync
# with build-prerelease.yml.
export PATH="/usr/local/cuda-13.0/bin:${PATH}"
export LD_LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH:-}"
export LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LIBRARY_PATH:-}"
script/ci-cargo-escalate.sh cargo check -p neuron --features cuda --all-targets
if sccache --version 2>/dev/null && sccache --show-stats 2>/dev/null; then
echo "sccache with S3 support already installed"
else
cargo install sccache --features s3 --locked
fi
- name: Check formatting
run: cargo fmt --check --all
- name: Clippy
run: cargo clippy --workspace -- -D warnings
- name: Test
run: cargo test --workspace
- name: Show sccache stats
run: sccache --show-stats
srpm-cortex:
name: Build cortex SRPM
timeout-minutes: 25
runs-on: rpm
needs: [fmt, clippy, test, cuda-check]
runs-on: fedora
needs: check
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine version
id: version
@@ -140,12 +79,6 @@ jobs:
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
- name: Generate changelog entry
uses: https://git.lair.cafe/actions/rpm-changelog@v1
with:
spec: cortex.spec
version: ${{ steps.version.outputs.VERSION }}
- name: Generate source tarball
run: |
set -ex
@@ -180,14 +113,11 @@ jobs:
srpm-neuron:
name: Build neuron SRPM
timeout-minutes: 25
runs-on: rpm
needs: [fmt, clippy, test, cuda-check]
runs-on: fedora
needs: check
if: startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine version
id: version
@@ -199,37 +129,31 @@ jobs:
run: |
VERSION="${{ steps.version.outputs.VERSION }}"
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
sed -i "s/^Version:.*/Version: ${VERSION}/" helexa-neuron.spec
- name: Generate changelog entry
uses: https://git.lair.cafe/actions/rpm-changelog@v1
with:
spec: helexa-neuron.spec
version: ${{ steps.version.outputs.VERSION }}
sed -i "s/^Version:.*/Version: ${VERSION}/" neuron.spec
- name: Generate source tarball
run: |
set -ex
VERSION="${{ steps.version.outputs.VERSION }}"
tar czf /tmp/helexa-neuron-${VERSION}.tar.gz \
--transform "s,^\.,helexa-neuron-${VERSION}," \
tar czf /tmp/neuron-${VERSION}.tar.gz \
--transform "s,^\.,neuron-${VERSION}," \
--exclude='./target' \
--exclude='./.git' \
--exclude='*.tar.gz' \
--exclude='*.src.rpm' \
.
mv /tmp/helexa-neuron-${VERSION}.tar.gz .
mv /tmp/neuron-${VERSION}.tar.gz .
- name: Vendor Rust dependencies
run: |
VERSION="${{ steps.version.outputs.VERSION }}"
cargo vendor vendor/
tar czf helexa-neuron-${VERSION}-vendor.tar.gz vendor/
tar czf neuron-${VERSION}-vendor.tar.gz vendor/
rm -rf vendor/
- name: Build SRPM
run: |
rpmbuild -bs helexa-neuron.spec \
rpmbuild -bs neuron.spec \
--define "_sourcedir $(pwd)" \
--define "_srcrpmdir $(pwd)"
@@ -241,86 +165,67 @@ jobs:
copr-cortex:
name: Publish cortex to COPR
timeout-minutes: 60
runs-on: fedora-43
runs-on: fedora
needs: srpm-cortex
steps:
- uses: actions/checkout@v4
- name: Download SRPM
uses: actions/download-artifact@v3
with:
name: srpm-cortex
- name: Publish to COPR
uses: https://git.lair.cafe/actions/copr-publish@v1
with:
project: helexa/helexa
srpm: "*.src.rpm"
copr-config: ${{ secrets.COPR_CONFIG }}
- name: Configure copr-cli
run: |
mkdir -p ~/.config
echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
- name: Submit build to COPR
run: bash .gitea/scripts/copr-build.sh helexa/cortex *.src.rpm
copr-neuron:
name: Publish neuron to COPR
timeout-minutes: 60
runs-on: fedora-43
runs-on: fedora
needs: srpm-neuron
steps:
- uses: actions/checkout@v4
- name: Download SRPM
uses: actions/download-artifact@v3
with:
name: srpm-neuron
- name: Publish to COPR
uses: https://git.lair.cafe/actions/copr-publish@v1
with:
project: helexa/helexa
srpm: "*.src.rpm"
copr-config: ${{ secrets.COPR_CONFIG }}
- name: Configure copr-cli
run: |
mkdir -p ~/.config
echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
- name: Submit build to COPR
run: bash .gitea/scripts/copr-build.sh helexa/neuron *.src.rpm
bump-version:
name: Bump version in source
timeout-minutes: 15
runs-on: rust
runs-on: fedora
needs: [copr-cortex, copr-neuron]
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine version
id: version
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> "$GITHUB_OUTPUT"
- name: Stamp version
run: |
VERSION="${{ steps.version.outputs.VERSION }}"
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
sed -i "s/^Version:.*/Version: ${VERSION}/" helexa-neuron.spec
cargo check --workspace 2>/dev/null || true
- name: Generate cortex changelog entry
uses: https://git.lair.cafe/actions/rpm-changelog@v1
with:
spec: cortex.spec
version: ${{ steps.version.outputs.VERSION }}
- name: Generate helexa-neuron changelog entry
uses: https://git.lair.cafe/actions/rpm-changelog@v1
with:
spec: helexa-neuron.spec
version: ${{ steps.version.outputs.VERSION }}
- name: Commit and push
- name: Stamp version and push
env:
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
run: |
VERSION="${{ steps.version.outputs.VERSION }}"
VERSION="${GITHUB_REF#refs/tags/v}"
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
sed -i "s/^Version:.*/Version: ${VERSION}/" neuron.spec
cargo check --workspace 2>/dev/null || true
git config user.name "Gitea Actions"
git config user.email "actions@git.lair.cafe"
git add Cargo.toml Cargo.lock cortex.spec helexa-neuron.spec
git add Cargo.toml Cargo.lock cortex.spec neuron.spec
if git diff --cached --quiet; then
echo "Nothing to commit for ${VERSION}"
echo "Version already at ${VERSION}"
else
git commit -m "chore: bump version to ${VERSION}"
git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/${{ github.repository }}.git"
git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/helexa/cortex.git"
git push origin HEAD:main
fi

View File

@@ -1,136 +0,0 @@
name: deploy-dev
# Fast-path iteration deploy for a SINGLE neuron host: build one CUDA
# flavour, copy the raw binary to the host, restart neuron.service.
# Skips the other two flavours, all RPM packaging, signing, repo
# publish, and dnf — push-to-testable drops from ~20 min to roughly
# one CUDA build plus a service restart.
#
# This is a DEV convenience, not a release path:
# - the binary lands at /usr/bin/neuron *outside* RPM ownership;
# the next regular deploy.yml run reconciles the host back to the
# packaged binary (dnf sees the newer RPM and reinstalls). `rpm -V
# helexa-neuron-<flavour>` flagging a modified /usr/bin/neuron in
# the interim is expected.
# - nothing is published; other hosts are untouched.
# - requires the `install` sudoers rule from
# asset/sudoers.d/neuron-host.conf (re-run script/infra-setup.sh
# after updating it).
#
# Trigger from the Gitea UI: Actions → deploy-dev → Run workflow,
# pick the target host. Defaults to the ref you dispatch from, so it
# works from feature branches without touching main.
on:
workflow_dispatch:
inputs:
target:
description: "neuron host to deploy to"
required: true
type: choice
options: [beast, benjy, quadbrat]
default: beast
# One dev deploy at a time; a newer dispatch for the same host wins.
concurrency:
group: deploy-dev-${{ inputs.target }}
cancel-in-progress: true
env:
CARGO_INCREMENTAL: "0"
CARGO_TERM_COLOR: "always"
jobs:
build:
name: Build neuron (${{ inputs.target }})
runs-on: cuda-13.0
outputs:
flavour: ${{ steps.map.outputs.flavour }}
steps:
- uses: actions/checkout@v4
# host → flavour → compute cap. Keep in sync with the
# build-neuron matrix in build-prerelease.yml and the
# deploy-neurons matrix in deploy.yml.
- id: map
run: |
case "${{ inputs.target }}" in
beast) flavour=blackwell cap=120 ;;
benjy) flavour=ada cap=89 ;;
quadbrat) flavour=ampere cap=86 ;;
*) echo "unknown target ${{ inputs.target }}"; exit 1 ;;
esac
echo "flavour=${flavour}" >> "$GITHUB_OUTPUT"
echo "cap=${cap}" >> "$GITHUB_OUTPUT"
- name: Build neuron with CUDA
run: |
set -eux
export PATH="/usr/local/cuda-13.0/bin:${PATH}"
export LD_LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH:-}"
export LIBRARY_PATH="/usr/local/cuda-13.0/targets/x86_64-linux/lib:/usr/local/cuda-13.0/lib64:${LIBRARY_PATH:-}"
cargo build --release -p neuron --features "cuda cudnn"
env:
CUDA_COMPUTE_CAP: ${{ steps.map.outputs.cap }}
CARGO_BUILD_JOBS: "8"
NVCC_THREADS: "4"
- name: Stage binary
run: |
mkdir --parents artifacts
cp target/release/neuron artifacts/neuron-dev
file artifacts/neuron-dev
- uses: actions/upload-artifact@v3
with:
name: neuron-dev-${{ inputs.target }}
path: artifacts/neuron-dev
retention-days: 1
deploy:
name: Deploy to ${{ inputs.target }}
needs: build
runs-on: fedora-43
env:
DEPLOY_KEY: |
${{ secrets.RSYNC_SSH_KEY }}
TARGET_HOST: ${{ inputs.target }}.hanzalova.internal
steps:
- name: SSH init
run: |
mkdir -p ~/.ssh
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
"gitea_ci@${TARGET_HOST}" 'hostname -f'
- uses: actions/download-artifact@v3
with:
name: neuron-dev-${{ inputs.target }}
path: artifacts/
- name: Copy binary to host
run: |
scp artifacts/neuron-dev "gitea_ci@${TARGET_HOST}:/var/lib/gitea_ci/neuron-dev"
- name: Install binary and restart neuron.service
run: |
ssh "gitea_ci@${TARGET_HOST}" '
set -eu
if systemctl is-active --quiet neuron.service; then
sudo /usr/bin/systemctl stop neuron.service
fi
# Exact command form required by the sudoers rule in
# asset/sudoers.d/neuron-host.conf — change both together.
sudo /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron
# enable --now so a dev deploy also leaves the unit enabled
# for boot, consistent with deploy.yml.
sudo /usr/bin/systemctl enable --now neuron.service
rm -f /var/lib/gitea_ci/neuron-dev'
- name: Capture neuron.service startup journal
if: always()
run: |
sleep 10
ssh "gitea_ci@${TARGET_HOST}" \
'journalctl --unit neuron.service -I --no-pager'

View File

@@ -1,448 +0,0 @@
name: deploy
# Roll the freshly-published unstable RPMs onto the helexa fleet:
# cortex on the gateway, helexa-neuron-<flavour> on each neuron host,
# and helexa-bench on bob (the bench host).
#
# Triggered automatically after `build-prerelease` succeeds (by which
# point the new RPMs are live on rpm.lair.cafe/unstable), and also
# re-runnable manually from the Gitea UI.
#
# Each host self-gates: if dnf sees no newer package than what is
# installed, the service is left alone — no stop, no restart, no model
# cold-load. Combined with build-prerelease's change detection this
# means a docs- or gateway-only push never restarts the neurons (a
# neuron restart costs ~5 min of 27B cold-load, see issue #1).
#
# Per-host one-time setup (gitea_ci user, authorized_keys, scoped
# sudoers drop-in) lives in script/infra-setup.sh — run that once per
# host before this workflow can succeed.
on:
workflow_run:
workflows: [build-prerelease]
types: [completed]
workflow_dispatch:
# Serialize deploys. Overlapping runs would race on dnf metadata
# refresh and service-restart timing; queueing keeps the fleet
# predictable. Don't cancel an in-flight deploy — a half-applied dnf
# transaction is worse than a slightly stale deploy.
concurrency:
group: deploy
cancel-in-progress: false
env:
DEPLOY_KEY: |
${{ secrets.RSYNC_SSH_KEY }}
jobs:
deploy-cortex:
runs-on: fedora-43
# Two trigger paths: manual dispatch always runs; workflow_run
# only runs if the upstream `build-prerelease` actually succeeded.
if: >-
${{
github.event_name == 'workflow_dispatch'
|| github.event.workflow_run.conclusion == 'success'
}}
steps:
- name: SSH init
run: |
mkdir -p ~/.ssh
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
gitea_ci@hanzalova.internal 'hostname -f'
# Gating compares `rpm -q` against the packages.json manifest the
# publish job maintains — NOT unprivileged `dnf check-update`,
# which proved unreliable as the gitea_ci user (hung on metadata
# locks on one host, silently reported "no updates" on others).
# An unreadable/unparsable manifest fails open: deploy proceeds.
- name: Deploy cortex (skips when already current)
run: |
ssh gitea_ci@hanzalova.internal 'bash -s' <<'DEPLOY'
set -eu
pkg=cortex
installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
| python3 -c '
import json, sys
name = sys.argv[1]
cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
if cands:
p = max(cands, key=lambda p: p.get("buildTime", 0))
print(p["version"] + "-" + p["release"])
' "${pkg}" 2>/dev/null || true)
if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
echo "${pkg}-${installed} already current — leaving service untouched"
exit 0
fi
echo "installed=${installed} published=${latest:-unknown} — deploying"
if systemctl is-active --quiet cortex.service; then
sudo /usr/bin/systemctl stop cortex.service
fi
if rpm -q "${pkg}" >/dev/null 2>&1; then
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
else
sudo /usr/bin/dnf install --refresh --allowerasing -y cortex
fi
sudo /usr/bin/systemctl daemon-reload
# enable --now: start the service AND enable it for boot so the
# fleet self-heals after a host reboot.
sudo /usr/bin/systemctl enable --now cortex.service
DEPLOY
# Wait for the service to either come up or wedge, then capture
# the latest-invocation journal. Runs even on prior failure so a
# failed start step still leaves a usable record in the deploy log.
- name: Capture cortex.service startup journal
if: always()
run: |
sleep 10
ssh gitea_ci@hanzalova.internal \
'journalctl --unit cortex.service -I --no-pager'
deploy-neurons:
needs: [deploy-cortex]
runs-on: fedora-43
strategy:
# One neuron failing must not cancel the others. Cortex is up
# already; a partial neuron deploy is strictly better than
# rolling back to zero.
fail-fast: false
matrix:
include:
# load_timeout: how long to wait for default_models to finish
# loading after a restart. beast cold-loads Qwen3.6-27B Q6K
# TP=2 (~5-6 min typical, see #1); benjy/quadbrat load small
# single-GPU models in well under a minute.
#
# max_prompt_tokens: per-model context cap, written to the
# neuron.service.d/model.conf drop-in (NEURON_MAX_PROMPT_TOKENS).
# A change here restarts the neuron even with no new RPM. Values
# are VRAM-safe ceilings derived per model — see
# doc/context-limits.md. beast (Qwen3.6-27B, hybrid linear, 2x
# 32GB) has ample KV headroom; benjy (Qwen3-8B dense, ~6GB free)
# is VRAM-bound and stays at the default; quadbrat (Qwen3-1.7B)
# likewise conservative.
- host: beast.hanzalova.internal
flavour: blackwell
load_timeout: 900
max_prompt_tokens: 131072
- host: benjy.hanzalova.internal
flavour: ada
load_timeout: 300
max_prompt_tokens: 16384
- host: quadbrat.hanzalova.internal
flavour: ampere
load_timeout: 300
max_prompt_tokens: 16384
steps:
- name: SSH init
run: |
mkdir -p ~/.ssh
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
gitea_ci@${{ matrix.host }} 'hostname -f'
# See deploy-cortex for why gating uses the publish manifest and
# not unprivileged `dnf check-update`.
- name: Deploy helexa-neuron-${{ matrix.flavour }} (skips when already current)
run: |
ssh gitea_ci@${{ matrix.host }} 'bash -s' <<'DEPLOY'
set -eu
pkg=helexa-neuron-${{ matrix.flavour }}
max_prompt_tokens="${{ matrix.max_prompt_tokens }}"
# ── Desired per-model systemd drop-in ─────────────────────────
# model.conf carries NEURON_MAX_PROMPT_TOKENS so the context cap
# is deterministic per host and rolled out (with a restart) by
# this workflow, not hand-edited. It sorts after local.conf, so a
# deploy-managed value wins over any manual local override of the
# same variable. See doc/context-limits.md.
conf=/etc/systemd/system/neuron.service.d/model.conf
config_changed=0
if [ -n "${max_prompt_tokens}" ]; then
desired=$(printf '%s\n%s\n%s\n%s' \
"# Managed by .gitea/workflows/deploy.yml - do not edit by hand." \
"# Per-model context cap; see doc/context-limits.md." \
"[Service]" \
"Environment=NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}")
[ "${desired}" = "$(cat "${conf}" 2>/dev/null || true)" ] || config_changed=1
fi
# ── Package version gate (manifest rationale: see deploy-cortex) ──
installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
| python3 -c '
import json, sys
name = sys.argv[1]
cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
if cands:
p = max(cands, key=lambda p: p.get("buildTime", 0))
print(p["version"] + "-" + p["release"])
' "${pkg}" 2>/dev/null || true)
pkg_changed=1
if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
pkg_changed=0
fi
# Skip only when BOTH the package and the drop-in are unchanged —
# a context-cap change must restart the neuron even with no new RPM.
if [ "${pkg_changed}" -eq 0 ] && [ "${config_changed}" -eq 0 ]; then
echo "${pkg}-${installed} current; NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens:-<unset>} unchanged — leaving service untouched"
exit 0
fi
echo "installed=${installed} published=${latest:-unknown} pkg_changed=${pkg_changed} config_changed=${config_changed} — deploying"
# Write the drop-in (staged in gitea_ci's dir, installed root-owned).
if [ "${config_changed}" -eq 1 ]; then
printf '%s\n' "${desired}" > /var/lib/gitea_ci/model.conf
sudo /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf "${conf}"
rm -f /var/lib/gitea_ci/model.conf
echo "applied ${conf}: NEURON_MAX_PROMPT_TOKENS=${max_prompt_tokens}"
fi
if systemctl is-active --quiet neuron.service; then
sudo /usr/bin/systemctl stop neuron.service
fi
if [ "${pkg_changed}" -eq 1 ]; then
if rpm -q "${pkg}" >/dev/null 2>&1; then
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y "${pkg}"
else
sudo /usr/bin/dnf install --refresh --allowerasing -y "${pkg}"
fi
fi
# daemon-reload picks up both a new unit (dnf) and the drop-in.
sudo /usr/bin/systemctl daemon-reload
# enable --now: start the service AND enable it for boot so the
# fleet self-heals after a host reboot.
sudo /usr/bin/systemctl enable --now neuron.service
# ── Post-deploy validation ────────────────────────────────
# A deploy only goes green if the neuron (a) finishes loading
# its default models and (b) answers a trivial prompt like an
# LLM should. Catches the class of bug where the binary
# starts fine but model load or inference is broken — which
# previously surfaced only when a human noticed. The wait
# polls /health activation (the structured source of the
# "loaded default model" journal line, plus per-model failure
# detail); the journal-capture step below still runs for
# forensics either way.
load_timeout=${{ matrix.load_timeout }}
echo "waiting for default models (timeout ${load_timeout}s)"
deadline=$(( $(date +%s) + load_timeout ))
health=""
while :; do
health=$(curl -fsS --max-time 5 http://localhost:13131/health 2>/dev/null || true)
state=$(printf %s "${health}" | python3 -c '
import json, sys
try:
print(json.load(sys.stdin).get("activation", {}).get("state", ""))
except Exception:
print("")
')
if [ "${state}" = "ready" ]; then
break
fi
if [ "$(date +%s)" -ge "${deadline}" ]; then
echo "FAIL: activation not ready within ${load_timeout}s (last state: ${state:-unreachable})"
exit 1
fi
sleep 10
done
model=$(printf %s "${health}" | python3 -c '
import json, sys
a = json.load(sys.stdin).get("activation", {})
failed = a.get("failed", [])
if failed:
for f in failed:
msg = "FAILED " + str(f.get("model_id")) + ": " + str(f.get("error", ""))[:400]
sys.stderr.write(msg + chr(10))
sys.exit(1)
completed = a.get("completed", [])
print(completed[0] if completed else "")
')
if [ -z "${model}" ]; then
echo "no default models configured — skipping LLM probe"
exit 0
fi
echo "LLM probe against ${model}"
probe_body=$(printf '{"model":"%s","messages":[{"role":"user","content":"Reply with exactly one word: pineapple"}],"max_tokens":512,"temperature":0}' "${model}")
resp=$(curl -fsS --max-time 180 -H "content-type: application/json" \
-d "${probe_body}" http://localhost:13131/v1/chat/completions) || {
echo "FAIL: probe request errored"
exit 1
}
if printf %s "${resp}" | grep -qi pineapple; then
echo "LLM probe passed"
else
echo "FAIL: probe response missing expected token"
printf %s "${resp}" | head -c 2000
echo
exit 1
fi
DEPLOY
- name: Ensure firewalld allows helexa-neuron
run: |
ssh gitea_ci@${{ matrix.host }} '
if ! sudo /usr/bin/firewall-cmd --query-service=helexa-neuron --quiet 2>/dev/null; then
sudo /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
sudo /usr/bin/firewall-cmd --reload
fi'
# Wait for the service to either come up or wedge, then capture
# the latest-invocation journal. Runs even on prior failure so a
# failed start step still leaves a usable record in the deploy log.
- name: Capture neuron.service startup journal
if: always()
run: |
sleep 10
ssh gitea_ci@${{ matrix.host }} \
'journalctl --unit neuron.service -I --no-pager'
# helexa-bench is a separate package on a separate host (bob), and it
# only consumes the fleet's HTTP APIs — it has no deploy-ordering
# dependency on cortex or the neurons (the sweep loop is version-aware
# and picks up whatever each neuron reports whenever). So it runs
# alongside the cortex→neurons chain rather than after it.
deploy-bench:
runs-on: fedora-43
if: >-
${{
github.event_name == 'workflow_dispatch'
|| github.event.workflow_run.conclusion == 'success'
}}
steps:
- name: SSH init
run: |
mkdir -p ~/.ssh
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
gitea_ci@bob.hanzalova.internal 'hostname -f'
# See deploy-cortex for why gating uses the publish manifest and
# not unprivileged `dnf check-update`.
- name: Deploy helexa-bench (skips when already current)
run: |
ssh gitea_ci@bob.hanzalova.internal 'bash -s' <<'DEPLOY'
set -eu
pkg=helexa-bench
installed=$(rpm -q --qf '%{VERSION}-%{RELEASE}' "${pkg}" 2>/dev/null || echo "not-installed")
latest=$(curl -fsS --max-time 15 "https://rpm.lair.cafe/fedora/43/x86_64/unstable/packages.json" 2>/dev/null \
| python3 -c '
import json, sys
name = sys.argv[1]
cands = [p for p in json.load(sys.stdin)["packages"] if p.get("name") == name]
if cands:
p = max(cands, key=lambda p: p.get("buildTime", 0))
print(p["version"] + "-" + p["release"])
' "${pkg}" 2>/dev/null || true)
if [ -n "${latest}" ] && [ "${latest}" = "${installed}" ]; then
echo "${pkg}-${installed} already current — leaving service untouched"
exit 0
fi
echo "installed=${installed} published=${latest:-unknown} — deploying"
if systemctl is-active --quiet helexa-bench.service; then
sudo /usr/bin/systemctl stop helexa-bench.service
fi
if rpm -q "${pkg}" >/dev/null 2>&1; then
sudo /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
else
sudo /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
fi
sudo /usr/bin/systemctl daemon-reload
# enable --now: start the service AND enable it for boot so the
# bench resumes collecting after a host reboot.
sudo /usr/bin/systemctl enable --now helexa-bench.service
# ── Post-deploy validation ────────────────────────────────
# The bench serves a read-only API on :13132 alongside the
# outbound sweep loop. Probe the API over localhost (bypasses
# firewalld) — catches a crash-on-start or a bad bind. Bail
# early if the unit drops out of active (Restart backoff).
echo "waiting for bench API on :13132"
deadline=$(( $(date +%s) + 30 ))
while :; do
if curl -fsS --max-time 5 http://localhost:13132/api/health >/dev/null 2>&1; then
echo "bench API healthy"
break
fi
if ! systemctl is-active --quiet helexa-bench.service; then
echo "FAIL: helexa-bench.service is not active"
systemctl --no-pager status helexa-bench.service | head -20 || true
exit 1
fi
if [ "$(date +%s)" -ge "${deadline}" ]; then
echo "FAIL: bench API not healthy within 30s"
exit 1
fi
sleep 3
done
DEPLOY
- name: Ensure firewalld allows helexa-bench
run: |
ssh gitea_ci@bob.hanzalova.internal '
if ! sudo /usr/bin/firewall-cmd --query-service=helexa-bench --quiet 2>/dev/null; then
sudo /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
sudo /usr/bin/firewall-cmd --reload
fi'
# Wait for the service to either come up or wedge, then capture
# the latest-invocation journal. Runs even on prior failure so a
# failed start step still leaves a usable record in the deploy log.
- name: Capture helexa-bench.service startup journal
if: always()
run: |
sleep 10
ssh gitea_ci@bob.hanzalova.internal \
'journalctl --unit helexa-bench.service -I --no-pager'
# Build the bench UI and publish it to the public nginx vhost on the
# gateway (https://bench.helexa.ai). The vhost + Let's Encrypt cert are
# one-time host setup (script/infra-setup.sh); this job just refreshes
# the static assets. nginx reverse-proxies /api to the bob API, so the
# SPA is built same-origin (no VITE_API_BASE). Independent of the other
# deploy jobs.
deploy-bench-ui:
runs-on: fedora-43
if: >-
${{
github.event_name == 'workflow_dispatch'
|| github.event.workflow_run.conclusion == 'success'
}}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: "20"
- name: Build UI
run: |
cd bench
npm ci
npm run build
- name: SSH init
run: |
mkdir -p ~/.ssh
echo "${DEPLOY_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new \
gitea_ci@hanzalova.internal 'hostname -f'
- name: Rsync built UI to gateway webroot
run: |
rsync --archive --compress --delete \
--rsync-path 'sudo rsync' \
bench/dist/ \
gitea_ci@hanzalova.internal:/var/www/bench.helexa.ai/

5
.gitignore vendored
View File

@@ -1,12 +1,7 @@
/target
/bench/node_modules
/bench/dist
*.swp
*.swo
.idea/
.vscode/
cortex.toml
models.toml
doc/plan/*
/target-cuda/
.claude/

268
AGENTS.md
View File

@@ -1,268 +0,0 @@
# AGENTS.md — helexa/cortex
## Project Overview
helexa is a self-hosted LLM serving stack for multi-node GPU inference clusters. It has two components:
- **cortex** — the per-operator control plane and LLM proxy. A Rust reverse-proxy that sits in front of the fleet and presents a unified OpenAI + Anthropic compatible API surface. It handles model routing, lifecycle management (load/unload/evict), request translation, and metrics collection.
- **neuron** — the per-host LLM harness. One instance runs on every GPU host, serving candle-based in-process inference and managing local hardware discovery and model lifecycle.
## Repository Layout
```
cortex/
├── Cargo.toml # workspace root (Rust 2024 edition, GPL-3.0)
├── cortex.example.toml # example gateway config
├── models.example.toml # example model catalogue
├── neuron.example.toml # example neuron config
├── README.md # public-facing documentation
├── CLAUDE.md # detailed design rationale and implementation history
├── AGENTS.md # ← you are here
├── cortex.spec # RPM spec for cortex
├── helexa-neuron.spec # RPM spec for neuron (renamed to avoid Fedora collision)
├── rpm/ # prerelease RPM specs
│ ├── cortex-prerelease.spec
│ ├── helexa-neuron-prerelease.spec
│ └── helexa-bench-prerelease.spec
├── data/ # systemd units and example configs for packaging
│ ├── cortex.service
│ ├── neuron.service
│ ├── cortex.example.toml
│ ├── neuron.example.toml
│ └── models.example.toml
└── crates/
├── cortex-core/ # shared types, config, envelopes
│ └── src/
│ ├── lib.rs
│ ├── build_info.rs # BuildInfo type for /version endpoint
│ ├── config.rs # figment-based config structs
│ ├── catalogue.rs # ModelProfile, placement matching
│ ├── discovery.rs # DeviceInfo, DiscoveryResponse
│ ├── harness.rs # Harness trait, HarnessConfig, HarnessHealth
│ ├── node.rs # NodeState, ModelStatus
│ ├── openai.rs # OpenAI request/response types
│ ├── anthropic.rs # Anthropic request/response types
│ ├── translate.rs # OpenAI <-> Anthropic translation
│ └── metrics.rs # RequestMetrics, histogram helpers
├── cortex-gateway/ # the HTTP proxy server
│ └── src/
│ ├── lib.rs
│ ├── state.rs # CortexState: Arc<RwLock<...>>
│ ├── router.rs # model -> node routing logic
│ ├── proxy.rs # streaming HTTP proxy to backends
│ ├── evictor.rs # LRU/priority eviction logic
│ ├── poller.rs # background task polling neuron status
│ ├── handlers.rs # axum handlers (chat, completions, models, etc.)
│ └── metrics.rs # prometheus exporter endpoint
├── cortex-cli/ # CLI entrypoint
│ └── src/main.rs # binary: `cortex`
├── neuron/ # per-host LLM daemon (replaces cortex-agent)
│ ├── Cargo.toml # features: cuda, cudnn, flash-attn, cuda-integration
│ ├── build.rs # compiles CUDA kernels, emits build metadata
│ └── src/
│ ├── main.rs # binary: `neuron`
│ ├── discovery.rs # nvidia-smi parsing, device enumeration
│ ├── health.rs # runtime GPU polling
│ ├── api.rs # HTTP handlers for /discovery, /models, etc.
│ ├── version.rs # GET /version endpoint with BuildInfo
│ ├── models.rs # local model lifecycle orchestration
│ └── harness/ # in-process candle inference
│ ├── device_worker/ # per-device CUDA worker threads
│ │ ├── mod.rs # canonical narrative for worker architecture
│ │ ├── jobs.rs # Job enum, dispatch handlers
│ │ └── dispatch.rs # DeviceWorkerState struct
│ ├── candle.rs # candle model implementation
│ └── tp/ # tensor parallelism
│ └── worker.rs # TP worker subprocesses
├── helexa-acp/ # Agent Client Protocol bridge (Apache-2.0)
│ └── src/main.rs # binary: `helexa-acp`, self-contained (no workspace deps)
└── helexa-bench/ # benchmark harness
└── src/main.rs # binary: `helexa-bench`, SQLite-backed, version-aware
```
## Key Design Decisions
### Architecture
- **cortex** is the control plane. It exposes the unified API, routes requests, manages model lifecycle across the fleet, and collects metrics.
- **neuron** is the node plane. One instance runs on every GPU host. It discovers local hardware, manages in-process candle inference, handles NCCL tensor parallelism, and reports runtime state.
- cortex never shells out to `nvidia-smi`, never touches systemd units, and never talks directly to a harness. It talks only to neurons via HTTP API on port 13131.
### Per-device worker thread (neuron)
Every CUDA device gets one dedicated OS thread that owns its `CudaContext` for the daemon's lifetime. All CUDA operations route through this thread via a `std::sync::mpsc` job channel. Tensors never escape the worker thread alive. Inference replies carry `Vec<f32>` CPU-side logits; sampled tokens come back as `u32`. The opaque `ArchHandle(u64)` and `TpHandle(u64)` are indices into the worker's state slab, not pointers.
CPU loads (`Device::Cpu` fallback) keep the legacy `tokio::task::spawn_blocking + Arc<Mutex<ModelArch>>` path — there's no context to own and the channel hop would only add latency. Four `spawn_blocking` references in `harness/candle.rs` are deliberate CPU fallback.
### candle-native (not mistral.rs)
neuron builds directly on [candle](https://github.com/huggingface/candle). Every model architecture it serves is implemented in this repository, ported against the HuggingFace reference. No external inference server to babysit. The Harness trait remains as an internal seam for adding future engines (vision/audio/diffusion) but its only implementation is in-process candle.
### Streaming proxy
Chat completions are proxied as SSE streams. The gateway must:
1. Parse the inbound request to extract the model name
2. Route to the correct backend neuron
3. Stream the response back, capturing token timing for metrics
4. NOT buffer the full response — true streaming passthrough
### Anthropic translation
When a request arrives at `/v1/messages` (Anthropic format), the gateway translates it to OpenAI format before proxying to neuron, then translates the response back. This is stateless envelope transformation. Non-streaming round-trip is implemented; streaming SSE translation deferred.
### Eviction
The evictor runs as a background task. Before loading a model on a node where VRAM is tight:
1. Check if the model is already loaded elsewhere → route there instead
2. Find the LRU model on the target node (excluding pinned models)
3. Call `POST {neuron}/models/unload` on that model
4. The incoming request's lazy-load triggers the new model load
### Metrics
Per-request: model, node, prompt_tokens, completion_tokens, total_tokens, tok_per_sec, time_to_first_token_ms, total_latency_ms. Exposed as Prometheus histograms/counters on a separate port (31314).
## Tech Stack
- **Rust 2024 edition** — workspace with 6 crates
- **Axum 0.8** — HTTP framework
- **reqwest** — HTTP client for proxying to backends
- **figment** — config loading (TOML + env vars)
- **tokio** — async runtime
- **metrics + metrics-exporter-prometheus** — observability
- **tracing** — structured logging
- **candle** — in-process inference engine (neuron only, with CUDA support)
- **cudarc** — patched for neuron's needs (see workspace `[patch]`)
- **clap** — CLI parsing
- **rusqlite** (bundled) — helexa-bench SQLite system-of-record
## Build Commands
```sh
cargo build --release # build all crates
cargo run -p cortex-cli -- serve # run the gateway
cargo test # run all tests
cargo clippy --workspace # lint
```
### neuron Features
- `cuda`: Enables CUDA acceleration in candle and cudarc/nccl bindings. Without it, falls back to CPU.
- `cudnn`: Use cuDNN for convolution/attention kernels (requires `cuda`).
- `flash-attn`: FlashAttention kernels (requires `cuda`).
- `cuda-integration`: Reserved for GPU-only integration tests (requires multiple CUDA devices + libnccl).
### Build Scripts
- `neuron/build.rs`: Compiles CUDA kernels (`src/cuda/*.cu`) using `cudaforge::KernelBuilder` when `cuda` feature is enabled. Handles compute capability checks (sm_<80 disables bf16 intrinsics). Also captures build metadata: git SHA, dirty flag, timestamp, rustc version, profile, features, candle-core version.
## CI
Gitea Actions runs on every push to any branch. All three checks must pass before merging:
```sh
cargo fmt --check --all # formatting
cargo clippy --workspace -- -D warnings # lint (warnings are errors)
cargo test --workspace # tests
```
Run these locally before pushing. `cargo fmt --all` fixes formatting automatically. Clippy warnings must be resolved, not suppressed with `#[allow(...)]` unless there is a clear rationale.
Tagged releases (`v*`) build SRPMs for `cortex`, `helexa-neuron`, and `helexa-bench` and publish to COPR (`helexa/helexa`). Build metadata SHA injection: CI sets `HELEXA_BUILD_SHA=$(git rev-parse HEAD)`.
## Environment
- Targets Fedora 43 (systemd, SELinux enforcing)
- Nodes communicate over a private network (e.g. WireGuard mesh)
- cortex listens on port 31313 (API) and 31314 (metrics)
- neuron listens on port 13131 on each GPU host
- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
## Conventions
- Error handling: `anyhow` for binaries, `thiserror` for library crates
- No `unwrap()` in library code; `expect()` only with clear rationale
- All public types derive `Debug, Clone, Serialize, Deserialize` where sensible
- Config structs use `figment` with TOML as primary source, env vars as override
- Prefer `Arc<RwLock<...>>` for shared fleet state; minimize lock duration
- SSE streaming uses `tokio_stream` + `eventsource-stream` for parsing
- Log at `info` for request routing, `debug` for proxy details, `warn` for eviction and node health, `error` for proxy failures
## Testing
### Gateway tests
Use mock neurons spawned via axum in `crates/cortex-gateway/tests/common/mod.rs`. Helpers: `spawn_mock_backend()`, `spawn_gateway()`.
### neuron integration tests
- Numerical reference tests (`numerical_reference.rs`) require `NEURON_REF_MODEL_PATH` env var pointing to a HF snapshot directory. Fixtures are f32-based for precision validation against HuggingFace transformers.
- CUDA integration tests (`tp_worker_lifecycle_cuda.rs`) gated behind `cuda-integration` feature; requires 2+ CUDA devices (e.g., 2x RTX 5090).
### Metrics testing
Use `install_test_recorder()` in test code to capture metrics without the HTTP listener.
## helexa-bench
A continuous, version-aware benchmark harness. Hits each neuron directly on `:13131`, exercises each warm model with a Scenario suite (chat-latency family), and records results into SQLite stamped with the neuron's full `BuildInfo`. The loop is version-aware: skips any (target, build SHA, model, scenario) cell already at `samples_per_version`.
Packaged as `helexa-bench` RPM (prebuilt-binary spec). One systemd unit, typically on the metrics host.
## helexa-acp
Agent Client Protocol bridge — connects ACP editors (Zed, etc.) to any OpenAI-compatible endpoint, cortex by default. Intentionally self-contained: no workspace crate dependencies. Uses `agent-client-protocol` with `unstable_session_model` feature for Zed model picker support. Licensed Apache-2.0 (workspace is GPL-3.0).
## RPM Packaging
- `cortex.spec` — installs the `cortex` binary
- `helexa-neuron.spec` — installs the `neuron` binary under package name `helexa-neuron` (renamed to avoid Fedora's NEURON neural-simulation package collision)
- Systemd units in `data/cortex.service`, `data/neuron.service`
- Example configs: `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
Install:
```sh
dnf copr enable helexa/helexa
dnf install cortex # gateway host
dnf install helexa-neuron # GPU nodes
```
## Configuration Files
### cortex.toml (gateway)
```toml
[gateway]
listen = "0.0.0.0:31313"
metrics_listen = "0.0.0.0:31314"
[eviction]
strategy = "lru" # lru | priority
defrag_after_cycles = 50
[[neurons]]
name = "beast"
endpoint = "http://beast.internal:13131"
```
### models.toml (catalogue)
```toml
[[models]]
id = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
harness = "candle"
quant = "Q4_K_M"
vram_mb = 19000
min_devices = 2
min_device_vram_mb = 10000
pinned_on = ["beast"] # optional: never evict from these neurons
```
### neuron.toml (per-host)
Configured via figment + env override. See `neuron.example.toml` for reference.
## neuron API Endpoints
```
GET /discovery → hardware discovery (hostname, OS, CUDA, devices, harnesses)
GET /health → runtime GPU stats (VRAM, utilization, temperature)
GET /models → loaded/unloaded models with VRAM usage
POST /models/load → load a model with spec (quant, TP, devices)
POST /models/unload → unload a model, freeing device memory
GET /models/{id}/endpoint → inference URL for a model
GET /version → build metadata (SHA, features, candle version, etc.)
```
## Sources of Truth
When prose documentation conflicts with code, trust:
1. Executable configuration (`*.toml`, `Cargo.toml` features)
2. Type definitions in `cortex-core/`
3. Test files in `crates/*/tests/` and `*/src/**/*_test.rs`
4. `CLAUDE.md` for historical design rationale

272
CLAUDE.md
View File

@@ -1,26 +1,16 @@
# CLAUDE.md — helexa
# CLAUDE.md — cortex
## Project overview
helexa is a self-hosted LLM serving stack for multi-node GPU inference
clusters. It has two components:
- **cortex** — the per-operator control plane and LLM proxy. A Rust
reverse-proxy that sits in front of the fleet and presents a unified
OpenAI + Anthropic compatible API surface. It handles model routing,
lifecycle management (load/unload/evict), request translation, and
metrics collection.
- **neuron** — the per-host LLM harness. One instance runs on every GPU
host, serving candle-based in-process inference and managing local
hardware discovery and model lifecycle.
(Historical note: cortex originally proxied to mistral.rs nodes; neuron
replaced that — see the 2026-05-18 candle-native addendum below.)
cortex is a Rust reverse-proxy that sits in front of multiple
mistral.rs inference nodes and presents a unified OpenAI + Anthropic
compatible API surface. It handles model routing, lifecycle management
(load/unload/evict), request translation, and metrics collection.
## Repository layout
```
helexa/
cortex/
├── Cargo.toml # workspace root
├── cortex.toml # example gateway config
├── README.md
@@ -94,63 +84,6 @@ Per-request: model, node, prompt_tokens, completion_tokens, total_tokens,
tok_per_sec, time_to_first_token_ms, total_latency_ms.
Exposed as Prometheus histograms/counters on a separate port.
### Per-device worker thread (neuron)
The neuron daemon dedicates one OS thread per CUDA device it loads
onto. That thread binds the device's `CudaContext` once at startup and
owns it for the daemon's lifetime; every model load, forward step,
KV-cache reset, VRAM query, NCCL init/sanity, NCCL all_reduce, and
model drop on that device routes through this thread via a
`std::sync::mpsc` job channel. Replies cross back via
`tokio::sync::oneshot`.
Three properties this gives us, in order of weight:
1. **Context locality.** cudarc binds the CUDA context per OS thread
via `cuCtxSetCurrent`. Before this refactor, ad-hoc
`tokio::task::spawn_blocking` calls bound the context onto a
different thread per request — and `device_vram_mb()` from an
async task bound it onto whichever tokio worker happened to be
running. Pinning the context to one named thread ends that.
2. **Drop safety.** Every `CudaSlice` in a `Tensor`, every
`cudarc::nccl::Comm`, and the `CudaContext` itself call `cuMemFree` /
`ncclCommDestroy` / `cuCtxDestroy` during `Drop` — and require the
right context current. With the worker owning the model slab,
`Drop` always runs on the right thread. The cudarc Drop constraint
is structurally enforced.
3. **Poisoning blast radius.** When a CUDA driver error makes the
context unrecoverable, the poison flag lives on the
`DeviceWorkerHandle` itself. Subsequent `submit()` calls fast-reject
at the channel boundary with a clear "device worker is poisoned"
error before any further CUDA work is attempted. The thread doesn't
exit (dropping the slab would re-touch the broken context) — it
enters a drain-only mode and replies error to everything until the
daemon restarts.
Tensors never escape the worker thread alive. Inference replies carry
`Vec<f32>` CPU-side logits; the async caller wraps them in a CPU
candle tensor and runs `apply_repeat_penalty` + `LogitsProcessor::sample`
without ever rebinding the device context. Sampled tokens come back as
`u32`; VRAM queries as `(u64, u64)`. The opaque `ArchHandle(u64)` and
`TpHandle(u64)` are the only "references" callers hold to loaded
models — they're indices into the worker's state slab, not pointers.
The TP worker subprocesses in `harness/tp/worker.rs` are the same
pattern out-of-process — a dedicated context-owning process per
non-zero NCCL rank. The in-process worker in `harness/device_worker/`
brings the discipline to rank 0.
CPU loads (`Device::Cpu` fallback when CUDA is unavailable) keep the
legacy `tokio::task::spawn_blocking + Arc<Mutex<ModelArch>>` path —
there's no context to own and the channel hop would only add latency.
Four `spawn_blocking` references in `harness/candle.rs` are deliberate
CPU fallback.
Canonical narrative lives in
`crates/neuron/src/harness/device_worker/mod.rs`'s module
doc-comment; touch points (the `Job` enum, the dispatch handlers, the
`DeviceWorkerState` struct) are in the sibling `jobs.rs` and
`dispatch.rs`.
## Tech stack
- **Rust 2024 edition** — workspace with 4 crates
@@ -192,8 +125,7 @@ automatically. Clippy warnings must be resolved, not suppressed with
- One or more GPU nodes running mistral.rs on port 8080
- Optionally a metrics-only node (no GPU) for Prometheus/Grafana
- Each node runs `mistralrs serve` on port 8080
- Gateway listens on port 31313 (API) and 31314 (metrics)
- neuron listens on port 13131 on each GPU host
- Gateway listens on port 8000 (API) and 9100 (metrics)
- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
## Conventions
@@ -448,7 +380,7 @@ processes (one process per loaded model, each on its own port).
## neuron API
neuron exposes an HTTP API on port 13131 that cortex polls and calls.
neuron exposes an HTTP API on port 9090 that cortex polls and calls.
```
GET /discovery
@@ -492,8 +424,8 @@ endpoint. cortex.toml shrinks to:
```toml
[gateway]
listen = "0.0.0.0:31313"
metrics_listen = "0.0.0.0:31314"
listen = "0.0.0.0:8000"
metrics_listen = "0.0.0.0:9100"
[eviction]
strategy = "lru"
@@ -501,15 +433,15 @@ defrag_after_cycles = 50
[[neurons]]
name = "beast"
endpoint = "http://beast.hanzalova.internal:13131"
endpoint = "http://beast.hanzalova.internal:9090"
[[neurons]]
name = "benjy"
endpoint = "http://benjy.hanzalova.internal:13131"
endpoint = "http://benjy.kosherinata.internal:9090"
[[neurons]]
name = "quadbrat"
endpoint = "http://quadbrat.hanzalova.internal:13131"
endpoint = "http://quadbrat.hanzalova.internal:9090"
```
On startup and periodically, cortex calls `GET /discovery` and
@@ -558,7 +490,7 @@ and the hardcoded `vram_mb` per node.
## Revised repository layout
```
helexa/
cortex/
├── Cargo.toml
├── cortex.toml # gateway config (neurons only)
├── models.toml # model catalogue
@@ -589,7 +521,7 @@ helexa/
│ │ └── metrics.rs # prometheus exporter (unchanged)
│ ├── neuron/ # node plane (replaces cortex-agent)
│ │ └── src/
│ │ ├── main.rs # binary entrypoint, axum server on :13131
│ │ ├── main.rs # binary entrypoint, axum server on :9090
│ │ ├── discovery.rs # nvidia-smi, device enumeration
│ │ ├── health.rs # runtime GPU polling
│ │ ├── api.rs # HTTP handlers for /discovery, /models, etc.
@@ -663,140 +595,70 @@ placement matching can be added incrementally.
Completed. Both packages have RPM specs, systemd units, and example configs.
CI builds parallel SRPMs on tag push and publishes to separate COPR repos.
- `cortex.spec` — installs the `cortex` binary. Package name keeps the
short `cortex` because no Fedora package collides with it.
- `helexa-neuron.spec` — installs the `neuron` binary under package name
`helexa-neuron`. Renamed from bare `neuron` to avoid collision with
Fedora's NEURON neural-simulation package
(https://src.fedoraproject.org/rpms/neuron); binary, systemd unit,
system user, and config dir all stay named `neuron` since those are
project-local contexts.
- `cortex.spec` `helexa/cortex` COPR: binary, systemd unit, config files
- `neuron.spec``helexa/neuron` COPR: binary, systemd unit, config
- `data/cortex.service`, `data/neuron.service` — systemd units
- `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR
publish to a single project `helexa/helexa` hosting both packages.
- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR publish
Install:
```sh
dnf copr enable helexa/helexa
dnf install cortex # gateway host
dnf install helexa-neuron # GPU nodes
dnf copr enable helexa/cortex && dnf install cortex # gateway host
dnf copr enable helexa/neuron && dnf install neuron # GPU nodes
```
## 2026-05-18 addendum: candle-native pivot
### Phase 11: llama.cpp harness stub
Phases 11 (llama.cpp harness) and 12 (mistral.rs COPR) below are
**superseded**. The project no longer treats mistral.rs or llama.cpp as
dependencies — both are conceptually out of scope. neuron becomes a
candle-native inference daemon, with `Harness` retained as an
internal seam for adding future engines (vision/audio/diffusion) but
its only implementation being in-process candle.
**Goal:** Prove the harness abstraction works with a second engine.
The full staged plan for this pivot lives at
`~/.claude/plans/create-a-more-aggressive-calm-naur.md`. Summary:
**Steps:**
1. `crates/neuron/src/harness/llamacpp.rs` — implement the `Harness`
trait for llama.cpp's `llama-server`.
- `start()` — launch `llama-server` with the correct model path,
`--port`, `--n-gpu-layers`, `--tensor-split` args. Track the
child process.
- `stop()` — send SIGTERM to the child process.
- `list_models()` — llama-server serves one model per process, so
return a single-element list.
- `load_model()` — start a new llama-server process for this model.
- `unload_model()` — stop the process.
- `inference_endpoint()` — return `http://localhost:{assigned_port}`.
2. Port allocation: neuron assigns ports from a range (e.g. 8100-8199)
to llama-server instances.
3. Register in `HarnessRegistry` when configured:
```toml
[[harnesses]]
name = "llamacpp"
binary = "/usr/local/bin/llama-server"
port_range = [8100, 8199]
```
4. Tests: mock llama-server (simple HTTP server returning canned
responses), test load/unload/endpoint lifecycle.
- **Stage 1 (this commit):** delete `mistralrs.rs` and `llamacpp.rs`,
scaffold inert `CandleHarness`, drop `endpoint`/`systemd_unit` from
`HarnessConfig`, default no-op `start`/`stop` on the `Harness` trait.
- **Stages 24:** wire up candle model load/unload (quantized Qwen3
first), add OpenAI-compatible inference endpoint in neuron, then SSE
streaming.
- **Stages 56:** load-on-activation (default models in config) and
unload-on-deactivation (graceful shutdown).
- **Stages 78:** multi-GPU tensor parallelism and broader model/quant
coverage.
**Done when:** A model with `harness = "llamacpp"` in `models.toml` can
be loaded and served through cortex. Tests pass with mock llama-server.
Sections of this document that describe mistral.rs HTTP behaviour
("mistral.rs API gotchas") are retained as historical context for
Phases 110 — they document what was true while the project depended
on mistral.rs. They do not describe current behaviour.
### Phase 12 (lower priority): mistral.rs COPR packaging
---
**Goal:** Fedora RPMs for mistral.rs built against specific CUDA versions.
### Phase 11 (superseded): llama.cpp harness stub
**Steps:**
1. `mistralrs-cuda.spec` — RPM spec that clones a pinned mistral.rs git
tag, builds with `--features cuda`, links against the system CUDA
toolkit. Produces `mistralrs-cuda13-server` (CUDA 13.x / sm_120) and
`mistralrs-cuda12-server` (CUDA 12.x / sm_89). Install binary to
`/usr/local/bin/mistralrs`.
2. COPR build config: enable the NVIDIA CUDA repo as a build dependency.
Pin the CUDA toolkit version in `BuildRequires`.
3. Gitea Actions or manual workflow: bump the mistral.rs tag in the spec,
trigger COPR rebuild.
4. neuron's mistralrs harness config references which binary/package
provides the mistral.rs binary. neuron could warn at startup if the
installed mistral.rs CUDA version doesn't match the discovered driver.
~~Originally planned as a second engine to prove the harness
abstraction.~~ Replaced by the candle harness work in the 2026-05-18
addendum above. llama.cpp's any-model/any-hardware breadth is no
longer in scope for helexa.
**Done when:** `dnf install mistralrs-cuda13-server` on beast provides a
working `mistralrs` binary built for Blackwell GPUs. `dnf install
mistralrs-cuda12-server` on benjy provides one built for Ada GPUs.
### Phase 12 (superseded): mistral.rs COPR packaging
~~Originally planned to ship CUDA-versioned mistral.rs RPMs.~~ Replaced
by the candle harness work in the 2026-05-18 addendum above. With
mistral.rs out of the dependency tree, there is nothing to package.
## 2026-05-27 addendum: per-device worker thread
Replaced the ad-hoc `tokio::task::spawn_blocking` pattern that drove
every leader-side CUDA op with one dedicated OS thread per CUDA device,
permanently bound to that device's `CudaContext`. All leader-side
inference work (GGUF + dense + TP shard load, forward, kv-cache clear,
NCCL init/sanity, NCCL all_reduce, VRAM query, model drop) routes
through the worker via a `std::sync::mpsc` channel; tensors never
escape the worker thread alive. See "Per-device worker thread (neuron)"
above and `crates/neuron/src/harness/device_worker/mod.rs` for the
canonical narrative.
Motivated by the 2026-05-26 silent-hang on beast: a CUDA OOM cascade
poisoned the device context on whichever spawn_blocking thread caught
it, and subsequent requests stalled invisibly on the pool lock. After
the refactor, the same failure mode shows up in journalctl as
`prefill sample failed; logits unhealthy nan: 248320/248320` followed
by `failed, model marked poisoned`. The thread stays alive and rejects
subsequent requests at the channel boundary.
Landed in four PRs:
- **Phase 1** (`081b532`) — device_worker module + 8 VRAM-query sites
route through the worker. CPU build only; smoke on beast confirmed
a persistent `cuda-dev-0` thread.
- **Phase 2** (`b179204`) — single-GPU forward + clear_kv + drop via
the worker. `LoadedModel.arch_handle: Option<ArchHandle>` replaces
`Arc<Mutex<ModelArch>>` for CUDA loads. CPU keeps the legacy path.
- **Phase 3** (`76ab24d`) — TP forward + NCCL init/sanity + leader
KV-clear routed through the worker. `WorkerPool.leader_nccl` moves
into the worker's state. `TpLoadedModel.leader_handle: TpHandle`
replaces `Arc<Mutex<TpLeaderModel>>`. CUDA-only TP smoke deferred to
next deploy.
- **Phase 4** (`b4f3576`) — GGUF + dense + TP shard loads move onto
the worker. The `Job::TransferIn` / `Job::CloneLeaderComm` bridges
from Phases 2/3 deleted; `SendComm` newtype no longer needed in the
load path. `grep -rn spawn_blocking crates/neuron/src/harness/`
returns only deliberate CPU-fallback hits after this PR.
## 2026-06-13 addendum: build metadata + helexa-bench
Two coupled additions so fleet performance can be tracked automatically
across neuron updates instead of by hand-running `script/bench.py` and
editing `doc/benchmarks.md`.
**neuron build metadata + `GET /version`.** neuron's `build.rs` now also
captures build identity (`HELEXA_GIT_SHA` — preferring a CI/RPM-injected
`HELEXA_BUILD_SHA`, falling back to git, else `unknown` — plus dirty
flag, build timestamp, rustc version, profile, enabled cargo features,
and a best-effort `candle-core` version from `Cargo.lock`). These are
exposed as `cortex_core::build_info::BuildInfo` (new module) from a new
`GET /version` endpoint (`neuron/src/version.rs`, wired in `api.rs`) and
in clap's `--version` long form. The SHA is injected in CI
(`build-prerelease.yml` build-neuron step: `export HELEXA_BUILD_SHA=$(git
rev-parse HEAD)`) and via `--define helexa_commit` in the source-build
spec, so tarball-built RPMs report the real SHA. `/version` is now the
canonical "which build is live" probe (supersedes the per-host RPM-sha
check in the fleet-validation flow).
**`crates/helexa-bench`** — a new binary: a continuous, version-aware
benchmark harness (one systemd unit, typically on the metrics host). It
hits each neuron **directly** on `:13131`, exercises each **warm**
(`status == "loaded"`) model with an extensible `Scenario` suite (phase
1: the chat-latency family ported verbatim from `bench.py` — synthetic
128/4096-tok prompts, `/no_think`, streamed TTFT + decode-window
tok/s), and records each run into a SQLite system-of-record stamped with
the neuron's full `BuildInfo`. The loop is **version-aware**: it skips
any (target, build SHA, model, scenario) cell already at
`samples_per_version`, so a steady fleet costs only cheap `/version` +
`/models` polls until a new SHA ships. `helexa-bench report` regenerates
the `benchmarks.md`-style table from the DB. `kind = "openai"` targets
(mistral.rs/llama.cpp comparison) are scaffolded but not yet wired.
Packaged as the `helexa-bench` RPM (prebuilt-binary spec, outbound-only
so no firewalld service) via the same `build-prerelease.yml` pipeline.
This is a separate repo/spec — not part of the cortex workspace — but
tightly coupled operationally. Track it as a sibling project.

2628
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -5,15 +5,13 @@ members = [
"crates/cortex-gateway",
"crates/cortex-cli",
"crates/neuron",
"crates/helexa-acp",
"crates/helexa-bench",
]
[workspace.package]
version = "0.1.16"
version = "0.1.2"
edition = "2024"
license = "GPL-3.0-or-later"
repository = "https://git.lair.cafe/helexa/helexa"
repository = "https://git.lair.cafe/helexa/cortex"
[workspace.dependencies]
# async runtime
@@ -29,7 +27,7 @@ serde = { version = "1", features = ["derive"] }
serde_json = "1"
toml = "0.8"
# http client (for proxying to neuron backends)
# http client (for proxying to mistralrs backends)
reqwest = { version = "0.12", features = ["json", "stream"] }
# observability
@@ -62,12 +60,3 @@ eventsource-stream = "0.2"
# workspace crates
cortex-core = { path = "crates/cortex-core" }
cortex-gateway = { path = "crates/cortex-gateway" }
# Patched cudarc (affects neuron's 0.19.x only; candle's 0.17.x is
# untouched since the fork is 0.19.7 and doesn't satisfy a 0.17 req). Adds
# Comm::abort / get_async_error / raw comm() — needed for #17 Stage 2 TP
# hang-recovery (abort a wedged collective from another thread, then
# rebuild the comm). Pinned to a fork revision pending upstream review
# (grenade/cudarc @ nccl-comm-abort).
[patch.crates-io]
cudarc = { git = "https://github.com/grenade/cudarc", rev = "63327a256059f8252641ae46c6bb9eefe707f382" }

227
README.md
View File

@@ -1,68 +1,24 @@
# helexa
# cortex
**Near-frontier AI for mortals.**
A Rust reverse-proxy and fleet management layer for multi-node
[mistral.rs](https://github.com/EricLBuehler/mistral.rs) inference clusters.
helexa is a self-hosted LLM serving stack, written in Rust, for people
who run open-weight models on their own consumer GPUs. It has two
components:
## Problem
- **cortex** — the per-operator control plane and LLM proxy. It sits in
front of your GPU fleet and presents a unified OpenAI + Anthropic
compatible API surface, handling model routing, lifecycle management
(load / unload / evict), request translation, and metrics.
- **neuron** — the per-host LLM harness. One instance runs on every GPU
host, serving candle-based in-process inference and managing local
hardware discovery and model lifecycle.
Running local LLMs across multiple GPU nodes (different VRAM tiers, different
model affinities) requires a unified API surface that:
## Why
Two principles constrain everything in this repository:
1. **Frontier or close to it.** helexa serves the open-weight models
that get nearest to frontier capability — not every architecture
ever published.
2. **Consumer hardware.** Everything must run on the cards mortals can
actually buy: a 3060 here, a 4090 there, a 5090 if you got lucky.
Mixed VRAM tiers across mismatched boxes are the expected topology,
not a degraded case.
GPU acquisition is harder than it was a year ago, and the gap between
what cloud providers charge and what your own silicon costs keeps
widening. The intersection of those two principles — near-frontier
models, squeezed onto hardware you own — is helexa's entire niche.
The secondary objective is **predictable consumption**. If you own the
hardware, your tooling shouldn't break because a cloud provider changed
billing, deprecated a model, or reshaped an API. cortex's OpenAI and
Anthropic surfaces are a stability contract: point your editor, agent,
or CLI at it once, and it keeps working.
## What helexa is not
This is an intentionally different path from vLLM, SGLang, and peers —
not a smaller version of them. Out of scope, permanently:
- Any-model breadth. Architectures are ported because they're at or
near the frontier, not to complete a compatibility matrix.
- Datacenter-class scheduling. No sophisticated continuous-batching /
paged-attention machinery — the workload is a handful of operators
and their agents, not 200 QPS.
- Wrapping external inference engines. neuron builds directly on
[candle](https://github.com/huggingface/candle); every model
architecture it serves is implemented in this repository, ported
against the HuggingFace reference.
One thing that is *not* a principle: CUDA exclusivity. All high-end
consumer hardware is in scope. helexa is CUDA-only today because
that's the hardware on the bench — nothing ships untested — and ROCm
or other consumer accelerators join as soon as there's real hardware
to build against.
In scope, and where the engineering effort goes: aggressive
quantization (GGUF Q4_K_M / Q6_K / Q8_0), NCCL tensor parallelism
across heterogeneous consumer GPUs, careful CUDA failure handling, and
single-request latency — the performance that one operator at a
keyboard actually feels.
- Presents a **single `/v1/models` catalogue** merging every model across every
node.
- **Routes requests** to the correct node based on where a model is loaded (or
*can* be loaded).
- Manages **model lifecycle** — unload cold models, reload on demand, pin
critical ones — using the mistral.rs
`/v1/models/{unload,reload,status}` HTTP API (PR #1828+).
- Translates between **OpenAI and Anthropic** request/response envelopes so
every client in the homelab speaks whichever dialect it prefers.
- Captures **per-request metrics** (tokens, tok/s, TTFT, latency) and exposes
them as Prometheus counters/histograms.
## Architecture
@@ -72,119 +28,102 @@ keyboard actually feels.
└──────┬───────┘ └─────┬────┘ └──────┬─────┘ └──────┬─────┘
│ │ │ │
└────────────────┴──────┬───────┴───────────────┘
OpenAI + Anthropic APIs
┌──────────▼──────────┐
cortex
│ (cortex-gateway) │
│ cortex │
(cortex-gateway)
│ │
│ Router · Metrics │
│ Evictor · Translate│
└──┬──────┬────────┬──┘
│ │ │
┌──────────▼┐ ┌──▼─────┐ ┌▼──────────┐
neuron │ │ neuron │ │ neuron
:13131 │ │ :13131 │ │ :13131
candle │ │ candle │ │ candle
gpu-large │ │gpu-med │ │ gpu-small
mistralrs │ │mistral │ │ mistralrs
serve │ │rs serve│ │ serve
│ :8080 │ │ :8080 │ │ :8080 │
└───────────┘ └────────┘ └───────────┘
private network (.internal)
```
cortex discovers each neuron's hardware (devices, VRAM, compute
capability) at runtime and matches it against a model catalogue
(`models.toml`) to decide placement: which models fit where, what to
evict when VRAM is tight, where to route a request right now. Adding a
GPU host to the fleet is one `[[neurons]]` entry — no device specs in
config.
### Crates
| Crate | Purpose |
|---|---|
| `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic envelopes, harness trait, discovery types |
| `cortex-gateway` | Axum HTTP server: proxy, router, evictor, poller, metrics exporter |
| `neuron` | Per-host daemon: GPU discovery, in-process candle inference, NCCL tensor parallelism, model lifecycle API |
| `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic request/response envelopes |
| `cortex-gateway` | Axum HTTP server: proxy, router, evictor, metrics exporter |
| `cortex-agent` | Per-node sidecar: polls local mistralrs, reports to gateway, handles restart/defrag |
| `cortex-cli` | CLI entrypoint (`cortex serve`, `cortex status`, etc.) |
| `helexa-acp` | Agent Client Protocol bridge — connects ACP editors (Zed, etc.) to any OpenAI-compatible endpoint, cortex by default |
## The engine
## Node setup
neuron runs inference in-process on candle — there is no external
inference server to babysit. The parts that earn their keep:
Each GPU node runs `mistralrs serve` with a multi-model config. Models are
declared but start **unloaded** — mistral.rs lazy-loads on first request and
the gateway can explicitly unload/reload via the HTTP API.
- **Per-device worker threads.** Every CUDA device gets one dedicated
OS thread that owns its CUDA context for the daemon's lifetime. All
loads, forward passes, KV-cache resets, NCCL collectives, VRAM
queries, and unloads route through it; tensors never escape it
alive. Context binding is pinned to a known thread, the CUDA `Drop`
contract is structurally safe, and a driver error poisons one worker
— visibly — instead of hanging the whole process.
- **Tensor parallelism on consumer cards.** Megatron-style row/column
parallel layers with NCCL all-reduce, spanning the mismatched GPUs
you actually have. A step watchdog aborts wedged collectives instead
of letting a request hang forever.
- **Current model focus: the Qwen3 family** — dense and GGUF-quantized,
including the hybrid linear-attention (Gated DeltaNet) generation.
Vision support is in progress. Each architecture is ported against
its HuggingFace reference implementation.
Example node systemd unit:
See `CLAUDE.md` for design rationale and
`crates/neuron/src/harness/device_worker/` for the worker narrative.
```ini
# /etc/systemd/system/mistralrs.service
[Unit]
Description=mistral.rs inference server
After=network-online.target
Wants=network-online.target
## Install
[Service]
Type=simple
ExecStart=/usr/local/bin/mistralrs serve \
--from-config /etc/mistralrs/config.toml \
--port 8080
Restart=on-failure
RestartSec=5
Environment=CUDA_VISIBLE_DEVICES=0,1
Pre-built RPMs for Fedora:
```sh
dnf copr enable helexa/helexa
dnf install cortex # on the gateway host
dnf install helexa-neuron # on each GPU host
systemctl enable --now cortex # or neuron, respectively
[Install]
WantedBy=multi-user.target
```
## Configure
## Gateway config
```toml
# /etc/cortex/cortex.toml
# cortex.toml
[gateway]
listen = "0.0.0.0:31313"
metrics_listen = "0.0.0.0:31314"
listen = "0.0.0.0:8000"
metrics_listen = "0.0.0.0:9100"
[eviction]
strategy = "lru" # lru | priority
defrag_after_cycles = 50
[[neurons]]
name = "beast"
endpoint = "http://beast.internal:13131"
[[nodes]]
name = "gpu-large"
endpoint = "http://gpu-large.internal:8080"
vram_mb = 49_152 # e.g. 2x RTX 4090
pinned = ["your-org/large-model"]
[[neurons]]
name = "benjy"
endpoint = "http://benjy.internal:13131"
[[nodes]]
name = "gpu-medium"
endpoint = "http://gpu-medium.internal:8080"
vram_mb = 24_576 # e.g. RTX 4090
pinned = ["your-org/medium-model"]
[[nodes]]
name = "gpu-small"
endpoint = "http://gpu-small.internal:8080"
vram_mb = 12_288 # e.g. RTX 3060
pinned = ["your-org/embedding-model"]
```
Model placement profiles (VRAM requirements, quant, device minimums,
pinning) live in `models.toml` — see `models.example.toml`.
## Run
```sh
# start the gateway
cortex serve --config /etc/cortex/cortex.toml
# check fleet status
cortex status
# one catalogue across every node
curl http://localhost:31313/v1/models
```
## Build from source
## Building
```sh
cargo build --release
```
CI runs on every push; keep it green locally:
## CI
Every push triggers format, lint, and test checks. Ensure these pass
locally before pushing:
```sh
cargo fmt --check --all # must be clean
@@ -192,18 +131,20 @@ cargo clippy --workspace -- -D warnings # warnings are errors
cargo test --workspace # all tests must pass
```
Tagged releases (`v*`) build SRPMs for `cortex` and `helexa-neuron`
and publish to COPR.
Tagged releases (`v*`) additionally build an SRPM and publish to COPR.
## Status
## Running
Pre-1.0 and moving fast. The gateway path (routing, eviction,
translation, metrics) is stable and tested; the candle-native engine
is under active development — expect the supported-model list to track
the open-weight frontier, deliberately narrowly.
```sh
# start the gateway
cortex serve --config cortex.toml
Development happens at <https://git.lair.cafe/helexa/helexa>;
<https://github.com/helexa-ai/helexa> is a read-only mirror.
# check fleet status
cortex status
# list all models across nodes
curl http://localhost:8000/v1/models
```
## License

View File

@@ -1,38 +0,0 @@
# helexa-bench config for bob.hanzalova.internal.
#
# Synced to /etc/helexa-bench/helexa-bench.toml by script/infra-setup.sh
# (the helexa-bench RPM ships helexa-bench.example.toml as a
# %config(noreplace) default; this per-host file overrides it).
#
# bob is a client host (it also runs Agent Zero); helexa-bench here hits
# every neuron on the fleet directly and records build-stamped results
# into the local SQLite store.
[bench]
sweep_interval_secs = 1800
samples_per_version = 5
iteration_pause_secs = 2
request_timeout_secs = 600
db_path = "/var/lib/helexa-bench/bench.sqlite"
[scenarios]
prompt_sizes = [128, 4096]
max_tokens = 256
# Read-only JSON API consumed by the bench UI (hosted separately) and for
# programmatic access. Served alongside the sweep loop.
[api]
enabled = true
listen = "0.0.0.0:13132"
[[targets]]
name = "beast"
endpoint = "http://beast.hanzalova.internal:13131"
[[targets]]
name = "benjy"
endpoint = "http://benjy.hanzalova.internal:13131"
[[targets]]
name = "quadbrat"
endpoint = "http://quadbrat.hanzalova.internal:13131"

View File

@@ -1,24 +0,0 @@
# neuron.toml for beast.hanzalova.internal
#
# 2x RTX 5090 (32 GB each) — TP-2 capable. Pre-warms Qwen3.6-27B with
# q5k ISQ across both GPUs at activation, matching the validate-neuron
# invocation: `validate-neuron.sh beast.hanzalova.internal
# Qwen/Qwen3.6-27B q5k 2`.
#
# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh. Edits
# take effect after the next deploy workflow run restarts the service
# (default_models is read at activation).
port = 13131
[[harnesses]]
name = "candle"
[harness.candle]
[[default_models]]
model_id = "Qwen/Qwen3.6-27B"
harness = "candle"
quant = "q6k"
tensor_parallel = 2
devices = [0, 1]

View File

@@ -1,19 +0,0 @@
# neuron.toml for benjy.hanzalova.internal
#
# 1x RTX 4090 (24 GB) — largest single-GPU host on the fleet. Pre-warms
# Qwen3-8B (bf16, ~18 GB), leaving ~6 GB for KV cache + activations on
# moderate-length contexts.
#
# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.
port = 13131
[[harnesses]]
name = "candle"
[harness.candle]
[[default_models]]
model_id = "Qwen/Qwen3-8B"
harness = "candle"
devices = [0]

View File

@@ -1,19 +0,0 @@
# neuron.toml for quadbrat.hanzalova.internal
#
# 1x RTX 3060 (12 GB) — small / quantised tier. Pre-warms Qwen3-1.7B
# (bf16, ~4 GB), leaving ~7 GB for KV cache so long contexts on a small
# model still have plenty of room.
#
# Synced to /etc/neuron/neuron.toml by script/infra-setup.sh.
port = 13131
[[harnesses]]
name = "candle"
[harness.candle]
[[default_models]]
model_id = "Qwen/Qwen3-1.7B"
harness = "candle"
devices = [0]

View File

@@ -1,15 +0,0 @@
# Bootstrap vhost for bench.helexa.ai — http-only, used ONLY to obtain
# the initial Let's Encrypt cert via the webroot challenge (the full TLS
# vhost can't load before the cert file exists). script/infra-setup.sh
# installs this, runs certbot, then swaps in bench.helexa.ai.conf.
server {
listen 80;
server_name bench.helexa.ai;
location /.well-known/acme-challenge/ {
root /var/www/bench.helexa.ai;
}
location / {
try_files $uri $uri/ =404;
}
}

View File

@@ -1,56 +0,0 @@
# Public, auth-less bench UI at https://bench.helexa.ai.
#
# Serves the static SPA from /var/www/bench.helexa.ai (rsynced by
# .gitea/workflows/deploy.yml's deploy-bench-ui job) and reverse-proxies
# /api to the helexa-bench read API on bob over the WireGuard mesh — so
# the browser stays same-origin (no CORS) and the internal API never
# needs to be exposed publicly.
#
# TLS via Let's Encrypt; the cert is obtained/renewed by certbot
# (bootstrapped one-time in script/infra-setup.sh). Mirrors the
# dev.swym.hanzalova.internal vhost convention on this host.
server {
listen 80;
server_name bench.helexa.ai;
# Keep serving the ACME webroot so certbot can renew.
location /.well-known/acme-challenge/ {
root /var/www/bench.helexa.ai;
}
location / {
return 301 https://$host$request_uri;
}
}
server {
listen 443 ssl;
http2 on;
server_name bench.helexa.ai;
ssl_certificate /etc/letsencrypt/live/bench.helexa.ai/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/bench.helexa.ai/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
root /var/www/bench.helexa.ai;
index index.html;
# Bench read API on bob (internal WireGuard); browser stays same-origin.
location /api/ {
proxy_pass http://bob.hanzalova.internal:13132;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_read_timeout 60s;
}
# SPA fallback — client-side routes (/trends, /runs) resolve to index.html.
location / {
try_files $uri $uri/ /index.html;
}
}

View File

@@ -1,34 +0,0 @@
# Internal bench UI vhost — https://bench.internal, reachable from inside
# the WireGuard mesh (the public bench.helexa.ai dead-ends at the OPNsense
# LAN interface, which only port-forwards :443 from the WAN). Same SPA +
# /api→bob proxy as bench.helexa.ai, but with an internal-CA cert
# (smallstep "lair", renewed by step@bench.timer). Mirrors the
# *.internal vhost convention on oolon.kosherinata.internal.
server {
server_name bench.internal;
listen 443 ssl;
http2 on;
ssl_certificate /etc/nginx/tls/cert/bench.internal.pem;
ssl_certificate_key /etc/nginx/tls/key/bench.internal.pem;
ssl_trusted_certificate /etc/pki/ca-trust/source/anchors/root-internal.pem;
ssl_protocols TLSv1.3;
# Shared webroot with the public vhost — same built SPA.
root /var/www/bench.helexa.ai;
index index.html;
location /api/ {
proxy_pass http://bob.hanzalova.internal:13132;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_read_timeout 60s;
}
location / {
try_files $uri $uri/ /index.html;
}
}

View File

@@ -1,25 +0,0 @@
# Install on the bench host (bob) as /etc/sudoers.d/helexa_gitea_ci
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
# which SSHes as gitea_ci@bob to roll out helexa-bench package upgrades
# and config changes.
#
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
# helexa-org apps can drop their own sudoers files on the same host
# without overwriting this one.
#
# helexa-bench polls the neuron fleet (outbound) and serves a read-only
# JSON API on tcp/13132 for the bench UI — hence the firewall-cmd grants.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/helexa-bench/helexa-bench.toml
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start helexa-bench.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop helexa-bench.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now helexa-bench.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-bench
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-bench
# sudoers reserves `:` and `=` and requires `\` escaping inside command
# arguments — without it visudo errors at the first `:` in `https://`.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-bench --permanent
gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload

View File

@@ -1,23 +0,0 @@
# Install on the cortex gateway host as /etc/sudoers.d/helexa_gitea_ci
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
# which SSHes as gitea_ci@<gateway> to roll out cortex package upgrades
# and config changes.
#
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
# helexa-org apps can drop their own sudoers files on the same host
# without overwriting this one.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/cortex.toml
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/cortex/models.toml
# deploy-bench-ui rsyncs the built bench SPA into the nginx webroot.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /var/www/bench.helexa.ai/
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start cortex.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop cortex.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now cortex.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y cortex
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y cortex
# sudoers reserves `:` and `=` and requires `\` escaping inside command
# arguments — without it visudo errors at the first `:` in `https://`.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1

View File

@@ -1,43 +0,0 @@
# Install on every neuron host as /etc/sudoers.d/helexa_gitea_ci
# (owner root:root, mode 0440). Required by .gitea/workflows/deploy.yml,
# which SSHes as gitea_ci@<neuron-host> to roll out helexa-neuron-<flavour>
# package upgrades and config changes.
#
# Filename convention `helexa_gitea_ci` (vs bare `gitea_ci`) so other
# helexa-org apps can drop their own sudoers files on the same host
# without overwriting this one.
#
# All three CUDA flavours are listed because a host's flavour can change
# (e.g. GPU swap) and we don't want the sudoers file to need to change
# in lockstep. Only one flavour can be installed at a time (the packages
# Conflict: with each other), so the attack surface is bounded to "wrong
# flavour installed" — vandalism, not privilege escalation.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/rsync * /etc/neuron/neuron.toml
# deploy.yml writes the per-model systemd drop-in carrying
# NEURON_MAX_PROMPT_TOKENS: gitea_ci stages it in its own dir, then
# installs it root-owned. Exact source/dest paths; see doc/context-limits.md.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/install -o root -g root -m 0644 -D /var/lib/gitea_ci/model.conf /etc/systemd/system/neuron.service.d/model.conf
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl start neuron.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl stop neuron.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl enable --now neuron.service
gitea_ci ALL=(root) NOPASSWD: /usr/bin/systemctl daemon-reload
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ampere
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ampere
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-ada
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-ada
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install --refresh --allowerasing -y helexa-neuron-blackwell
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf upgrade --refresh --allowerasing -y helexa-neuron-blackwell
# sudoers reserves `:` and `=` and requires `\` escaping inside command
# arguments — without it visudo errors at the first `:` in `https://`.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://rpm.lair.cafe/lair-cafe-unstable.repo
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager setopt lair-cafe-unstable.enabled\=1
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf config-manager addrepo --from-repofile\=https\://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
gitea_ci ALL=(root) NOPASSWD: /usr/bin/dnf install -y libcudnn9-cuda-13
gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --add-service=helexa-neuron --permanent
gitea_ci ALL=(root) NOPASSWD: /usr/bin/firewall-cmd --reload
# deploy-dev.yml fast path: install a freshly-built dev binary over the
# packaged one. Exact source path + args; the workflow must use this
# command form verbatim. The next deploy.yml run reconciles the host
# back to the RPM-owned binary.
gitea_ci ALL=(root) NOPASSWD: /usr/bin/install -o root -g root -m 0755 /var/lib/gitea_ci/neuron-dev /usr/bin/neuron

View File

@@ -1,20 +0,0 @@
# Internal-CA cert renewal for %i.internal, driven by step@%i.timer.
# Replicated from oolon.kosherinata.internal (the kosherinata DC proxy).
# Renews an EXISTING cert via mTLS (step ca renew) — the initial cert
# must be issued once with a provisioner (see script/infra-setup.sh).
# Installed to /etc/systemd/system/step@.service.
[Unit]
Description=step cert renew for %i.internal
Documentation=https://smallstep.com/docs/step-ca/renewal
[Service]
Type=oneshot
ExecCondition=/usr/bin/step certificate needs-renewal \
/etc/nginx/tls/cert/%i.internal.pem
ExecStart=/usr/bin/step ca renew \
--force \
--ca-url https://ca.internal \
--root /etc/pki/ca-trust/source/anchors/root-internal.pem \
/etc/nginx/tls/cert/%i.internal.pem \
/etc/nginx/tls/key/%i.internal.pem
ExecStartPost=/usr/bin/systemctl reload nginx.service

View File

@@ -1,15 +0,0 @@
# Periodic internal-cert renewal for %i.internal (every 15 min, jittered).
# Replicated from oolon.kosherinata.internal. Installed to
# /etc/systemd/system/step@.timer; enable per-cert with
# `systemctl enable --now step@bench.timer`.
[Unit]
Description=step cert renew timer for %i.internal
[Timer]
Persistent=true
OnCalendar=*:1/15
AccuracySec=1us
RandomizedDelaySec=5m
[Install]
WantedBy=timers.target

3
bench/.gitignore vendored
View File

@@ -1,3 +0,0 @@
node_modules
dist
*.local

View File

@@ -1,45 +0,0 @@
# helexa bench UI
A Vite + React (SWC, TypeScript) app that visualises the fleet benchmark
data collected by `helexa-bench`. It reads the read-only JSON API the
bench daemon serves (`crates/helexa-bench/src/api.rs`, default
`:13132` on bob).
Stack: React Router, react-bootstrap, Recharts.
## Pages
- **Overview** — latest median results per (host, model, scenario) cell.
- **Trends** — decode-tok/s and TTFT plotted across neuron build SHAs as
releases roll out (the headline view). Pick host / model / scenario.
- **Runs** — filterable raw-run explorer.
## Develop
```sh
cd bench
npm install
npm run dev # http://localhost:5173
```
`vite.config.ts` proxies `/api``http://bob.hanzalova.internal:13132`,
so the dev server talks to the live bench API with no CORS fuss. Point
the proxy elsewhere (or run a local `helexa-bench serve`) to develop
against other data.
## Production hosting
Public at **https://bench.helexa.ai** — nginx on the gateway
(`hanzalova.internal`) serves the static `dist/` and reverse-proxies
`/api` to the bench API on bob over WireGuard, so the SPA is same-origin
(no CORS) and the internal API stays off the public internet.
- `npm run build` is run with **no** `VITE_API_BASE` (the app calls
`/api/...` on its own origin; nginx proxies it to bob).
- `.gitea/workflows/deploy.yml` (`deploy-bench-ui`) builds and rsyncs
`dist/` to `/var/www/bench.helexa.ai` on every deploy.
- The nginx vhost (`asset/nginx/bench.helexa.ai.conf`) and the
Let's Encrypt cert are one-time host setup in `script/infra-setup.sh`.
To host elsewhere instead, build with
`VITE_API_BASE=<bob-api-origin>` and serve the static `dist/`.

View File

@@ -1,12 +0,0 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>helexa bench</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

2191
bench/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,28 +0,0 @@
{
"name": "helexa-bench-ui",
"private": true,
"version": "0.1.0",
"type": "module",
"description": "Visualisation app for helexa-bench fleet benchmark data.",
"scripts": {
"dev": "vite",
"build": "tsc && vite build",
"preview": "vite preview"
},
"dependencies": {
"bootstrap": "^5.3.3",
"react": "^18.3.1",
"react-bootstrap": "^2.10.5",
"react-dom": "^18.3.1",
"react-router-dom": "^6.26.2",
"recharts": "^2.12.7"
},
"devDependencies": {
"@types/node": "^20.14.0",
"@types/react": "^18.3.5",
"@types/react-dom": "^18.3.0",
"@vitejs/plugin-react-swc": "^3.7.0",
"typescript": "^5.5.4",
"vite": "^5.4.0"
}
}

View File

@@ -1,30 +0,0 @@
import { Container, Nav, Navbar } from "react-bootstrap";
import { NavLink, Outlet } from "react-router-dom";
export default function App() {
return (
<>
<Navbar bg="dark" variant="dark" expand="md">
<Container>
<Navbar.Brand as={NavLink} to="/">
helexa&nbsp;bench
</Navbar.Brand>
<Nav className="me-auto">
<Nav.Link as={NavLink} to="/" end>
Overview
</Nav.Link>
<Nav.Link as={NavLink} to="/trends">
Trends
</Nav.Link>
<Nav.Link as={NavLink} to="/runs">
Runs
</Nav.Link>
</Nav>
</Container>
</Navbar>
<Container className="py-4">
<Outlet />
</Container>
</>
);
}

View File

@@ -1,45 +0,0 @@
import type { Dimensions, ReportRow, RunRow, SeriesPoint } from "./types";
// Empty default → `fetch('/api/...')` hits the dev proxy (vite.config.ts)
// or the same origin. For a separately-hosted build, set VITE_API_BASE to
// the bob API origin (e.g. http://bob.hanzalova.internal:13132).
const BASE = import.meta.env.VITE_API_BASE ?? "";
async function getJson<T>(path: string): Promise<T> {
const res = await fetch(`${BASE}${path}`);
if (!res.ok) {
throw new Error(`${res.status} ${res.statusText}: ${await res.text()}`);
}
return res.json() as Promise<T>;
}
export const getDimensions = () => getJson<Dimensions>("/api/dimensions");
export const getSummary = () => getJson<ReportRow[]>("/api/summary");
// host is resolved server-side (each model maps to one host today), so the
// public UI selects by model + scenario alone.
export const getSeries = (model: string, scenario: string) =>
getJson<SeriesPoint[]>(
`/api/series?model=${encodeURIComponent(model)}&scenario=${encodeURIComponent(scenario)}`,
);
export interface RunsParams {
host?: string;
model?: string;
scenario?: string;
sha?: string;
ok?: boolean;
limit?: number;
}
export const getRuns = (p: RunsParams = {}) => {
const q = new URLSearchParams();
if (p.host) q.set("host", p.host);
if (p.model) q.set("model", p.model);
if (p.scenario) q.set("scenario", p.scenario);
if (p.sha) q.set("sha", p.sha);
if (p.ok !== undefined) q.set("ok", String(p.ok));
if (p.limit) q.set("limit", String(p.limit));
const qs = q.toString();
return getJson<RunRow[]>(`/api/runs${qs ? `?${qs}` : ""}`);
};

View File

@@ -1,52 +0,0 @@
// Pre-helexa-bench baseline, transcribed verbatim from doc/benchmarks.md.
//
// IMPORTANT — different measurement regime. These were measured by
// script/bench.py *through the cortex gateway* (so TTFT/total include a
// proxy hop), reported as medians only, before helexa-bench existed.
// helexa-bench measures each neuron *directly*. So these points are an
// honest historical anchor, NOT apples-to-apples with the live series —
// the Trends view renders them dashed + labelled, never merged into the
// live line.
//
// Host is inferred from the model via the doc's Fleet table
// (beast=27B, benjy=8B, quadbrat=1.7B). Timestamps are the two 2026-06-12
// snapshots in the doc, ordered (08:00 = pre-#11, 16:00 = post-#11) so
// they sort before the bench era on the shared time axis.
export interface BaselinePoint {
host: string;
model: string;
scenario: string;
git_sha: string;
build_timestamp: string;
ttft_s: number;
decode_tps: number;
total_s: number;
}
/** Source: bench.py via cortex gateway — see doc/benchmarks.md. */
export const BASELINE_SOURCE = "bench.py · via cortex gateway";
export const BASELINE: BaselinePoint[] = [
// ── 8f6f1d3 — baseline (2026-06-12) ────────────────────────────────
{ host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 1.658, decode_tps: 35.0, total_s: 8.981 },
{ host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 7.067, decode_tps: 33.7, total_s: 14.63 },
{ host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 0.884, decode_tps: 62.4, total_s: 4.938 },
{ host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 1.818, decode_tps: 46.5, total_s: 7.27 },
{ host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:128", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 0.685, decode_tps: 81.3, total_s: 3.741 },
{ host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:4096", git_sha: "8f6f1d3", build_timestamp: "2026-06-12T08:00:00Z", ttft_s: 2.743, decode_tps: 35.4, total_s: 9.884 },
// ── a1952a4 — post prefix-KV-cache (#11, 2026-06-12) ───────────────
{ host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.355, decode_tps: 45.8, total_s: 4.147 },
{ host: "beast", model: "Qwen/Qwen3.6-27B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.431, decode_tps: 43.3, total_s: 4.387 },
{ host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 0.886, decode_tps: 78.6, total_s: 2.478 },
{ host: "benjy", model: "Qwen/Qwen3-8B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 1.824, decode_tps: 58.3, total_s: 3.969 },
{ host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:128", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 0.702, decode_tps: 104.8, total_s: 1.895 },
{ host: "quadbrat", model: "Qwen/Qwen3-1.7B", scenario: "chat:4096", git_sha: "a1952a4", build_timestamp: "2026-06-12T16:00:00Z", ttft_s: 2.749, decode_tps: 44.9, total_s: 5.534 },
];
/** Baseline points for one (model, scenario) cell, oldest first. */
export function baselineFor(model: string, scenario: string): BaselinePoint[] {
return BASELINE.filter(
(b) => b.model === model && b.scenario === scenario,
).sort((a, b) => a.build_timestamp.localeCompare(b.build_timestamp));
}

View File

@@ -1,22 +0,0 @@
import React from "react";
import ReactDOM from "react-dom/client";
import { BrowserRouter, Route, Routes } from "react-router-dom";
import "bootstrap/dist/css/bootstrap.min.css";
import App from "./App";
import Overview from "./pages/Overview";
import Trends from "./pages/Trends";
import Runs from "./pages/Runs";
ReactDOM.createRoot(document.getElementById("root")!).render(
<React.StrictMode>
<BrowserRouter>
<Routes>
<Route path="/" element={<App />}>
<Route index element={<Overview />} />
<Route path="trends" element={<Trends />} />
<Route path="runs" element={<Runs />} />
</Route>
</Routes>
</BrowserRouter>
</React.StrictMode>,
);

View File

@@ -1,64 +0,0 @@
import { useEffect, useState } from "react";
import { Alert, Spinner, Table } from "react-bootstrap";
import { getSummary } from "../api";
import type { ReportRow } from "../types";
const f = (n: number | null, p = 2) => (n == null ? "—" : n.toFixed(p));
export default function Overview() {
const [rows, setRows] = useState<ReportRow[]>([]);
const [err, setErr] = useState<string | null>(null);
const [loading, setLoading] = useState(true);
useEffect(() => {
getSummary()
.then(setRows)
.catch((e) => setErr(String(e)))
.finally(() => setLoading(false));
}, []);
if (loading) return <Spinner animation="border" />;
if (err) return <Alert variant="danger">{err}</Alert>;
return (
<>
<h3 className="mb-3">Latest results per cell</h3>
<p className="text-muted">
Median of each cell's samples on the most recent build seen for that
(host, model, scenario).
</p>
<Table striped bordered hover responsive size="sm">
<thead>
<tr>
<th>GPU</th>
<th>model</th>
<th className="text-end">prompt tok</th>
<th className="text-end">TTFT (s)</th>
<th className="text-end">decode tok/s</th>
<th className="text-end">total (s)</th>
<th>build</th>
<th className="text-end">n</th>
</tr>
</thead>
<tbody>
{rows.map((r, i) => (
<tr key={i}>
<td>{r.gpu ?? r.target_name}</td>
<td>{r.model_id}</td>
<td className="text-end">
{r.prompt_tokens ?? `~${r.prompt_size_approx}`}
</td>
<td className="text-end">{f(r.ttft_s_median, 3)}</td>
<td className="text-end">{f(r.decode_tps_median, 1)}</td>
<td className="text-end">{f(r.total_s_median, 3)}</td>
<td>
<code>{r.git_sha}</code>
</td>
<td className="text-end">{r.samples}</td>
</tr>
))}
</tbody>
</Table>
</>
);
}

View File

@@ -1,141 +0,0 @@
import { useEffect, useState } from "react";
import { Alert, Badge, Col, Form, Row, Spinner, Table } from "react-bootstrap";
import { getDimensions, getRuns } from "../api";
import type { Dimensions, RunRow } from "../types";
const f = (n: number | null, p = 2) => (n == null ? "—" : n.toFixed(p));
function Picker({
label,
value,
set,
options,
}: {
label: string;
value: string;
set: (v: string) => void;
options: string[];
}) {
return (
<Form.Group as={Col}>
<Form.Label>{label}</Form.Label>
<Form.Select value={value} onChange={(e) => set(e.target.value)}>
<option value="">(all)</option>
{options.map((o) => (
<option key={o} value={o}>
{o}
</option>
))}
</Form.Select>
</Form.Group>
);
}
export default function Runs() {
const [dims, setDims] = useState<Dimensions | null>(null);
const [host, setHost] = useState("");
const [model, setModel] = useState("");
const [scenario, setScenario] = useState("");
const [rows, setRows] = useState<RunRow[]>([]);
const [err, setErr] = useState<string | null>(null);
const [loading, setLoading] = useState(false);
useEffect(() => {
getDimensions()
.then(setDims)
.catch((e) => setErr(String(e)));
}, []);
useEffect(() => {
setLoading(true);
getRuns({
host: host || undefined,
model: model || undefined,
scenario: scenario || undefined,
limit: 200,
})
.then(setRows)
.catch((e) => setErr(String(e)))
.finally(() => setLoading(false));
}, [host, model, scenario]);
if (err) return <Alert variant="danger">{err}</Alert>;
return (
<>
<h3 className="mb-3">Runs</h3>
{dims && (
<Row className="g-3 mb-3">
{/* GPU filter — labelled by GPU, but filters by the underlying host. */}
<Form.Group as={Col}>
<Form.Label>GPU</Form.Label>
<Form.Select value={host} onChange={(e) => setHost(e.target.value)}>
<option value="">(all)</option>
{dims.hosts.map((h) => (
<option key={h} value={h}>
{dims.host_gpus[h] ?? h}
</option>
))}
</Form.Select>
</Form.Group>
<Picker
label="Model"
value={model}
set={setModel}
options={dims.models}
/>
<Picker
label="Scenario"
value={scenario}
set={setScenario}
options={dims.scenarios}
/>
</Row>
)}
{loading ? (
<Spinner animation="border" />
) : (
<Table striped bordered hover responsive size="sm">
<thead>
<tr>
<th>ts</th>
<th>GPU</th>
<th>model</th>
<th>scenario</th>
<th>build</th>
<th className="text-end">TTFT</th>
<th className="text-end">tok/s</th>
<th className="text-end">total</th>
<th>ok</th>
</tr>
</thead>
<tbody>
{rows.map((r) => (
<tr key={r.id}>
<td>{r.ts}</td>
<td>{r.gpu ?? r.host}</td>
<td>{r.model_id}</td>
<td>{r.scenario_id}</td>
<td>
<code>{r.git_sha}</code>
</td>
<td className="text-end">{f(r.ttft_s, 3)}</td>
<td className="text-end">{f(r.decode_tps, 1)}</td>
<td className="text-end">{f(r.total_s, 3)}</td>
<td>
{r.ok ? (
<Badge bg="success">ok</Badge>
) : (
<Badge bg="danger" title={r.error ?? ""}>
fail
</Badge>
)}
</td>
</tr>
))}
</tbody>
</Table>
)}
</>
);
}

View File

@@ -1,221 +0,0 @@
import { useEffect, useMemo, useState } from "react";
import { Alert, Col, Form, Row, Spinner } from "react-bootstrap";
import {
CartesianGrid,
Legend,
Line,
LineChart,
ReferenceLine,
ResponsiveContainer,
Tooltip,
XAxis,
YAxis,
} from "recharts";
import { getDimensions, getSeries } from "../api";
import type { Dimensions, SeriesPoint } from "../types";
import { BASELINE_SOURCE, baselineFor } from "../baseline";
function Picker({
label,
value,
set,
options,
}: {
label: string;
value: string;
set: (v: string) => void;
options: string[];
}) {
return (
<Form.Group as={Col}>
<Form.Label>{label}</Form.Label>
<Form.Select value={value} onChange={(e) => set(e.target.value)}>
{options.map((o) => (
<option key={o} value={o}>
{o}
</option>
))}
</Form.Select>
</Form.Group>
);
}
export default function Trends() {
const [dims, setDims] = useState<Dimensions | null>(null);
const [model, setModel] = useState("");
const [scenario, setScenario] = useState("");
const [series, setSeries] = useState<SeriesPoint[]>([]);
const [err, setErr] = useState<string | null>(null);
useEffect(() => {
getDimensions()
.then((d) => {
setDims(d);
if (d.models[0]) setModel(d.models[0]);
if (d.scenarios[0]) setScenario(d.scenarios[0]);
})
.catch((e) => setErr(String(e)));
}, []);
useEffect(() => {
if (model && scenario) {
getSeries(model, scenario)
.then(setSeries)
.catch((e) => setErr(String(e)));
}
}, [model, scenario]);
// Prepend the pre-helexa-bench baseline (dashed, separate keys) so it
// anchors the timeline without being merged into the live line. Different
// measurement regime — see baseline.ts / doc/benchmarks.md.
const base = useMemo(
() => baselineFor(model, scenario),
[model, scenario],
);
const data = useMemo(
() => [
...base.map((p) => ({
label: p.git_sha,
baseTtft: p.ttft_s,
baseDecode: p.decode_tps,
baseTotal: p.total_s,
})),
...series.map((p) => ({
label: p.git_sha,
ttft: p.ttft_s_median,
decode: p.decode_tps_median,
total: p.total_s_median,
})),
],
[series, base],
);
// Divider marking the boundary between the two regimes (drawn at the
// first live build, with baseline points to its left).
const firstLive = series[0]?.git_sha;
const showDivider = base.length > 0 && series.length > 0;
if (err) return <Alert variant="danger">{err}</Alert>;
if (!dims) return <Spinner animation="border" />;
return (
<>
<h3 className="mb-3">Trends over builds</h3>
<Row className="g-3 mb-4">
<Picker
label="Model"
value={model}
set={setModel}
options={dims.models}
/>
<Picker
label="Scenario"
value={scenario}
set={setScenario}
options={dims.scenarios}
/>
</Row>
{dims.model_gpus[model] && (
<p className="text-muted mb-3">
Measured on <strong>{dims.model_gpus[model]}</strong>.
</p>
)}
{data.length === 0 ? (
<Alert variant="info">No data for this selection yet.</Alert>
) : (
<>
{base.length > 0 && (
<p className="text-muted small mb-3">
Dashed = pre-helexa-bench baseline ({BASELINE_SOURCE}); solid =
helexa-bench (direct to neuron). Different measurement regimes
see <code>doc/benchmarks.md</code>.
</p>
)}
<h5 className="mt-3">decode tok/s (higher is better)</h5>
<ResponsiveContainer width="100%" height={280}>
<LineChart data={data} margin={{ top: 8, right: 24, bottom: 8, left: 0 }}>
<CartesianGrid strokeDasharray="3 3" />
<XAxis dataKey="label" />
<YAxis />
<Tooltip />
<Legend />
{showDivider && firstLive && (
<ReferenceLine
x={firstLive}
stroke="#bbb"
strokeDasharray="3 3"
label={{
value: "bench.py → helexa-bench",
position: "top",
fill: "#999",
fontSize: 11,
}}
/>
)}
<Line
type="monotone"
dataKey="decode"
name="decode tok/s"
stroke="#0d6efd"
connectNulls
/>
{base.length > 0 && (
<Line
type="monotone"
dataKey="baseDecode"
name="baseline (bench.py · gateway)"
stroke="#888"
strokeDasharray="5 5"
connectNulls
/>
)}
</LineChart>
</ResponsiveContainer>
<h5 className="mt-4">TTFT seconds (lower is better)</h5>
<ResponsiveContainer width="100%" height={280}>
<LineChart data={data} margin={{ top: 8, right: 24, bottom: 8, left: 0 }}>
<CartesianGrid strokeDasharray="3 3" />
<XAxis dataKey="label" />
<YAxis />
<Tooltip />
<Legend />
{showDivider && firstLive && (
<ReferenceLine
x={firstLive}
stroke="#bbb"
strokeDasharray="3 3"
label={{
value: "bench.py → helexa-bench",
position: "top",
fill: "#999",
fontSize: 11,
}}
/>
)}
<Line
type="monotone"
dataKey="ttft"
name="TTFT (s)"
stroke="#dc3545"
connectNulls
/>
{base.length > 0 && (
<Line
type="monotone"
dataKey="baseTtft"
name="baseline (bench.py · gateway)"
stroke="#888"
strokeDasharray="5 5"
connectNulls
/>
)}
</LineChart>
</ResponsiveContainer>
</>
)}
</>
);
}

View File

@@ -1,69 +0,0 @@
// Mirrors the JSON served by helexa-bench's read API (crates/helexa-bench/src/api.rs).
export interface BuildRef {
git_sha: string;
build_timestamp: string | null;
package_version: string | null;
}
export interface Dimensions {
hosts: string[];
models: string[];
scenarios: string[];
builds: BuildRef[];
/** host → GPU label, e.g. "2× RTX 5090". */
host_gpus: Record<string, string>;
/** model → GPU label (model maps to one host today). */
model_gpus: Record<string, string>;
}
/** Latest-SHA-per-cell medians (the report table). */
export interface ReportRow {
target_name: string;
model_id: string;
scenario_id: string;
prompt_size_approx: number;
git_sha: string;
prompt_tokens: number | null;
ttft_s_median: number | null;
decode_tps_median: number | null;
total_s_median: number | null;
samples: number;
/** Public-facing resource name (the host's GPU(s)). */
gpu: string | null;
}
/** One point in a per-build time-series for a (host, model, scenario) cell. */
export interface SeriesPoint {
git_sha: string;
build_timestamp: string | null;
package_version: string | null;
ttft_s_median: number | null;
decode_tps_median: number | null;
total_s_median: number | null;
samples: number;
}
export interface RunRow {
id: number;
ts: string;
host: string;
/** Public-facing resource name (the host's GPU(s)). */
gpu: string | null;
hostname: string | null;
git_sha: string;
build_timestamp: string | null;
package_version: string;
model_id: string;
harness: string;
scenario_id: string;
prompt_size_approx: number;
prompt_tokens_actual: number | null;
max_tokens: number;
ttft_s: number | null;
decode_tps: number | null;
total_s: number | null;
completion_tokens: number | null;
ok: boolean;
error: string | null;
}

View File

@@ -1,9 +0,0 @@
/// <reference types="vite/client" />
interface ImportMetaEnv {
/** Base origin of the bench API. Empty → use the dev proxy / same origin. */
readonly VITE_API_BASE?: string;
}
interface ImportMeta {
readonly env: ImportMetaEnv;
}

View File

@@ -1,22 +0,0 @@
{
"compilerOptions": {
"target": "ES2022",
"useDefineForClassFields": true,
"lib": ["ES2022", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"moduleDetection": "force",
"noEmit": true,
"jsx": "react-jsx",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
"types": ["node", "vite/client"]
},
"include": ["src", "vite.config.ts"]
}

View File

@@ -1,18 +0,0 @@
import { defineConfig } from "vite";
import react from "@vitejs/plugin-react-swc";
// Dev server proxies /api to the bench API on bob so `fetch('/api/...')`
// works without CORS/mixed-origin fuss during local development.
// For a production build hosted elsewhere, set VITE_API_BASE to the bob
// API origin (e.g. http://bob.hanzalova.internal:13132) instead.
export default defineConfig({
plugins: [react()],
server: {
proxy: {
"/api": {
target: "http://bob.hanzalova.internal:13132",
changeOrigin: true,
},
},
},
});

View File

@@ -3,27 +3,22 @@
# Copy to cortex.toml and adjust for your environment.
#
# Environment variable overrides use CORTEX_ prefix with __ separators:
# CORTEX_GATEWAY__LISTEN=0.0.0.0:31313
# Path to the model catalogue (limits, cost, pinning, aliases, feasibility).
# Defaults to the packaged location below; uncomment to override for a
# non-packaged / local run.
# models_config = "/etc/cortex/models.toml"
# CORTEX_GATEWAY__LISTEN=0.0.0.0:9000
[gateway]
listen = "0.0.0.0:31313"
metrics_listen = "0.0.0.0:31314"
listen = "0.0.0.0:8000"
metrics_listen = "0.0.0.0:9100"
[eviction]
strategy = "lru"
# Restart neurons after this many load/unload cycles to defragment VRAM.
# Restart mistralrs after this many load/unload cycles to defragment VRAM.
# Set to 0 to disable.
defrag_after_cycles = 50
# -- Nodes ---------------------------------------------------------------
# Each [[nodes]] entry declares a neuron daemon in the fleet.
# Models are discovered by polling the neuron's /models endpoint.
# Pinned models (see models.toml) are never evicted.
# Each [[nodes]] entry declares a mistral.rs instance in the fleet.
# Models are discovered by polling the node's /v1/models endpoint.
# Pinned models are never evicted.
[[nodes]]
name = "gpu-large"
@@ -48,45 +43,3 @@ vram_mb = 12288 # e.g. RTX 3060 (12 GB)
pinned = [
"your-org/embedding-model",
]
# -- Entitlements (multi-tenant governance, #47) -------------------------
# Identity + per-key token budgets. Omit this section entirely for the
# legacy single-operator behaviour: requests are anonymous and uncapped.
#
# The local/static provider below is the source of truth for accounts,
# keys, and hard caps until the upstream clearing house exists. Identity
# rides standard bearer auth only — clients send
# Authorization: Bearer <key>
# no custom headers or body fields.
[entitlements]
# Reject unauthenticated requests with 401 invalid_api_key. Leave false
# (allow-anonymous) during rollout; flip to true once keys are issued.
require_auth = false
# One entry per API key.
[[entitlements.keys]]
key = "sk-example-rolling" # the bearer token the client sends
account_id = "team-research" # billable account (keys may share one)
key_id = "research-ci" # stable label for ledger/metrics (optional)
hard_cap = 5_000_000 # hard token cap over the window
# Rolling window that resets — over-cap requests get 429 rate_limit_exceeded
# + Retry-After, so well-behaved clients (opencode/AI SDK) back off and retry.
window = { kind = "rolling", seconds = 3600 }
[[entitlements.keys]]
key = "sk-example-balance"
account_id = "team-research"
key_id = "research-prepaid"
hard_cap = 20_000_000
# Hard balance, no reset — exhaustion returns 429 insufficient_quota
# (the client surfaces and stops). This is the default when `window` is
# omitted. Never 402.
window = { kind = "balance" }
[[entitlements.keys]]
key = "sk-example-infra"
account_id = "operator"
key_id = "infra"
# No hard_cap → uncapped operator infra key (own fleet, own use). Still
# metered for visibility.

View File

@@ -1,10 +1,10 @@
Name: cortex
Version: 0.1.16
Version: 0.1.2
Release: 1%{?dist}
Summary: Inference gateway for multi-node GPU clusters
License: GPL-3.0-or-later
URL: https://git.lair.cafe/helexa/helexa
URL: https://git.lair.cafe/helexa/cortex
Source0: %{name}-%{version}.tar.gz
Source1: %{name}-%{version}-vendor.tar.gz
@@ -21,16 +21,12 @@ BuildRequires: systemd-rpm-macros
Requires(pre): shadow-utils
Requires: systemd
Requires: firewalld-filesystem
# systemd-rpm-macros ships a unit dep generator that parses User=/Group=
# from our .service file and emits Requires: user(cortex)/group(cortex).
# rpm's sysusers provides-generator emits the unversioned form for groups
# but only a versioned user(cortex) = <base64> for users with GECOS/home/
# shell. Provide the unversioned user(cortex) explicitly so dnf can resolve
# the auto-generated Requires. Without this, dnf5 silently filters the
# package and reports "Nothing to do".
# rpm's sysusers provides-generator only emits versioned user(cortex) when
# the u-line has GECOS/home/shell fields. %attr(,,cortex) in %files emits
# an unversioned Requires: user(cortex), so we provide it explicitly.
Provides: user(cortex)
Provides: group(cortex)
%description
Cortex is a Rust reverse-proxy that sits in front of multiple inference
@@ -57,10 +53,9 @@ cargo build --release -p cortex-cli
install -Dm755 target/release/cortex %{buildroot}%{_bindir}/cortex
install -Dm644 data/cortex.service %{buildroot}%{_unitdir}/cortex.service
install -Dm644 data/cortex-sysusers.conf %{buildroot}%{_sysusersdir}/cortex.conf
install -Dm644 data/cortex-firewalld.xml %{buildroot}%{_prefix}/lib/firewalld/services/cortex.xml
install -dm755 %{buildroot}%{_sysconfdir}/cortex
install -Dm644 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
install -Dm644 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
install -dm750 %{buildroot}%{_sysconfdir}/cortex
install -Dm640 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
install -Dm640 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
%pre
%sysusers_create_compat %{_builddir}/%{name}-%{version}/data/cortex-sysusers.conf
@@ -74,53 +69,16 @@ install -Dm644 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
%postun
%systemd_postun_with_restart cortex.service
%posttrans
# Migration: older cortex packages shipped the firewalld service as
# `helexa-cortex` and (in some build streams) with wrong port numbers
# (9301/9302/9304). Operators who enabled that legacy service in their
# zone end up with the wrong-port override taking precedence over the
# vendor `cortex.xml` now in /usr/lib/firewalld/services/. Clean up the
# stale /etc/ override here and migrate any zone bindings to the new
# service name.
if [ -f /etc/firewalld/services/helexa-cortex.xml ]; then
rm -f /etc/firewalld/services/helexa-cortex.xml
fi
if [ -x /usr/bin/firewall-cmd ] && /usr/bin/firewall-cmd --state >/dev/null 2>&1; then
# Drop the legacy service name from every zone where it was enabled
# and add the new `cortex` service in its place. Operators who never
# ran firewall-cmd against either name see no zone change.
for zone in $(/usr/bin/firewall-cmd --get-active-zones 2>/dev/null \
| awk '!/^[[:space:]]/ {print $1}'); do
if /usr/bin/firewall-cmd --permanent --zone="$zone" --query-service=helexa-cortex >/dev/null 2>&1; then
/usr/bin/firewall-cmd --permanent --zone="$zone" --remove-service=helexa-cortex >/dev/null 2>&1 || :
/usr/bin/firewall-cmd --permanent --zone="$zone" --add-service=cortex >/dev/null 2>&1 || :
fi
done
/usr/bin/firewall-cmd --reload >/dev/null 2>&1 || :
fi
:
%files
%license LICENSE
%doc README.md
%{_bindir}/cortex
%{_unitdir}/cortex.service
%{_sysusersdir}/cortex.conf
%{_prefix}/lib/firewalld/services/cortex.xml
%dir %{_sysconfdir}/cortex
%config(noreplace) %{_sysconfdir}/cortex/cortex.toml
%config(noreplace) %{_sysconfdir}/cortex/models.toml
%dir %attr(750,root,cortex) %{_sysconfdir}/cortex
%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/cortex.toml
%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/models.toml
%changelog
* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.16-1
- chore: ignore local deploy script
- chore: move default ports out of common-collision ranges
- ci: drop actions/cache for cargo registry and target
* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.14-1
- ci: publish both packages to a single helexa/helexa COPR project
- fix(rpm): rename neuron package to helexa-neuron
- ci: commit generated %changelog entries back to main
* Wed Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
* Tue Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
- Initial package

View File

@@ -5,7 +5,7 @@ use tracing_subscriber::EnvFilter;
#[derive(Parser)]
#[command(name = "cortex")]
#[command(about = "Unified inference gateway for multi-node GPU clusters")]
#[command(about = "Unified inference gateway for multi-node mistral.rs clusters")]
#[command(version)]
struct Cli {
#[command(subcommand)]
@@ -23,7 +23,7 @@ enum Commands {
/// Print the fleet status (models, nodes, health).
Status {
/// Gateway API endpoint to query.
#[arg(short, long, default_value = "http://localhost:31313")]
#[arg(short, long, default_value = "http://localhost:8000")]
endpoint: String,
},
}

View File

@@ -2,7 +2,7 @@
//!
//! These mirror the `/v1/messages` format used by the Anthropic API.
//! The gateway accepts these, translates to OpenAI format, proxies to
//! the inference backend (neuron), then translates the response back.
//! mistral.rs, then translates the response back.
use serde::{Deserialize, Serialize};
use serde_json::Value;

View File

@@ -1,119 +0,0 @@
//! Build/version metadata shared between cortex and neuron.
//!
//! neuron captures these facts at compile time in its `build.rs`
//! (git SHA, enabled cargo features, rustc/candle versions, …) and
//! serves them from `GET /version`. cortex and `helexa-bench`
//! deserialize the same struct so a benchmark run can be attributed to
//! the exact daemon build that produced it — not just the host's CUDA
//! and driver versions that `/discovery` already reports.
//!
//! Every field beyond the always-present package version is
//! `#[serde(default)]` so a newer reader stays compatible with an
//! older neuron that omits a field (and vice versa) — the same
//! forward/backward-compat discipline as
//! [`crate::discovery::ActivationStatus`].
use serde::{Deserialize, Serialize};
/// Build-time identity of a neuron daemon.
///
/// Returned by `GET /version`. The `git_sha` is the canonical "which
/// build is live" key — benchmark records are bucketed by it, so a
/// regression can be pinned to a daemon change rather than a host
/// change. When neuron is built from a source tarball with no git
/// metadata available (and no `HELEXA_BUILD_SHA` injected by CI/RPM),
/// `git_sha` is the string `"unknown"`.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct BuildInfo {
/// Crate version from `CARGO_PKG_VERSION` (e.g. `"0.1.16"`).
pub package_version: String,
/// Short git SHA, or `"unknown"` when unavailable at build time.
#[serde(default = "unknown")]
pub git_sha: String,
/// Full 40-char git SHA when available.
#[serde(default)]
pub git_sha_long: Option<String>,
/// Whether the working tree had uncommitted changes at build time.
/// `false` when the SHA is unknown (tarball build).
#[serde(default)]
pub git_dirty: bool,
/// RFC3339 build timestamp.
#[serde(default)]
pub build_timestamp: Option<String>,
/// `rustc --version` output of the compiler used.
#[serde(default)]
pub rustc_version: Option<String>,
/// Cargo build profile: `"release"` or `"debug"`.
#[serde(default)]
pub profile: Option<String>,
/// Target triple the binary was compiled for.
#[serde(default)]
pub target: Option<String>,
/// Enabled cargo features (e.g. `["cuda", "cudnn"]`). These define
/// the performance envelope, so they are recorded against every
/// benchmark run.
#[serde(default)]
pub features: Vec<String>,
/// Locked `candle-core` version, best-effort from `Cargo.lock`.
#[serde(default)]
pub candle_version: Option<String>,
}
fn unknown() -> String {
"unknown".to_string()
}
impl BuildInfo {
/// A placeholder used by non-neuron benchmark targets (and tests)
/// that have no build metadata to report.
pub fn unknown() -> Self {
BuildInfo {
package_version: env!("CARGO_PKG_VERSION").to_string(),
git_sha: unknown(),
git_sha_long: None,
git_dirty: false,
build_timestamp: None,
rustc_version: None,
profile: None,
target: None,
features: Vec::new(),
candle_version: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn round_trips_full() {
let info = BuildInfo {
package_version: "0.1.16".into(),
git_sha: "30d50d6".into(),
git_sha_long: Some("30d50d6abc123".into()),
git_dirty: true,
build_timestamp: Some("2026-06-13T10:00:00+00:00".into()),
rustc_version: Some("rustc 1.85.0".into()),
profile: Some("release".into()),
target: Some("x86_64-unknown-linux-gnu".into()),
features: vec!["cuda".into(), "cudnn".into()],
candle_version: Some("0.10.2".into()),
};
let json = serde_json::to_string(&info).unwrap();
let back: BuildInfo = serde_json::from_str(&json).unwrap();
assert_eq!(info, back);
}
#[test]
fn deserializes_minimal_payload() {
// An older neuron might send only the package version; every
// other field must default rather than fail.
let back: BuildInfo = serde_json::from_str(r#"{"package_version":"0.1.0"}"#).unwrap();
assert_eq!(back.package_version, "0.1.0");
assert_eq!(back.git_sha, "unknown");
assert!(!back.git_dirty);
assert!(back.features.is_empty());
assert!(back.candle_version.is_none());
}
}

View File

@@ -1,9 +1,6 @@
//! Model catalogue — profiles describing how to serve each model.
use crate::discovery::DeviceInfo;
use crate::harness::{ModelCost, ModelLimit};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::Path;
/// A model serving profile loaded from models.toml.
@@ -25,32 +22,6 @@ pub struct ModelProfile {
/// Neurons where this model should never be evicted.
#[serde(default)]
pub pinned_on: Vec<String>,
/// Source scheme this profile's weights come from. When set, the
/// router prefixes `id` with `scheme:` before forwarding the load
/// request to neuron, ensuring the daemon fetches from the right
/// registry regardless of which entry happens to match `id`.
///
/// `None` lets neuron substitute its own `default_source` (typically
/// `huggingface`). Set to `"helexa"` when the model is hosted in
/// the helexa registry — operator-procurement-grade audit relies
/// on this being explicit per model rather than implicit.
#[serde(default)]
pub source: Option<String>,
// ── Enrichment (issue #62) ────────────────────────────────
/// Per-model token budget. When present, advertised in `/v1/models`
/// so clients can size and compact their context automatically.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub limit: Option<ModelLimit>,
/// Operator-set pricing (USD per 1M tokens). `0.0` for self-hosted.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cost: Option<ModelCost>,
/// Static capability flags the operator wants to advertise even
/// before the model is loaded on any neuron (e.g. `"reasoning"`,
/// `"tool_call"`). Runtime-detected capabilities from the harness
/// are unioned with this set in the gateway's `/v1/models` response.
#[serde(default)]
pub capabilities: Vec<String>,
}
fn default_min_devices() -> u32 {
@@ -62,14 +33,6 @@ fn default_min_devices() -> u32 {
pub struct ModelCatalogue {
#[serde(default)]
pub models: Vec<ModelProfile>,
/// Tier aliases — clients can send a request with `model: "helexa/small"`
/// and the gateway transparently rewrites + routes to the concrete
/// model id this maps to. Lets operators define latency/quality
/// tiers (`small`/`balanced`/`large`, `fast`/`thinking`, etc.)
/// without imposing knowledge of specific model ids on clients.
/// Loaded from the `[aliases]` table in models.toml.
#[serde(default)]
pub aliases: HashMap<String, String>,
}
impl ModelCatalogue {
@@ -101,165 +64,4 @@ impl ModelCatalogue {
.iter()
.any(|p| p.id == model_id && p.pinned_on.contains(&neuron_name.to_string()))
}
/// Find a profile by model id.
pub fn get(&self, model_id: &str) -> Option<&ModelProfile> {
self.models.iter().find(|p| p.id == model_id)
}
/// Resolve an alias to its concrete model id. Returns `id` verbatim
/// when it isn't an alias. Aliases never chain — operator config
/// is treated as flat — so this is a single lookup.
pub fn resolve_alias<'a>(&'a self, id: &'a str) -> &'a str {
self.aliases.get(id).map(String::as_str).unwrap_or(id)
}
}
impl ModelProfile {
/// True iff this profile's placement constraints can be satisfied
/// by the named neuron with the given device topology.
///
/// Constraints checked:
/// - `pinned_on`: non-empty → neuron must be on the list.
/// - `min_devices`: neuron must have at least this many devices.
/// - `min_device_vram_mb`: at least `min_devices` of the neuron's
/// devices must each meet this VRAM floor.
pub fn is_feasible_on(&self, neuron_name: &str, devices: &[DeviceInfo]) -> bool {
if !self.pinned_on.is_empty() && !self.pinned_on.iter().any(|n| n == neuron_name) {
return false;
}
if (devices.len() as u32) < self.min_devices {
return false;
}
if let Some(min_vram) = self.min_device_vram_mb {
let big_enough = devices
.iter()
.filter(|d| d.vram_total_mb >= min_vram)
.count() as u32;
if big_enough < self.min_devices {
return false;
}
}
true
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::discovery::DeviceInfo;
fn device(idx: u32, vram_mb: u64) -> DeviceInfo {
DeviceInfo {
index: idx,
name: format!("DEV-{idx}"),
vram_total_mb: vram_mb,
compute_capability: "8.6".into(),
}
}
fn profile() -> ModelProfile {
ModelProfile {
id: "Qwen/Qwen3.6-27B".into(),
harness: "candle".into(),
quant: None,
vram_mb: Some(45_000),
min_devices: 2,
min_device_vram_mb: Some(24_000),
pinned_on: vec![],
source: None,
limit: None,
cost: None,
capabilities: vec![],
}
}
#[test]
fn feasible_when_two_devices_meet_vram_floor() {
let p = profile();
let devices = [device(0, 32_000), device(1, 32_000)];
assert!(p.is_feasible_on("beast", &devices));
}
#[test]
fn infeasible_when_only_one_device() {
let p = profile();
let devices = [device(0, 64_000)];
assert!(!p.is_feasible_on("benjy", &devices));
}
#[test]
fn infeasible_when_one_device_underspec() {
let p = profile();
let devices = [device(0, 32_000), device(1, 12_000)];
assert!(!p.is_feasible_on("mixed", &devices));
}
#[test]
fn pinned_on_excludes_other_neurons() {
let mut p = profile();
p.pinned_on = vec!["beast".into()];
let devices = [device(0, 32_000), device(1, 32_000)];
assert!(p.is_feasible_on("beast", &devices));
assert!(!p.is_feasible_on("benjy", &devices));
}
#[test]
fn no_vram_floor_just_needs_min_devices() {
let mut p = profile();
p.min_device_vram_mb = None;
let devices = [device(0, 1_000), device(1, 1_000)];
assert!(p.is_feasible_on("anywhere", &devices));
}
#[test]
fn resolve_alias_returns_target_when_alias_present() {
let mut cat = ModelCatalogue::default();
cat.aliases
.insert("helexa/small".into(), "Qwen/Qwen3-1.7B".into());
assert_eq!(cat.resolve_alias("helexa/small"), "Qwen/Qwen3-1.7B");
}
#[test]
fn resolve_alias_passes_through_when_not_an_alias() {
let mut cat = ModelCatalogue::default();
cat.aliases
.insert("helexa/small".into(), "Qwen/Qwen3-1.7B".into());
assert_eq!(cat.resolve_alias("Qwen/Qwen3-8B"), "Qwen/Qwen3-8B");
}
#[test]
fn source_defaults_to_none_when_absent_from_toml() {
let src = r#"
[[models]]
id = "Qwen/Qwen3-30B"
harness = "candle"
"#;
let cat: ModelCatalogue = toml::from_str(src).expect("parse models table");
assert!(cat.models[0].source.is_none());
}
#[test]
fn source_round_trips_through_toml() {
let src = r#"
[[models]]
id = "Helexa/Qwen3.6-27B-Uncensored"
harness = "candle"
source = "helexa"
"#;
let cat: ModelCatalogue = toml::from_str(src).expect("parse models table");
assert_eq!(cat.models[0].source.as_deref(), Some("helexa"));
}
#[test]
fn aliases_table_round_trips_through_toml() {
let src = r#"
[aliases]
"helexa/small" = "Qwen/Qwen3-1.7B"
"helexa/large" = "Qwen/Qwen3.6-27B"
"#;
let cat: ModelCatalogue = toml::from_str(src).expect("parse aliases table");
assert_eq!(cat.resolve_alias("helexa/small"), "Qwen/Qwen3-1.7B");
assert_eq!(cat.resolve_alias("helexa/large"), "Qwen/Qwen3.6-27B");
}
}

View File

@@ -1,4 +1,3 @@
use crate::entitlements::CapWindow;
use figment::{
Figment,
providers::{Env, Format, Toml},
@@ -12,68 +11,20 @@ pub struct GatewayConfig {
pub eviction: EvictionSettings,
/// Neuron endpoints (replaces old NodeConfig with static vram_mb/pinned).
pub neurons: Vec<NeuronEndpoint>,
/// Path to the model catalogue file. Defaults to the packaged
/// location (`/etc/cortex/models.toml`); set explicitly for
/// non-packaged / local runs.
/// Path to the model catalogue file (default: "models.toml").
#[serde(default = "default_models_path")]
pub models_config: String,
/// Multi-tenant governance: auth + per-key token budgets (#47). Empty
/// by default — anonymous, uncapped — so existing single-operator
/// setups keep working until keys are configured.
#[serde(default)]
pub entitlements: EntitlementsConfig,
}
/// `[entitlements]` — the local/static [`crate::entitlements::EntitlementProvider`]
/// source of truth (#50). Accounts, keys, and hard caps live here; the
/// future upstream client (#57) ignores this section.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct EntitlementsConfig {
/// Reject unauthenticated requests with `401 invalid_api_key` when
/// true. Default `false` (allow-anonymous) for dev / single-operator
/// continuity.
#[serde(default)]
pub require_auth: bool,
/// Static API keys and their budgets, consumed by the local provider.
#[serde(default)]
pub keys: Vec<ApiKeyConfig>,
}
/// One configured API key: the bearer token, the account it bills to, and
/// its hard cap. `[[entitlements.keys]]` in TOML.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ApiKeyConfig {
/// The bearer token clients send in `Authorization: Bearer <key>`.
pub key: String,
/// Billable account. Multiple keys may share one account.
pub account_id: String,
/// Stable per-key identifier for ledger/metrics labels. Defaults to
/// `account_id` when omitted, so the secret is never used as a label.
#[serde(default)]
pub key_id: Option<String>,
/// Hard token cap. `None`/omitted = uncapped (e.g. operator infra key).
#[serde(default)]
pub hard_cap: Option<u64>,
/// Cap-window semantics. Default: a non-resetting [`CapWindow::Balance`].
#[serde(default)]
pub window: CapWindow,
}
fn default_models_path() -> String {
// Absolute, so the systemd-launched binary finds the catalogue
// regardless of its working directory. The RPM installs the catalogue
// here (`cortex.spec`); a relative "models.toml" silently resolved to
// the service cwd and left the catalogue empty in production
// (pinning / aliases / limits all no-ops). Override via `models_config`
// in cortex.toml for local runs.
"/etc/cortex/models.toml".into()
"models.toml".into()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GatewaySettings {
/// Address to listen on for API requests (e.g. "0.0.0.0:31313")
/// Address to listen on for API requests (e.g. "0.0.0.0:8000")
pub listen: String,
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:31314")
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
pub metrics_listen: String,
}
@@ -99,7 +50,7 @@ pub enum EvictionStrategy {
pub struct NeuronEndpoint {
/// Human-readable node name (e.g. "beast")
pub name: String,
/// Base URL of the neuron daemon (e.g. "http://beast.internal:13131")
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
pub endpoint: String,
}
@@ -119,8 +70,8 @@ impl Default for GatewayConfig {
fn default() -> Self {
Self {
gateway: GatewaySettings {
listen: "0.0.0.0:31313".into(),
metrics_listen: "0.0.0.0:31314".into(),
listen: "0.0.0.0:8000".into(),
metrics_listen: "0.0.0.0:9100".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
@@ -128,7 +79,6 @@ impl Default for GatewayConfig {
},
neurons: vec![],
models_config: default_models_path(),
entitlements: EntitlementsConfig::default(),
}
}
}

View File

@@ -22,23 +22,6 @@ pub struct DiscoveryResponse {
pub driver_version: Option<String>,
pub devices: Vec<DeviceInfo>,
pub harnesses: Vec<String>,
/// Set when the host has an NVIDIA stack that is currently
/// unusable — specifically the userspace↔kernel-module version
/// skew after an un-rebooted driver update ("Driver/library
/// version mismatch"), where every CUDA call including nvidia-smi
/// fails (#19). `None` on healthy hosts AND on hosts with no
/// NVIDIA stack at all (CPU-only is not an error). Carries an
/// operator-actionable description; cortex can read it to route
/// around the node instead of cold-loading into a guaranteed
/// failure.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cuda_unavailable_reason: Option<String>,
/// The neuron's effective maximum prompt size in tokens
/// (`NEURON_MAX_PROMPT_TOKENS`) — the enforced prompt cap on this
/// host. `#[serde(default)]` (→ 0) for forward-compat with neurons
/// that predate this field; cortex treats 0 as "unknown".
#[serde(default)]
pub max_prompt_tokens: u64,
}
/// Runtime health metrics for a single GPU device.
@@ -53,72 +36,8 @@ pub struct DeviceHealth {
/// Runtime health response from a neuron endpoint.
/// Returned by `GET /health`.
///
/// `activation` was added in 2026-05-26 to distinguish "process is up
/// and reachable" from "process is ready to serve traffic". A `Type=simple`
/// systemd unit reports `active` the moment the binary starts — but a
/// neuron whose `default_models` list takes minutes to materialise
/// won't bind its listener (or, in the new flow, won't have any models
/// loaded) until pre-warm completes. The new field is `#[serde(default)]`
/// so a pre-2026-05-26 gateway polling a new neuron — or vice versa —
/// keeps working.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthResponse {
pub uptime_secs: u64,
pub devices: Vec<DeviceHealth>,
#[serde(default)]
pub activation: ActivationStatus,
}
/// High-level activation state of the neuron daemon. The HTTP listener
/// is bound during both states; what differs is whether the configured
/// `default_models` have finished loading.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ActivationState {
/// At least one `default_models` entry is still loading. The
/// neuron's other endpoints work, but inference against
/// not-yet-loaded models will 404.
PreWarming,
/// Every `default_models` entry has either loaded or failed; the
/// neuron is steady-state. Subsequent on-demand loads via
/// `/models/load` don't flip back to PreWarming — that field
/// reflects the activation-time set only.
#[default]
Ready,
}
/// Per-model failure record surfaced in [`ActivationStatus::failed`].
/// The error string is the rendered anyhow chain at the time of the
/// failure; operators read it from `/health` to decide whether to
/// retry, edit the spec, or unload+reload.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PreWarmFailure {
pub model_id: String,
pub error: String,
}
/// Activation-time progress snapshot. All four lists are populated by
/// the neuron's pre-warm task and read by the `/health` handler. The
/// snapshot is consistent: a model id appears in exactly one of
/// `pending`, `in_progress` (as `Option<String>`), `completed`, or
/// `failed` at any point in time.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ActivationStatus {
pub state: ActivationState,
/// Model ids queued but not yet started. Empty in `Ready` state.
#[serde(default)]
pub pending: Vec<String>,
/// Model id currently materialising. None when between models or
/// in `Ready` state.
#[serde(default)]
pub in_progress: Option<String>,
/// Model ids that finished loading successfully during this
/// activation. Cleared on process restart.
#[serde(default)]
pub completed: Vec<String>,
/// Model ids that failed during this activation, with the rendered
/// error chain. Cleared on process restart.
#[serde(default)]
pub failed: Vec<PreWarmFailure>,
}

View File

@@ -1,145 +0,0 @@
//! Identity and entitlement primitives for multi-tenant governance (#47).
//!
//! Identity is the shared substrate the whole epic hangs off:
//! `identity (principal) → accounting (spend) → policy → enforcement`. This
//! module defines the seam — the [`EntitlementProvider`] trait and its data
//! types — so the local/static provider (operator-config caps, in
//! cortex-gateway) can land the auth + per-key-cap + amplification fix
//! *before* any upstream clearing house exists. The future helexa-upstream
//! client (#57) is just another impl of this trait.
//!
//! The provider owns three jobs:
//! 1. **resolve** a bearer key to a [`Principal`] (drives auth, #49);
//! 2. **reserve → settle/release** token budget around a request so spend
//! can never overshoot a hard cap under concurrency (drives budget
//! enforcement, #52);
//! 3. expose a [`BudgetSnapshot`] for metering/metrics (#51).
//!
//! [`BudgetError`] carries the cap-window semantics so the caller can pick
//! the correct #63 rejection (`rate_limit_exceeded` + `Retry-After` for a
//! resetting window vs `insufficient_quota` for a hard balance) without the
//! provider knowing anything about HTTP.
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
/// Internal header carrying the resolved account id from cortex to neuron.
/// neuron trusts these over the WireGuard link (#54); cortex **strips** any
/// client-supplied copy before stamping the authoritative value, so a client
/// can never assert a principal directly.
pub const HEADER_ACCOUNT_ID: &str = "x-helexa-account-id";
/// Internal header carrying the resolved key id from cortex to neuron.
pub const HEADER_KEY_ID: &str = "x-helexa-key-id";
/// Who a request is for. Resolved once at the edge from the bearer key and
/// carried through the request context. `account_id` is the billable owner
/// (spendable at any operator, by decision); `key_id` identifies the
/// specific API key for per-key hard caps and ledger/metrics labels.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Principal {
pub account_id: String,
pub key_id: String,
}
/// Cap-window semantics for a key's hard cap. Determines which #63 code an
/// over-cap reservation maps to.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum CapWindow {
/// Hard balance — the cap never resets. Exhaustion is permanent
/// (`429 insufficient_quota`, no `Retry-After`).
#[default]
Balance,
/// Rolling window of `seconds` that resets. Exhaustion is transient
/// (`429 rate_limit_exceeded` + `Retry-After` until reset).
Rolling { seconds: u64 },
}
/// An outstanding budget reservation. The caller holds this opaque handle
/// between [`EntitlementProvider::reserve`] and exactly one of
/// [`EntitlementProvider::settle`] / [`EntitlementProvider::release`]. Not
/// `Clone` — a reservation is consumed once.
#[derive(Debug)]
pub struct Reservation {
/// Provider-local handle; opaque to the caller.
pub id: u64,
/// The principal this reservation belongs to.
pub principal: Principal,
/// Tokens reserved against the cap.
pub reserved: u64,
}
/// A point-in-time view of a key's budget, for metering and metrics (#51).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BudgetSnapshot {
/// Hard cap in tokens. `None` means uncapped (e.g. an operator infra
/// key, #58).
pub hard_cap: Option<u64>,
/// Settled spend in the current window.
pub spent: u64,
/// Sum of outstanding (un-settled) reservations.
pub reserved: u64,
}
/// Authentication failure — the bearer key could not be resolved. Maps to
/// `401 invalid_api_key` (#49/#63).
#[derive(Debug, thiserror::Error)]
pub enum AuthError {
#[error("invalid or unknown API key")]
InvalidKey,
}
/// Why a reservation was refused. Carries enough for the caller to build the
/// correct #63 envelope without the provider touching HTTP.
#[derive(Debug, thiserror::Error)]
pub enum BudgetError {
/// A resetting window is exhausted → `429 rate_limit_exceeded` +
/// `Retry-After: retry_after_secs`.
#[error(
"rolling-window budget exhausted ({requested} requested, {available} available); \
resets in {retry_after_secs}s"
)]
RateLimited {
requested: u64,
available: u64,
retry_after_secs: u64,
},
/// A hard balance is exhausted → `429 insufficient_quota` (no
/// `Retry-After`; the client surfaces and stops). Never `402`.
#[error("hard balance exhausted ({requested} requested, {available} available)")]
InsufficientQuota { requested: u64, available: u64 },
}
/// The seam between cortex's enforcement and whatever decides entitlement —
/// a local/static config provider today (#50), the helexa-upstream client
/// later (#57). All methods are async so the upstream impl can do network
/// I/O; the local impl resolves in-process.
#[async_trait]
pub trait EntitlementProvider: Send + Sync {
/// Resolve a bearer API key to its principal. `Err(InvalidKey)` for an
/// unknown/empty key.
async fn resolve(&self, api_key: &str) -> Result<Principal, AuthError>;
/// Reserve up to `max_tokens` against the principal's cap. Returns a
/// handle on success, or a [`BudgetError`] (which the caller maps to a
/// #63 `429`) if the reservation would exceed the cap. Reserving the
/// *maximum* a request could consume before dispatch is what prevents
/// overshoot under concurrency.
async fn reserve(
&self,
principal: &Principal,
max_tokens: u64,
) -> Result<Reservation, BudgetError>;
/// Settle a reservation with the tokens actually consumed, releasing the
/// unused remainder back to the cap.
async fn settle(&self, reservation: Reservation, actual_tokens: u64);
/// Release a reservation in full — e.g. dispatch failed before any
/// tokens were consumed.
async fn release(&self, reservation: Reservation);
/// Current budget snapshot for a principal, for metering/metrics.
/// `None` if the provider doesn't track this principal.
async fn snapshot(&self, principal: &Principal) -> Option<BudgetSnapshot>;
}

View File

@@ -1,257 +0,0 @@
//! The OpenAI-standard error envelope (#60) and the rejection contract
//! that rides on it (#63).
//!
//! Every non-2xx response cortex and neuron emit uses the shape
//!
//! ```json
//! { "error": { "message": "...", "type": "...", "code": "...", "param": null } }
//! ```
//!
//! because OpenAI-compatible clients (opencode, the AI SDK, litellm, the
//! OpenAI SDKs) read `error.type` / `error.code` to decide what to do —
//! most importantly `code == "context_length_exceeded"` triggers
//! auto-compaction, and a `429` with `Retry-After` makes them back off and
//! retry rather than surfacing an opaque failure. A flat `{"error":"..."}`
//! string is invisible to that logic.
//!
//! This module is the single source of truth for that envelope. It is
//! deliberately **axum-agnostic** — cortex-core is a pure types crate — so
//! it carries the response as data (`status`, `body()`, `retry_after_secs`)
//! and each HTTP crate (cortex-gateway, neuron) owns a tiny adapter that
//! turns an [`OpenAiError`] into its framework's response type, setting the
//! `Retry-After` header when present.
//!
//! Retryable conditions **must** carry `Retry-After` (per #63). The named
//! constructors below encode that: [`OpenAiError::rate_limit_exceeded`] and
//! [`OpenAiError::service_unavailable`] take a retry hint;
//! [`OpenAiError::insufficient_quota`] (hard balance, no reset) and
//! [`OpenAiError::context_length_exceeded`] / [`OpenAiError::invalid_api_key`]
//! (permanent) do not. `402 Payment Required` is banned by the contract — use
//! `429 insufficient_quota` for hard budget exhaustion.
use serde_json::{Map, Value, json};
/// A rejection rendered in the OpenAI error envelope.
///
/// Build with [`OpenAiError::new`] (or a named constructor), refine with the
/// `with_*` builders, then hand to the consuming crate's adapter to turn into
/// an HTTP response.
#[derive(Debug, Clone)]
pub struct OpenAiError {
/// HTTP status code (e.g. `401`, `429`, `503`).
pub status: u16,
/// Broad OpenAI category — `"invalid_request_error"`, `"api_error"`,
/// `"rate_limit_error"`, …
pub error_type: String,
/// Specific machine-readable code clients key on (`"invalid_api_key"`,
/// `"rate_limit_exceeded"`, `"context_length_exceeded"`, …). `None`
/// renders as JSON `null`.
pub code: Option<String>,
/// Human-readable, actionable message.
pub message: String,
/// OpenAI's `param` field — the offending request parameter, if any.
pub param: Option<String>,
/// Seconds to advertise in the `Retry-After` header. Set only on
/// retryable conditions; `None` means no header.
pub retry_after_secs: Option<u64>,
/// Diagnostic fields merged *inside* the `error` object (e.g.
/// `prompt_len`, `max`, `free_mb`) so they don't break the envelope
/// shape. Clients ignore unknown keys.
pub extra: Map<String, Value>,
}
impl OpenAiError {
/// Construct an envelope with an explicit code. For a `null` code use
/// [`OpenAiError::without_code`].
pub fn new(
status: u16,
error_type: impl Into<String>,
code: impl Into<String>,
message: impl Into<String>,
) -> Self {
Self {
status,
error_type: error_type.into(),
code: Some(code.into()),
message: message.into(),
param: None,
retry_after_secs: None,
extra: Map::new(),
}
}
/// Construct an envelope whose `code` is `null` (e.g. an unclassified
/// internal error).
pub fn without_code(
status: u16,
error_type: impl Into<String>,
message: impl Into<String>,
) -> Self {
Self {
status,
error_type: error_type.into(),
code: None,
message: message.into(),
param: None,
retry_after_secs: None,
extra: Map::new(),
}
}
/// Advertise a `Retry-After` (seconds). Use on retryable rejections.
pub fn with_retry_after(mut self, secs: u64) -> Self {
self.retry_after_secs = Some(secs);
self
}
/// Set the OpenAI `param` field.
pub fn with_param(mut self, param: impl Into<String>) -> Self {
self.param = Some(param.into());
self
}
/// Merge one diagnostic field into the error object.
pub fn with_extra(mut self, key: impl Into<String>, value: Value) -> Self {
self.extra.insert(key.into(), value);
self
}
/// Merge a bag of diagnostic fields into the error object.
pub fn with_extras(mut self, extras: Map<String, Value>) -> Self {
for (k, v) in extras {
self.extra.insert(k, v);
}
self
}
/// Render the `{ "error": { … } }` body. Field order is irrelevant to
/// clients (they parse JSON); the standard keys come first, then any
/// diagnostic extras.
pub fn body(&self) -> Value {
let mut error = Map::new();
error.insert("message".into(), Value::String(self.message.clone()));
error.insert("type".into(), Value::String(self.error_type.clone()));
error.insert(
"code".into(),
self.code.clone().map(Value::String).unwrap_or(Value::Null),
);
error.insert(
"param".into(),
self.param.clone().map(Value::String).unwrap_or(Value::Null),
);
for (k, v) in &self.extra {
error.insert(k.clone(), v.clone());
}
json!({ "error": Value::Object(error) })
}
// ── Named constructors for the #63 standard codes ──────────────────
/// `401 invalid_api_key` — missing/invalid bearer token (#49). Permanent.
pub fn invalid_api_key(message: impl Into<String>) -> Self {
Self::new(401, "invalid_request_error", "invalid_api_key", message)
}
/// `429 rate_limit_exceeded` + `Retry-After` — transient overload,
/// fair-share/in-flight cap, admission rejection, or a rolling budget
/// window that resets (#52/#53/#54/#55). Clients back off and retry.
pub fn rate_limit_exceeded(message: impl Into<String>, retry_after_secs: u64) -> Self {
Self::new(429, "rate_limit_error", "rate_limit_exceeded", message)
.with_retry_after(retry_after_secs)
}
/// `429 insufficient_quota` — hard balance exhausted, no reset (#52).
/// No `Retry-After`; the client surfaces and stops. (Never `402`.)
pub fn insufficient_quota(message: impl Into<String>) -> Self {
Self::new(429, "insufficient_quota", "insufficient_quota", message)
}
/// `400 context_length_exceeded` — prompt exceeds the model's context
/// window (#56/#60). Permanent for this request; opencode auto-compacts.
pub fn context_length_exceeded(message: impl Into<String>) -> Self {
Self::new(
400,
"invalid_request_error",
"context_length_exceeded",
message,
)
}
/// `503 service_unavailable` + optional `Retry-After` — transient
/// backend unavailability (no healthy nodes, recovery, fail-closed
/// upstream). Retryable when a hint is given.
pub fn service_unavailable(message: impl Into<String>, retry_after_secs: Option<u64>) -> Self {
let mut err = Self::new(503, "api_error", "service_unavailable", message);
err.retry_after_secs = retry_after_secs;
err
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn body_has_standard_envelope_shape() {
let env = OpenAiError::new(429, "rate_limit_error", "rate_limit_exceeded", "slow down");
let body = env.body();
let error = body.get("error").and_then(Value::as_object).unwrap();
assert_eq!(error["message"], "slow down");
assert_eq!(error["type"], "rate_limit_error");
assert_eq!(error["code"], "rate_limit_exceeded");
assert_eq!(error["param"], Value::Null);
}
#[test]
fn without_code_renders_null_code() {
let env = OpenAiError::without_code(500, "api_error", "kaboom");
assert_eq!(env.body()["error"]["code"], Value::Null);
}
#[test]
fn extras_ride_inside_the_error_object() {
let env = OpenAiError::context_length_exceeded("too long")
.with_extra("prompt_len", json!(60_000))
.with_extra("max", json!(49_152));
let error = &env.body()["error"];
assert_eq!(error["prompt_len"], 60_000);
assert_eq!(error["max"], 49_152);
assert_eq!(error["code"], "context_length_exceeded");
}
#[test]
fn rolling_window_rejection_carries_retry_after() {
let env = OpenAiError::rate_limit_exceeded("budget window", 30);
assert_eq!(env.status, 429);
assert_eq!(env.retry_after_secs, Some(30));
}
#[test]
fn hard_balance_rejection_has_no_retry_after() {
let env = OpenAiError::insufficient_quota("out of credit");
assert_eq!(env.status, 429);
assert_eq!(env.code.as_deref(), Some("insufficient_quota"));
assert_eq!(env.retry_after_secs, None);
}
#[test]
fn permanent_rejections_have_no_retry_after() {
assert_eq!(OpenAiError::invalid_api_key("nope").retry_after_secs, None);
assert_eq!(
OpenAiError::context_length_exceeded("too long").retry_after_secs,
None
);
}
#[test]
fn service_unavailable_retry_after_is_optional() {
assert_eq!(
OpenAiError::service_unavailable("recovering", Some(5)).retry_after_secs,
Some(5)
);
assert_eq!(
OpenAiError::service_unavailable("gone", None).retry_after_secs,
None
);
}
}

View File

@@ -9,13 +9,13 @@ use async_trait::async_trait;
use serde::{Deserialize, Serialize};
/// Configuration for a harness instance on a neuron.
///
/// All current harnesses are in-process (candle); per-harness tuning
/// (cache paths, device policies, etc.) lives in dedicated config
/// blocks rather than on this struct.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HarnessConfig {
pub name: String,
/// Base URL of the harness (e.g. "http://localhost:8080" for mistral.rs).
pub endpoint: Option<String>,
/// Systemd unit name, if the harness is managed via systemd.
pub systemd_unit: Option<String>,
}
/// Health status of a harness process.
@@ -36,44 +36,6 @@ pub struct ModelSpec {
pub devices: Option<Vec<u32>>,
}
/// Per-model token budget advertised by the catalogue or neuron.
///
/// `context` is the hard wall (the served max-seq-len). `input` is the
/// compaction trigger — when set, opencode treats it as "usable context =
/// input reserved". When omitted, clients fall back to `context output`.
/// `output` is the maximum number of generation tokens.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelLimit {
/// Hard wall — served max-seq-len in tokens.
pub context: usize,
/// Compaction trigger / usable input budget. When absent clients fall
/// back to `context output`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input: Option<usize>,
/// Maximum number of generation tokens.
pub output: usize,
}
/// Operator-set pricing in USD per 1M tokens.
///
/// Self-hosted deployments typically leave both at `0.0`. Cache fields are
/// optional — set when the backend supports a prefix-cache discount tier.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelCost {
/// USD per 1M input (prompt) tokens.
#[serde(default)]
pub input: f64,
/// USD per 1M output (completion) tokens.
#[serde(default)]
pub output: f64,
/// USD per 1M cache-hit tokens (optional).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_read: Option<f64>,
/// USD per 1M cache-write tokens (optional).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_write: Option<f64>,
}
/// A model as reported by a harness.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelInfo {
@@ -82,54 +44,19 @@ pub struct ModelInfo {
pub status: String,
pub devices: Vec<u32>,
pub vram_used_mb: Option<u64>,
/// Modalities this loaded model supports. Today: `["text"]` for
/// text-only checkpoints, `["text", "vision"]` for vision-capable
/// ones (Stage B7). Clients like litellm / agent0 can gate
/// `image_url` submission on the advertised set.
///
/// Optional in the wire format so older clients that don't read
/// it stay compatible. Default-empty for absent/older data, which
/// callers can interpret as "text".
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub capabilities: Vec<String>,
// ── Enrichment (issue #62) ────────────────────────────────
/// Token budget advertised by the catalogue or discovered at load time.
/// `None` when neither the catalogue nor the loaded model can provide it.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub limit: Option<ModelLimit>,
/// Operator-set pricing in USD per 1M tokens (0.0 = free/self-hosted).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cost: Option<ModelCost>,
/// `true` when the model's tokenizer contains recognised tool-call
/// marker tokens (`<tool_call>` / `<\/tool_call>` convention).
#[serde(default)]
pub tool_call: bool,
/// `true` when the model's tokenizer contains recognised reasoning
/// marker tokens (`<think>` / `<\/think>` or similar).
#[serde(default)]
pub reasoning: bool,
}
/// What an inference harness must do, from neuron's perspective.
///
/// All current harnesses are in-process — they share neuron's address
/// space and lifecycle. `start`/`stop` therefore default to no-ops; a
/// future process-supervising harness would override them.
#[async_trait]
pub trait Harness: Send + Sync {
/// Human-readable name (e.g. "candle").
/// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
fn name(&self) -> &str;
/// Start the harness. Default no-op for in-process harnesses.
async fn start(&self, _config: &HarnessConfig) -> Result<()> {
Ok(())
}
/// Start the harness process if it is not already running.
async fn start(&self, config: &HarnessConfig) -> Result<()>;
/// Stop the harness. Default no-op for in-process harnesses.
async fn stop(&self) -> Result<()> {
Ok(())
}
/// Stop the harness process gracefully.
async fn stop(&self) -> Result<()>;
/// Health check. Returns the harness process status.
async fn health(&self) -> HarnessHealth;

View File

@@ -1,14 +1,9 @@
pub mod anthropic;
pub mod build_info;
pub mod catalogue;
pub mod config;
pub mod discovery;
pub mod entitlements;
pub mod error_envelope;
pub mod harness;
pub mod metrics;
pub mod node;
pub mod openai;
pub mod responses;
pub mod source;
pub mod translate;

View File

@@ -1,5 +1,3 @@
use crate::discovery::{ActivationStatus, DiscoveryResponse};
use crate::harness::{ModelCost, ModelLimit};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
@@ -8,25 +6,13 @@ use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct NodeState {
pub name: String,
/// Base URL of the neuron daemon (e.g. "http://beast.internal:13131").
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
pub endpoint: String,
pub healthy: bool,
pub models: HashMap<String, ModelEntry>,
/// Number of load/unload cycles since last process restart.
pub lifecycle_cycles: u32,
pub last_poll: Option<DateTime<Utc>>,
/// Result of the most recent successful `GET /discovery` against
/// this neuron. Cached forever once obtained — device topology is
/// invariant for a given neuron process. `None` until the first
/// successful poll. Used by the router and `/v1/models` to do
/// catalogue × topology feasibility checks.
pub discovery: Option<DiscoveryResponse>,
/// Last-seen pre-warm progress from this neuron's `/health`
/// endpoint. `None` until the first /health poll succeeds. The
/// `/v1/models` handler reads `in_progress` + `pending` from here
/// to synthesize `Loading` locations so clients see a catalogued
/// model that's mid-prewarm as "loading", not "missing".
pub activation: Option<ActivationStatus>,
}
/// A model registered on a node, with its runtime status.
@@ -38,102 +24,25 @@ pub struct ModelEntry {
pub last_accessed: Option<DateTime<Utc>>,
/// Estimated VRAM usage in MB when loaded.
pub vram_estimate_mb: Option<u64>,
/// Modalities the loaded model advertises (e.g. `["text", "vision"]`),
/// copied verbatim from the neuron's `ModelInfo.capabilities` at poll
/// time. Empty when the neuron reports none. `#[serde(default)]` keeps
/// older persisted/serialised entries deserialisable.
#[serde(default)]
pub capabilities: Vec<String>,
/// Runtime-detected capability flags from the neuron's `/models`
/// response (`ModelInfo`). `false` when the neuron predates these
/// fields or hasn't reported them yet.
#[serde(default)]
pub tool_call: bool,
#[serde(default)]
pub reasoning: bool,
/// Self-derived token budget the neuron computed for this loaded
/// model (#67), copied from `ModelInfo.limit` at poll time. `None`
/// when the neuron doesn't compute one (arch without a context
/// profile, or derivation disabled). This is the authoritative
/// source the gateway advertises — operator-declared catalogue
/// limits are no longer consulted.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub limit: Option<ModelLimit>,
}
/// Model lifecycle status.
///
/// `Loading` is a gateway-side synthetic status: neurons never emit it
/// on `/models` (that endpoint only knows about already-loaded handles).
/// The gateway populates it from a neuron's `/health` activation
/// snapshot so the unified `/v1/models` can distinguish "model is
/// catalogued but no one has it" from "model is materialising on
/// neuron N right now". Other status values are reported verbatim by
/// neurons.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ModelStatus {
Loaded,
Unloaded,
Reloading,
Loading,
/// Reported by neuron while a poisoned model auto-recovers via
/// unload→reload (#17/#20). Temporarily unservable but NOT
/// evicted: the gateway holds the route, answers with a transient
/// retry error instead of 404, and must not race a second
/// placement elsewhere.
Recovering,
}
/// Unified model entry as exposed by the gateway's `/v1/models` endpoint.
///
/// The first four fields (`id`, `object`, `created`, `owned_by`) match
/// OpenAI's `/v1/models` shape verbatim, so existing OpenAI-aware
/// tooling deserialises this without custom code. The remaining fields
/// are helexa-specific extensions — OpenAI clients ignore unknown
/// fields and other consumers can read them for placement / debugging.
/// Includes which node(s) host this model and their status.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CortexModelEntry {
pub id: String,
/// Always `"model"` per OpenAI's contract.
pub object: String,
/// Unix-second timestamp; cortex stamps this at response time.
pub created: u64,
/// OpenAI's "publisher" field — `"helexa"` for everything we serve.
pub owned_by: String,
/// True if any neuron currently has this model loaded. False for
/// catalogue entries that are feasible but not yet loaded.
pub loaded: bool,
/// Neurons whose discovered topology can satisfy this model's
/// catalogue placement constraints. Empty for models that are
/// loaded somewhere but not present in the catalogue (cortex has
/// no feasibility opinion on those).
pub feasible_on: Vec<String>,
/// Where this model is actually loaded right now. Subset of (or
/// disjoint from) `feasible_on` depending on whether the catalogue
/// covers this model.
/// Which nodes have this model (and their status).
pub locations: Vec<ModelLocation>,
/// Union of the modalities advertised by every neuron that has this
/// model loaded (e.g. `["text", "vision"]`). Empty for catalogue-only
/// entries with no loaded location — filled from catalogue profile
/// capabilities when available, then unioned with runtime-detected
/// values from loaded neurons.
#[serde(default)]
pub capabilities: Vec<String>,
// ── Enrichment (issue #62) ────────────────────────────────
/// Per-model token budget from the catalogue profile or discovered
/// at load time. `None` when neither source provides it.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub limit: Option<ModelLimit>,
/// Operator-set pricing in USD per 1M tokens (0.0 = free/self-hosted).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cost: Option<ModelCost>,
/// `true` when any neuron reports this model supports tool calls.
#[serde(default)]
pub tool_call: bool,
/// `true` when any neuron reports this model supports reasoning tokens.
#[serde(default)]
pub reasoning: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -3,7 +3,7 @@
//! These are a subset sufficient for chat completions (streaming + non-streaming).
//! Fields not relevant to proxying are captured as `serde_json::Value` via
//! `#[serde(flatten)]` so we forward them without needing to enumerate every
//! extension field a backend might support.
//! extension field mistral.rs supports.
use serde::{Deserialize, Serialize};
use serde_json::Value;
@@ -22,7 +22,7 @@ pub struct ChatCompletionRequest {
pub max_tokens: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub stream: Option<bool>,
/// All other fields (tools, response_format, backend extensions, etc.)
/// All other fields (tools, response_format, mistral.rs extensions, etc.)
#[serde(flatten)]
pub extra: Value,
}
@@ -71,18 +71,10 @@ pub struct ChatCompletionChoice {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionChunk {
#[serde(default)]
pub id: String,
#[serde(default)]
pub object: String,
#[serde(default)]
pub created: u64,
// Lenient deserialization throughout: the gateway parses chunks
// from arbitrary OpenAI-compatible upstreams, and some engines
// omit fields on special frames (e.g. usage-only final chunks).
#[serde(default)]
pub model: String,
#[serde(default)]
pub choices: Vec<ChunkChoice>,
#[serde(skip_serializing_if = "Option::is_none")]
pub usage: Option<Usage>,
@@ -106,31 +98,6 @@ pub struct Usage {
pub prompt_tokens: u64,
pub completion_tokens: u64,
pub total_tokens: u64,
/// OpenAI-standard breakdown of `completion_tokens`. Optional and
/// additive — clients that don't read it are unaffected. Carries
/// `reasoning_tokens` for reasoning models (a sub-count of
/// `completion_tokens`, never added into `total_tokens`).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub completion_tokens_details: Option<CompletionTokensDetails>,
/// OpenAI-standard breakdown of `prompt_tokens`. Populated once
/// prompt caching lands (#11); `None` until then.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prompt_tokens_details: Option<PromptTokensDetails>,
}
/// Sub-counts of `Usage::completion_tokens`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompletionTokensDetails {
/// Tokens generated inside the model's reasoning span.
pub reasoning_tokens: u64,
}
/// Sub-counts of `Usage::prompt_tokens`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PromptTokensDetails {
/// Prompt tokens served from cache (cache-read rate). Populated
/// once prompt caching lands (#11).
pub cached_tokens: u64,
}
// ── Models list response ─────────────────────────────────────────────

View File

@@ -1,372 +0,0 @@
//! OpenAI Responses API (`POST /v1/responses`) envelope types.
//!
//! This is OpenAI's newer chat surface, distinct from
//! `/v1/chat/completions` in three ways that matter for us:
//!
//! 1. **Input shape**. Instead of a `messages` array, the request
//! carries `input` — either a plain string (single user turn)
//! or an array of typed items (messages, function calls,
//! function-call outputs, reasoning blocks, …).
//! 2. **Output shape**. The response carries a single `output`
//! array of items, each typed. We always emit one
//! `OutputItem::Message` containing the assistant's reply (plus,
//! when we get there, separate `function_call` items).
//! 3. **Streaming events**. Where chat completions stream
//! structurally-identical `chat.completion.chunk` frames over
//! `data:` lines, Responses streams *named* events
//! (`response.created`, `response.output_text.delta`,
//! `response.completed`, …) over `event:` + `data:` SSE pairs.
//! The wire projector in `neuron::wire::openai_responses` builds
//! these from the same [`crate::openai`]-shaped
//! `InferenceEvent` stream the chat projector consumes.
//!
//! Scope cuts for this first cut:
//!
//! - **`previous_response_id` is rejected at parse time**. Stateful
//! chained conversations need a persistence layer we don't have.
//! - **Reasoning items are accepted-and-ignored** (no Qwen3
//! `<think>` routing yet). Audio and embedded resources are
//! rejected as unsupported.
//! - **Tool calls** (function_call / function_call_output) are
//! carried as round-trip types but the candle harness doesn't
//! emit them yet — wired so the surface is in place for the
//! day we add proper tool-call extraction.
use serde::{Deserialize, Serialize};
use serde_json::Value;
// ── Request ──────────────────────────────────────────────────────────
/// Body of a `POST /v1/responses` request.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResponsesRequest {
pub model: String,
pub input: ResponsesInput,
/// System-prompt-style instructions. The Responses API
/// separates these from input so a caller doesn't have to
/// build a `system` message item by hand.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub instructions: Option<String>,
#[serde(default)]
pub stream: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_output_tokens: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub temperature: Option<f64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub top_p: Option<f64>,
/// Chained-conversation identifier. We don't store responses
/// server-side yet; if this is `Some`, the handler returns 400.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub previous_response_id: Option<String>,
/// Catch-all for anything we don't model yet (tools, tool_choice,
/// reasoning, response_format, …). Lets a client send a
/// forward-compatible request without our parser rejecting it.
#[serde(flatten)]
pub extra: Value,
}
/// `input` is either a single string or an array of typed items.
/// `#[serde(untagged)]` so the wire shape `"input": "hi"` and
/// `"input": [{...}]` both deserialize.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ResponsesInput {
Text(String),
Items(Vec<ResponsesInputItem>),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ResponsesInputItem {
/// A user / assistant / system turn.
Message {
role: String,
content: ResponsesMessageContent,
},
/// Assistant emitted a tool call. Round-trip only — neuron
/// doesn't synthesise these yet.
FunctionCall {
call_id: String,
name: String,
arguments: String,
},
/// User is feeding a tool result back into the model.
FunctionCallOutput { call_id: String, output: String },
/// Reasoning items emitted by o-series models. Accepted but
/// not forwarded to the model — neuron's candle path doesn't
/// surface reasoning separately yet.
Reasoning {
#[serde(default)]
content: Vec<Value>,
},
}
/// Inside a `Message` item, content is either a plain string or an
/// array of typed parts. Mirrors the chat-completions Parts shape.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ResponsesMessageContent {
Text(String),
Parts(Vec<ResponsesContentPart>),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ResponsesContentPart {
/// Plain text inside a user / system turn.
InputText { text: String },
/// An image. `image_url` is either a remote URL or a
/// `data:image/png;base64,…` URI; the request translator just
/// forwards the string.
InputImage {
image_url: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
detail: Option<String>,
},
/// Returned text inside an assistant turn — only relevant when
/// the caller is feeding an assistant turn back in to continue
/// a conversation manually (no `previous_response_id`).
OutputText {
text: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
annotations: Vec<Value>,
},
}
// ── Response (non-streaming) ─────────────────────────────────────────
/// Body of a `POST /v1/responses` response.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResponsesResponse {
pub id: String,
/// Always `"response"`.
pub object: String,
pub created_at: u64,
/// `"completed"`, `"incomplete"`, or — for the initial event of
/// a streaming response — `"in_progress"`.
pub status: String,
pub model: String,
pub output: Vec<ResponsesOutputItem>,
/// Populated on completion; `None` while streaming.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub usage: Option<ResponsesUsage>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ResponsesOutputItem {
Message {
id: String,
/// Always `"assistant"` for model output.
role: String,
/// Output content parts. We always emit a single
/// `OutputText` today; multi-part output would land here
/// once we have e.g. image generation.
content: Vec<ResponsesOutputContent>,
/// Item-level status. `"in_progress"` while streaming the
/// content parts, `"completed"` when done.
#[serde(default = "default_item_status")]
status: String,
},
/// Reserved for the day tool-call extraction lands. The wire
/// shape mirrors `ResponsesInputItem::FunctionCall`.
FunctionCall {
id: String,
call_id: String,
name: String,
arguments: String,
#[serde(default = "default_item_status")]
status: String,
},
}
fn default_item_status() -> String {
"completed".into()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ResponsesOutputContent {
OutputText {
text: String,
/// Citations / inline annotations. Empty today; reserved
/// for the day we wire in web search / file search.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
annotations: Vec<Value>,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResponsesUsage {
pub input_tokens: u64,
pub output_tokens: u64,
pub total_tokens: u64,
/// OpenAI-standard breakdown of `output_tokens`. Optional and
/// additive. Carries `reasoning_tokens` for reasoning models (a
/// sub-count of `output_tokens`, never added into `total_tokens`).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_tokens_details: Option<OutputTokensDetails>,
/// OpenAI-standard breakdown of `input_tokens`. Populated once
/// prompt caching lands (#11); `None` until then.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input_tokens_details: Option<InputTokensDetails>,
}
/// Sub-counts of `ResponsesUsage::output_tokens`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutputTokensDetails {
/// Tokens generated inside the model's reasoning span.
pub reasoning_tokens: u64,
}
/// Sub-counts of `ResponsesUsage::input_tokens`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InputTokensDetails {
/// Input tokens served from cache (cache-read rate). Populated
/// once prompt caching lands (#11).
pub cached_tokens: u64,
}
// ── Streaming event names ────────────────────────────────────────────
/// Event names the SSE projector emits, hoisted as constants so
/// the projector and the wire shape stay in sync without
/// string-typos. The strings are dictated by OpenAI's published
/// Responses API.
pub mod events {
pub const CREATED: &str = "response.created";
/// Fired between `response.created` and the first output-item
/// event. Marks "request validated, model is generating" —
/// some clients use it to differentiate the "warming up" state
/// from "streaming tokens" in their UI.
pub const IN_PROGRESS: &str = "response.in_progress";
pub const OUTPUT_ITEM_ADDED: &str = "response.output_item.added";
pub const CONTENT_PART_ADDED: &str = "response.content_part.added";
pub const OUTPUT_TEXT_DELTA: &str = "response.output_text.delta";
pub const OUTPUT_TEXT_DONE: &str = "response.output_text.done";
pub const CONTENT_PART_DONE: &str = "response.content_part.done";
pub const OUTPUT_ITEM_DONE: &str = "response.output_item.done";
pub const COMPLETED: &str = "response.completed";
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn deserialises_input_string_form() {
let raw = r#"{"model": "m", "input": "hello"}"#;
let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
match req.input {
ResponsesInput::Text(s) => assert_eq!(s, "hello"),
other => panic!("expected Text, got {other:?}"),
}
}
#[test]
fn deserialises_input_items_form() {
let raw = r#"{
"model": "m",
"input": [
{"type": "message", "role": "user", "content": "hi"}
]
}"#;
let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
match req.input {
ResponsesInput::Items(items) => {
assert_eq!(items.len(), 1);
match &items[0] {
ResponsesInputItem::Message { role, content } => {
assert_eq!(role, "user");
match content {
ResponsesMessageContent::Text(t) => assert_eq!(t, "hi"),
other => panic!("expected Text content, got {other:?}"),
}
}
other => panic!("expected Message item, got {other:?}"),
}
}
other => panic!("expected Items, got {other:?}"),
}
}
#[test]
fn deserialises_input_with_image() {
let raw = r#"{
"model": "m",
"input": [
{"type": "message", "role": "user", "content": [
{"type": "input_text", "text": "what is this"},
{"type": "input_image", "image_url": "data:image/png;base64,AAA="}
]}
]
}"#;
let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
let items = match req.input {
ResponsesInput::Items(i) => i,
other => panic!("expected Items, got {other:?}"),
};
let parts = match &items[0] {
ResponsesInputItem::Message {
content: ResponsesMessageContent::Parts(p),
..
} => p,
other => panic!("expected Parts, got {other:?}"),
};
assert_eq!(parts.len(), 2);
assert!(matches!(
&parts[0],
ResponsesContentPart::InputText { text } if text == "what is this"
));
assert!(matches!(
&parts[1],
ResponsesContentPart::InputImage { image_url, .. }
if image_url == "data:image/png;base64,AAA="
));
}
#[test]
fn unknown_fields_round_trip_via_extra() {
let raw = r#"{
"model": "m",
"input": "hi",
"tools": [{"type": "web_search"}],
"reasoning": {"effort": "medium"}
}"#;
let req: ResponsesRequest = serde_json::from_str(raw).unwrap();
assert!(req.extra.get("tools").is_some());
assert!(req.extra.get("reasoning").is_some());
}
#[test]
fn response_round_trips_through_serde() {
let r = ResponsesResponse {
id: "resp_1".into(),
object: "response".into(),
created_at: 1700,
status: "completed".into(),
model: "m".into(),
output: vec![ResponsesOutputItem::Message {
id: "msg_1".into(),
role: "assistant".into(),
content: vec![ResponsesOutputContent::OutputText {
text: "hi there".into(),
annotations: vec![],
}],
status: "completed".into(),
}],
usage: Some(ResponsesUsage {
input_tokens: 5,
output_tokens: 3,
total_tokens: 8,
output_tokens_details: None,
input_tokens_details: None,
}),
};
let json = serde_json::to_string(&r).unwrap();
let parsed: ResponsesResponse = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.id, "resp_1");
assert_eq!(parsed.output.len(), 1);
}
}

View File

@@ -1,267 +0,0 @@
//! Scheme-qualified model identifiers.
//!
//! cortex/neuron historically resolves every model id through hf-hub
//! against `https://huggingface.co`. Helexa is adding an EU-hosted
//! registry (`registry.helexa.ai`) alongside HF — both speak the same
//! HF-compatible wire format, but the bytes, jurisdiction, and trust
//! root differ. Model ids therefore need a scheme:
//!
//! - `huggingface:Qwen/Qwen3.6-27B` — HF-hosted bytes
//! - `helexa:Qwen/Qwen3.6-27B-Uncensored` — helexa registry bytes
//! - `helexa:SomeOperator/CustomFinetune` — operator publishing
//! under the helexa namespace; same scheme handles all `org/name`
//! pairs hosted in that registry.
//!
//! Bare `org/name` parses with an empty scheme; the caller (typically
//! a harness) substitutes its configured default scheme so existing
//! configs keep working through the transition.
use serde::{Deserialize, Serialize};
use std::fmt;
use std::str::FromStr;
/// Parsed `scheme:org/name`. Bare `org/name` produces an empty scheme
/// — call `with_default_scheme` (or check `is_scheme_unset`) to
/// resolve before using.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ModelSourceId {
pub scheme: String,
pub org: String,
pub name: String,
}
/// Errors from `ModelSourceId::from_str`. Carries the offending input
/// so log lines / API errors can echo what the operator typed.
#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
pub enum ParseError {
#[error("empty model id")]
Empty,
#[error("model id '{0}' is missing the '/' between org and name")]
MissingSlash(String),
#[error("model id '{0}' has an empty scheme before ':'")]
EmptyScheme(String),
#[error("model id '{0}' has an empty org")]
EmptyOrg(String),
#[error("model id '{0}' has an empty name")]
EmptyName(String),
#[error("model id '{0}' has a scheme containing '/' which is reserved for org/name")]
SchemeContainsSlash(String),
#[error("model id '{0}' has a name containing ':' which is reserved for the scheme prefix")]
NameContainsColon(String),
}
impl ModelSourceId {
/// Construct directly from already-validated parts. Used by tests
/// and call sites that have the fields separately; the public API
/// for parsing user input is `FromStr`.
pub fn new(scheme: impl Into<String>, org: impl Into<String>, name: impl Into<String>) -> Self {
Self {
scheme: scheme.into(),
org: org.into(),
name: name.into(),
}
}
/// True when this id parsed from a bare `org/name` (no scheme
/// prefix). The harness substitutes its configured default in
/// `with_default_scheme` before resolving against a registry.
pub fn is_scheme_unset(&self) -> bool {
self.scheme.is_empty()
}
/// Substitute `default` for an empty scheme. No-op when the scheme
/// is already set. Returns self by value so it composes neatly:
/// `id.parse::<ModelSourceId>()?.with_default_scheme("huggingface")`.
pub fn with_default_scheme(mut self, default: &str) -> Self {
if self.scheme.is_empty() {
self.scheme = default.to_string();
}
self
}
/// The `org/name` half — what an hf-hub `Api::model(...)` call
/// expects regardless of which scheme/endpoint we're hitting.
pub fn repo_path(&self) -> String {
format!("{}/{}", self.org, self.name)
}
}
impl fmt::Display for ModelSourceId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.scheme.is_empty() {
write!(f, "{}/{}", self.org, self.name)
} else {
write!(f, "{}:{}/{}", self.scheme, self.org, self.name)
}
}
}
impl FromStr for ModelSourceId {
type Err = ParseError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if s.is_empty() {
return Err(ParseError::Empty);
}
// Scheme split. Only the *first* colon counts — anything after
// belongs to org/name (and would be rejected separately because
// `:` isn't allowed there).
let (scheme, rest) = match s.split_once(':') {
Some((scheme, rest)) => {
if scheme.is_empty() {
return Err(ParseError::EmptyScheme(s.to_string()));
}
if scheme.contains('/') {
return Err(ParseError::SchemeContainsSlash(s.to_string()));
}
(scheme.to_string(), rest)
}
None => (String::new(), s),
};
let (org, name) = rest
.split_once('/')
.ok_or_else(|| ParseError::MissingSlash(s.to_string()))?;
if org.is_empty() {
return Err(ParseError::EmptyOrg(s.to_string()));
}
if name.is_empty() {
return Err(ParseError::EmptyName(s.to_string()));
}
if name.contains(':') {
return Err(ParseError::NameContainsColon(s.to_string()));
}
Ok(Self {
scheme,
org: org.to_string(),
name: name.to_string(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_qualified() {
let id: ModelSourceId = "huggingface:Qwen/Qwen3.6-27B".parse().unwrap();
assert_eq!(id.scheme, "huggingface");
assert_eq!(id.org, "Qwen");
assert_eq!(id.name, "Qwen3.6-27B");
assert_eq!(id.repo_path(), "Qwen/Qwen3.6-27B");
assert!(!id.is_scheme_unset());
}
#[test]
fn parses_helexa_scheme() {
let id: ModelSourceId = "helexa:SomeOperator/Qwen3.6-27B-Uncensored"
.parse()
.unwrap();
assert_eq!(id.scheme, "helexa");
assert_eq!(id.org, "SomeOperator");
assert_eq!(id.name, "Qwen3.6-27B-Uncensored");
}
#[test]
fn parses_bare_id_with_empty_scheme() {
let id: ModelSourceId = "Qwen/Qwen3-30B-A3B-Instruct".parse().unwrap();
assert_eq!(id.scheme, "");
assert_eq!(id.org, "Qwen");
assert_eq!(id.name, "Qwen3-30B-A3B-Instruct");
assert!(id.is_scheme_unset());
}
#[test]
fn substitutes_default_scheme_only_when_unset() {
let id: ModelSourceId = "Qwen/Q3".parse().unwrap();
assert_eq!(id.with_default_scheme("huggingface").scheme, "huggingface");
let id: ModelSourceId = "helexa:Qwen/Q3".parse().unwrap();
assert_eq!(
id.with_default_scheme("huggingface").scheme,
"helexa",
"default substitution must not override an explicit scheme"
);
}
#[test]
fn display_roundtrips_qualified_id() {
let s = "helexa:Helexa/Qwen3.6-27B";
let id: ModelSourceId = s.parse().unwrap();
assert_eq!(id.to_string(), s);
}
#[test]
fn display_roundtrips_bare_id() {
let s = "Qwen/Q3";
let id: ModelSourceId = s.parse().unwrap();
assert_eq!(id.to_string(), s);
}
#[test]
fn rejects_empty() {
assert_eq!("".parse::<ModelSourceId>().unwrap_err(), ParseError::Empty);
}
#[test]
fn rejects_missing_slash() {
match "Qwen".parse::<ModelSourceId>().unwrap_err() {
ParseError::MissingSlash(s) => assert_eq!(s, "Qwen"),
other => panic!("expected MissingSlash, got {other:?}"),
}
match "huggingface:Qwen".parse::<ModelSourceId>().unwrap_err() {
ParseError::MissingSlash(s) => assert_eq!(s, "huggingface:Qwen"),
other => panic!("expected MissingSlash, got {other:?}"),
}
}
#[test]
fn rejects_empty_scheme() {
match ":Qwen/Q3".parse::<ModelSourceId>().unwrap_err() {
ParseError::EmptyScheme(s) => assert_eq!(s, ":Qwen/Q3"),
other => panic!("expected EmptyScheme, got {other:?}"),
}
}
#[test]
fn rejects_scheme_with_slash() {
match "hugg/ingface:Q/N".parse::<ModelSourceId>().unwrap_err() {
ParseError::SchemeContainsSlash(s) => assert_eq!(s, "hugg/ingface:Q/N"),
other => panic!("expected SchemeContainsSlash, got {other:?}"),
}
}
#[test]
fn rejects_empty_org_or_name() {
match "huggingface:/N".parse::<ModelSourceId>().unwrap_err() {
ParseError::EmptyOrg(_) => {}
other => panic!("expected EmptyOrg, got {other:?}"),
}
match "huggingface:Q/".parse::<ModelSourceId>().unwrap_err() {
ParseError::EmptyName(_) => {}
other => panic!("expected EmptyName, got {other:?}"),
}
}
#[test]
fn rejects_name_with_colon() {
match "huggingface:Q/N:weird"
.parse::<ModelSourceId>()
.unwrap_err()
{
ParseError::NameContainsColon(s) => assert_eq!(s, "huggingface:Q/N:weird"),
other => panic!("expected NameContainsColon, got {other:?}"),
}
}
#[test]
fn serde_roundtrips_via_struct() {
// We serialize as a struct (scheme/org/name fields) so the
// shape is self-describing in API payloads. Callers that want
// the compact `scheme:org/name` string use `Display`/`FromStr`.
let id = ModelSourceId::new("helexa", "Helexa", "Qwen3.6-27B");
let json = serde_json::to_string(&id).unwrap();
let back: ModelSourceId = serde_json::from_str(&json).unwrap();
assert_eq!(back, id);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,6 @@ license.workspace = true
[dependencies]
cortex-core.workspace = true
async-trait.workspace = true
tokio.workspace = true
axum.workspace = true
tower.workspace = true
@@ -25,7 +24,6 @@ tokio-stream.workspace = true
eventsource-stream.workspace = true
bytes = "1"
urlencoding = "2"
url = "2"
[dev-dependencies]
tokio = { workspace = true, features = ["test-util"] }

View File

@@ -1,235 +0,0 @@
//! Streaming Anthropic SSE translation (#24).
//!
//! The `/v1/messages` handler translates the request envelope to
//! OpenAI before proxying (see `cortex_core::translate`); this module
//! completes the round trip for `stream: true` — the upstream OpenAI
//! SSE stream is re-framed, event by event, into Anthropic's
//! `message_start` / `content_block_*` / `message_delta` /
//! `message_stop` sequence as it arrives. True streaming: each
//! upstream chunk is translated and forwarded immediately; nothing is
//! buffered beyond the current SSE event's bytes.
//!
//! The translation state machine itself is pure and lives in
//! [`cortex_core::translate::AnthropicStreamTranslator`]; this module
//! owns the wire concerns — splitting the upstream byte stream into
//! SSE events, parsing `data:` payloads, and framing the translated
//! events as `event: <name>\ndata: <json>\n\n`.
use axum::body::Body;
use axum::http::StatusCode;
use axum::response::Response;
use bytes::Bytes;
use cortex_core::openai::ChatCompletionChunk;
use cortex_core::translate::AnthropicStreamTranslator;
use futures::StreamExt;
use tokio_stream::wrappers::ReceiverStream;
/// Forward the translated OpenAI request to the upstream node and
/// return the response translated to Anthropic SSE framing.
pub async fn stream_translated(
client: &reqwest::Client,
endpoint: &str,
openai_body: axum::body::Bytes,
model_id: &str,
node_name: &str,
inbound_headers: &axum::http::HeaderMap,
usage_sink: Option<crate::metering::UsageSink>,
) -> Response {
let url = format!("{endpoint}/v1/chat/completions");
tracing::info!(
handler = "anthropic_messages",
model = %model_id,
node = %node_name,
url = %url,
"proxying streaming request (anthropic SSE translation)"
);
let request = crate::auth::forward_principal_headers(
client
.post(&url)
.header("content-type", "application/json")
.body(openai_body),
inbound_headers,
);
let upstream = match request.send().await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
handler = "anthropic_messages",
node = %node_name,
url = %url,
error = %e,
"anthropic stream: upstream request failed"
);
return anthropic_error(StatusCode::BAD_GATEWAY, "upstream request failed");
}
};
let status = upstream.status();
if !status.is_success() {
tracing::warn!(
handler = "anthropic_messages",
node = %node_name,
url = %url,
status = status.as_u16(),
"anthropic stream: upstream returned non-2xx"
);
return anthropic_error(
StatusCode::from_u16(status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY),
"upstream returned an error",
);
}
// Bounded channel: a slow client back-pressures the pump task,
// which back-pressures the upstream read — same propagation
// discipline as neuron's own projectors.
let (tx, rx) = tokio::sync::mpsc::channel::<Result<Bytes, std::convert::Infallible>>(32);
let node = node_name.to_string();
let model = model_id.to_string();
tokio::spawn(async move {
let mut upstream = upstream.bytes_stream();
let mut translator = AnthropicStreamTranslator::new();
let mut buf: Vec<u8> = Vec::new();
let mut done = false;
// Wire-debug accounting for the stream summary emitted at the
// end: did the model emit a structured tool call, what was the
// final finish_reason, and how many upstream frames did we see.
let mut saw_tool_call = false;
let mut last_finish: Option<String> = None;
let mut frames = 0u64;
// Engine-truth usage for metering (#51), scanned from the upstream
// frames (neuron emits a final `usage` object on the stream, #48).
let mut usage_prompt = 0u64;
let mut usage_completion = 0u64;
'outer: while let Some(block) = upstream.next().await {
let block = match block {
Ok(b) => b,
Err(e) => {
tracing::warn!(node = %node, error = %e, "anthropic stream: upstream read failed mid-stream");
break;
}
};
buf.extend_from_slice(&block);
// SSE events are separated by a blank line.
while let Some(pos) = find_event_boundary(&buf) {
let event: Vec<u8> = buf.drain(..pos + 2).collect();
let text = String::from_utf8_lossy(&event);
for line in text.lines() {
let Some(data) = line.strip_prefix("data:") else {
continue;
};
let data = data.trim();
if data == "[DONE]" {
done = true;
if !send_frames(&tx, translator.finish()).await {
break 'outer;
}
continue;
}
tracing::trace!(node = %node, frame = %data, "anthropic stream: upstream frame");
// Capture usage for metering before translation — the
// usage object rides on a late frame (often after the
// last content delta).
if let Some(p) = crate::proxy::last_count_for(data, "prompt_tokens") {
usage_prompt = p;
}
if let Some(c) = crate::proxy::last_count_for(data, "completion_tokens") {
usage_completion = c;
}
let Ok(chunk) = serde_json::from_str::<ChatCompletionChunk>(data) else {
tracing::debug!(node = %node, "anthropic stream: unparsable upstream frame skipped");
continue;
};
frames += 1;
if chunk
.choices
.iter()
.any(|c| c.delta.get("tool_calls").is_some())
{
saw_tool_call = true;
}
if let Some(fr) = chunk.choices.iter().find_map(|c| c.finish_reason.clone()) {
last_finish = Some(fr);
}
if !send_frames(&tx, translator.on_chunk(&chunk)).await {
break 'outer;
}
}
}
}
// Upstream ended without [DONE] (error or truncation): still
// close the Anthropic event sequence so clients aren't left
// with an unterminated message.
if !done {
let _ = send_frames(&tx, translator.finish()).await;
}
// Stream summary: the streaming counterpart to the non-streaming
// handler's "upstream response" line. `upstream_tool_calls =
// false` on a tools-bearing request is the fingerprint of the
// model improvising an unparsed tool-call format.
tracing::debug!(
wire = "anthropic",
model = %model,
node = %node,
frames,
upstream_tool_calls = saw_tool_call,
finish_reason = ?last_finish,
terminated = done,
"anthropic stream complete"
);
// Settle metering with the observed usage (#51). Runs on every exit
// path of the pump — clean end, early break, or upstream error — so
// the reservation is always resolved. `(0, 0)` when no usage frame
// was seen, which releases without recording spend.
if let Some(sink) = usage_sink {
sink(usage_prompt, usage_completion);
}
});
Response::builder()
.status(StatusCode::OK)
.header("content-type", "text/event-stream")
.header("cache-control", "no-cache")
.body(Body::from_stream(ReceiverStream::new(rx)))
.unwrap_or_else(|_| {
anthropic_error(
StatusCode::INTERNAL_SERVER_ERROR,
"failed to build response",
)
})
}
/// `\n\n` boundary of the first complete SSE event in `buf`, if any.
fn find_event_boundary(buf: &[u8]) -> Option<usize> {
buf.windows(2).position(|w| w == b"\n\n")
}
/// Render translated events as SSE frames and send them. Returns
/// `false` when the client has gone away (receiver dropped).
async fn send_frames(
tx: &tokio::sync::mpsc::Sender<Result<Bytes, std::convert::Infallible>>,
events: Vec<(String, serde_json::Value)>,
) -> bool {
for (name, payload) in events {
let frame = format!("event: {name}\ndata: {payload}\n\n");
if tx.send(Ok(Bytes::from(frame))).await.is_err() {
return false;
}
}
true
}
/// Anthropic-shaped error body (`{"type":"error","error":{...}}`).
fn anthropic_error(status: StatusCode, message: &str) -> Response {
let body = serde_json::json!({
"type": "error",
"error": { "type": "api_error", "message": message }
});
Response::builder()
.status(status)
.header("content-type", "application/json")
.body(Body::from(body.to_string()))
.expect("static error response must build")
}

View File

@@ -1,119 +0,0 @@
//! API-key authentication + principal resolution (#49).
//!
//! Identity rides standard bearer auth only — `Authorization: Bearer <key>`
//! — which is what keeps every tier OpenAI-compatible by construction (no
//! custom required headers or body fields, per #47). The middleware resolves
//! the key to a [`Principal`] via the [`EntitlementProvider`], carries it in
//! the request extensions for cortex-side metering/enforcement (#51/#52), and
//! stamps it as internal headers on the request so it reaches neuron, which
//! trusts cortex's assertion over WireGuard (#54).
//!
//! Anti-spoofing: any client-supplied principal header is **stripped** before
//! the authoritative value is stamped, so a client can never assert a
//! principal it didn't authenticate as.
//!
//! Rejection contract (#63): missing key under `require_auth`, or any present
//! but unresolvable key, yields `401 invalid_api_key` in the #60 envelope.
use crate::error::envelope_response;
use crate::state::CortexState;
use axum::extract::{Request, State};
use axum::http::header::AUTHORIZATION;
use axum::http::{HeaderMap, HeaderValue};
use axum::middleware::Next;
use axum::response::Response;
use cortex_core::entitlements::{HEADER_ACCOUNT_ID, HEADER_KEY_ID};
use cortex_core::error_envelope::OpenAiError;
use std::sync::Arc;
/// Endpoints that never require auth: liveness/readiness probes. Everything
/// else flows through resolution.
fn is_public(path: &str) -> bool {
path == "/health" || path == "/"
}
/// Extract the bearer token from an `Authorization` header value, if present
/// and well-formed. Scheme match is case-insensitive per RFC 7235.
fn parse_bearer(headers: &HeaderMap) -> Option<String> {
let raw = headers.get(AUTHORIZATION)?.to_str().ok()?;
let (scheme, token) = raw.split_once(' ')?;
if scheme.eq_ignore_ascii_case("bearer") {
let token = token.trim();
(!token.is_empty()).then(|| token.to_string())
} else {
None
}
}
/// Axum middleware: resolve the bearer key, attach the principal, stamp the
/// internal headers. Wired in `build_app` via `from_fn_with_state`.
pub async fn require_principal(
State(fleet): State<Arc<CortexState>>,
mut req: Request,
next: Next,
) -> Response {
if is_public(req.uri().path()) {
return next.run(req).await;
}
// Anti-spoof: drop any client-supplied principal headers up front.
{
let headers = req.headers_mut();
headers.remove(HEADER_ACCOUNT_ID);
headers.remove(HEADER_KEY_ID);
}
match parse_bearer(req.headers()) {
Some(key) => match fleet.entitlements.resolve(&key).await {
Ok(principal) => {
// Stamp the authoritative principal for neuron. Account/key
// ids come from operator config, so they're valid header
// values; guard anyway and skip a malformed one rather than
// panic.
if let (Ok(account), Ok(key_id)) = (
HeaderValue::from_str(&principal.account_id),
HeaderValue::from_str(&principal.key_id),
) {
let headers = req.headers_mut();
headers.insert(HEADER_ACCOUNT_ID, account);
headers.insert(HEADER_KEY_ID, key_id);
}
// Carry the typed principal for cortex-side metering (#51)
// and budget enforcement (#52).
req.extensions_mut().insert(principal);
next.run(req).await
}
// A present-but-invalid credential is always an error, even when
// anonymous access is otherwise allowed.
Err(_) => unauthorized("invalid API key"),
},
None => {
if fleet.require_auth {
unauthorized("missing API key; supply 'Authorization: Bearer <key>'")
} else {
next.run(req).await
}
}
}
}
/// `401 invalid_api_key` in the standard envelope (#63).
fn unauthorized(message: &str) -> Response {
envelope_response(OpenAiError::invalid_api_key(message))
}
/// Copy the cortex-stamped principal headers from an inbound [`HeaderMap`]
/// onto an outbound reqwest builder. Used by the Anthropic proxy paths,
/// which construct their own upstream requests instead of going through
/// [`crate::proxy::forward_request`] (which forwards all headers verbatim).
pub fn forward_principal_headers(
mut builder: reqwest::RequestBuilder,
headers: &HeaderMap,
) -> reqwest::RequestBuilder {
for name in [HEADER_ACCOUNT_ID, HEADER_KEY_ID] {
if let Some(value) = headers.get(name) {
builder = builder.header(name, value);
}
}
builder
}

View File

@@ -1,317 +0,0 @@
//! The local/static [`EntitlementProvider`] (#50).
//!
//! Accounts, keys, and hard caps come from operator config
//! ([`cortex_core::config::EntitlementsConfig`]); reservations and settled
//! spend are tracked in-process. This lands auth + per-key caps + the
//! amplification fix before any upstream clearing house exists; the future
//! helexa-upstream client (#57) implements the same trait.
//!
//! Budget math is serialized under a single [`std::sync::Mutex`] so
//! reserve/settle/release are atomic — a key's `spent + reserved` can never
//! exceed its hard cap even under concurrent requests (the #52 guarantee).
//! The lock is held only for the in-memory arithmetic, never across an
//! await.
use cortex_core::config::{ApiKeyConfig, EntitlementsConfig};
use cortex_core::entitlements::{
AuthError, BudgetError, BudgetSnapshot, CapWindow, EntitlementProvider, Principal, Reservation,
};
use std::collections::HashMap;
use std::sync::Mutex;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Instant;
/// Per-key budget configuration (resolved from [`ApiKeyConfig`]).
struct Budget {
hard_cap: Option<u64>,
window: CapWindow,
}
/// Live, mutable accounting for one key over its current window.
#[derive(Default)]
struct Ledger {
/// Settled spend in the current window.
spent: u64,
/// Sum of outstanding (un-settled) reservations.
reserved: u64,
/// Start of the current rolling window; `None` until the first reserve.
/// Unused for [`CapWindow::Balance`].
window_start: Option<Instant>,
}
pub struct LocalEntitlementProvider {
/// Bearer token → principal.
keys: HashMap<String, Principal>,
/// `key_id` → budget config.
budgets: HashMap<String, Budget>,
/// `key_id` → live ledger.
ledgers: Mutex<HashMap<String, Ledger>>,
/// Monotonic source of opaque reservation handles.
next_id: AtomicU64,
}
impl LocalEntitlementProvider {
/// Build from the `[entitlements]` config. A key without an explicit
/// `key_id` is tracked at `account_id` granularity (its secret is never
/// used as a label).
pub fn from_config(config: &EntitlementsConfig) -> Self {
let mut keys = HashMap::new();
let mut budgets = HashMap::new();
for ApiKeyConfig {
key,
account_id,
key_id,
hard_cap,
window,
} in &config.keys
{
let key_id = key_id.clone().unwrap_or_else(|| account_id.clone());
keys.insert(
key.clone(),
Principal {
account_id: account_id.clone(),
key_id: key_id.clone(),
},
);
budgets.insert(
key_id,
Budget {
hard_cap: *hard_cap,
window: window.clone(),
},
);
}
Self {
keys,
budgets,
ledgers: Mutex::new(HashMap::new()),
next_id: AtomicU64::new(1),
}
}
}
/// Tokens still available under `cap` given current `spent`/`reserved`.
/// `None` cap = unlimited.
fn available(cap: Option<u64>, spent: u64, reserved: u64) -> Option<u64> {
cap.map(|c| c.saturating_sub(spent).saturating_sub(reserved))
}
#[async_trait::async_trait]
impl EntitlementProvider for LocalEntitlementProvider {
async fn resolve(&self, api_key: &str) -> Result<Principal, AuthError> {
self.keys.get(api_key).cloned().ok_or(AuthError::InvalidKey)
}
async fn reserve(
&self,
principal: &Principal,
max_tokens: u64,
) -> Result<Reservation, BudgetError> {
// A principal with no configured budget (or an uncapped one) always
// reserves; we still track spend for metrics.
let budget = self.budgets.get(&principal.key_id);
let (cap, window) = match budget {
Some(b) => (b.hard_cap, b.window.clone()),
None => (None, CapWindow::Balance),
};
let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
let ledger = ledgers.entry(principal.key_id.clone()).or_default();
// Lazily reset a rolling window that has elapsed before checking.
let mut retry_after_secs = 0;
if let CapWindow::Rolling { seconds } = window {
let now = Instant::now();
match ledger.window_start {
Some(start) if now.duration_since(start).as_secs() < seconds => {
retry_after_secs = seconds - now.duration_since(start).as_secs();
}
_ => {
// First reserve, or the window has fully elapsed: reset.
ledger.spent = 0;
ledger.window_start = Some(now);
retry_after_secs = seconds;
}
}
}
if let Some(avail) = available(cap, ledger.spent, ledger.reserved)
&& max_tokens > avail
{
return Err(match window {
CapWindow::Rolling { .. } => BudgetError::RateLimited {
requested: max_tokens,
available: avail,
// At least 1s so clients don't hot-loop on a sub-second
// remainder.
retry_after_secs: retry_after_secs.max(1),
},
CapWindow::Balance => BudgetError::InsufficientQuota {
requested: max_tokens,
available: avail,
},
});
}
ledger.reserved += max_tokens;
Ok(Reservation {
id: self.next_id.fetch_add(1, Ordering::Relaxed),
principal: principal.clone(),
reserved: max_tokens,
})
}
async fn settle(&self, reservation: Reservation, actual_tokens: u64) {
let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
if let Some(ledger) = ledgers.get_mut(&reservation.principal.key_id) {
ledger.reserved = ledger.reserved.saturating_sub(reservation.reserved);
ledger.spent += actual_tokens;
}
}
async fn release(&self, reservation: Reservation) {
let mut ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
if let Some(ledger) = ledgers.get_mut(&reservation.principal.key_id) {
ledger.reserved = ledger.reserved.saturating_sub(reservation.reserved);
}
}
async fn snapshot(&self, principal: &Principal) -> Option<BudgetSnapshot> {
let ledgers = self.ledgers.lock().expect("ledger mutex poisoned");
let (spent, reserved) = ledgers
.get(&principal.key_id)
.map(|l| (l.spent, l.reserved))
.unwrap_or((0, 0));
let hard_cap = self.budgets.get(&principal.key_id).and_then(|b| b.hard_cap);
Some(BudgetSnapshot {
hard_cap,
spent,
reserved,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
fn provider() -> LocalEntitlementProvider {
let config = EntitlementsConfig {
require_auth: true,
keys: vec![
ApiKeyConfig {
key: "sk-balance".into(),
account_id: "acct-a".into(),
key_id: Some("key-balance".into()),
hard_cap: Some(1_000),
window: CapWindow::Balance,
},
ApiKeyConfig {
key: "sk-rolling".into(),
account_id: "acct-b".into(),
key_id: Some("key-rolling".into()),
hard_cap: Some(500),
window: CapWindow::Rolling { seconds: 3_600 },
},
ApiKeyConfig {
key: "sk-infra".into(),
account_id: "operator".into(),
key_id: Some("key-infra".into()),
hard_cap: None,
window: CapWindow::Balance,
},
],
};
LocalEntitlementProvider::from_config(&config)
}
#[tokio::test]
async fn resolves_configured_key_to_principal() {
let p = provider();
let principal = p.resolve("sk-balance").await.expect("known key resolves");
assert_eq!(principal.account_id, "acct-a");
assert_eq!(principal.key_id, "key-balance");
}
#[tokio::test]
async fn unknown_key_is_invalid() {
let p = provider();
assert!(matches!(
p.resolve("sk-nope").await,
Err(AuthError::InvalidKey)
));
}
#[tokio::test]
async fn reserve_settle_release_round_trip() {
let p = provider();
let principal = p.resolve("sk-balance").await.unwrap();
let r = p.reserve(&principal, 400).await.expect("within cap");
// Reserved, not yet spent.
let snap = p.snapshot(&principal).await.unwrap();
assert_eq!(snap.hard_cap, Some(1_000));
assert_eq!(snap.reserved, 400);
assert_eq!(snap.spent, 0);
// Used fewer tokens than reserved → remainder released, spend exact.
p.settle(r, 250).await;
let snap = p.snapshot(&principal).await.unwrap();
assert_eq!(snap.reserved, 0);
assert_eq!(snap.spent, 250);
// A reservation that is released contributes no spend.
let r2 = p.reserve(&principal, 100).await.unwrap();
p.release(r2).await;
let snap = p.snapshot(&principal).await.unwrap();
assert_eq!(snap.reserved, 0);
assert_eq!(snap.spent, 250);
}
#[tokio::test]
async fn balance_over_cap_is_insufficient_quota() {
let p = provider();
let principal = p.resolve("sk-balance").await.unwrap();
// Reserve most of the cap, then ask for more than remains.
let _r = p.reserve(&principal, 900).await.unwrap();
let err = p.reserve(&principal, 200).await.expect_err("over cap");
match err {
BudgetError::InsufficientQuota {
requested,
available,
} => {
assert_eq!(requested, 200);
assert_eq!(available, 100);
}
other => panic!("expected InsufficientQuota, got {other:?}"),
}
}
#[tokio::test]
async fn rolling_over_cap_is_rate_limited_with_retry_after() {
let p = provider();
let principal = p.resolve("sk-rolling").await.unwrap();
let _r = p.reserve(&principal, 500).await.unwrap();
let err = p.reserve(&principal, 1).await.expect_err("over cap");
match err {
BudgetError::RateLimited {
retry_after_secs, ..
} => {
assert!(retry_after_secs >= 1, "must advertise a retry hint");
assert!(retry_after_secs <= 3_600);
}
other => panic!("expected RateLimited, got {other:?}"),
}
}
#[tokio::test]
async fn uncapped_infra_key_never_refuses() {
let p = provider();
let principal = p.resolve("sk-infra").await.unwrap();
let r = p.reserve(&principal, 10_000_000).await.expect("uncapped");
p.settle(r, 10_000_000).await;
let snap = p.snapshot(&principal).await.unwrap();
assert_eq!(snap.hard_cap, None);
assert_eq!(snap.spent, 10_000_000);
}
}

View File

@@ -1,24 +0,0 @@
//! Gateway adapter that turns the shared, axum-agnostic
//! [`cortex_core::error_envelope::OpenAiError`] into an axum [`Response`],
//! setting the `Retry-After` header when the envelope carries one.
//!
//! cortex-core owns the envelope shape and the rejection contract (#60/#63);
//! this is the only place the gateway crosses from that data into axum.
use axum::http::{HeaderValue, StatusCode, header};
use axum::response::{IntoResponse, Json, Response};
use cortex_core::error_envelope::OpenAiError;
/// Render an [`OpenAiError`] as an axum response (status + JSON envelope +
/// optional `Retry-After`).
pub fn envelope_response(err: OpenAiError) -> Response {
let status = StatusCode::from_u16(err.status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
let retry_after = err.retry_after_secs;
let mut response = (status, Json(err.body())).into_response();
if let Some(secs) = retry_after
&& let Ok(value) = HeaderValue::from_str(&secs.to_string())
{
response.headers_mut().insert(header::RETRY_AFTER, value);
}
response
}

View File

@@ -11,8 +11,6 @@ use axum::http::HeaderMap;
use axum::response::{IntoResponse, Json, Response};
use axum::routing::{get, post};
use chrono::Utc;
use cortex_core::error_envelope::OpenAiError;
use cortex_core::harness::ModelLimit;
use cortex_core::node::{CortexModelEntry, ModelLocation};
use serde_json::{Value, json};
use std::sync::Arc;
@@ -22,7 +20,6 @@ pub fn api_routes() -> Router<Arc<CortexState>> {
Router::new()
.route("/v1/chat/completions", post(chat_completions))
.route("/v1/completions", post(completions))
.route("/v1/responses", post(responses))
.route("/v1/models", get(list_models))
.route("/v1/messages", post(anthropic_messages))
.route("/health", get(health))
@@ -35,104 +32,25 @@ async fn chat_completions(
headers: HeaderMap,
body: Bytes,
) -> Response {
log_inbound("openai-chat", "/v1/chat/completions", &body);
let model_id = match extract_model(&body) {
Some(m) => m,
None => {
tracing::warn!(
handler = "chat_completions",
"rejected: missing 'model' field in request body"
);
return error_response(
400,
"invalid_request_error",
"missing_model_field",
"missing 'model' field in request body",
);
}
None => return error_response(400, "missing 'model' field in request body"),
};
let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
handler = "chat_completions",
model = %model_id,
error = %e,
"route resolve failed"
);
return route_error_response(&e);
}
Err(e) => return error_response(404, &e.to_string()),
};
touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
touch_model(&fleet, &route.node_name, &model_id).await;
let body = rewrite_model_in_body(body, &route.resolved_model_id);
proxy_with_metrics(
&fleet,
&route,
"/v1/chat/completions",
headers,
body,
&route.resolved_model_id,
)
.await
}
/// `POST /v1/responses` — proxy to the appropriate backend node.
///
/// Same routing shape as [`chat_completions`]: extract `model` from
/// the body, resolve to a node, forward verbatim. No translation —
/// neuron speaks the Responses API natively (see
/// `crates/neuron/src/wire/openai_responses.rs`), so the gateway is
/// a pass-through. Streaming and non-streaming are handled
/// identically; the upstream `Content-Type` (text/event-stream vs.
/// application/json) propagates through the proxy.
async fn responses(
State(fleet): State<Arc<CortexState>>,
headers: HeaderMap,
body: Bytes,
) -> Response {
log_inbound("openai-responses", "/v1/responses", &body);
let model_id = match extract_model(&body) {
Some(m) => m,
None => {
tracing::warn!(
handler = "responses",
"rejected: missing 'model' field in request body"
);
return error_response(
400,
"invalid_request_error",
"missing_model_field",
"missing 'model' field in request body",
);
}
};
let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
handler = "responses",
model = %model_id,
error = %e,
"route resolve failed"
);
return route_error_response(&e);
}
};
touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
let body = rewrite_model_in_body(body, &route.resolved_model_id);
proxy_with_metrics(
&fleet,
&route,
"/v1/responses",
headers,
body,
&route.resolved_model_id,
&model_id,
)
.await
}
@@ -143,48 +61,19 @@ async fn completions(
headers: HeaderMap,
body: Bytes,
) -> Response {
log_inbound("openai-completions", "/v1/completions", &body);
let model_id = match extract_model(&body) {
Some(m) => m,
None => {
tracing::warn!(
handler = "completions",
"rejected: missing 'model' field in request body"
);
return error_response(
400,
"invalid_request_error",
"missing_model_field",
"missing 'model' field in request body",
);
}
None => return error_response(400, "missing 'model' field in request body"),
};
let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
handler = "completions",
model = %model_id,
error = %e,
"route resolve failed"
);
return route_error_response(&e);
}
Err(e) => return error_response(404, &e.to_string()),
};
touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
touch_model(&fleet, &route.node_name, &model_id).await;
let body = rewrite_model_in_body(body, &route.resolved_model_id);
proxy_with_metrics(
&fleet,
&route,
"/v1/completions",
headers,
body,
&route.resolved_model_id,
)
.await
proxy_with_metrics(&fleet, &route, "/v1/completions", headers, body, &model_id).await
}
/// `POST /v1/messages` — accept Anthropic format, translate, proxy, translate back.
@@ -196,108 +85,28 @@ async fn anthropic_messages(
// Parse as Anthropic request.
let anth_req: cortex_core::anthropic::MessagesRequest = match serde_json::from_slice(&body) {
Ok(r) => r,
Err(e) => {
tracing::warn!(
handler = "anthropic_messages",
error = %e,
"rejected: invalid Anthropic request body"
);
return error_response(
400,
"invalid_request_error",
"invalid_anthropic_body",
"invalid Anthropic request body",
);
}
Err(e) => return error_response(400, &format!("invalid Anthropic request: {e}")),
};
let model_id = anth_req.model.clone();
let is_streaming = anth_req.stream.unwrap_or(false);
// Wire-debug: make the exercised path and request shape concrete
// rather than guesswork. `tool_history` flags whether the client is
// continuing a tool conversation (tool_use/tool_result blocks in the
// message history) vs. opening a fresh one. Full bodies ride at
// trace! (cortex/neuron ship at info; operator infra runs at debug).
if tracing::enabled!(tracing::Level::DEBUG) {
let n_tools = anth_req
.extra
.get("tools")
.and_then(Value::as_array)
.map(|a| a.len())
.unwrap_or(0);
let tool_history = anth_req
.messages
.iter()
.any(|m| anthropic_message_has_tool_blocks(&m.content));
tracing::debug!(
wire = "anthropic",
endpoint = "/v1/messages",
model = %model_id,
stream = is_streaming,
messages = anth_req.messages.len(),
tools = n_tools,
tool_history,
system = anth_req.system.is_some(),
"inbound request"
);
}
tracing::trace!(wire = "anthropic", body = %body_preview(&body), "inbound anthropic body");
// Translate to OpenAI format.
let openai_req = cortex_core::translate::anthropic_to_openai(anth_req);
let openai_body = match serde_json::to_vec(&openai_req) {
Ok(b) => Bytes::from(b),
Err(e) => {
tracing::error!(
handler = "anthropic_messages",
model = %model_id,
error = %e,
"internal: failed to serialise translated OpenAI request"
);
return error_response(
500,
"api_error",
"internal_translation_error",
"internal translation error",
);
}
Err(e) => return error_response(500, &format!("translation error: {e}")),
};
let route = match router::resolve(&fleet, &model_id).await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
error = %e,
"route resolve failed"
);
// RouteError's Display strings are short and informative
// ("model 'X' not found...", "no healthy nodes available")
// — fine to surface to the caller. The warn above carries
// any extra context for operators.
return route_error_response(&e);
}
Err(e) => return error_response(404, &e.to_string()),
};
touch_model(&fleet, &route.node_name, &route.resolved_model_id).await;
// Swap the alias for the concrete id in the translated body so
// neuron's harness sees a model name that matches what it has
// loaded.
let openai_body = rewrite_model_in_body(openai_body, &route.resolved_model_id);
// The translated body is what neuron actually sees — the reshaped
// OpenAI-form tools live here. Tracing it makes "did the tool
// definitions survive translation?" a log line, not a guess.
tracing::trace!(
wire = "anthropic",
body = %body_preview(&openai_body),
"translated openai body (sent upstream)"
);
touch_model(&fleet, &route.node_name, &model_id).await;
let labels = [
("model", route.resolved_model_id.clone()),
("model", model_id.clone()),
("node", route.node_name.clone()),
];
metrics::counter!("cortex_requests_total", &labels).increment(1);
@@ -306,127 +115,57 @@ async fn anthropic_messages(
}
let start = Instant::now();
// Per-request metering (#51), same lifecycle as the OpenAI paths:
// reserve (0 tokens this phase) and build the completion sink. Consumed
// by whichever branch runs below; dropping it unused releases the
// reservation.
let usage_sink = match crate::metering::principal_from_headers(&headers) {
Some(principal) => {
let guard = crate::metering::ReservationGuard::reserve(
Arc::clone(&fleet.entitlements),
&principal,
0,
)
.await;
Some(crate::metering::usage_sink(principal, guard))
}
None => None,
};
if is_streaming {
// Anthropic SSE translation (#24): upstream speaks OpenAI SSE;
// re-frame it event-by-event into Anthropic's message_start /
// content_block_* / message_delta / message_stop sequence.
let resp = crate::anthropic_sse::stream_translated(
// TODO: streaming Anthropic translation requires converting SSE format.
// For now, proxy the OpenAI SSE stream directly (clients that can handle
// OpenAI SSE will work; full Anthropic SSE translation is a follow-up).
let result = proxy::forward_request(
&fleet.http_client,
&route.endpoint,
&route,
"/v1/chat/completions",
headers,
openai_body,
&model_id,
&route.node_name,
&headers,
usage_sink,
)
.await;
metrics::histogram!("cortex_request_duration_seconds", &labels)
.record(start.elapsed().as_secs_f64());
if !resp.status().is_success() {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
match result {
Ok(resp) => resp,
Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
e.into_response()
}
}
resp
} else {
// Non-streaming: proxy, buffer full response, translate back to Anthropic.
let target_url = format!("{}/v1/chat/completions", route.endpoint);
tracing::info!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
cold_start = route.cold_start,
"proxying request"
);
let upstream_resp = crate::auth::forward_principal_headers(
fleet
.http_client
.post(&target_url)
.body(openai_body)
.header("content-type", "application/json"),
&headers,
)
.send()
.await;
let upstream_resp = fleet
.http_client
.post(format!("{}/v1/chat/completions", route.endpoint))
.body(openai_body)
.header("content-type", "application/json")
.send()
.await;
let upstream_resp = match upstream_resp {
Ok(r) => r,
Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
error = %e,
"upstream request failed (network)"
);
return error_response(
502,
"api_error",
"upstream_connection_error",
"upstream request failed",
);
return error_response(502, &format!("upstream request failed: {e}"));
}
};
let upstream_status = upstream_resp.status();
if !upstream_status.is_success() {
if !upstream_resp.status().is_success() {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
let status = upstream_status.as_u16();
let status = upstream_resp.status().as_u16();
let body = upstream_resp.text().await.unwrap_or_default();
let body_snippet = body.chars().take(512).collect::<String>();
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
status,
body = %body_snippet,
"upstream returned non-2xx"
);
return error_response(
status,
"api_error",
"upstream_error",
&format!("upstream returned {status}"),
);
return error_response(status, &format!("upstream error: {body}"));
}
let body_bytes = match upstream_resp.bytes().await {
Ok(b) => b,
Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
error = %e,
"failed to read upstream response body"
);
return error_response(
502,
"api_error",
"upstream_connection_error",
"failed to read upstream response",
);
return error_response(502, &format!("failed to read upstream response: {e}"));
}
};
@@ -435,151 +174,23 @@ async fn anthropic_messages(
Ok(r) => r,
Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
let body_snippet = String::from_utf8_lossy(&body_bytes)
.chars()
.take(512)
.collect::<String>();
tracing::warn!(
handler = "anthropic_messages",
model = %model_id,
node = %route.node_name,
url = %target_url,
error = %e,
body = %body_snippet,
"failed to parse upstream response as OpenAI ChatCompletionResponse"
);
return error_response(
502,
"api_error",
"upstream_malformed_response",
"malformed upstream response",
);
return error_response(502, &format!("failed to parse upstream response: {e}"));
}
};
metrics::histogram!("cortex_request_duration_seconds", &labels)
.record(start.elapsed().as_secs_f64());
// Settle metering with the upstream usage (#51). Scanned from the
// raw body — same engine-truth source as the streaming path — so we
// don't depend on the typed usage struct's optionality.
if let Some(sink) = usage_sink {
let tail = String::from_utf8_lossy(&body_bytes);
let prompt = proxy::last_count_for(&tail, "prompt_tokens").unwrap_or(0);
let completion = proxy::last_count_for(&tail, "completion_tokens").unwrap_or(0);
sink(prompt, completion);
}
// Did the model actually produce a structured tool call, or just
// text? This is the single most useful signal for "is tool
// calling working end-to-end" — a `false` here alongside a
// request that carried tools means the model improvised an
// unparsed format (the original failure mode).
let upstream_tool_calls = openai_resp.choices.iter().any(|c| {
c.message
.extra
.get("tool_calls")
.and_then(Value::as_array)
.map(|a| !a.is_empty())
.unwrap_or(false)
});
let finish_reason = openai_resp
.choices
.first()
.and_then(|c| c.finish_reason.clone());
tracing::debug!(
wire = "anthropic",
model = %model_id,
node = %route.node_name,
upstream_tool_calls,
finish_reason = ?finish_reason,
"upstream non-streaming response"
);
let anthropic_resp = cortex_core::translate::openai_to_anthropic(openai_resp);
Json(json!(anthropic_resp)).into_response()
}
}
/// Combine two self-derived limits for the same model loaded on
/// different neurons (#67): keep the tightest (smallest `context`) so a
/// client sized against the advertised limit never overflows the
/// most-constrained deployment that might serve the request. `None`
/// means "that neuron reported no limit"; the present one wins.
fn tightest_limit(a: Option<ModelLimit>, b: Option<ModelLimit>) -> Option<ModelLimit> {
match (a, b) {
(None, x) | (x, None) => x,
(Some(a), Some(b)) => Some(if b.context < a.context { b } else { a }),
}
}
/// `GET /v1/models` — union of (catalogue × topology feasibility) and
/// (currently loaded somewhere). The result is what the fleet *could*
/// serve, not just what's already loaded — so OpenAI-compatible tools
/// see every model the operator has provisioned, and cortex
/// transparently cold-loads the first time one is requested.
/// `GET /v1/models` — aggregate models from all nodes.
async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
use std::collections::HashMap;
let now = Utc::now().timestamp() as u64;
let nodes = fleet.nodes.read().await;
let catalogue = &fleet.catalogue;
let mut model_map: std::collections::HashMap<String, CortexModelEntry> =
std::collections::HashMap::new();
let mut entries: HashMap<String, CortexModelEntry> = HashMap::new();
// Pass 1: catalogue × topology. For every catalogue profile, find
// healthy neurons whose discovered devices satisfy the profile.
// Catalogue-defined models surface here even if nothing has loaded
// them yet — that's the point of the unified endpoint.
for profile in &catalogue.models {
let mut feasible_on = Vec::new();
for node in nodes.values() {
if !node.healthy {
continue;
}
let Some(disc) = node.discovery.as_ref() else {
continue;
};
if profile.is_feasible_on(&node.name, &disc.devices) {
feasible_on.push(node.name.clone());
}
}
if feasible_on.is_empty() {
// The catalogue lists this model but no neuron's topology
// matches — surface it as not-loaded with no feasible
// location. Hides nothing; lets operators see why a
// configured model isn't reachable.
feasible_on.clear();
}
entries.insert(
profile.id.clone(),
CortexModelEntry {
id: profile.id.clone(),
object: "model".into(),
created: now,
owned_by: "helexa".into(),
loaded: false,
feasible_on,
locations: Vec::new(),
// Start with catalogue-declared capabilities; Pass 2 unions
// runtime-detected ones from loaded neurons.
capabilities: profile.capabilities.clone(),
// `limit` is no longer operator-declared (#67): the neuron
// self-derives it from live VRAM + throughput and reports it
// per loaded model — Pass 2 fills it from the neuron's
// ModelEntry. A catalogue `limit`, if present, is ignored
// (it can't track hot-swapped models or live capacity).
// `cost` stays operator-set and flows from the catalogue.
limit: None,
cost: profile.cost.clone(),
// Runtime-detected — will be OR-ed in Pass 2 from neuron data.
tool_call: false,
reasoning: false,
},
);
}
// Pass 2: layer the actually-loaded state on top. For each
// (node, model) entry, attach a ModelLocation. If the model isn't
// in the catalogue, create a new CortexModelEntry from scratch —
// cortex doesn't refuse to surface a manually-loaded model just
// because the operator didn't enumerate it in models.toml.
for node in nodes.values() {
for (model_id, entry) in &node.models {
let location = ModelLocation {
@@ -587,142 +198,19 @@ async fn list_models(State(fleet): State<Arc<CortexState>>) -> Json<Value> {
status: entry.status,
vram_estimate_mb: entry.vram_estimate_mb,
};
let was_loaded = matches!(entry.status, cortex_core::node::ModelStatus::Loaded);
entries
model_map
.entry(model_id.clone())
.and_modify(|e| {
e.locations.push(location.clone());
if was_loaded {
e.loaded = true;
}
// Union the per-node capabilities so a model loaded
// on several neurons reports every modality any of
// them advertises.
for cap in &entry.capabilities {
if !e.capabilities.contains(cap) {
e.capabilities.push(cap.clone());
}
}
// OR-in runtime-detected capability flags from the neuron.
e.tool_call = e.tool_call || entry.tool_call;
e.reasoning = e.reasoning || entry.reasoning;
// Adopt the neuron's self-derived limit (#67). When a
// model is loaded on several neurons with different
// headroom, advertise the tightest (smallest context)
// so a client never overflows the most-constrained
// deployment that might serve it.
e.limit = tightest_limit(e.limit.take(), entry.limit.clone());
})
.and_modify(|e| e.locations.push(location.clone()))
.or_insert_with(|| CortexModelEntry {
id: model_id.clone(),
object: "model".into(),
created: now,
owned_by: "helexa".into(),
loaded: was_loaded,
// Not in catalogue — cortex has no opinion on
// feasibility; leave empty.
feasible_on: Vec::new(),
locations: vec![location],
capabilities: entry.capabilities.clone(),
limit: entry.limit.clone(),
cost: None,
tool_call: entry.tool_call,
reasoning: entry.reasoning,
});
}
}
// Pass 3: surface pre-warming models. Each neuron's `/health`
// activation snapshot (polled separately from /models) reports
// `in_progress` (the model currently materialising) and `pending`
// (queued behind it). Neither appears on the neuron's `/models`
// yet — that endpoint only knows about fully-loaded handles — so
// without this pass a client polling `/v1/models` during pre-warm
// sees Qwen3.6-27B with no location and concludes "not there".
// Synthesising a Loading location instead tells clients the model
// is on its way. Idempotent against Pass 2: if a Loading location
// for this node already exists (shouldn't, but be safe) we skip.
for node in nodes.values() {
let Some(activation) = node.activation.as_ref() else {
continue;
};
let mut loading_ids: Vec<&str> = Vec::new();
if let Some(id) = activation.in_progress.as_deref() {
loading_ids.push(id);
}
for id in &activation.pending {
loading_ids.push(id.as_str());
}
for model_id in loading_ids {
let location = ModelLocation {
node: node.name.clone(),
status: cortex_core::node::ModelStatus::Loading,
vram_estimate_mb: None,
};
entries
.entry(model_id.to_string())
.and_modify(|e| {
let already = e.locations.iter().any(|l| {
l.node == node.name && l.status == cortex_core::node::ModelStatus::Loading
});
if !already {
e.locations.push(location.clone());
}
})
.or_insert_with(|| CortexModelEntry {
id: model_id.to_string(),
object: "model".into(),
created: now,
owned_by: "helexa".into(),
loaded: false,
feasible_on: Vec::new(),
locations: vec![location],
// A model that's only mid-prewarm has no loaded
// location to read capabilities from yet.
capabilities: Vec::new(),
limit: None,
cost: None,
tool_call: false,
reasoning: false,
});
}
}
let data: Vec<Value> = model_map.values().map(|e| json!(e)).collect();
// Pass 4: surface aliases as their own entries pointing at the
// same locations as the target id, so a client browsing /v1/models
// sees "helexa/small" / "helexa/balanced" / "helexa/large" (or
// whatever the operator defined) and can request inference
// against them directly. Aliases that point at unknown targets
// are skipped — surfacing a dead alias would be misleading.
for (alias, target) in &catalogue.aliases {
let Some(target_entry) = entries.get(target).cloned() else {
tracing::warn!(
alias = alias,
target = target,
"alias points at a model not present in catalogue or fleet; skipping"
);
continue;
};
entries.insert(
alias.clone(),
CortexModelEntry {
id: alias.clone(),
object: "model".into(),
created: now,
owned_by: "helexa".into(),
loaded: target_entry.loaded,
feasible_on: target_entry.feasible_on,
locations: target_entry.locations,
capabilities: target_entry.capabilities,
limit: target_entry.limit.clone(),
cost: target_entry.cost.clone(),
tool_call: target_entry.tool_call,
reasoning: target_entry.reasoning,
},
);
}
let data: Vec<Value> = entries.values().map(|e| json!(e)).collect();
Json(json!({
"object": "list",
"data": data,
@@ -765,35 +253,8 @@ async fn proxy_with_metrics(
metrics::counter!("cortex_cold_starts_total", &labels).increment(1);
}
// Per-request metering (#51): reconstruct the principal from the
// middleware-stamped headers, reserve (0 tokens this phase — metering
// only; #52 makes it the real cap), and build the completion sink that
// settles spend when the response finishes. Anonymous requests get no
// sink. Must happen before `headers`/`body` are moved into the proxy.
let usage_sink = match crate::metering::principal_from_headers(&headers) {
Some(principal) => {
let guard = crate::metering::ReservationGuard::reserve(
Arc::clone(&fleet.entitlements),
&principal,
0,
)
.await;
Some(crate::metering::usage_sink(principal, guard))
}
None => None,
};
let start = Instant::now();
let result = proxy::forward_request(
&fleet.http_client,
route,
path,
headers,
body,
model_id,
usage_sink,
)
.await;
let result = proxy::forward_request(&fleet.http_client, route, path, headers, body).await;
let duration = start.elapsed();
match result {
@@ -804,9 +265,6 @@ async fn proxy_with_metrics(
}
Err(e) => {
metrics::counter!("cortex_request_errors_total", &labels).increment(1);
// proxy::forward_request already warn'd with wire-level
// detail (target URL, error, status). ProxyError::into_response
// now returns a generic message — no body leak.
e.into_response()
}
}
@@ -827,99 +285,14 @@ fn extract_model(body: &[u8]) -> Option<String> {
v.get("model")?.as_str().map(|s| s.to_string())
}
/// Emit a uniform wire-debug summary for an OpenAI-family inbound
/// request (chat/completions, completions, responses). Makes which
/// surface a client exercised — and whether it sent tools / asked for
/// streaming — a concrete log line. The full body rides at trace!.
///
/// Parsing is gated on the debug level being enabled so info-level
/// deployments pay nothing.
fn log_inbound(wire: &str, endpoint: &str, body: &[u8]) {
if tracing::enabled!(tracing::Level::DEBUG) {
let v: Value = match serde_json::from_slice(body) {
Ok(v) => v,
Err(_) => return,
};
let model = v.get("model").and_then(Value::as_str).unwrap_or("?");
let stream = v.get("stream").and_then(Value::as_bool).unwrap_or(false);
let tools = v
.get("tools")
.and_then(Value::as_array)
.map(|a| a.len())
.unwrap_or(0);
tracing::debug!(wire, endpoint, model, stream, tools, "inbound request");
}
tracing::trace!(wire, endpoint, body = %body_preview(body), "inbound body");
}
/// True if an Anthropic message's content carries any `tool_use` or
/// `tool_result` block — i.e. the client is mid tool-conversation.
fn anthropic_message_has_tool_blocks(content: &cortex_core::anthropic::AnthropicContent) -> bool {
use cortex_core::anthropic::AnthropicContent;
match content {
AnthropicContent::Text(_) => false,
AnthropicContent::Blocks(blocks) => blocks
.iter()
.any(|b| matches!(b.block_type.as_str(), "tool_use" | "tool_result")),
}
}
/// Render a UTF-8-safe, length-capped preview of a request/response
/// body for trace logging. Caps by characters (not bytes) so the slice
/// can never split a multi-byte codepoint.
fn body_preview(body: &[u8]) -> String {
const MAX_CHARS: usize = 8192;
let text = String::from_utf8_lossy(body);
if text.chars().count() > MAX_CHARS {
let head: String = text.chars().take(MAX_CHARS).collect();
format!("{head}…<truncated, {} bytes total>", body.len())
} else {
text.into_owned()
}
}
/// Rewrite the `model` field of an OpenAI-style JSON request body to
/// the resolved concrete id. Returns the original bytes if `new_model`
/// matches what's already there or the body fails to parse — the
/// caller has already extracted `model` via `extract_model`, so a
/// parse failure here would only happen on a body the client crafted
/// to defeat us, and we'd rather proxy it unchanged than 500.
///
/// Needed because neuron rejects requests whose `model` field doesn't
/// match a loaded model, so a client that sends `model: "helexa/small"`
/// would hit a 404 at the harness unless we swap it for the concrete
/// id the alias resolved to.
fn rewrite_model_in_body(body: Bytes, new_model: &str) -> Bytes {
let Ok(mut v) = serde_json::from_slice::<Value>(&body) else {
return body;
};
let needs_rewrite = v
.get("model")
.and_then(|m| m.as_str())
.map(|m| m != new_model)
.unwrap_or(false);
if !needs_rewrite {
return body;
}
if let Value::Object(obj) = &mut v {
obj.insert("model".into(), Value::String(new_model.to_string()));
}
match serde_json::to_vec(&v) {
Ok(bytes) => Bytes::from(bytes),
Err(_) => body,
}
}
fn error_response(status: u16, typ: &str, code: &str, message: &str) -> Response {
crate::error::envelope_response(OpenAiError::new(status, typ, code, message))
}
/// Render a [`RouteError`] in the standard envelope, attaching `Retry-After`
/// for its transient variants (#63).
fn route_error_response(e: &router::RouteError) -> Response {
let mut env = OpenAiError::new(e.http_status(), e.broad_type(), e.code(), e.to_string());
if let Some(secs) = e.retry_after_secs() {
env = env.with_retry_after(secs);
}
crate::error::envelope_response(env)
fn error_response(status: u16, message: &str) -> Response {
let code = axum::http::StatusCode::from_u16(status)
.unwrap_or(axum::http::StatusCode::INTERNAL_SERVER_ERROR);
let body = json!({
"error": {
"message": message,
"type": "gateway_error",
}
});
(code, Json(body)).into_response()
}

View File

@@ -1,10 +1,5 @@
pub mod anthropic_sse;
pub mod auth;
pub mod entitlements_local;
pub mod error;
pub mod evictor;
pub mod handlers;
pub mod metering;
pub mod metrics;
pub mod poller;
pub mod proxy;
@@ -13,26 +8,15 @@ pub mod state;
use anyhow::Result;
use axum::Router;
use axum::middleware::from_fn_with_state;
use cortex_core::config::GatewayConfig;
use std::sync::Arc;
use tower_http::cors::CorsLayer;
use tower_http::trace::TraceLayer;
/// Build the Axum application router with all routes wired up.
///
/// Layer order (outermost first): trace → CORS → auth → handlers. CORS is
/// outer to auth so preflight `OPTIONS` short-circuits before resolution;
/// auth (`require_principal`) resolves the bearer key, attaches the
/// principal, and stamps the internal principal headers before any handler
/// runs.
pub fn build_app(fleet: Arc<state::CortexState>) -> Router {
Router::new()
.merge(handlers::api_routes())
.layer(from_fn_with_state(
Arc::clone(&fleet),
auth::require_principal,
))
.layer(CorsLayer::permissive())
.layer(TraceLayer::new_for_http())
.with_state(fleet)

View File

@@ -1,121 +0,0 @@
//! Per-request token metering (#51).
//!
//! Captures the real `(prompt, completion)` usage of every request and feeds
//! it to two places: the [`EntitlementProvider`] spend ledger (via
//! reserve→settle) and per-principal Prometheus counters. The principal is
//! reconstructed from the internal headers the auth middleware stamped (#49),
//! so this works uniformly across every proxy path without threading the
//! typed principal through each handler.
//!
//! The reserve→settle lifecycle is established here but, in this phase,
//! reserves **zero** tokens — metering only, no enforcement. Budget
//! enforcement (#52) flips the reserved amount to the real
//! `prompt + max_output` and handles the [`BudgetError`] rejection; the
//! settle/release plumbing is identical, so that change is localized.
//!
//! [`ReservationGuard`] makes leaks impossible: settling records actual
//! spend and releases the unused remainder; dropping a guard that was never
//! settled releases the whole reservation. So an early return, error path,
//! or dropped stream can't strand a reservation.
use axum::http::HeaderMap;
use cortex_core::entitlements::{EntitlementProvider, HEADER_ACCOUNT_ID, HEADER_KEY_ID, Principal};
use std::sync::Arc;
/// Invoked exactly once at request completion with best-effort
/// `(prompt_tokens, completion_tokens)`. When no usage could be observed
/// (e.g. a pre-dispatch failure or a dropped stream) it is dropped unused —
/// which releases the held reservation via [`ReservationGuard`]'s `Drop`.
pub type UsageSink = Box<dyn FnOnce(u64, u64) + Send>;
/// Reconstruct the principal from the cortex-stamped internal headers. The
/// auth middleware strips any client copy and stamps the authoritative value,
/// so these headers are trustworthy within cortex. `None` for anonymous
/// (unauthenticated) requests.
pub fn principal_from_headers(headers: &HeaderMap) -> Option<Principal> {
let account_id = headers.get(HEADER_ACCOUNT_ID)?.to_str().ok()?.to_string();
let key_id = headers.get(HEADER_KEY_ID)?.to_str().ok()?.to_string();
Some(Principal { account_id, key_id })
}
/// Emit per-principal spend counters (#51). Labelled by account/key only —
/// both are operator-bounded, so cardinality is controlled.
pub fn record_spend(principal: &Principal, prompt: u64, completion: u64) {
let labels = [
("account", principal.account_id.clone()),
("key", principal.key_id.clone()),
];
metrics::counter!("cortex_spend_tokens_total", &labels).increment(prompt + completion);
metrics::counter!("cortex_spend_prompt_tokens_total", &labels).increment(prompt);
metrics::counter!("cortex_spend_completion_tokens_total", &labels).increment(completion);
}
/// Holds a budget reservation for the life of a request. [`settle`] records
/// actual spend and releases the remainder; an un-settled guard releases the
/// whole reservation when dropped. Anonymous requests carry an empty guard,
/// where every operation is a no-op.
///
/// [`settle`]: ReservationGuard::settle
pub struct ReservationGuard {
provider: Arc<dyn EntitlementProvider>,
reservation: Option<cortex_core::entitlements::Reservation>,
}
impl ReservationGuard {
/// An empty guard for an anonymous request — no reservation to resolve.
pub fn anonymous(provider: Arc<dyn EntitlementProvider>) -> Self {
Self {
provider,
reservation: None,
}
}
/// Reserve `max_tokens` for the principal, returning a guard. In this
/// phase callers pass `0` (metering only); #52 passes the real cap and
/// surfaces the [`cortex_core::entitlements::BudgetError`] instead.
pub async fn reserve(
provider: Arc<dyn EntitlementProvider>,
principal: &Principal,
max_tokens: u64,
) -> Self {
let reservation = provider.reserve(principal, max_tokens).await.ok();
Self {
provider,
reservation,
}
}
/// Settle with the tokens actually consumed, disarming the drop-release.
/// Spawns the (fast, in-process for the local provider) settle so the
/// caller — which may be a sync stream-completion callback — needn't
/// await.
pub fn settle(mut self, actual_tokens: u64) {
if let Some(reservation) = self.reservation.take() {
let provider = Arc::clone(&self.provider);
tokio::spawn(async move {
provider.settle(reservation, actual_tokens).await;
});
}
}
}
impl Drop for ReservationGuard {
fn drop(&mut self) {
if let Some(reservation) = self.reservation.take() {
let provider = Arc::clone(&self.provider);
tokio::spawn(async move {
provider.release(reservation).await;
});
}
}
}
/// Build the completion sink for an authenticated request: record spend and
/// settle the reservation with the observed total. Dropping it unused (no
/// usage observed) releases the reservation via the guard.
pub fn usage_sink(principal: Principal, guard: ReservationGuard) -> UsageSink {
Box::new(move |prompt, completion| {
record_spend(&principal, prompt, completion);
guard.settle(prompt + completion);
})
}

View File

@@ -46,14 +46,6 @@ fn describe_metrics() {
"Generation throughput in tokens per second"
);
metrics::describe_counter!("cortex_requests_total", "Total number of proxied requests");
metrics::describe_counter!(
"cortex_prompt_tokens_total",
"Total prompt tokens reported by upstream usage objects"
);
metrics::describe_counter!(
"cortex_completion_tokens_total",
"Total completion tokens reported by upstream usage objects"
);
metrics::describe_counter!(
"cortex_request_errors_total",
"Total number of failed proxy requests"
@@ -63,16 +55,4 @@ fn describe_metrics() {
"cortex_cold_starts_total",
"Total number of cold-start model loads"
);
metrics::describe_counter!(
"cortex_spend_tokens_total",
"Total metered tokens (prompt + completion) per principal, labelled by account/key (#51)"
);
metrics::describe_counter!(
"cortex_spend_prompt_tokens_total",
"Metered prompt tokens per principal, labelled by account/key (#51)"
);
metrics::describe_counter!(
"cortex_spend_completion_tokens_total",
"Metered completion tokens per principal, labelled by account/key (#51)"
);
}

View File

@@ -3,7 +3,6 @@
use crate::state::CortexState;
use chrono::Utc;
use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
use cortex_core::harness::ModelInfo;
use cortex_core::node::{ModelEntry, ModelStatus};
use std::sync::Arc;
@@ -26,68 +25,7 @@ pub async fn poll_once(fleet: &CortexState) {
}
}
/// Fetch `GET /discovery` and cache it on the NodeState — topology is
/// invariant for a given neuron process, so a successful fetch is kept.
/// Re-polled only while `max_prompt_tokens` is still unknown (0): on a
/// rolling deploy cortex can win the race and cache a neuron's discovery
/// before that neuron reports the field (it deserialises to 0). Re-polling
/// until a real cap arrives self-heals that without periodic polling.
async fn maybe_poll_discovery(fleet: &CortexState, name: &str, endpoint: &str) {
{
let nodes = fleet.nodes.read().await;
match nodes.get(name) {
Some(n)
if n.discovery
.as_ref()
.is_some_and(|d| d.max_prompt_tokens > 0) =>
{
return;
}
_ => {}
}
}
let url = format!("{endpoint}/discovery");
let resp = match fleet
.http_client
.get(&url)
.timeout(Duration::from_secs(5))
.send()
.await
{
Ok(r) if r.status().is_success() => r,
Ok(r) => {
tracing::debug!(node = name, status = %r.status(), "discovery probe non-success");
return;
}
Err(e) => {
tracing::debug!(node = name, error = %e, "discovery probe unreachable");
return;
}
};
match resp.json::<DiscoveryResponse>().await {
Ok(d) => {
let mut nodes = fleet.nodes.write().await;
if let Some(node) = nodes.get_mut(name) {
tracing::info!(
node = name,
hostname = %d.hostname,
devices = d.devices.len(),
"discovery cached"
);
node.discovery = Some(d);
}
}
Err(e) => {
tracing::warn!(node = name, error = %e, "failed to parse /discovery response");
}
}
}
async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
// Topology first — cheap once cached, and the router needs it to
// route requests against catalogue entries that aren't loaded yet.
maybe_poll_discovery(fleet, name, endpoint).await;
let url = format!("{endpoint}/models");
let result = fleet
@@ -116,22 +54,12 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
.and_modify(|e| {
e.status = status;
e.vram_estimate_mb = upstream.vram_used_mb;
e.capabilities = upstream.capabilities.clone();
e.tool_call = upstream.tool_call;
e.reasoning = upstream.reasoning;
// Neuron's self-derived limit (#67) — the
// authoritative source the gateway advertises.
e.limit = upstream.limit.clone();
})
.or_insert_with(|| ModelEntry {
id: upstream.id.clone(),
status,
last_accessed: None,
vram_estimate_mb: upstream.vram_used_mb,
capabilities: upstream.capabilities.clone(),
tool_call: upstream.tool_call,
reasoning: upstream.reasoning,
limit: upstream.limit.clone(),
});
}
@@ -161,51 +89,6 @@ async fn poll_neuron(fleet: &CortexState, name: &str, endpoint: &str) {
node.healthy = false;
}
}
// Release the write lock before the next HTTP call.
drop(nodes);
// Poll /health for the activation snapshot. We don't want this to
// flip the node to unhealthy on its own — a neuron that's serving
// /models fine is still operational even if /health is briefly
// unavailable — so failures are debug-level and leave the existing
// activation reading in place.
poll_health(fleet, name, endpoint).await;
}
/// Fetch `/health` and stash the activation snapshot on NodeState.
/// Decoupled from the /models poll so a /health glitch doesn't mark
/// the neuron unhealthy or evict the model list.
async fn poll_health(fleet: &CortexState, name: &str, endpoint: &str) {
let url = format!("{endpoint}/health");
let resp = match fleet
.http_client
.get(&url)
.timeout(Duration::from_secs(5))
.send()
.await
{
Ok(r) if r.status().is_success() => r,
Ok(r) => {
tracing::debug!(node = name, status = %r.status(), "/health probe non-success");
return;
}
Err(e) => {
tracing::debug!(node = name, error = %e, "/health probe failed");
return;
}
};
match resp.json::<HealthResponse>().await {
Ok(h) => {
let mut nodes = fleet.nodes.write().await;
if let Some(node) = nodes.get_mut(name) {
node.activation = Some(h.activation);
}
}
Err(e) => {
tracing::debug!(node = name, error = %e, "failed to parse /health response");
}
}
}
fn parse_status(s: &str) -> ModelStatus {
@@ -213,8 +96,6 @@ fn parse_status(s: &str) -> ModelStatus {
"loaded" => ModelStatus::Loaded,
"unloaded" => ModelStatus::Unloaded,
"reloading" => ModelStatus::Reloading,
"loading" => ModelStatus::Loading,
"recovering" => ModelStatus::Recovering,
_ => ModelStatus::Loaded,
}
}

View File

@@ -1,4 +1,4 @@
//! Streaming HTTP reverse proxy to neuron backends.
//! Streaming HTTP reverse proxy to mistral.rs backends.
//!
//! For streaming requests, SSE chunks are forwarded as they arrive.
//! The proxy captures timing information for metrics but does not
@@ -9,31 +9,16 @@ use anyhow::Result;
use axum::body::Body;
use axum::http::{HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use futures::Stream;
use futures::stream::BoxStream;
use reqwest::Client;
use std::pin::Pin;
use std::task::{Context, Poll};
use std::time::Instant;
/// Proxy a request body to the resolved backend node and stream the response.
///
/// Logging contract: every call emits exactly one structured event at
/// info / warn level for operator visibility, regardless of outcome.
/// Network-level failures and non-2xx upstream statuses are warn'd here
/// (closest to the wire); the user-facing response carries only the
/// status code and a generic message — implementation detail (body,
/// error chain) lives in the log, never in the API surface.
pub async fn forward_request(
client: &Client,
route: &RouteDecision,
path: &str,
headers: HeaderMap,
body: bytes::Bytes,
model_id: &str,
usage_sink: Option<crate::metering::UsageSink>,
) -> Result<Response, ProxyError> {
let request_start = Instant::now();
let url = format!("{}{}", route.endpoint, path);
tracing::info!(
node = %route.node_name,
@@ -52,39 +37,13 @@ pub async fn forward_request(
req_builder = req_builder.header(key, value);
}
let upstream_resp = match req_builder.send().await {
Ok(r) => r,
Err(e) => {
tracing::warn!(
node = %route.node_name,
url = %url,
error = %e,
"proxy: upstream request failed (network)"
);
return Err(ProxyError::Upstream(e));
}
};
let upstream_resp = req_builder.send().await.map_err(ProxyError::Upstream)?;
let upstream_status = upstream_resp.status();
if !upstream_status.is_success() {
// Streaming body — can't snippet without breaking the stream
// pass-through. Log status + URL; the client still gets the
// upstream status, just without the leaked body.
tracing::warn!(
node = %route.node_name,
url = %url,
status = upstream_status.as_u16(),
"proxy: upstream returned non-2xx"
);
}
let status = StatusCode::from_u16(upstream_status.as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let status =
StatusCode::from_u16(upstream_resp.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let resp_headers = upstream_resp.headers().clone();
let stream = TokenMetricsStream::new(
Box::pin(upstream_resp.bytes_stream()),
TokenMetrics::new(model_id, &route.node_name, request_start, usage_sink),
);
let stream = upstream_resp.bytes_stream();
let body = Body::from_stream(stream);
@@ -93,284 +52,31 @@ pub async fn forward_request(
response = response.header(key, value);
}
response.body(body).map_err(|e| {
tracing::warn!(
node = %route.node_name,
url = %url,
error = %e,
"proxy: failed to build response"
);
ProxyError::ResponseBuild(e.to_string())
})
response
.body(body)
.map_err(|e| ProxyError::ResponseBuild(e.to_string()))
}
#[derive(Debug, thiserror::Error)]
pub enum ProxyError {
#[error("upstream request failed")]
#[error("upstream request failed: {0}")]
Upstream(reqwest::Error),
#[error("failed to build response")]
#[error("failed to build response: {0}")]
ResponseBuild(String),
}
impl IntoResponse for ProxyError {
fn into_response(self) -> Response {
let (status, code, message) = match &self {
ProxyError::Upstream(_) => (
StatusCode::BAD_GATEWAY,
"upstream_connection_error",
"upstream request failed",
),
ProxyError::ResponseBuild(_) => (
StatusCode::INTERNAL_SERVER_ERROR,
"internal_server_error",
"failed to build response",
),
let status = match &self {
ProxyError::Upstream(_) => StatusCode::BAD_GATEWAY,
ProxyError::ResponseBuild(_) => StatusCode::INTERNAL_SERVER_ERROR,
};
crate::error::envelope_response(cortex_core::error_envelope::OpenAiError::new(
status.as_u16(),
"api_error",
code,
message,
))
}
}
// ── Per-request token metrics (#21) ─────────────────────────────────
//
// The proxy never buffers or re-serialises the upstream body — chunks
// are forwarded verbatim. For metrics it observes each chunk's arrival
// time and keeps a bounded tail of the body text, from which the final
// OpenAI `usage` object (present on the last SSE chunk and on
// non-streaming JSON bodies alike) yields engine-truth token counts.
//
// Emitted per request, labelled {model, node}:
// cortex_time_to_first_token_seconds (histogram) — first body chunk
// cortex_tokens_per_second (histogram) — completion tokens
// over the decode window (first→last chunk); falls back to the
// full request duration for single-chunk (non-streaming) bodies
// cortex_prompt_tokens_total / cortex_completion_tokens_total (counters)
/// Cap on the retained body tail. The usage object rides on the final
/// chunk, so a generous tail is plenty; the cap bounds memory on huge
/// non-streaming bodies.
const TAIL_CAP_BYTES: usize = 64 * 1024;
/// Find the value of the LAST `"key": <integer>` occurrence in `tail`.
/// Pure and chunk-boundary-safe (the tail is contiguous appended text).
/// The quoted-needle form means `completion_tokens` never matches
/// `completion_tokens_details`.
pub(crate) fn last_count_for(tail: &str, key: &str) -> Option<u64> {
let needle = format!("\"{key}\"");
let mut result = None;
for (idx, _) in tail.match_indices(&needle) {
let rest = tail[idx + needle.len()..].trim_start();
let Some(rest) = rest.strip_prefix(':') else {
continue;
};
let rest = rest.trim_start();
let digits: &str = &rest[..rest
.char_indices()
.find(|(_, c)| !c.is_ascii_digit())
.map(|(i, _)| i)
.unwrap_or(rest.len())];
if let Ok(v) = digits.parse::<u64>() {
result = Some(v);
}
}
result
}
struct TokenMetrics {
labels: [(&'static str, String); 2],
request_start: Instant,
first_chunk: Option<Instant>,
last_chunk: Option<Instant>,
tail: String,
finished: bool,
/// Per-principal metering hook (#51). Invoked exactly once in `finish`
/// with the observed `(prompt, completion)` so the reservation can be
/// settled and spend recorded. `None` for anonymous requests.
usage_sink: Option<crate::metering::UsageSink>,
}
impl TokenMetrics {
fn new(
model_id: &str,
node_name: &str,
request_start: Instant,
usage_sink: Option<crate::metering::UsageSink>,
) -> Self {
Self {
labels: [
("model", model_id.to_string()),
("node", node_name.to_string()),
],
request_start,
first_chunk: None,
last_chunk: None,
tail: String::new(),
finished: false,
usage_sink,
}
}
fn observe(&mut self, chunk: &[u8]) {
let now = Instant::now();
self.first_chunk.get_or_insert(now);
self.last_chunk = Some(now);
self.tail.push_str(&String::from_utf8_lossy(chunk));
if self.tail.len() > TAIL_CAP_BYTES {
// Keep the newest half; the usage object is always at the
// very end of the body. Split at a char boundary.
let mut cut = self.tail.len() - TAIL_CAP_BYTES / 2;
while !self.tail.is_char_boundary(cut) {
cut += 1;
let body = serde_json::json!({
"error": {
"message": self.to_string(),
"type": "proxy_error",
}
self.tail.drain(..cut);
}
}
/// Emit the metrics exactly once — called on clean stream end and
/// from Drop (client disconnect mid-stream still records what we
/// saw).
fn finish(&mut self) {
if self.finished {
return;
}
self.finished = true;
let prompt = last_count_for(&self.tail, "prompt_tokens");
let completion = last_count_for(&self.tail, "completion_tokens");
// Per-model metrics — only when body chunks actually arrived.
if let Some(first) = self.first_chunk {
let ttft = first.duration_since(self.request_start).as_secs_f64();
metrics::histogram!("cortex_time_to_first_token_seconds", &self.labels).record(ttft);
if let Some(prompt) = prompt {
metrics::counter!("cortex_prompt_tokens_total", &self.labels).increment(prompt);
}
if let Some(completion) = completion.filter(|c| *c > 0) {
metrics::counter!("cortex_completion_tokens_total", &self.labels)
.increment(completion);
let last = self.last_chunk.unwrap_or(first);
let decode_window = last.duration_since(first).as_secs_f64();
// Streaming: rate over the decode window (first→last chunk).
// Non-streaming bodies arrive as ~one chunk (window ≈ 0),
// where the only honest denominator is the full request
// duration.
let secs = if decode_window >= 0.1 {
decode_window
} else {
last.duration_since(self.request_start).as_secs_f64()
};
if secs > 0.0 {
metrics::histogram!("cortex_tokens_per_second", &self.labels)
.record(completion as f64 / secs);
}
}
}
// Per-principal metering + reservation settle (#51). Always runs so
// the reservation is resolved even when no usage/body was observed
// (sink with (0, 0) → settle 0 → release).
if let Some(sink) = self.usage_sink.take() {
sink(prompt.unwrap_or(0), completion.unwrap_or(0));
}
}
}
/// Pass-through stream wrapper that feeds [`TokenMetrics`]. Emits on
/// clean end-of-stream; the Drop impl covers client disconnects.
struct TokenMetricsStream {
inner: BoxStream<'static, Result<bytes::Bytes, reqwest::Error>>,
metrics: TokenMetrics,
}
impl TokenMetricsStream {
fn new(
inner: BoxStream<'static, Result<bytes::Bytes, reqwest::Error>>,
metrics: TokenMetrics,
) -> Self {
Self { inner, metrics }
}
}
impl Stream for TokenMetricsStream {
type Item = Result<bytes::Bytes, reqwest::Error>;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
let this = self.get_mut();
match this.inner.as_mut().poll_next(cx) {
Poll::Ready(Some(Ok(chunk))) => {
this.metrics.observe(&chunk);
Poll::Ready(Some(Ok(chunk)))
}
Poll::Ready(Some(Err(e))) => Poll::Ready(Some(Err(e))),
Poll::Ready(None) => {
this.metrics.finish();
Poll::Ready(None)
}
Poll::Pending => Poll::Pending,
}
}
}
impl Drop for TokenMetricsStream {
fn drop(&mut self) {
self.metrics.finish();
}
}
#[cfg(test)]
mod tests {
use super::last_count_for;
#[test]
fn extracts_counts_from_final_sse_usage_chunk() {
let tail = concat!(
"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\n",
"data: {\"choices\":[],\"usage\":{\"prompt_tokens\":225,",
"\"completion_tokens\":42,\"total_tokens\":267}}\n\n",
"data: [DONE]\n\n"
);
assert_eq!(last_count_for(tail, "prompt_tokens"), Some(225));
assert_eq!(last_count_for(tail, "completion_tokens"), Some(42));
}
#[test]
fn extracts_counts_from_non_streaming_body() {
let tail = "{\"choices\":[{\"message\":{\"content\":\"hi\"}}],\
\"usage\":{\"prompt_tokens\": 12, \"completion_tokens\": 7}}";
assert_eq!(last_count_for(tail, "prompt_tokens"), Some(12));
assert_eq!(last_count_for(tail, "completion_tokens"), Some(7));
}
#[test]
fn ignores_details_variants_and_takes_last_occurrence() {
// completion_tokens_details must not shadow completion_tokens,
// and the LAST usage object wins (matters when content echoes
// a usage-shaped string earlier in the stream).
let tail = concat!(
"data: {\"usage\":{\"completion_tokens\":1}}\n\n",
"data: {\"usage\":{\"completion_tokens\":99,",
"\"completion_tokens_details\":{\"reasoning_tokens\":3}}}\n\n"
);
assert_eq!(last_count_for(tail, "completion_tokens"), Some(99));
}
#[test]
fn absent_keys_yield_none() {
assert_eq!(
last_count_for("data: [DONE]\n\n", "completion_tokens"),
None
);
assert_eq!(last_count_for("", "prompt_tokens"), None);
// key present but non-numeric value
assert_eq!(
last_count_for("\"completion_tokens\": null", "completion_tokens"),
None
);
});
(status, axum::Json(body)).into_response()
}
}

View File

@@ -2,21 +2,13 @@
//!
//! Given a model ID from an inbound request, determine which node should
//! handle it. Priority:
//! 1. Node where the model is currently `Loaded` → use it.
//! 2. Node where the model is `Unloaded` → use it; neuron's existing
//! lazy-load behaviour will reload before serving the request.
//! 3. Model is in the catalogue → pick a feasible neuron, call
//! `POST /models/load`, wait for the load to complete, then
//! proxy. First-request cold-load latency is acceptable per the
//! unified-endpoint contract.
//! 4. Not in catalogue, not loaded anywhere → 404.
//! 1. Node where the model is currently `Loaded`
//! 2. Node where the model is `Unloaded` (will lazy-load on request)
//! 3. Error: model not found on any node
use crate::state::CortexState;
use cortex_core::catalogue::ModelProfile;
use cortex_core::harness::ModelSpec;
use cortex_core::node::ModelStatus;
use std::sync::Arc;
use std::time::Duration;
/// The routing decision: which node endpoint to proxy the request to.
#[derive(Debug, Clone)]
@@ -24,385 +16,62 @@ pub struct RouteDecision {
pub node_name: String,
/// The inference endpoint to proxy to (from neuron's /models/{id}/endpoint).
pub endpoint: String,
/// Whether the model will need to load (cold start). Set to true
/// when we proxied to an `Unloaded` node (lazy load on neuron) or
/// when we just triggered an explicit cold-load via the catalogue
/// path.
/// Whether the model will need to load (cold start).
pub cold_start: bool,
/// The concrete model id we actually routed to. Equal to the
/// caller's requested id unless an alias was resolved (e.g. caller
/// asked for `helexa/small`, this carries `Qwen/Qwen3-1.7B`). The
/// handler uses this to rewrite the request body's `model` field
/// before proxying — neurons reject requests where the body's
/// model name doesn't match a loaded model.
pub resolved_model_id: String,
}
#[derive(Debug, thiserror::Error)]
pub enum RouteError {
#[error("model '{0}' not found on any node and not in catalogue")]
#[error("model '{0}' not found on any node")]
ModelNotFound(String),
#[error("no healthy nodes available")]
NoHealthyNodes,
#[error("failed to resolve inference endpoint for model '{0}' on node '{1}'")]
EndpointResolveFailed(String, String),
#[error(
"model '{model_id}' is in the catalogue but no healthy neuron's topology satisfies its constraints"
)]
NoFeasibleNeuron { model_id: String },
#[error("cold-load of '{model_id}' on '{node}' failed: {message}")]
ColdLoadFailed {
model_id: String,
node: String,
message: String,
},
#[error(
"model '{model_id}' is recovering on node '{node}' (device context rebuild in progress) — retry shortly"
)]
ModelRecovering { model_id: String, node: String },
}
impl RouteError {
/// HTTP status the gateway should answer with. `NoHealthyNodes` and
/// `ModelRecovering` are the transient cases (503 service_unavailable,
/// safe to retry the same request); everything else is 404.
pub fn http_status(&self) -> u16 {
match self {
RouteError::NoHealthyNodes | RouteError::ModelRecovering { .. } => 503,
_ => 404,
}
}
/// Broad OpenAI error category for the JSON envelope.
pub fn broad_type(&self) -> &'static str {
match self {
RouteError::ModelNotFound(_) => "invalid_request_error",
RouteError::NoHealthyNodes
| RouteError::EndpointResolveFailed(_, _)
| RouteError::NoFeasibleNeuron { .. }
| RouteError::ColdLoadFailed { .. }
| RouteError::ModelRecovering { .. } => "api_error",
}
}
/// Specific machine-readable error code.
pub fn code(&self) -> &'static str {
match self {
RouteError::ModelNotFound(_) => "model_not_found",
RouteError::NoHealthyNodes => "service_unavailable",
RouteError::EndpointResolveFailed(_, _) => "service_unavailable",
RouteError::NoFeasibleNeuron { .. } => "service_unavailable",
RouteError::ColdLoadFailed { .. } => "service_unavailable",
RouteError::ModelRecovering { .. } => "service_unavailable",
}
}
/// Seconds to advertise in `Retry-After` for the transient variants
/// (#63). `NoHealthyNodes` may clear once the poller re-marks a node
/// healthy; `ModelRecovering` clears once the device context finishes
/// rebuilding — both are safe to retry. Everything else is permanent
/// for this request (404) and carries no hint.
pub fn retry_after_secs(&self) -> Option<u64> {
match self {
RouteError::ModelRecovering { .. } => Some(2),
RouteError::NoHealthyNodes => Some(5),
_ => None,
}
}
}
/// Resolve which node should serve a request for the given model.
/// Asks the neuron for the inference endpoint after selecting a node.
pub async fn resolve(
fleet: &Arc<CortexState>,
requested_model_id: &str,
model_id: &str,
) -> Result<RouteDecision, RouteError> {
// Alias resolution first — swap `helexa/small` (etc.) for the
// concrete id before any node lookups so the rest of routing,
// loading, and metrics deal in concrete ids only. `resolve_alias`
// returns the input verbatim when it isn't an alias.
let model_id = fleet.catalogue.resolve_alias(requested_model_id);
if model_id != requested_model_id {
tracing::debug!(
requested = requested_model_id,
resolved = model_id,
"alias resolved"
);
}
// Snapshot loaded / unloaded / recovering state from the poller cache.
let (loaded_route, unloaded_route, recovering_node, any_healthy) = {
let (node_name, neuron_endpoint, cold_start) = {
let nodes = fleet.nodes.read().await;
let mut loaded_route = None;
let mut unloaded_route = None;
let mut recovering_node = None;
let mut any_healthy = false;
let mut loaded_candidate = None;
let mut unloaded_candidate = None;
for node in nodes.values() {
if !node.healthy {
continue;
}
any_healthy = true;
if let Some(entry) = node.models.get(model_id) {
match entry.status {
ModelStatus::Loaded | ModelStatus::Reloading => {
loaded_route = Some((node.name.clone(), node.endpoint.clone(), false));
loaded_candidate = Some((node.name.clone(), node.endpoint.clone(), false));
break;
}
ModelStatus::Unloaded => {
if unloaded_route.is_none() {
unloaded_route = Some((node.name.clone(), node.endpoint.clone(), true));
if unloaded_candidate.is_none() {
unloaded_candidate =
Some((node.name.clone(), node.endpoint.clone(), true));
}
}
// Auto-recovering (#17/#20): the model is rebuilding
// its device context on this node. Hold the route —
// answer "retry shortly" rather than 404, and do NOT
// fall through to the catalogue cold-load, which
// would race a second placement (and a second copy's
// worth of VRAM) against the in-flight recovery.
ModelStatus::Recovering => {
if recovering_node.is_none() {
recovering_node = Some(node.name.clone());
}
}
// Loading is gateway-synthesised from neuron's
// activation snapshot; it never appears on the
// wire from neuron's `/models`. Skip — the model
// isn't actually servable yet. The pre-existing
// race (catalogue cold_load fires a parallel
// /models/load against the in-flight load) is no
// worse than before; fixing it needs neuron-side
// in-flight tracking on /models/load itself.
ModelStatus::Loading => {}
}
}
}
(loaded_route, unloaded_route, recovering_node, any_healthy)
};
if !any_healthy {
return Err(RouteError::NoHealthyNodes);
}
// Priority 1: already loaded.
if let Some((node_name, neuron_endpoint, cold_start)) = loaded_route {
return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await;
}
// Priority 2: recovering somewhere — transient hold, not a reroute.
if let Some(node) = recovering_node {
return Err(RouteError::ModelRecovering {
model_id: model_id.to_string(),
node,
});
}
// Priority 3: known to neuron but unloaded (neuron's lazy load).
if let Some((node_name, neuron_endpoint, cold_start)) = unloaded_route {
return finish(fleet, &node_name, &neuron_endpoint, model_id, cold_start).await;
}
// Priority 4: catalogue × topology cold-load.
if let Some(profile) = fleet.catalogue.get(model_id) {
let (node_name, neuron_endpoint) = pick_feasible_neuron(fleet, profile).await?;
cold_load(fleet, &node_name, &neuron_endpoint, profile).await?;
return finish(fleet, &node_name, &neuron_endpoint, model_id, true).await;
}
Err(RouteError::ModelNotFound(model_id.to_string()))
}
/// Pick a healthy neuron whose discovered topology satisfies the
/// profile. Preference order:
/// 1. A neuron from `profile.pinned_on` that is healthy + feasible.
/// 2. Otherwise, any healthy + feasible neuron, stable by name.
async fn pick_feasible_neuron(
fleet: &Arc<CortexState>,
profile: &ModelProfile,
) -> Result<(String, String), RouteError> {
let nodes = fleet.nodes.read().await;
let mut candidates: Vec<(String, String, bool)> = Vec::new();
for node in nodes.values() {
if !node.healthy {
continue;
}
let Some(disc) = node.discovery.as_ref() else {
continue;
};
if !profile.is_feasible_on(&node.name, &disc.devices) {
continue;
}
let pinned = profile.pinned_on.iter().any(|n| n == &node.name);
candidates.push((node.name.clone(), node.endpoint.clone(), pinned));
}
candidates.sort_by(|a, b| {
b.2.cmp(&a.2) // pinned first (true > false)
.then(a.0.cmp(&b.0))
});
let pick = candidates.into_iter().next();
pick.map(|(n, e, _)| (n, e))
.ok_or_else(|| RouteError::NoFeasibleNeuron {
model_id: profile.id.clone(),
})
}
/// Issue `POST {endpoint}/models/load` for this profile on this neuron,
/// blocking until the load completes (neuron's load endpoint is
/// synchronous — it returns 200 once VRAM is materialised). On success
/// also inserts a `Loaded` entry into the local NodeState cache so the
/// caller's subsequent endpoint lookup sees the new model without
/// waiting for the next poll cycle.
async fn cold_load(
fleet: &Arc<CortexState>,
node_name: &str,
neuron_endpoint: &str,
profile: &ModelProfile,
) -> Result<(), RouteError> {
let spec = profile_to_spec(fleet, node_name, profile).await;
let url = format!("{neuron_endpoint}/models/load");
tracing::info!(model = %profile.id, node = node_name, "cold-loading via /models/load");
// Generous timeout: a fresh download + safetensors mmap + device
// copy for a 30B-class dense model can comfortably exceed 5 min on
// a slow link. The HTTP client's own default already covers most
// of this; pin a longer per-request bound just here.
let resp = match fleet
.http_client
.post(&url)
.timeout(Duration::from_secs(1800))
.json(&spec)
.send()
.await
{
Ok(r) => r,
Err(e) => {
return Err(RouteError::ColdLoadFailed {
model_id: profile.id.clone(),
node: node_name.to_string(),
message: format!("HTTP request failed: {e}"),
});
}
};
let status = resp.status();
if !status.is_success() {
let body = resp.text().await.unwrap_or_default();
// Neuron returns 400 "already loaded" when two concurrent
// requests race the same model. Treat that as success — both
// requests effectively achieved the same end state.
if body.contains("already loaded") {
tracing::info!(
model = %profile.id,
node = node_name,
"cold-load saw 'already loaded' — treating as success"
);
} else {
return Err(RouteError::ColdLoadFailed {
model_id: profile.id.clone(),
node: node_name.to_string(),
message: format!("HTTP {status}: {body}"),
});
}
} else {
tracing::info!(model = %profile.id, node = node_name, "cold-load returned 200");
}
// Warm the cache: insert a Loaded ModelEntry so the next
// resolve() finds the model without waiting for the poll loop.
{
let mut nodes = fleet.nodes.write().await;
if let Some(node) = nodes.get_mut(node_name) {
node.models.insert(
profile.id.clone(),
cortex_core::node::ModelEntry {
id: profile.id.clone(),
status: ModelStatus::Loaded,
last_accessed: Some(chrono::Utc::now()),
vram_estimate_mb: profile.vram_mb,
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
}
Ok(())
}
/// Translate a `ModelProfile` to a `ModelSpec` neuron's /models/load
/// accepts. Devices are picked from the neuron's discovered topology —
/// the first `min_devices` indices that meet `min_device_vram_mb`.
async fn profile_to_spec(
fleet: &Arc<CortexState>,
node_name: &str,
profile: &ModelProfile,
) -> ModelSpec {
let devices = {
let nodes = fleet.nodes.read().await;
let mut picked: Vec<u32> = Vec::new();
if let Some(node) = nodes.get(node_name)
&& let Some(disc) = &node.discovery
{
let min_vram = profile.min_device_vram_mb.unwrap_or(0);
for d in &disc.devices {
if d.vram_total_mb >= min_vram {
picked.push(d.index);
if picked.len() as u32 >= profile.min_devices {
break;
}
}
}
}
if picked.is_empty() {
// Fall back to a 0..min_devices default; pick_feasible_neuron
// already verified the topology satisfies the constraints,
// so this only fires if discovery raced or was lost.
(0..profile.min_devices).collect()
} else {
picked
}
loaded_candidate.or(unloaded_candidate).ok_or_else(|| {
if nodes.values().any(|n| n.healthy) {
RouteError::ModelNotFound(model_id.to_string())
} else {
RouteError::NoHealthyNodes
}
})?
};
let tensor_parallel = if profile.min_devices > 1 {
Some(profile.min_devices)
} else {
None
};
ModelSpec {
model_id: qualified_model_id(profile),
harness: profile.harness.clone(),
quant: profile.quant.clone(),
tensor_parallel,
devices: Some(devices),
}
}
/// Prefix the catalogue id with the scheme when one is declared, so
/// neuron resolves the load against the right registry. Without this,
/// a profile pointing at the helexa registry would resolve via
/// neuron's `default_source` (typically `huggingface`) and fetch
/// bytes from the wrong place. Profiles that omit `source` continue
/// to pass the bare id through, preserving the pre-Phase-3 contract.
///
/// Stays at module scope (not nested in `profile_to_spec`) so the unit
/// tests can exercise it without spinning up CortexState topology.
fn qualified_model_id(profile: &ModelProfile) -> String {
match profile.source.as_deref() {
Some(scheme) if !scheme.is_empty() => format!("{scheme}:{}", profile.id),
_ => profile.id.clone(),
}
}
/// Resolve neuron's `/models/{id}/endpoint` to its inference URL and
/// build the final `RouteDecision`. Shared by all three priority
/// branches above.
async fn finish(
fleet: &Arc<CortexState>,
node_name: &str,
neuron_endpoint: &str,
model_id: &str,
cold_start: bool,
) -> Result<RouteDecision, RouteError> {
// Ask the neuron for the inference endpoint for this model.
let endpoint_url = format!(
"{}/models/{}/endpoint",
neuron_endpoint,
@@ -420,122 +89,13 @@ async fn finish(
_ => None,
};
let raw = inference_endpoint.ok_or_else(|| {
RouteError::EndpointResolveFailed(model_id.to_string(), node_name.to_string())
let endpoint = inference_endpoint.ok_or_else(|| {
RouteError::EndpointResolveFailed(model_id.to_string(), node_name.clone())
})?;
// Rewrite loopback inference URLs to use the configured neuron host.
// Neuron's default bind_url is `http://localhost:13131` (it can't
// reliably know its own externally-resolvable name). Cortex sees a
// URL that's only meaningful from the neuron host's own perspective;
// proxying directly to localhost from a different cortex host would
// hit nothing. Keep neuron's port and path (a future harness could
// serve inference on a different port than the management API), but
// swap the host for the one in cortex.toml.
let endpoint = rewrite_loopback_host(&raw, neuron_endpoint).unwrap_or(raw);
Ok(RouteDecision {
node_name: node_name.to_string(),
node_name,
endpoint,
cold_start,
resolved_model_id: model_id.to_string(),
})
}
/// If `inference_url`'s host is a loopback name (localhost / 127.0.0.1 /
/// 0.0.0.0 / ::1), return a copy with the host replaced by
/// `neuron_endpoint`'s host. Otherwise return None and the caller falls
/// back to the inference URL as-is.
fn rewrite_loopback_host(inference_url: &str, neuron_endpoint: &str) -> Option<String> {
let inf = url::Url::parse(inference_url).ok()?;
let inf_host = inf.host_str()?;
let is_loopback = matches!(inf_host, "localhost" | "127.0.0.1" | "0.0.0.0" | "::1");
if !is_loopback {
return None;
}
let neuron = url::Url::parse(neuron_endpoint).ok()?;
let new_host = neuron.host_str()?;
let mut out = inf.clone();
out.set_host(Some(new_host)).ok()?;
// url::Url::to_string normalises an empty path to "/", which then
// breaks downstream callers that do format!("{endpoint}/v1/...")
// and produce a double slash. The proxy URL is treated as a base
// string that the caller appends paths to, so strip the trailing
// slash here.
let s = out.to_string();
Some(s.trim_end_matches('/').to_string())
}
#[cfg(test)]
mod tests {
use super::{ModelProfile, qualified_model_id, rewrite_loopback_host};
fn bare_profile(id: &str, source: Option<&str>) -> ModelProfile {
ModelProfile {
id: id.into(),
harness: "candle".into(),
quant: None,
vram_mb: None,
min_devices: 1,
min_device_vram_mb: None,
pinned_on: vec![],
source: source.map(String::from),
limit: None,
cost: None,
capabilities: vec![],
}
}
#[test]
fn qualified_id_passes_through_when_source_absent() {
let p = bare_profile("Qwen/Qwen3-30B", None);
assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B");
}
#[test]
fn qualified_id_prefixes_when_source_set() {
let p = bare_profile("Helexa/Qwen3.6-27B-Uncensored", Some("helexa"));
assert_eq!(
qualified_model_id(&p),
"helexa:Helexa/Qwen3.6-27B-Uncensored"
);
}
#[test]
fn qualified_id_passes_through_when_source_is_empty_string() {
// An empty scheme is treated as absent — neuron's default_source
// substitution kicks in.
let p = bare_profile("Qwen/Qwen3-30B", Some(""));
assert_eq!(qualified_model_id(&p), "Qwen/Qwen3-30B");
}
#[test]
fn rewrites_localhost_keeps_port_and_path() {
let out = rewrite_loopback_host(
"http://localhost:13131",
"http://beast.hanzalova.internal:13131",
);
assert_eq!(
out.as_deref(),
Some("http://beast.hanzalova.internal:13131")
);
}
#[test]
fn rewrites_loopback_with_distinct_inference_port() {
let out = rewrite_loopback_host("http://127.0.0.1:8080", "http://beast.lan:13131");
assert_eq!(out.as_deref(), Some("http://beast.lan:8080"));
}
#[test]
fn leaves_non_loopback_alone() {
let out = rewrite_loopback_host("http://other.host:1234", "http://beast.lan:13131");
assert_eq!(out, None);
}
#[test]
fn malformed_inference_url_returns_none() {
let out = rewrite_loopback_host("not a url", "http://beast.lan:13131");
assert_eq!(out, None);
}
}

View File

@@ -1,10 +1,7 @@
use crate::entitlements_local::LocalEntitlementProvider;
use cortex_core::catalogue::ModelCatalogue;
use cortex_core::config::{EvictionSettings, GatewayConfig, NeuronEndpoint};
use cortex_core::entitlements::EntitlementProvider;
use cortex_core::node::NodeState;
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
/// Shared fleet state, protected by a RwLock for concurrent reader access.
@@ -14,12 +11,6 @@ pub struct CortexState {
pub eviction: EvictionSettings,
pub catalogue: ModelCatalogue,
pub http_client: reqwest::Client,
/// Resolves bearer keys to principals and enforces token budgets (#47).
/// A local/static provider today (#50); the upstream client later (#57).
pub entitlements: Arc<dyn EntitlementProvider>,
/// Whether to reject unauthenticated requests (#49). Read by the auth
/// middleware once it lands.
pub require_auth: bool,
}
impl CortexState {
@@ -35,17 +26,12 @@ impl CortexState {
models: HashMap::new(),
lifecycle_cycles: 0,
last_poll: None,
discovery: None,
activation: None,
},
);
}
let catalogue = ModelCatalogue::load(&config.models_config);
let entitlements: Arc<dyn EntitlementProvider> =
Arc::new(LocalEntitlementProvider::from_config(&config.entitlements));
Self {
nodes: RwLock::new(nodes),
neuron_configs: config.neurons.clone(),
@@ -55,8 +41,6 @@ impl CortexState {
.timeout(std::time::Duration::from_secs(300))
.build()
.expect("failed to build HTTP client"),
entitlements,
require_auth: config.entitlements.require_auth,
}
}
}

View File

@@ -1,280 +0,0 @@
//! Alias resolution: a client request with `model: "helexa/small"`
//! routes to the concrete model id (e.g. `Qwen/Qwen3-1.7B`), with the
//! proxied request body rewritten so the upstream neuron sees a model
//! name that matches its loaded handle.
mod common;
use cortex_core::config::{
EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
};
use cortex_core::node::{ModelEntry, ModelStatus};
use cortex_gateway::state::CortexState;
use serde_json::json;
use std::path::PathBuf;
use std::sync::Arc;
use tokio::net::TcpListener;
/// Write a `models.toml` with one alias to a unique temp path. Returns
/// the path; the file persists for the test process and gets reaped by
/// the OS at exit. Using $XDG_RUNTIME_DIR fallback for the temp dir
/// keeps the file off shared /tmp on CI without pulling in tempfile.
fn write_models_toml(alias: &str, target: &str) -> PathBuf {
let contents = format!(
r#"
[aliases]
"{alias}" = "{target}"
"#
);
let mut path = std::env::temp_dir();
let pid = std::process::id();
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos();
path.push(format!("cortex-test-models-{pid}-{now}.toml"));
std::fs::write(&path, contents).expect("write temp models.toml");
path
}
#[tokio::test]
async fn test_alias_resolves_in_chat_completions() {
let mock_url = common::spawn_mock_neuron().await;
let models_path = write_models_toml("helexa/small", "test-model");
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: mock_url,
}],
models_config: models_path.to_string_lossy().to_string(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
// Seed the node as healthy with the concrete model loaded under
// the target id. The poller doesn't run in this test; we just
// populate state manually.
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
// Sanity: the catalogue actually picked up the alias.
assert_eq!(
fleet.catalogue.resolve_alias("helexa/small"),
"test-model",
"alias should resolve to target id"
);
// Spawn the gateway against this fleet.
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let gateway_addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let gateway_url = format!("http://{gateway_addr}");
// Send a chat completion against the alias. The mock backend
// echoes back the `model` field it received — so a body whose
// model wasn't rewritten would come back as "helexa/small", and a
// properly-rewritten one as "test-model".
let client = reqwest::Client::new();
let resp = client
.post(format!("{gateway_url}/v1/chat/completions"))
.json(&json!({
"model": "helexa/small",
"messages": [{"role": "user", "content": "hi"}],
}))
.send()
.await
.expect("gateway should respond");
assert!(resp.status().is_success(), "gateway returned non-2xx");
let body: serde_json::Value = resp.json().await.expect("response is JSON");
assert_eq!(
body.get("model").and_then(|m| m.as_str()),
Some("test-model"),
"mock backend should have seen the resolved model id, not the alias"
);
}
#[tokio::test]
async fn test_aliases_surface_in_v1_models() {
let mock_url = common::spawn_mock_neuron().await;
let models_path = write_models_toml("helexa/small", "test-model");
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: mock_url,
}],
models_config: models_path.to_string_lossy().to_string(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
// Seed the target as loaded so the alias's mirrored entry shows
// loaded=true.
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(2000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let gateway_addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let gateway_url = format!("http://{gateway_addr}");
let resp = reqwest::get(format!("{gateway_url}/v1/models"))
.await
.expect("gateway should respond");
let body: serde_json::Value = resp.json().await.unwrap();
let entries = body
.get("data")
.and_then(|d| d.as_array())
.expect("data array");
// Both the alias and the target should be present.
let ids: Vec<&str> = entries
.iter()
.filter_map(|e| e.get("id").and_then(|v| v.as_str()))
.collect();
assert!(ids.contains(&"test-model"), "target should be listed");
assert!(ids.contains(&"helexa/small"), "alias should be listed");
// The alias's `loaded` flag and locations should mirror the target.
let alias_entry = entries
.iter()
.find(|e| e.get("id").and_then(|v| v.as_str()) == Some("helexa/small"))
.expect("alias entry");
assert_eq!(alias_entry.get("loaded"), Some(&json!(true)));
let locations = alias_entry
.get("locations")
.and_then(|l| l.as_array())
.expect("locations array");
assert_eq!(locations.len(), 1);
assert_eq!(
locations[0].get("node").and_then(|n| n.as_str()),
Some("mock-node")
);
}
#[tokio::test]
async fn test_alias_falls_through_for_unmapped_model() {
// Catalogue has an alias for some-other-thing but the request
// model "test-model" isn't an alias; resolution should be a no-op.
let mock_url = common::spawn_mock_neuron().await;
let models_path = write_models_toml("helexa/large", "definitely-not-loaded");
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: mock_url,
}],
models_config: models_path.to_string_lossy().to_string(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let gateway_addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let gateway_url = format!("http://{gateway_addr}");
let resp = reqwest::Client::new()
.post(format!("{gateway_url}/v1/chat/completions"))
.json(&json!({
"model": "test-model",
"messages": [{"role": "user", "content": "hi"}],
}))
.send()
.await
.unwrap();
assert!(resp.status().is_success());
let body: serde_json::Value = resp.json().await.unwrap();
assert_eq!(
body.get("model").and_then(|m| m.as_str()),
Some("test-model")
);
}

View File

@@ -123,212 +123,3 @@ async fn test_anthropic_invalid_request() {
assert_eq!(resp.status(), 400);
}
/// Tool round-trip: an Anthropic `/v1/messages` request carrying tools
/// (the Claude Code shape: `{name, description, input_schema}`) must
/// reach the upstream neuron reshaped into OpenAI function-tool form,
/// and tool history (`tool_use` / `tool_result` blocks) must become
/// `tool_calls` / `role:"tool"` messages. This is the fix for the
/// failure where the model received malformed tool defs and improvised
/// an unparseable `<tool_use_name>` format.
#[tokio::test]
async fn test_anthropic_tools_reshaped_for_upstream() {
let (mock_url, captured) = common::spawn_capturing_mock_neuron().await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/messages"))
.header("content-type", "application/json")
.json(&json!({
"model": "test-model",
"max_tokens": 100,
"tools": [{
"name": "Read",
"description": "Read a file from disk",
"input_schema": {
"type": "object",
"properties": {"path": {"type": "string"}},
"required": ["path"]
}
}],
"tool_choice": {"type": "auto"},
"messages": [
{"role": "user", "content": "read /etc/hosts"},
{"role": "assistant", "content": [
{"type": "text", "text": "Reading it."},
{"type": "tool_use", "id": "toolu_42", "name": "Read",
"input": {"path": "/etc/hosts"}}
]},
{"role": "user", "content": [
{"type": "tool_result", "tool_use_id": "toolu_42",
"content": "127.0.0.1 localhost"}
]}
]
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), 200);
let forwarded = {
let guard = captured.lock().unwrap();
guard.last().cloned().expect("upstream received a request")
};
// Tool definitions reshaped to OpenAI function form.
let tools = forwarded["tools"].as_array().expect("tools array");
assert_eq!(tools[0]["type"], "function");
assert_eq!(tools[0]["function"]["name"], "Read");
assert_eq!(
tools[0]["function"]["parameters"]["properties"]["path"]["type"],
"string"
);
assert!(tools[0]["function"].get("input_schema").is_none());
// tool_choice mapped.
assert_eq!(forwarded["tool_choice"], "auto");
// Message history: user, assistant(+tool_calls), tool, user.
let msgs = forwarded["messages"].as_array().expect("messages array");
let assistant = msgs
.iter()
.find(|m| m["role"] == "assistant")
.expect("assistant turn");
assert_eq!(assistant["tool_calls"][0]["id"], "toolu_42");
assert_eq!(assistant["tool_calls"][0]["function"]["name"], "Read");
// arguments is the parsed object, not a JSON string — the Qwen3.6
// chat template iterates `tool_call.arguments | items`.
assert_eq!(
assistant["tool_calls"][0]["function"]["arguments"],
json!({"path": "/etc/hosts"})
);
let tool_msg = msgs
.iter()
.find(|m| m["role"] == "tool")
.expect("tool result turn");
assert_eq!(tool_msg["tool_call_id"], "toolu_42");
assert_eq!(tool_msg["content"], "127.0.0.1 localhost");
}
/// #24: a streaming Anthropic request gets a translated Anthropic SSE
/// stream — not raw OpenAI frames. Verifies the full event sequence,
/// text reassembly, and the content type.
#[tokio::test]
async fn test_anthropic_streaming_sse_translation() {
let mock_url =
common::spawn_streaming_mock_neuron(4, std::time::Duration::from_millis(20)).await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/messages"))
.header("content-type", "application/json")
.json(&json!({
"model": "test-model",
"max_tokens": 64,
"stream": true,
"messages": [{"role": "user", "content": "Hi"}]
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), 200);
assert!(
resp.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.starts_with("text/event-stream"),
"anthropic stream must be SSE"
);
let body = resp.text().await.expect("stream should complete");
assert!(
!body.contains("chat.completion.chunk"),
"raw OpenAI frames must not leak through:\n{body}"
);
let event_names: Vec<&str> = body
.lines()
.filter_map(|l| l.strip_prefix("event: "))
.collect();
assert_eq!(
event_names,
vec![
"message_start",
"content_block_start",
"content_block_delta",
"content_block_delta",
"content_block_delta",
"content_block_delta",
"content_block_stop",
"message_delta",
"message_stop",
],
"unexpected event sequence:\n{body}"
);
// Reassemble the text deltas: the mock emits token0..token3.
let text: String = body
.lines()
.filter_map(|l| l.strip_prefix("data: "))
.filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
.filter(|v| v["type"] == "content_block_delta")
.filter_map(|v| v["delta"]["text"].as_str().map(String::from))
.collect();
assert_eq!(text, "token0token1token2token3");
// The mock sends no finish_reason — stop_reason defaults to
// end_turn, and output_tokens falls back to the delta count.
let message_delta = body
.lines()
.filter_map(|l| l.strip_prefix("data: "))
.filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
.find(|v| v["type"] == "message_delta")
.expect("message_delta event present");
assert_eq!(message_delta["delta"]["stop_reason"], "end_turn");
assert_eq!(message_delta["usage"]["output_tokens"], 4);
}
/// #24: an upstream usage frame (stream_options include_usage shape)
/// rides into message_delta as input/output token counts.
#[tokio::test]
async fn test_anthropic_streaming_usage_propagation() {
let mock_url = common::spawn_streaming_mock_neuron_with_usage(
3,
std::time::Duration::from_millis(10),
225,
42,
)
.await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let body = client
.post(format!("{gw_url}/v1/messages"))
.header("content-type", "application/json")
.json(&json!({
"model": "test-model",
"max_tokens": 64,
"stream": true,
"messages": [{"role": "user", "content": "Hi"}]
}))
.send()
.await
.expect("request should succeed")
.text()
.await
.expect("stream should complete");
let message_delta = body
.lines()
.filter_map(|l| l.strip_prefix("data: "))
.filter_map(|d| serde_json::from_str::<serde_json::Value>(d).ok())
.find(|v| v["type"] == "message_delta")
.expect("message_delta event present");
assert_eq!(message_delta["usage"]["output_tokens"], 42);
assert_eq!(message_delta["usage"]["input_tokens"], 225);
}

View File

@@ -1,250 +0,0 @@
//! Integration tests for API-key auth + principal resolution (#49).
//!
//! Verifies the #63 rejection contract (401 invalid_api_key via the #60
//! envelope) and that an authenticated request reaches neuron carrying the
//! internal principal headers — while a client-supplied principal header is
//! stripped (anti-spoofing).
use axum::Json;
use axum::extract::Path;
use axum::http::HeaderMap;
use axum::routing::{get, post};
use cortex_core::config::{
ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
GatewaySettings, NeuronEndpoint,
};
use cortex_core::entitlements::{CapWindow, HEADER_ACCOUNT_ID, HEADER_KEY_ID};
use cortex_core::node::{ModelEntry, ModelStatus};
use cortex_gateway::state::CortexState;
use serde_json::{Value, json};
use std::sync::{Arc, Mutex};
use tokio::net::TcpListener;
/// What the mock neuron observed on the inbound `/v1/chat/completions`
/// request: the principal headers cortex stamped (or didn't).
#[derive(Default)]
struct Seen {
account_id: Option<String>,
key_id: Option<String>,
}
/// Spawn a mock neuron that records the principal headers it receives and
/// returns a trivial chat completion. Returns (base_url, observed).
async fn spawn_capturing_neuron() -> (String, Arc<Mutex<Seen>>) {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let base_url = format!("http://{addr}");
let inference_url = base_url.clone();
let seen: Arc<Mutex<Seen>> = Arc::new(Mutex::new(Seen::default()));
let sink = Arc::clone(&seen);
let app = axum::Router::new()
.route(
"/models/{model_id}/endpoint",
get(move |Path(_): Path<String>| {
let url = inference_url.clone();
async move { Json(json!({ "url": url })) }
}),
)
.route(
"/v1/chat/completions",
post(move |headers: HeaderMap, Json(body): Json<Value>| {
let sink = Arc::clone(&sink);
async move {
{
let mut s = sink.lock().unwrap();
s.account_id = headers
.get(HEADER_ACCOUNT_ID)
.and_then(|v| v.to_str().ok())
.map(str::to_string);
s.key_id = headers
.get(HEADER_KEY_ID)
.and_then(|v| v.to_str().ok())
.map(str::to_string);
}
let model = body.get("model").and_then(Value::as_str).unwrap_or("m");
Json(json!({
"id": "chatcmpl-auth-001",
"object": "chat.completion",
"created": 1700000000_u64,
"model": model,
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": "ok"},
"finish_reason": "stop"
}],
"usage": {"prompt_tokens": 3, "completion_tokens": 1, "total_tokens": 4}
}))
}
}),
)
.with_state(());
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
(base_url, seen)
}
/// Spawn a gateway with the given entitlements config, a single neuron, and
/// `test-model` seeded as loaded (build_app spawns no poller).
async fn spawn_gateway(neuron_url: &str, entitlements: EntitlementsConfig) -> String {
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: neuron_url.to_string(),
}],
models_config: "/dev/null".into(),
entitlements,
};
let fleet = Arc::new(CortexState::from_config(&config));
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").unwrap();
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
format!("http://{addr}")
}
fn one_key_config(require_auth: bool) -> EntitlementsConfig {
EntitlementsConfig {
require_auth,
keys: vec![ApiKeyConfig {
key: "sk-good".into(),
account_id: "acct-1".into(),
key_id: Some("key-1".into()),
hard_cap: None,
window: CapWindow::Balance,
}],
}
}
fn chat_body() -> Value {
json!({
"model": "test-model",
"messages": [{"role": "user", "content": "hi"}]
})
}
#[tokio::test]
async fn missing_key_when_required_is_401_invalid_api_key() {
let (neuron, _seen) = spawn_capturing_neuron().await;
let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
let resp = reqwest::Client::new()
.post(format!("{gateway}/v1/chat/completions"))
.json(&chat_body())
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::UNAUTHORIZED);
let body: Value = resp.json().await.unwrap();
assert_eq!(body["error"]["code"], "invalid_api_key");
assert_eq!(body["error"]["type"], "invalid_request_error");
}
#[tokio::test]
async fn invalid_key_is_401_even_when_auth_not_required() {
let (neuron, seen) = spawn_capturing_neuron().await;
// A present-but-wrong credential is always an error.
let gateway = spawn_gateway(&neuron, one_key_config(false)).await;
let resp = reqwest::Client::new()
.post(format!("{gateway}/v1/chat/completions"))
.bearer_auth("sk-wrong")
.json(&chat_body())
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::UNAUTHORIZED);
let body: Value = resp.json().await.unwrap();
assert_eq!(body["error"]["code"], "invalid_api_key");
// Rejected before dispatch — neuron never saw the request.
assert!(seen.lock().unwrap().account_id.is_none());
}
#[tokio::test]
async fn valid_key_reaches_neuron_with_principal_headers() {
let (neuron, seen) = spawn_capturing_neuron().await;
let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
let resp = reqwest::Client::new()
.post(format!("{gateway}/v1/chat/completions"))
.bearer_auth("sk-good")
// A spoofed principal header must be stripped, not forwarded.
.header(HEADER_ACCOUNT_ID, "attacker")
.json(&chat_body())
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::OK);
let s = seen.lock().unwrap();
assert_eq!(s.account_id.as_deref(), Some("acct-1"));
assert_eq!(s.key_id.as_deref(), Some("key-1"));
}
#[tokio::test]
async fn anonymous_allowed_when_auth_not_required() {
let (neuron, seen) = spawn_capturing_neuron().await;
let gateway = spawn_gateway(&neuron, EntitlementsConfig::default()).await;
let resp = reqwest::Client::new()
.post(format!("{gateway}/v1/chat/completions"))
.json(&chat_body())
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::OK);
// No principal resolved → no principal headers stamped.
let s = seen.lock().unwrap();
assert!(s.account_id.is_none());
assert!(s.key_id.is_none());
}
#[tokio::test]
async fn health_is_public_even_when_auth_required() {
let (neuron, _seen) = spawn_capturing_neuron().await;
let gateway = spawn_gateway(&neuron, one_key_config(true)).await;
let resp = reqwest::Client::new()
.get(format!("{gateway}/health"))
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::OK);
}

View File

@@ -22,7 +22,6 @@ use tokio::net::TcpListener;
/// - GET /models/:id/endpoint (returns the inference URL)
/// - POST /models/unload (accepts unload requests)
/// - GET /v1/chat/completions + POST /v1/chat/completions (inference)
///
/// Returns the neuron base URL.
pub async fn spawn_mock_neuron() -> String {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
@@ -44,7 +43,6 @@ pub async fn spawn_mock_neuron() -> String {
post(|Json(_body): Json<Value>| async { Json(json!({"status": "unloaded"})) }),
)
.route("/v1/chat/completions", post(mock_chat_completions))
.route("/v1/responses", post(mock_responses))
.route("/v1/models", get(mock_v1_models));
tokio::spawn(async move {
@@ -54,64 +52,9 @@ pub async fn spawn_mock_neuron() -> String {
base_url
}
/// Like [`spawn_mock_neuron`] but captures the JSON body of every
/// `POST /v1/chat/completions` it receives into the returned handle, so
/// a test can assert what the gateway *actually forwarded upstream*
/// (e.g. that Anthropic-shaped tools were reshaped to OpenAI form).
pub async fn spawn_capturing_mock_neuron() -> (String, Arc<std::sync::Mutex<Vec<Value>>>) {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let base_url = format!("http://{addr}");
let inference_url = base_url.clone();
let captured: Arc<std::sync::Mutex<Vec<Value>>> = Arc::new(std::sync::Mutex::new(Vec::new()));
let sink = captured.clone();
let app = Router::new()
.route("/models", get(mock_neuron_list_models))
.route(
"/models/{model_id}/endpoint",
get(move |Path(_): Path<String>| {
let url = inference_url.clone();
async move { Json(json!({"url": url})) }
}),
)
.route(
"/v1/chat/completions",
post(move |Json(body): Json<Value>| {
let sink = sink.clone();
async move {
let model = body
.get("model")
.and_then(|v| v.as_str())
.unwrap_or("unknown");
let resp = json!({
"id": "chatcmpl-capture-001",
"object": "chat.completion",
"created": 1700000000_u64,
"model": model,
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": "Hello from mock backend"},
"finish_reason": "stop"
}],
"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
});
sink.lock().unwrap().push(body);
Json(resp)
}
}),
);
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
(base_url, captured)
}
async fn mock_neuron_list_models() -> Json<Value> {
Json(json!([
{"id": "test-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000, "capabilities": ["text"], "tool_call": false, "reasoning": false}
{"id": "test-model", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000}
]))
}
@@ -149,39 +92,6 @@ async fn mock_chat_completions(Json(body): Json<Value>) -> Json<Value> {
}))
}
async fn mock_responses(Json(body): Json<Value>) -> Json<Value> {
let model = body
.get("model")
.and_then(|v| v.as_str())
.unwrap_or("unknown");
// Echo the model field back and synthesise a tiny ResponsesResponse.
// Mirrors the shape neuron's /v1/responses handler emits so the
// gateway test only needs to assert the proxy round-tripped it.
Json(json!({
"id": "resp-test-001",
"object": "response",
"created_at": 1700000000_u64,
"status": "completed",
"model": model,
"output": [{
"type": "message",
"id": "msg-test-001",
"role": "assistant",
"content": [{
"type": "output_text",
"text": "Hello from mock backend",
"annotations": []
}],
"status": "completed"
}],
"usage": {
"input_tokens": 5,
"output_tokens": 5,
"total_tokens": 10
}
}))
}
/// Spawns a mock neuron that returns SSE streaming responses for chat completions.
pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Duration) -> String {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
@@ -251,120 +161,8 @@ pub async fn spawn_streaming_mock_neuron(chunk_count: usize, chunk_delay: Durati
base_url
}
/// Like `spawn_streaming_mock_neuron`, but the stream ends with an
/// OpenAI `stream_options.include_usage`-style final chunk (empty
/// choices + usage object) before `[DONE]` — the shape the gateway's
/// token metrics (#21) extract counts from.
pub async fn spawn_streaming_mock_neuron_with_usage(
chunk_count: usize,
chunk_delay: Duration,
prompt_tokens: u64,
completion_tokens: u64,
) -> String {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let base_url = format!("http://{addr}");
let inference_url = base_url.clone();
let app = Router::new()
.route("/models", get(mock_neuron_list_models))
.route(
"/models/{model_id}/endpoint",
get(move |Path(_model_id): Path<String>| {
let url = inference_url.clone();
async move { Json(json!({"url": url})) }
}),
)
.route(
"/v1/chat/completions",
post(move |Json(body): Json<Value>| async move {
let model = body
.get("model")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
let mut chunks: Vec<String> = (0..chunk_count)
.map(|i| {
let chunk = json!({
"id": "chatcmpl-stream-002",
"object": "chat.completion.chunk",
"created": 1700000000_u64,
"model": model,
"choices": [{
"index": 0,
"delta": { "content": format!("token{i}") },
"finish_reason": null
}]
});
format!("data: {chunk}\n\n")
})
.collect();
let usage_chunk = json!({
"id": "chatcmpl-stream-002",
"object": "chat.completion.chunk",
"created": 1700000000_u64,
"model": model,
"choices": [],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens
}
});
chunks.push(format!("data: {usage_chunk}\n\n"));
chunks.push("data: [DONE]\n\n".to_string());
let delay = chunk_delay;
let stream = stream::iter(chunks).then(move |chunk| async move {
tokio::time::sleep(delay).await;
Ok::<_, std::convert::Infallible>(chunk)
});
Response::builder()
.header(header::CONTENT_TYPE, "text/event-stream")
.header(header::CACHE_CONTROL, "no-cache")
.body(Body::from_stream(stream))
.unwrap()
}),
);
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
base_url
}
/// Spawns a mock neuron with a custom models list.
pub async fn spawn_mock_neuron_with_models(models_response: Value) -> String {
spawn_mock_neuron_with_models_and_health(models_response, default_health_response()).await
}
/// Default `/health` response used by mocks that don't care about the
/// activation field — empty devices, no in-flight pre-warm, state=ready.
pub fn default_health_response() -> Value {
json!({
"uptime_secs": 0,
"devices": [],
"activation": {
"state": "ready",
"pending": [],
"in_progress": null,
"completed": [],
"failed": []
}
})
}
/// Variant of `spawn_mock_neuron_with_models` that also serves a
/// `/health` body. Used by tests that drive the gateway's activation
/// surface (poller reading /health, /v1/models synthesising Loading
/// locations from in_progress / pending).
pub async fn spawn_mock_neuron_with_models_and_health(
models_response: Value,
health_response: Value,
) -> String {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let base_url = format!("http://{addr}");
@@ -378,13 +176,6 @@ pub async fn spawn_mock_neuron_with_models_and_health(
async move { Json(resp) }
}),
)
.route(
"/health",
get(move || {
let resp = health_response.clone();
async move { Json(resp) }
}),
)
.route(
"/models/{model_id}/endpoint",
get(move |Path(_model_id): Path<String>| {
@@ -429,7 +220,6 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
endpoint: mock_url.to_string(),
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
@@ -446,10 +236,6 @@ pub async fn spawn_gateway_with_state(mock_url: &str) -> (Arc<CortexState>, Stri
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}

View File

@@ -1,140 +0,0 @@
mod common;
use serde_json::json;
#[tokio::test]
async fn error_response_model_not_found() {
let neuron_url = common::spawn_mock_neuron().await;
let gateway_url = common::spawn_gateway(&neuron_url).await;
let client = reqwest::Client::new();
// Request a model that isn't loaded on the mock neuron.
let resp = client
.post(format!("{gateway_url}/v1/chat/completions"))
.header("Content-Type", "application/json")
.json(&json!({
"model": "nonexistent-model",
"messages": [{"role": "user", "content": "hi"}]
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), axum::http::StatusCode::NOT_FOUND);
let body: serde_json::Value = resp.json().await.expect("valid json");
let err = body.get("error").expect("response has error object");
// Broad type categorization
assert_eq!(err.get("type").unwrap(), "invalid_request_error");
// Specific machine-readable code
assert_eq!(
err.get("code").unwrap().as_str().unwrap(),
"model_not_found"
);
// param is always null
assert!(err.get("param").unwrap().is_null());
}
#[tokio::test]
async fn error_response_missing_model_field() {
let neuron_url = common::spawn_mock_neuron().await;
let gateway_url = common::spawn_gateway(&neuron_url).await;
let client = reqwest::Client::new();
// Request without the required `model` field.
let resp = client
.post(format!("{gateway_url}/v1/chat/completions"))
.header("Content-Type", "application/json")
.json(&json!({
"messages": [{"role": "user", "content": "hi"}]
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), axum::http::StatusCode::BAD_REQUEST);
let body: serde_json::Value = resp.json().await.expect("valid json");
let err = body.get("error").expect("response has error object");
assert_eq!(err.get("type").unwrap(), "invalid_request_error");
assert_eq!(
err.get("code").unwrap().as_str().unwrap(),
"missing_model_field"
);
assert!(err.get("param").unwrap().is_null());
}
#[tokio::test]
async fn error_response_no_healthy_nodes() {
use cortex_core::config::{EvictionSettings, GatewayConfig, GatewaySettings, NeuronEndpoint};
use std::sync::Arc;
// Create a gateway config with a neuron pointing at an unreachable port so no node is ever healthy.
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: cortex_core::config::EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "dead-node".into(),
endpoint: "http://127.0.0.1:1".into(),
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(cortex_gateway::state::CortexState::from_config(&config));
let app = cortex_gateway::build_app(fleet);
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
// Allow the poller a moment to mark the node unhealthy.
tokio::time::sleep(std::time::Duration::from_millis(200)).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("http://{addr}/v1/chat/completions"))
.header("Content-Type", "application/json")
.json(&json!({
"model": "any-model",
"messages": [{"role": "user", "content": "hi"}]
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), axum::http::StatusCode::SERVICE_UNAVAILABLE);
// Transient 503 — the gateway advertises Retry-After so OpenAI-compatible
// clients back off and retry rather than surfacing an opaque error (#63).
let retry_after = resp
.headers()
.get(reqwest::header::RETRY_AFTER)
.expect("transient 503 must carry Retry-After")
.to_str()
.unwrap()
.to_string();
assert_eq!(retry_after, "5");
let body: serde_json::Value = resp.json().await.expect("valid json");
let err = body.get("error").expect("response has error object");
assert_eq!(err.get("type").unwrap(), "api_error");
assert_eq!(
err.get("code").unwrap().as_str().unwrap(),
"service_unavailable"
);
assert!(err.get("param").unwrap().is_null());
}

View File

@@ -71,7 +71,6 @@ fn make_fleet(endpoint: &str, defrag_after: u32) -> Arc<CortexState> {
endpoint: endpoint.to_string(),
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
Arc::new(CortexState::from_config(&config))
}
@@ -92,10 +91,6 @@ async fn test_evict_lru_model() {
status: ModelStatus::Loaded,
last_accessed: Some(Utc::now() - chrono::Duration::hours(2)),
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
node.models.insert(
@@ -105,10 +100,6 @@ async fn test_evict_lru_model() {
status: ModelStatus::Loaded,
last_accessed: Some(Utc::now()),
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
@@ -172,10 +163,6 @@ async fn test_eviction_increments_lifecycle_cycles() {
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}

View File

@@ -1,207 +0,0 @@
//! Integration tests for per-request token metering (#51).
//!
//! Drives authenticated requests through the gateway to a mock neuron that
//! reports a fixed `usage` object, then asserts the EntitlementProvider's
//! spend ledger reflects cumulative per-key spend and that reservations
//! settle to actual (no outstanding reserved tokens once requests complete).
mod common;
use cortex_core::config::{
ApiKeyConfig, EntitlementsConfig, EvictionSettings, EvictionStrategy, GatewayConfig,
GatewaySettings, NeuronEndpoint,
};
use cortex_core::entitlements::{CapWindow, Principal};
use cortex_core::node::{ModelEntry, ModelStatus};
use cortex_gateway::state::CortexState;
use serde_json::json;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::TcpListener;
const ACCOUNT: &str = "acct-meter";
const KEY_ID: &str = "key-meter";
const BEARER: &str = "sk-meter";
/// The mock neuron (common::spawn_mock_neuron) reports this fixed usage on
/// every chat completion.
const PROMPT_PER_REQ: u64 = 10;
const COMPLETION_PER_REQ: u64 = 5;
async fn spawn_metered_gateway(neuron_url: &str) -> (Arc<CortexState>, String) {
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: neuron_url.to_string(),
}],
models_config: "/dev/null".into(),
entitlements: EntitlementsConfig {
require_auth: true,
keys: vec![ApiKeyConfig {
key: BEARER.into(),
account_id: ACCOUNT.into(),
key_id: Some(KEY_ID.into()),
hard_cap: Some(1_000_000),
window: CapWindow::Balance,
}],
},
};
let fleet = Arc::new(CortexState::from_config(&config));
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").unwrap();
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
(fleet, format!("http://{addr}"))
}
fn principal() -> Principal {
Principal {
account_id: ACCOUNT.into(),
key_id: KEY_ID.into(),
}
}
/// Poll the provider ledger until settled spend reaches `expected` (settle
/// runs in a spawned task after the response stream finishes) or time out.
async fn await_spent(fleet: &CortexState, expected: u64) -> u64 {
let principal = principal();
for _ in 0..100 {
let snap = fleet.entitlements.snapshot(&principal).await.unwrap();
if snap.spent >= expected {
return snap.spent;
}
tokio::time::sleep(Duration::from_millis(20)).await;
}
fleet.entitlements.snapshot(&principal).await.unwrap().spent
}
#[tokio::test]
async fn cumulative_spend_is_metered_per_key() {
let neuron = common::spawn_mock_neuron().await;
let (fleet, gateway) = spawn_metered_gateway(&neuron).await;
let client = reqwest::Client::new();
const N: u64 = 3;
for _ in 0..N {
let resp = client
.post(format!("{gateway}/v1/chat/completions"))
.bearer_auth(BEARER)
.json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::OK);
// Drain the body so the response stream finishes and metering settles.
let _ = resp.bytes().await.unwrap();
}
let expected = N * (PROMPT_PER_REQ + COMPLETION_PER_REQ);
let spent = await_spent(&fleet, expected).await;
assert_eq!(
spent, expected,
"ledger must reflect cumulative per-key spend"
);
// Reservations settled to actual — nothing left outstanding.
let snap = fleet.entitlements.snapshot(&principal()).await.unwrap();
assert_eq!(snap.reserved, 0, "all reservations must settle/release");
assert_eq!(snap.hard_cap, Some(1_000_000));
}
#[tokio::test]
async fn anonymous_request_records_no_spend() {
// require_auth=false so the unauthenticated request is served, but with
// no principal it must not touch any ledger.
let neuron = common::spawn_mock_neuron().await;
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
endpoint: neuron.clone(),
}],
models_config: "/dev/null".into(),
entitlements: EntitlementsConfig::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").unwrap();
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let resp = reqwest::Client::new()
.post(format!("http://{addr}/v1/chat/completions"))
.json(&json!({"model": "test-model", "messages": [{"role": "user", "content": "hi"}]}))
.send()
.await
.unwrap();
assert_eq!(resp.status(), reqwest::StatusCode::OK);
let _ = resp.bytes().await.unwrap();
// An unconfigured principal has a zeroed snapshot — nothing was metered.
let snap = fleet
.entitlements
.snapshot(&Principal {
account_id: "nobody".into(),
key_id: "nobody".into(),
})
.await
.unwrap();
assert_eq!(snap.spent, 0);
}

View File

@@ -1,26 +1,20 @@
mod common;
use serde_json::json;
use std::sync::OnceLock;
/// The metrics recorder is a process-wide global; both tests in this
/// binary run against one shared install. Assertions must therefore be
/// order-independent (presence of names / monotonic counters, not
/// "empty before").
fn recorder() -> &'static metrics_exporter_prometheus::PrometheusHandle {
static HANDLE: OnceLock<metrics_exporter_prometheus::PrometheusHandle> = OnceLock::new();
HANDLE.get_or_init(|| {
cortex_gateway::metrics::install_test_recorder().expect("recorder should install")
})
}
#[tokio::test]
async fn test_metrics_emitted_after_proxy() {
let handle = recorder();
let handle = cortex_gateway::metrics::install_test_recorder().expect("recorder should install");
let mock_url = common::spawn_mock_neuron().await;
let gw_url = common::spawn_gateway(&mock_url).await;
let before = handle.render();
assert!(
!before.contains("cortex_requests_total"),
"no request metrics before any requests"
);
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/chat/completions"))
@@ -50,72 +44,3 @@ async fn test_metrics_emitted_after_proxy() {
"no errors expected for a successful request"
);
}
#[tokio::test]
async fn test_token_metrics_emitted_for_streamed_request() {
// #21: a streamed chat completion with a final usage chunk must
// produce TTFT + tok/s histograms and prompt/completion token
// counters, labelled with model and node. The recorder is global
// per-process, so this test runs in its own binary invocation —
// cargo's per-file integration binaries give us that as long as
// only one test in this file installs the recorder... it isn't:
// test_metrics_emitted_after_proxy also installs. Whichever wins
// the race, both render from the same recorder, so assert on
// delta-able names rather than exact totals.
let handle = recorder();
let mock_url = common::spawn_streaming_mock_neuron_with_usage(
5,
std::time::Duration::from_millis(40),
225,
42,
)
.await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/chat/completions"))
.header("content-type", "application/json")
.json(&json!({
"model": "test-model",
"messages": [{"role": "user", "content": "Hi"}],
"stream": true
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), 200);
let body = resp.text().await.expect("stream should complete");
assert!(body.contains("[DONE]"));
let rendered = handle.render();
for needle in [
"cortex_time_to_first_token_seconds",
"cortex_tokens_per_second",
] {
assert!(
rendered.contains(needle),
"{needle} should be present.\nMetrics:\n{rendered}"
);
}
// The recorder is shared with the sibling test (same model/node
// labels), so counters are lower bounds, not exact values: this
// request contributed prompt=225 / completion=42.
let counter_value = |name: &str| -> u64 {
rendered
.lines()
.find(|l| l.starts_with(name) && l.contains(r#"model="test-model""#))
.and_then(|l| l.rsplit(' ').next())
.and_then(|v| v.parse().ok())
.unwrap_or_else(|| panic!("{name} should be present.\nMetrics:\n{rendered}"))
};
assert!(
counter_value("cortex_prompt_tokens_total") >= 225,
"prompt token counter should include this request's 225.\nMetrics:\n{rendered}"
);
assert!(
counter_value("cortex_completion_tokens_total") >= 42,
"completion token counter should include this request's 42.\nMetrics:\n{rendered}"
);
}

View File

@@ -1,132 +0,0 @@
//! Issue #62 / #67: `GET /v1/models` advertises a per-model serving budget so
//! an OpenAI-compatible client (opencode's helexa provider) can size and
//! compact its context without hand-configuration.
//!
//! Asserts the composition sources land on the response:
//! - `limit` from the neuron's self-derived value (#67) — NOT the catalogue;
//! an operator-declared catalogue `limit` is deliberately ignored.
//! - `cost` from the catalogue profile (operator-set pricing).
//! - `tool_call` / `reasoning` from the neuron's runtime detection (OR-ed in)
//!
//! Also a regression guard for the removal of `max_model_len` — the misnamed,
//! unconsumed vLLM-ism that this contract replaces.
use cortex_core::config::{
EvictionSettings, EvictionStrategy, GatewayConfig, GatewaySettings, NeuronEndpoint,
};
use cortex_core::harness::ModelLimit;
use cortex_core::node::{ModelEntry, ModelStatus};
use cortex_gateway::state::CortexState;
use std::sync::Arc;
use tokio::net::TcpListener;
#[tokio::test]
async fn v1_models_surfaces_limit_cost_and_capability_flags() {
// Catalogue declares pricing + an operator `limit` that must be IGNORED
// (#67): the neuron's self-derived limit is authoritative.
let models_toml = r#"
[[models]]
id = "test-model"
harness = "candle"
limit.context = 999999
limit.input = 999999
limit.output = 999999
cost.input = 0.0
cost.output = 0.0
capabilities = ["text"]
"#;
let cat_path = std::env::temp_dir().join("cortex_test_issue62_models.toml");
std::fs::write(&cat_path, models_toml).unwrap();
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "mock-node".into(),
// Never contacted: build_app does not spawn the poller, so the
// seeded state below is authoritative for /v1/models.
endpoint: "http://127.0.0.1:1".into(),
}],
models_config: cat_path.to_string_lossy().into_owned(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
// Seed the model as loaded on the node with runtime-detected flags set —
// these must OR into the catalogue entry, not be lost.
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node exists");
node.healthy = true;
node.models.insert(
"test-model".into(),
ModelEntry {
id: "test-model".into(),
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: Some(8000),
capabilities: vec!["text".into()],
tool_call: true,
reasoning: true,
// Neuron's self-derived limit (#67) — the authoritative
// source. Distinct from the catalogue's (ignored) values.
limit: Some(ModelLimit {
context: 49152,
input: Some(40960),
output: 8192,
}),
},
);
}
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let body: serde_json::Value = reqwest::Client::new()
.get(format!("http://{addr}/v1/models"))
.send()
.await
.unwrap()
.json()
.await
.unwrap();
let entry = body["data"]
.as_array()
.expect("data is an array")
.iter()
.find(|m| m["id"] == "test-model")
.expect("test-model present in /v1/models");
// `limit` is the neuron's self-derived value (#67), NOT the catalogue's
// (which declared 999999 and must be ignored). `cost` still flows from
// the catalogue.
assert_eq!(entry["limit"]["context"], 49152);
assert_eq!(entry["limit"]["input"], 40960);
assert_eq!(entry["limit"]["output"], 8192);
assert_eq!(entry["cost"]["input"], 0.0);
assert_eq!(entry["cost"]["output"], 0.0);
// Runtime-detected capability flags OR-ed in from the neuron's ModelEntry.
assert_eq!(entry["tool_call"], true);
assert_eq!(entry["reasoning"], true);
// Regression guard: the removed, unconsumed vLLM-ism must not reappear.
assert!(
entry.get("max_model_len").is_none(),
"max_model_len was removed; /v1/models must not advertise it"
);
let _ = std::fs::remove_file(&cat_path);
}

View File

@@ -12,8 +12,8 @@ use std::sync::Arc;
async fn test_poller_discovers_models() {
// Mock neuron reports 2 models via /models endpoint (neuron format).
let mock_url = common::spawn_mock_neuron_with_models(json!([
{"id": "model-a", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
{"id": "model-b", "harness": "candle", "status": "unloaded", "devices": [], "vram_used_mb": null}
{"id": "model-a", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
{"id": "model-b", "harness": "mistralrs", "status": "unloaded", "devices": [], "vram_used_mb": null}
]))
.await;
@@ -31,7 +31,6 @@ async fn test_poller_discovers_models() {
endpoint: mock_url,
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
@@ -64,8 +63,8 @@ async fn test_poller_discovers_models() {
#[tokio::test]
async fn test_poller_updates_gateway_models_endpoint() {
let mock_url = common::spawn_mock_neuron_with_models(json!([
{"id": "model-x", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null},
{"id": "model-y", "harness": "candle", "status": "loaded", "devices": [1], "vram_used_mb": null}
{"id": "model-x", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
{"id": "model-y", "harness": "mistralrs", "status": "loaded", "devices": [1], "vram_used_mb": null}
]))
.await;
@@ -83,7 +82,6 @@ async fn test_poller_updates_gateway_models_endpoint() {
endpoint: mock_url,
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
@@ -120,88 +118,6 @@ async fn test_poller_updates_gateway_models_endpoint() {
}
}
#[tokio::test]
async fn test_models_endpoint_unions_capabilities_across_nodes() {
// C3: two neurons each have the same model loaded but advertise
// different capability sets. The gateway's /v1/models must report
// the union — a model loaded text-only on one node and
// text+vision on another is vision-capable to the fleet.
let node_a = common::spawn_mock_neuron_with_models(json!([
{"id": "shared-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null, "capabilities": ["text"]}
]))
.await;
let node_b = common::spawn_mock_neuron_with_models(json!([
{"id": "shared-model", "harness": "candle", "status": "loaded", "devices": [1], "vram_used_mb": null, "capabilities": ["text", "vision"]}
]))
.await;
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![
NeuronEndpoint {
name: "node-a".into(),
endpoint: node_a,
},
NeuronEndpoint {
name: "node-b".into(),
endpoint: node_b,
},
],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
cortex_gateway::poller::poll_once(&fleet).await;
let app = cortex_gateway::build_app(Arc::clone(&fleet));
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let client = reqwest::Client::new();
let body: serde_json::Value = client
.get(format!("http://{addr}/v1/models"))
.send()
.await
.expect("request should succeed")
.json()
.await
.unwrap();
let model = body["data"]
.as_array()
.expect("data array")
.iter()
.find(|m| m["id"] == "shared-model")
.expect("shared-model should be present");
let caps: Vec<&str> = model["capabilities"]
.as_array()
.expect("capabilities array")
.iter()
.filter_map(|c| c.as_str())
.collect();
assert!(caps.contains(&"text"), "union must include text: {caps:?}");
assert!(
caps.contains(&"vision"),
"union must include vision: {caps:?}"
);
assert_eq!(caps.len(), 2, "union must not duplicate text: {caps:?}");
// Both nodes hold the model, so two locations regardless of caps.
assert_eq!(model["locations"].as_array().unwrap().len(), 2);
}
#[tokio::test]
async fn test_poller_marks_unreachable_node_unhealthy() {
let config = GatewayConfig {
@@ -218,7 +134,6 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
endpoint: "http://127.0.0.1:1".into(),
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
@@ -237,8 +152,8 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
#[tokio::test]
async fn test_poller_removes_stale_models() {
let mock_url = common::spawn_mock_neuron_with_models(json!([
{"id": "keep-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null},
{"id": "drop-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null}
{"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
{"id": "drop-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
]))
.await;
@@ -256,7 +171,6 @@ async fn test_poller_removes_stale_models() {
endpoint: mock_url,
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
@@ -269,7 +183,7 @@ async fn test_poller_removes_stale_models() {
// New mock with only one model.
let new_mock_url = common::spawn_mock_neuron_with_models(json!([
{"id": "keep-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null}
{"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
]))
.await;
@@ -287,7 +201,6 @@ async fn test_poller_removes_stale_models() {
endpoint: new_mock_url,
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet2 = Arc::new(CortexState::from_config(&config2));
@@ -303,10 +216,6 @@ async fn test_poller_removes_stale_models() {
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
node.models.insert(
@@ -316,10 +225,6 @@ async fn test_poller_removes_stale_models() {
status: ModelStatus::Loaded,
last_accessed: None,
vram_estimate_mb: None,
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
@@ -332,96 +237,3 @@ async fn test_poller_removes_stale_models() {
assert!(node.models.contains_key("keep-me"));
assert!(!node.models.contains_key("drop-me"));
}
#[tokio::test]
async fn test_poller_captures_activation_from_health() {
// Mock neuron is mid-prewarm: /models reports nothing (the loading
// model hasn't been inserted into the harness map yet), but
// /health's activation says model-x is in_progress and model-y is
// queued behind it.
let mock_url = common::spawn_mock_neuron_with_models_and_health(
json!([]),
json!({
"uptime_secs": 30,
"devices": [],
"activation": {
"state": "pre_warming",
"pending": ["Qwen/model-y"],
"in_progress": "Qwen/model-x",
"completed": [],
"failed": []
}
}),
)
.await;
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "prewarm-node".into(),
endpoint: mock_url,
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
cortex_gateway::poller::poll_once(&fleet).await;
let nodes = fleet.nodes.read().await;
let node = nodes.get("prewarm-node").unwrap();
assert!(node.healthy);
// /models was empty — no entries in the per-node model map.
assert!(node.models.is_empty());
// But /health's activation should be captured.
let activation = node
.activation
.as_ref()
.expect("activation should be populated after /health poll");
assert_eq!(activation.in_progress.as_deref(), Some("Qwen/model-x"));
assert_eq!(activation.pending, vec!["Qwen/model-y".to_string()]);
}
#[tokio::test]
async fn test_poller_parses_recovering_status() {
// #20: a model auto-recovering on a neuron (poisoned → unload →
// reload, #17) is reported with status "recovering" and must land
// in gateway state as the dedicated Recovering status — not fall
// through the parser's catch-all to Loaded.
let mock_url = common::spawn_mock_neuron_with_models(json!([
{"id": "model-r", "harness": "candle", "status": "recovering", "devices": [0, 1], "vram_used_mb": null}
]))
.await;
let config = GatewayConfig {
gateway: GatewaySettings {
listen: "127.0.0.1:0".into(),
metrics_listen: "127.0.0.1:0".into(),
},
eviction: EvictionSettings {
strategy: EvictionStrategy::Lru,
defrag_after_cycles: 0,
},
neurons: vec![NeuronEndpoint {
name: "test-node".into(),
endpoint: mock_url,
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = Arc::new(CortexState::from_config(&config));
cortex_gateway::poller::poll_once(&fleet).await;
let nodes = fleet.nodes.read().await;
let node = nodes.get("test-node").unwrap();
let model_r = node.models.get("model-r").expect("model-r should exist");
assert_eq!(model_r.status, ModelStatus::Recovering);
}

View File

@@ -117,7 +117,6 @@ async fn test_no_healthy_nodes() {
endpoint: "http://127.0.0.1:1".into(),
}],
models_config: "/dev/null".into(),
entitlements: Default::default(),
};
let fleet = std::sync::Arc::new(cortex_gateway::state::CortexState::from_config(&config));
@@ -140,7 +139,7 @@ async fn test_no_healthy_nodes() {
.await
.expect("request should succeed");
assert_eq!(resp.status(), 503);
assert_eq!(resp.status(), 404);
let body: serde_json::Value = resp.json().await.unwrap();
assert!(
@@ -172,67 +171,3 @@ async fn test_missing_model_field() {
let body: serde_json::Value = resp.json().await.unwrap();
assert!(body["error"]["message"].as_str().unwrap().contains("model"));
}
#[tokio::test]
async fn test_recovering_model_returns_503_and_stays_listed() {
// #20: while a model auto-recovers on a neuron, the gateway must
// hold the route — transient 503 ("retry shortly"), not the 404
// "not found on any node" that makes a recovering model look
// evicted — and keep listing it on /v1/models.
let mock_url = common::spawn_mock_neuron().await;
let (fleet, gw_url) = common::spawn_gateway_with_state(&mock_url).await;
{
let mut nodes = fleet.nodes.write().await;
let node = nodes.get_mut("mock-node").expect("node must exist");
node.models.insert(
"recovering-model".into(),
cortex_core::node::ModelEntry {
id: "recovering-model".into(),
status: cortex_core::node::ModelStatus::Recovering,
last_accessed: None,
vram_estimate_mb: Some(8000),
capabilities: Vec::new(),
tool_call: false,
reasoning: false,
limit: None,
},
);
}
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/chat/completions"))
.header("content-type", "application/json")
.json(&json!({
"model": "recovering-model",
"messages": [{"role": "user", "content": "Hi"}]
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), 503);
let body: serde_json::Value = resp.json().await.unwrap();
let message = body["error"]["message"].as_str().unwrap();
assert!(
message.contains("recovering") && message.contains("retry"),
"503 body must say recovering/retry, got: {message}"
);
// The model must still be visible on the unified models endpoint.
let models: serde_json::Value = client
.get(format!("{gw_url}/v1/models"))
.send()
.await
.expect("models request should succeed")
.json()
.await
.unwrap();
let listed = models["data"]
.as_array()
.unwrap()
.iter()
.any(|m| m["id"] == "recovering-model");
assert!(listed, "recovering model must stay listed on /v1/models");
}

View File

@@ -1,91 +0,0 @@
//! Integration tests for the `/v1/responses` proxy route.
//!
//! The gateway forwards the request body to whichever neuron has the
//! model loaded. These tests exercise the routing decision (200 on a
//! known model, 404 on an unknown model, 400 on a missing model
//! field) and confirm the response body round-trips verbatim.
mod common;
use serde_json::json;
/// Happy path: gateway routes a `/v1/responses` request to the neuron
/// that has the model loaded, and the neuron's response body
/// arrives at the client unchanged.
#[tokio::test]
async fn test_responses_proxy() {
let mock_url = common::spawn_mock_neuron().await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/responses"))
.header("content-type", "application/json")
.json(&json!({
"model": "test-model",
"input": "Hi"
}))
.send()
.await
.expect("request should succeed");
assert_eq!(resp.status(), 200);
let body: serde_json::Value = resp.json().await.expect("valid JSON response");
assert_eq!(body["id"], "resp-test-001");
assert_eq!(body["object"], "response");
assert_eq!(body["model"], "test-model");
assert_eq!(body["status"], "completed");
assert_eq!(
body["output"][0]["content"][0]["text"],
"Hello from mock backend"
);
// Usage shape is the Responses-specific (input/output_tokens),
// not the chat-completions one (prompt/completion_tokens). Asserts
// the proxy didn't accidentally route through the wrong handler.
assert_eq!(body["usage"]["total_tokens"], 10);
assert!(body["usage"].get("input_tokens").is_some());
}
/// A request that targets a model not present in the catalogue gets
/// 404 from the router. This matches the chat-completions handler's
/// behaviour — same error path, same status code, so a client can
/// share retry logic across the two routes.
#[tokio::test]
async fn test_responses_model_not_found() {
let mock_url = common::spawn_mock_neuron().await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/responses"))
.json(&json!({
"model": "not-in-catalogue",
"input": "Hi"
}))
.send()
.await
.unwrap();
assert_eq!(resp.status(), 404);
}
/// A request body without a `model` field can't be routed; the
/// gateway returns 400 before reaching a backend. Same as the
/// chat-completions handler — extracted via the same `extract_model`
/// helper.
#[tokio::test]
async fn test_responses_missing_model_field() {
let mock_url = common::spawn_mock_neuron().await;
let gw_url = common::spawn_gateway(&mock_url).await;
let client = reqwest::Client::new();
let resp = client
.post(format!("{gw_url}/v1/responses"))
.json(&json!({
"input": "Hi"
}))
.send()
.await
.unwrap();
assert_eq!(resp.status(), 400);
}

View File

@@ -51,18 +51,18 @@ async fn test_streaming_sse_passthrough() {
}
assert!(
chunks.len() > chunk_count,
"expected more than {} chunks (got {}): {:?}",
chunk_count,
chunks.len() >= chunk_count + 1,
"expected at least {} chunks (got {}): {:?}",
chunk_count + 1,
chunks.len(),
chunks,
);
assert_eq!(chunks.last().unwrap(), "[DONE]");
for (i, chunk) in chunks.iter().enumerate().take(chunk_count) {
for i in 0..chunk_count {
let chunk_json: serde_json::Value =
serde_json::from_str(chunk).expect("chunk should be valid JSON");
serde_json::from_str(&chunks[i]).expect("chunk should be valid JSON");
assert_eq!(
chunk_json["choices"][0]["delta"]["content"],
format!("token{i}")

View File

@@ -1,48 +0,0 @@
[package]
name = "helexa-acp"
version = "0.1.16"
edition = "2024"
license = "Apache-2.0"
repository = "https://git.lair.cafe/helexa/helexa"
description = """
Agent Client Protocol bridge for the helexa self-hosted LLM stack.
Speaks ACP to ACP-compatible editor clients (Zed, etc.) and forwards
the conversation to any OpenAI-compatible HTTP endpoint — defaulting
to cortex (helexa's reverse-proxy / fleet gateway).
"""
# This crate is intentionally self-contained — no dependencies on other
# workspace crates (cortex-core, cortex-gateway, neuron). The goal is
# a painless migration to a dedicated GitHub repo in the future if the
# project grows beyond helexa's needs. All deps are crates.io.
[dependencies]
# `unstable_session_model` flips on the SessionModelState type and the
# session/set_model RPC the model-picker dropdown in Zed needs. The
# feature is upstream-marked unstable; we accept that risk because the
# model picker is core UX and the alternative (rolling our own
# extension method) drifts further from spec each time it moves.
agent-client-protocol = { version = "0.12", features = ["unstable_session_model"] }
tokio = { version = "1", features = ["rt-multi-thread", "macros", "sync", "io-util", "process", "signal"] }
reqwest = { version = "0.12", features = ["json", "stream", "rustls-tls"], default-features = false }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
toml = "0.8"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
anyhow = "1"
thiserror = "2"
async-trait = "0.1"
futures = "0.3"
tokio-stream = "0.1"
tokio-util = { version = "0.7", features = ["rt"] }
eventsource-stream = "0.2"
async-stream = "0.3"
url = { version = "2", features = ["serde"] }
# Already transitively pulled via the ACP SDK; declared directly so we
# can format ISO 8601 timestamps for `SessionInfo.updated_at` in the
# session/list response.
chrono = { version = "0.4", default-features = false, features = ["std"] }
[[bin]]
name = "helexa-acp"
path = "src/main.rs"

View File

@@ -1,546 +0,0 @@
# helexa-acp
ACP (Agent Client Protocol) bridge for editors like
[Zed](https://zed.dev). Lets you point your editor's agent panel at
**any combination** of OpenAI-compatible, OpenAI Responses, and
Anthropic Messages endpoints — public APIs, private LAN deployments,
local Ollama / LM Studio — and switch between them per session via a
model dropdown.
The "missing ACP binary" for users who don't want to be locked into
one vendor's agent client.
```
┌───────────────────────────────────┐
│ Zed (or any ACP editor client) │
└────────────┬──────────────────────┘
│ stdio JSON-RPC (ACP)
┌─────────────────┐
│ helexa-acp │ ← one binary, multi-endpoint
└─────┬───────────┘
│ HTTP / SSE
┌────────┼─────────────┬──────────────┬──────────────┐
▼ ▼ ▼ ▼ ▼
cortex/ OpenAI Anthropic OpenRouter LM Studio
neuron Responses Messages
(self- (gpt-5,…) (Claude)
hosted)
```
## What it does
- **Speaks ACP** over stdio to editor clients (Zed today; any future
ACP client tomorrow).
- **Multi-endpoint** — one config file lists every LLM endpoint
you want available; pick one per session via the model dropdown
(`endpoint:model` selector).
- **Three wire formats**: `openai-chat` (the broadly compatible
default), `openai-responses` (newer OpenAI surface), and
`anthropic-messages` (Claude). Each is a separate provider impl
in `src/provider/`; adding a fourth (Gemini, Ollama native, …) is
one file plus a `WireApi` enum variant.
- **Built-in tools**: `read_file`, `write_file`, `edit_file`,
`list_dir`, `bash`. Permission-gated by default; the editor user
approves writes/shell per-call.
- **Three session modes**: Default (gated), Bypass Permissions
(auto-allow), and Plan (write-only-to-plan-dir, no shell).
- **Vision** — drag-drop images into the agent panel against any
vision-capable model.
- **Session resume** — multi-day conversations survive editor
restarts via on-disk transcript persistence.
- **Context compaction** — rolling history stays inside the model's
context window automatically so long sessions on small-context
local models don't fall over.
## Install
### From source
```sh
git clone https://git.lair.cafe/helexa/helexa.git
cd helexa
cargo install --path crates/helexa-acp
# Binary lands at ~/.cargo/bin/helexa-acp
```
### Pre-built RPM (Fedora 43)
```sh
dnf copr enable helexa/helexa
dnf install helexa-acp
```
The COPR project bundles helexa-acp alongside the cortex gateway
and helexa-neuron flavours; install only the package(s) you need.
## Quick start
The fastest path: env-var single-endpoint config.
```sh
export HELEXA_ACP_BASE_URL=http://hanzalova.internal:31313/v1
export HELEXA_ACP_MODEL=Qwen/Qwen3.6-27B
helexa-acp # speaks ACP over stdin/stdout; not interactive
```
Then in Zed (`~/.config/zed/settings.json`):
```jsonc
{
"agent_servers": {
"helexa": {
"command": "helexa-acp",
"args": []
}
}
}
```
Restart Zed → open the agent panel → pick "helexa" → start
chatting. Tool calls (file reads, writes, bash) prompt for
permission per-call in Default mode.
That's the minimum. The full config story below is what unlocks
the multi-endpoint dropdown.
## Multi-endpoint config
Copy `helexa-acp.example.toml` from this repo to
`$XDG_CONFIG_HOME/helexa-acp/config.toml` (typically
`~/.config/helexa-acp/config.toml`) and edit:
```toml
default_endpoint = "helexa"
[[endpoints]]
name = "helexa"
base_url = "http://hanzalova.internal:31313/v1"
wire_api = "openai-chat"
default_model = "Qwen/Qwen3.6-27B"
max_tokens = 8192
context_window = 32768
[[endpoints]]
name = "openrouter"
base_url = "https://openrouter.ai/api/v1"
wire_api = "openai-chat"
api_key_env = "OPENROUTER_API_KEY"
default_model = "anthropic/claude-opus-4"
[[endpoints]]
name = "anthropic"
base_url = "https://api.anthropic.com/v1"
wire_api = "anthropic-messages"
api_key_env = "ANTHROPIC_API_KEY"
default_model = "claude-opus-4"
```
Restart Zed. The model dropdown lists every model from every
configured endpoint with the `endpoint:model` selector
(`helexa:Qwen/Qwen3.6-27B`, `openrouter:anthropic/claude-opus-4`,
…). Switch mid-session; the next prompt routes to the new endpoint.
When only one endpoint is configured the prefix is dropped (model
ids appear bare).
### Selector syntax
The `model` field on every internal request is parsed as
`<endpoint>:<model>`:
- `openrouter:gpt-4o` → routes to the `openrouter` endpoint,
model `gpt-4o`.
- `helexa/large` → no colon → falls through to whichever endpoint
is named in `default_endpoint`, model `helexa/large`.
- `:gpt-5` → leading colon → also falls through to default.
## Endpoint cookbook
Copy-pasteable blocks. Mix and match.
### cortex / neuron (self-hosted)
```toml
[[endpoints]]
name = "helexa"
base_url = "http://hanzalova.internal:31313/v1"
wire_api = "openai-chat"
default_model = "Qwen/Qwen3.6-27B"
max_tokens = 8192
context_window = 32768
```
Use `openai-responses` instead of `openai-chat` once cortex 0.1.16+
is deployed and you want the Responses API surface (vision item
shape, structured reasoning items, etc.).
### OpenAI directly
```toml
[[endpoints]]
name = "openai"
base_url = "https://api.openai.com/v1"
wire_api = "openai-responses"
api_key_env = "OPENAI_API_KEY"
default_model = "gpt-5"
```
`openai-responses` is the right choice for current OpenAI models;
`openai-chat` works against legacy GPT-3.5/4 deployments and
anything labelled "chat completions".
### Anthropic directly
```toml
[[endpoints]]
name = "anthropic"
base_url = "https://api.anthropic.com/v1"
wire_api = "anthropic-messages"
api_key_env = "ANTHROPIC_API_KEY"
default_model = "claude-opus-4"
```
helexa-acp sends `x-api-key` + `anthropic-version: 2023-06-01`
automatically. The `api_key_env` indirection keeps your key out of
the config file.
### OpenRouter (multi-vendor proxy)
```toml
[[endpoints]]
name = "openrouter"
base_url = "https://openrouter.ai/api/v1"
wire_api = "openai-chat"
api_key_env = "OPENROUTER_API_KEY"
default_model = "anthropic/claude-opus-4"
```
OpenRouter speaks OpenAI-compat for every model it fronts, so
`openai-chat` is the right wire format regardless of the
underlying vendor.
### LM Studio (local)
```toml
[[endpoints]]
name = "lmstudio"
base_url = "http://localhost:1234/v1"
wire_api = "openai-chat"
default_model = "auto"
```
LM Studio's "auto" model id picks whatever's loaded. Same shape
works for Ollama in compat mode (`http://localhost:11434/v1`) and
vLLM.
### Multiple cortex deployments
```toml
[[endpoints]]
name = "lan"
base_url = "http://hanzalova.internal:31313/v1"
wire_api = "openai-chat"
default_model = "Qwen/Qwen3.6-27B"
[[endpoints]]
name = "cloud"
base_url = "https://cortex.example.com/v1"
wire_api = "openai-chat"
api_key_env = "CLOUD_CORTEX_KEY"
default_model = "Qwen/Qwen3-VL-8B"
```
Use the `endpoint:model` selector to switch between them mid-session.
## Zed setup
`~/.config/zed/settings.json`:
```jsonc
{
"agent_servers": {
"helexa": {
"command": "helexa-acp"
}
}
}
```
Optional environment overrides for the binary:
```jsonc
{
"agent_servers": {
"helexa": {
"command": "helexa-acp",
"env": {
"HELEXA_ACP_LOG_FILE": "/tmp/helexa-acp.log",
"RUST_LOG": "helexa_acp=debug"
}
}
}
}
```
`HELEXA_ACP_LOG_FILE` is the one you actually want — Zed doesn't
surface the agent's stderr, so without that env var debug output is
invisible. Point it at a file you can `tail -f`.
After restarting Zed: ⌘+? (or wherever your "Open Agent Panel"
binding is) → select "helexa" → the model dropdown populates from
your config → start prompting.
## Modes
Three session modes ship; the user picks via Zed's mode dropdown
on the agent panel.
| Mode | Reads | Writes | Bash | Permission prompts |
|------|-------|--------|------|--------------------|
| **Default** | ✓ | with prompt | with prompt | per call |
| **Bypass Permissions** | ✓ | ✓ | ✓ | never |
| **Plan** | ✓ | only into plan dir | disabled | never (plan-dir writes auto-allow) |
### Default
Reads are always allowed (`read_file`, `list_dir` are
unrestricted). Writes and shell commands prompt the user before
running. The intended baseline for any session where the agent
might do something you'd rather review first.
### Bypass Permissions
Auto-allow every tool call. Use for agentic loops you trust — bulk
edits across many files, scripted workflows, prepared session
templates. Never for code the agent hasn't seen before.
### Plan
The "draft an implementation plan before you write code" mode.
Available tools:
- `read_file`, `list_dir`: unrestricted (read the codebase).
- `write_file`, `edit_file`: allowed *only* under
`$XDG_DATA_HOME/helexa-acp/plans/<project-id>/`. Any path
outside that returns "plan mode: writes are restricted to …"
back to the model so it self-corrects.
- `bash`: disabled outright. Returns "plan mode: shell execution
is disabled" if attempted.
When the plan is complete, the model presents a 3-option menu:
1. **Bypass Permissions** — implement the plan now, no prompts.
2. **Default** — implement now with per-tool prompts.
3. **Plan** (stay here) — refine the plan with more guidance.
Switch the mode dropdown to your preference and reply to proceed.
## Tools
Five tools, defined in `src/tools.rs`:
| Tool | Args | Gated in Default? |
|------|------|-------------------|
| `read_file` | `path`, `line?`, `limit?` | no |
| `list_dir` | `path` | no |
| `write_file` | `path`, `content` | yes |
| `edit_file` | `path`, `old_text`, `new_text` | yes |
| `bash` | `command`, `cwd?` | yes |
### Path handling
`~`, `~/`, `$HOME`, and `$HOME/` are expanded server-side before
the path reaches ACP or local fs. Lets the model emit
`~/git/repo/file.rs` and have it Just Work.
`read_file` first tries the editor's filesystem (ACP's
`fs/read_text_file` — respects open buffers, workspace overlays,
etc.). If that fails — typically because the path is outside Zed's
workspace boundary — it falls back to `std::fs::read_to_string`.
This lets the agent pull in shared material like
`~/git/architecture/generic.md` from a different project's
session.
The fallback is logged at warn level so you can see when it kicks
in.
### Tool dispatch
Tool descriptions reach the model through a Qwen3 Hermes-format
`# Tools` block injected into the system prompt — cortex/neuron
pass the OpenAI `tools` request field through to the encoder
unread, so we work the model into emitting `<tool_call>{json}</tool_call>`
markers it then parses out of the content stream. This applies to
the helexa wire format; OpenAI / Anthropic endpoints with native
tool support would use their own paths once they're wired in.
The parser is tolerant: malformed JSON (trailing braces, missing
`name`, name nested in `arguments`) gets a repair pass; if that
fails the call surfaces as a "Malformed tool call" card in Zed and
the model gets a synthetic error result so it can self-correct.
## Session resume
helexa-acp persists every session to
`$XDG_DATA_HOME/helexa-acp/sessions/<id>.json`. Zed's `session/list`
RPC asks helexa-acp to enumerate them on workspace open;
`session/load` rehydrates and replays the transcript as
`session/update` notifications so the agent panel renders the
prior conversation.
Behaviour:
- Persisted per-round, so a mid-turn agent stall (long bash, wedged
ACP roundtrip) doesn't lose earlier rounds.
- Survives editor restart and the helexa-acp binary upgrading
between versions.
- Project-scoped: only sessions whose `cwd` matches the workspace
are listed.
To wipe history: `rm -rf $XDG_DATA_HOME/helexa-acp/sessions/`.
## Context compaction
When an endpoint sets `context_window`, helexa-acp projects the
rolling history into a token budget before each request — old
`ToolResult` content (read_file payloads are the worst offenders)
gets elided to one-line markers, preserving `tool_call_id` pairing
so the wire schema stays valid.
System prompts, user turns, and the most recent ~4 messages are
never elided. The full history stays on disk; compaction is a
per-request projection, not a destructive edit.
Set `context_window = 32768` for a 32 K Qwen3, `131072` for a
modern Claude, etc. With `max_tokens` also set, the budget is
`context_window - max_tokens - 512_safety`.
## Troubleshooting
### "default endpoint 'helexa' has no usable provider — check config"
The named default endpoint failed to construct. Usually:
- `api_key_env` references a variable that isn't set in the env
Zed launched helexa-acp with.
- The TOML's `wire_api` is misspelled (only `openai-chat`,
`openai-responses`, `anthropic-messages` are accepted).
Test by running `helexa-acp` directly from a shell — startup
errors land on stderr.
### Model dropdown is empty
Each provider's `list_models` failed at startup. Look at
`HELEXA_ACP_LOG_FILE` for "list_models failed; this endpoint's
models won't appear in the picker". Likely the endpoint URL is
wrong, the API key is invalid, or the upstream `/v1/models`
endpoint isn't responding.
The agent still works against `default_model` even when the
dropdown is empty — list-models is for picking, not routing.
### "prompt_too_long" / agent stalls mid-conversation
You hit the model's context window. Set `context_window` on the
endpoint and helexa-acp will compact before sending. The log line
`context compaction applied` confirms it's running; if it fires
but the upstream still rejects, the compaction heuristic
under-counted and the budget needs tuning down.
### Reading files outside the workspace returns "not found"
Zed's `fs/read_text_file` is workspace-scoped. helexa-acp falls
back to local `std::fs` automatically when that fails — look for
`fs/read_text_file failed; falling back to local std::fs` in the
log. If even local read fails, the file genuinely doesn't exist
or the user process lacks permissions.
### Tool calls render as text instead of structured cards
The model is emitting `<tool_call>` markers that the parser can't
decode. Two common causes:
1. The system prompt isn't reaching the model (cortex/neuron's
tool-block injection didn't fire). Confirm with
`RUST_LOG=helexa_acp=debug` and look at the outgoing
`POST /chat/completions` body.
2. The model itself is too small / undertrained to follow the
Hermes format reliably. helexa-acp has shape-based name
inference and JSON repair, but there's a floor below which
nothing helps.
### Plan-mode writes refused even inside the plan dir
The path comparison is byte-for-byte. If the model emits a path
with `~` and the plan_dir has the expanded form, expansion runs
*before* the comparison — but resolved-vs-symlinked-path
mismatches can still bite. The error message names the attempted
path and the expected prefix so you can compare directly.
## Architecture
Source layout under `crates/helexa-acp/src/`:
| File | Responsibility |
|------|----------------|
| `main.rs` | tokio + Stdio transport. Builds providers, hands off to `agent::Agent` |
| `config.rs` | TOML + env-fallback config, endpoint resolver |
| `agent.rs` | ACP handlers (initialize, session/new, session/prompt, session/cancel, session/set_mode, session/set_model, session/load, session/list), prompt loop with tool-call recursion |
| `session.rs` | Per-session state map (Arc<RwLock<HashMap<…>>>) |
| `store.rs` | On-disk session persistence, plan-dir resolution |
| `prompt.rs` | System-prompt assembly, plan-mode addendum |
| `tools.rs` | Tool schemas + shape-based name inference |
| `tool_runner.rs` | Dispatch a single tool call through ACP client RPCs; permission gate |
| `qwen3.rs` | Qwen3 Hermes tool-format parser (`<tool_call>` / `<think>` markers) |
| `compaction.rs` | Token-budget compaction for the rolling history |
| `path_util.rs` | `~` / `$HOME` expansion shared across every path-taking tool |
| `provider/openai_chat.rs` | OpenAI chat completions provider |
| `provider/openai_responses.rs` | OpenAI Responses API provider |
| `provider/anthropic_messages.rs` | Anthropic Messages API provider |
### Adding a new wire format
1. New file under `src/provider/` implementing the `Provider`
trait (encoder + SSE decoder).
2. Add a `WireApi` variant in `config.rs`.
3. Wire it into `build_provider` in `main.rs`.
4. Done — every other module is wire-format-agnostic.
### Concurrency
- `Arc<RwLock<HashMap<SessionId, Arc<Mutex<SessionState>>>>>`
per-session mutex so concurrent requests across sessions don't
contend; the map's RwLock is read-mostly.
- Every tool call dispatched serially within a session (parallel
dispatch would require Zed to handle interleaved permission
prompts).
- Provider streams are back-pressured by the consumer (bounded
mpsc channels).
### Self-contained
The crate has no workspace-internal dependencies (no
`cortex-core`, no `cortex-gateway`). Migration to a dedicated
GitHub repo for cross-platform CI / cargo-dist binaries is
Cargo.toml-only.
## Status
- Stages 16 shipped: scaffold, agent loop, tools, modes, session
resume, image input, model picker, three wire formats.
- Stage 8 (RPM + multi-platform CI) tracked in the canonical plan;
Linux x86_64 RPM ships today via the cortex monorepo's Gitea
Actions.
## Contributing
Repository: https://git.lair.cafe/helexa/helexa (`crates/helexa-acp/`).
Issues / PRs welcome. The canonical staged plan is in
`~/.claude/plans/plan-the-per-device-worker-abstract-micali.md` on
the maintainer's machine; the substages 3a3e and 6a/6b that the
canonical plan didn't anticipate are documented in commit messages.
CI: `cargo fmt --check --all`, `cargo clippy --workspace -- -D
warnings`, `cargo test --workspace` must all pass before merge.

File diff suppressed because it is too large Load Diff

View File

@@ -1,425 +0,0 @@
//! Rolling-conversation compaction for small-context local models.
//!
//! The tool-call loop in [`crate::agent`] grows the message vec it
//! sends upstream every round. On a frontier model that's fine; on a
//! 32 K Qwen3 the first few `read_file` results can push the prompt
//! past the model's context window, at which point cortex/neuron
//! refuses with `prompt_too_long` and the whole turn dies. Long-form
//! local agents are unusable without something here.
//!
//! Strategy (intentionally simple — no LLM-summarization round-trip,
//! no tokenizer dependency):
//!
//! 1. **Protect** the things the model cannot reason without:
//! - The system prompt (idx 0).
//! - Every `Role::User` turn (the user's intent — irreplaceable).
//! - The last [`KEEP_TAIL`] messages (most recent rounds stay
//! verbatim so the model can keep working on what it just
//! observed).
//! 2. **Elide** older `Role::Assistant` prose and older `Role::Tool`
//! result content. The structure stays — `tool_call_id`s, tool
//! names, and argument JSON survive intact — so OpenAI's strict
//! `tool_calls` ↔ `tool` pairing schema remains satisfied. Only
//! the *payload* shrinks to a one-line marker.
//! 3. Walk oldest→newest, recomputing the budget after each elision.
//! Stop as soon as we fit; we don't compact more than necessary.
//! 4. If we still exceed budget after eliding everything we're
//! allowed to, return what we have. The upstream will surface a
//! `prompt_too_long` error and the user can intervene; that's
//! better than silently dropping content the model needs.
//!
//! Token estimation uses a `chars / 3.5` heuristic — conservative
//! (over-estimates tokens slightly) so we compact a touch early
//! rather than a touch late.
use crate::provider::{Message, MessageContent, MessagePart, Role};
/// Most-recent N messages that are never elided. Roughly "the
/// current tool round in flight" — assistant turn that called the
/// tools + each tool result + a bit of slack.
const KEEP_TAIL: usize = 4;
/// Below this content size we don't bother eliding — the savings
/// don't outweigh the loss of detail. Roughly 6080 tokens.
const ELIDE_MIN_CHARS: usize = 256;
/// Roughly tokens-per-character for English + code mixed in. The
/// actual per-tokenizer ratio varies (GPT-4o ≈ 4 chars/token on
/// English prose, ≈ 3 chars/token on code-heavy text). We pick a
/// value on the conservative end so the budget check fires *before*
/// the upstream tokenizer says no.
const CHARS_PER_TOKEN: f32 = 3.5;
/// Per-message envelope overhead (role + JSON framing). Comes out
/// to a few tokens; tiny but it adds up across long histories.
const ENVELOPE_TOKENS: usize = 8;
/// Rough per-image token cost used by the budget estimator. Real
/// vision tokenizers vary widely (2561024 tokens for typical
/// resolutions on Qwen3-VL, OpenAI's `low`/`high` detail toggles
/// pick between ~85 and ~1000+). 512 is a defensible middle that
/// keeps compaction from treating images as free.
const IMAGE_TOKENS_APPROX: usize = 512;
/// Stats reported back from [`compact_to_budget`] for the caller to
/// log. The numbers are estimates (see [`estimate_tokens`]), so
/// don't compare them to upstream-reported token counts as if they
/// were exact.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct CompactionStats {
/// Estimated tokens in the input messages.
pub original_tokens: usize,
/// Estimated tokens after compaction. Equal to `original_tokens`
/// when no compaction was needed.
pub final_tokens: usize,
/// Number of messages whose content was elided. Zero is the
/// hot path (nothing to do).
pub elided_messages: usize,
}
impl CompactionStats {
fn unchanged(tokens: usize) -> Self {
Self {
original_tokens: tokens,
final_tokens: tokens,
elided_messages: 0,
}
}
}
/// Approximate token count for one message. Sums the textual
/// payload's chars, divides by [`CHARS_PER_TOKEN`], and adds an
/// envelope constant. Cheap (no allocation) so safe to call once per
/// message per round.
pub fn estimate_tokens(msg: &Message) -> usize {
let chars = match &msg.content {
MessageContent::Text { text } => text.len(),
MessageContent::MultiPart { parts } => parts
.iter()
.map(|p| match p {
MessagePart::Text { text } => text.len(),
// Each image is one block in the context window; the
// upstream tokenizer handles the real cost (and it
// varies wildly by model — Qwen3-VL uses ~256-1024
// tokens per image depending on size). Take a
// middle estimate so the budget tracker doesn't
// pretend images are free.
MessagePart::Image(_) => IMAGE_TOKENS_APPROX * CHARS_PER_TOKEN as usize,
})
.sum(),
MessageContent::ToolCalls { text, calls } => {
let txt = text.as_deref().map(|s| s.len()).unwrap_or(0);
let calls_size: usize = calls
.iter()
.map(|c| c.name.len() + c.arguments.len() + c.id.len())
.sum();
txt + calls_size
}
MessageContent::ToolResult {
tool_call_id,
content,
} => tool_call_id.len() + content.len(),
};
((chars as f32 / CHARS_PER_TOKEN) as usize) + ENVELOPE_TOKENS
}
/// Sum of [`estimate_tokens`] across all messages.
pub fn total_tokens(messages: &[Message]) -> usize {
messages.iter().map(estimate_tokens).sum()
}
/// Project `messages` into a vec whose estimated token count fits in
/// `budget` tokens. Returns the projection plus stats about what
/// was done. When the input already fits, the projection is a clone
/// of the input and stats report zero elisions.
///
/// See module docs for the strategy and protected set.
pub fn compact_to_budget(messages: &[Message], budget: usize) -> (Vec<Message>, CompactionStats) {
let original = total_tokens(messages);
if original <= budget {
return (messages.to_vec(), CompactionStats::unchanged(original));
}
let mut out = messages.to_vec();
let len = out.len();
let tail_start = len.saturating_sub(KEEP_TAIL);
let mut elided = 0usize;
// Two passes. First pass: ToolResult contents (largest savings
// per elision — read_file payloads land here). Second pass: long
// Assistant prose. We don't interleave because eliding a long
// assistant turn before a really old read_file would do less
// good per elision; oldest-first ordering is enforced *within*
// each pass instead.
for pass in 0..2 {
for i in 1..tail_start {
if matches!(out[i].role, Role::User) {
continue;
}
let target_pass_2 = matches!(
&out[i].content,
MessageContent::Text { .. } | MessageContent::ToolCalls { .. }
);
let target_pass_1 = matches!(&out[i].content, MessageContent::ToolResult { .. });
let in_pass = (pass == 0 && target_pass_1) || (pass == 1 && target_pass_2);
if !in_pass {
continue;
}
if elide_in_place(&mut out[i]) {
elided += 1;
if total_tokens(&out) <= budget {
let final_tokens = total_tokens(&out);
return (
out,
CompactionStats {
original_tokens: original,
final_tokens,
elided_messages: elided,
},
);
}
}
}
}
let final_tokens = total_tokens(&out);
(
out,
CompactionStats {
original_tokens: original,
final_tokens,
elided_messages: elided,
},
)
}
/// Shrink one message's payload while keeping its structural role
/// (so tool_call_id pairing survives). Returns `true` when the
/// message changed.
///
/// - `ToolResult.content` → `(elided: N bytes of tool result)`
/// - `ToolCalls.text` → `(elided: N bytes of assistant prose)`
/// - `Text` (assistant) → `(elided: N bytes of assistant prose)`
///
/// Already-tiny payloads are skipped — eliding a 50-byte string
/// would *grow* it once the marker is in place.
fn elide_in_place(msg: &mut Message) -> bool {
match &mut msg.content {
MessageContent::ToolResult { content, .. } => {
if content.len() < ELIDE_MIN_CHARS {
return false;
}
*content = format!("(elided: {} bytes of tool result)", content.len());
true
}
MessageContent::ToolCalls { text, .. } => match text {
Some(t) if t.len() >= ELIDE_MIN_CHARS => {
*text = Some(format!("(elided: {} bytes of assistant prose)", t.len()));
true
}
_ => false,
},
MessageContent::Text { text } => {
if text.len() < ELIDE_MIN_CHARS {
return false;
}
*text = format!("(elided: {} bytes of assistant prose)", text.len());
true
}
MessageContent::MultiPart { .. } => {
// MultiPart messages today only exist as User turns,
// and User turns are protected by the role check in
// `compact_to_budget` — so this branch is unreachable
// for current call sites. Returning false keeps the
// unreachable path benign if a future stage starts
// emitting MultiPart on other roles.
false
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::provider::ToolCall;
fn sys(text: &str) -> Message {
Message {
role: Role::System,
content: MessageContent::Text { text: text.into() },
}
}
fn user(text: &str) -> Message {
Message {
role: Role::User,
content: MessageContent::Text { text: text.into() },
}
}
fn assistant_text(text: &str) -> Message {
Message {
role: Role::Assistant,
content: MessageContent::Text { text: text.into() },
}
}
fn assistant_calls(text: Option<&str>, name: &str, args: &str, id: &str) -> Message {
Message {
role: Role::Assistant,
content: MessageContent::ToolCalls {
text: text.map(|s| s.to_string()),
calls: vec![ToolCall {
id: id.into(),
name: name.into(),
arguments: args.into(),
}],
},
}
}
fn tool_result(id: &str, body: &str) -> Message {
Message {
role: Role::Tool,
content: MessageContent::ToolResult {
tool_call_id: id.into(),
content: body.into(),
},
}
}
#[test]
fn under_budget_is_a_no_op_clone() {
let msgs = vec![sys("you are an agent"), user("hi"), assistant_text("hello")];
let (out, stats) = compact_to_budget(&msgs, 10_000);
assert_eq!(stats.elided_messages, 0);
assert_eq!(stats.original_tokens, stats.final_tokens);
assert_eq!(out.len(), msgs.len());
// Strings unchanged.
match &out[2].content {
MessageContent::Text { text } => assert_eq!(text, "hello"),
other => panic!("expected Text, got {other:?}"),
}
}
#[test]
fn elides_old_tool_result_before_old_assistant_prose() {
// History: sys, user, assistant_calls, big_tool_result,
// assistant_with_big_text, user, assistant_calls,
// small_tool_result.
// KEEP_TAIL=4 protects the last four; the big tool result
// sits in the prunable range and should go first because
// pass 0 (tool results) runs before pass 1 (prose).
let big_result = "X".repeat(4096);
let big_prose = "Y".repeat(2048);
let msgs = vec![
sys("preamble"),
user("first ask"),
assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "c0"),
tool_result("c0", &big_result),
assistant_text(&big_prose),
user("follow up"),
assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "c1"),
tool_result("c1", "short result body"),
];
let before = total_tokens(&msgs);
// Force compaction by setting budget well below current.
let budget = before / 2;
let (out, stats) = compact_to_budget(&msgs, budget);
assert!(
stats.elided_messages >= 1,
"expected at least one elision, got {stats:?}"
);
// The big tool result must be elided (oldest fat target).
match &out[3].content {
MessageContent::ToolResult { content, .. } => {
assert!(
content.starts_with("(elided:"),
"tool result not elided: {content:?}"
);
}
other => panic!("expected ToolResult, got {other:?}"),
}
// Last four messages must be untouched.
assert!(matches!(
&out[out.len() - 1].content,
MessageContent::ToolResult { content, .. } if content == "short result body"
));
}
#[test]
fn never_elides_system_or_user_turns() {
let big_user = "U".repeat(8192);
let msgs = vec![sys("preamble"), user(&big_user), assistant_text("ok")];
let budget = 10; // way below — forces all possible elision
let (out, _stats) = compact_to_budget(&msgs, budget);
// System unchanged.
match &out[0].content {
MessageContent::Text { text } => assert_eq!(text, "preamble"),
other => panic!("expected Text, got {other:?}"),
}
// User unchanged even though it's huge.
match &out[1].content {
MessageContent::Text { text } => assert_eq!(text.len(), big_user.len()),
other => panic!("expected Text, got {other:?}"),
}
}
#[test]
fn preserves_tool_call_id_pairing_after_elision() {
// OpenAI strict mode rejects a tool-result whose tool_call_id
// doesn't match a preceding assistant tool_call. Elision
// must not break that linkage.
let big = "Z".repeat(4096);
let msgs = vec![
sys("preamble"),
user("first"),
assistant_calls(None, "read_file", r#"{"path":"/a"}"#, "call_42"),
tool_result("call_42", &big),
// Tail messages.
user("next"),
assistant_calls(None, "read_file", r#"{"path":"/b"}"#, "call_43"),
tool_result("call_43", "ok"),
assistant_text("done"),
];
let budget = total_tokens(&msgs) / 3;
let (out, _stats) = compact_to_budget(&msgs, budget);
// The assistant call and its result both carry call_42.
let call_id = match &out[2].content {
MessageContent::ToolCalls { calls, .. } => calls[0].id.clone(),
other => panic!("expected ToolCalls, got {other:?}"),
};
match &out[3].content {
MessageContent::ToolResult { tool_call_id, .. } => {
assert_eq!(tool_call_id, &call_id, "pairing broken");
}
other => panic!("expected ToolResult, got {other:?}"),
}
}
#[test]
fn estimate_tokens_grows_with_content() {
let small = sys("hi");
let large = sys(&"x".repeat(10_000));
assert!(estimate_tokens(&large) > estimate_tokens(&small) * 100);
}
#[test]
fn elide_in_place_skips_short_content() {
let mut m = tool_result("c0", "tiny");
assert!(!elide_in_place(&mut m));
match m.content {
MessageContent::ToolResult { content, .. } => assert_eq!(content, "tiny"),
other => panic!("expected ToolResult, got {other:?}"),
}
}
#[test]
fn returns_best_effort_when_budget_unmeetable() {
// Single huge user message that cannot be elided. Budget 10.
// We don't error — we return what we have and let upstream
// refuse the prompt with its own error.
let big_user = "U".repeat(100_000);
let msgs = vec![sys("preamble"), user(&big_user)];
let (out, stats) = compact_to_budget(&msgs, 10);
assert_eq!(out.len(), msgs.len());
assert!(stats.final_tokens > 10, "still over budget by design");
}
}

View File

@@ -1,424 +0,0 @@
//! Configuration for the helexa-acp bridge.
//!
//! Loaded from `$XDG_CONFIG_HOME/helexa-acp/config.toml` (or
//! `~/.config/helexa-acp/config.toml` as a fallback). If no config file
//! exists, falls back to building a single anonymous endpoint from env
//! vars — that keeps "just point at one cortex" frictionless without
//! requiring a config file on disk.
//!
//! The design goal is "the missing ACP binary for users with multiple
//! API endpoints (possibly on a private LAN, possibly mixing wire
//! types)". Hence: every endpoint is named, has its own wire API, and
//! has its own default model. The agent's selected model id can be
//! prefixed `endpoint:model` to route across endpoints; a bare
//! `model` falls through to the configured `default_endpoint`.
//!
//! ### Example TOML
//!
//! ```toml
//! default_endpoint = "helexa"
//!
//! [[endpoints]]
//! name = "helexa"
//! base_url = "http://hanzalova.internal:31313/v1"
//! wire_api = "openai-chat"
//! default_model = "helexa/large"
//!
//! [[endpoints]]
//! name = "openrouter"
//! base_url = "https://openrouter.ai/api/v1"
//! wire_api = "openai-chat"
//! api_key_env = "OPENROUTER_API_KEY"
//! default_model = "anthropic/claude-opus-4"
//!
//! [[endpoints]]
//! name = "lmstudio"
//! base_url = "http://localhost:1234/v1"
//! wire_api = "openai-chat"
//! default_model = "auto"
//! ```
use anyhow::{Context, anyhow};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use url::Url;
const DEFAULT_BASE_URL: &str = "http://hanzalova.internal:31313/v1";
const DEFAULT_MODEL: &str = "helexa/large";
const DEFAULT_ENDPOINT_NAME: &str = "default";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Config {
/// Name of the endpoint used when a request doesn't pick one
/// explicitly. Must reference an entry in `endpoints`. Defaults to
/// the first endpoint declared if unset.
#[serde(default)]
pub default_endpoint: Option<String>,
/// Per-endpoint configuration. At least one entry is required.
#[serde(default)]
pub endpoints: Vec<EndpointConfig>,
/// Optional path to a system-prompt file. When unset, the built-in
/// default prompt from `prompt.rs` is used.
#[serde(default)]
pub system_prompt_path: Option<PathBuf>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EndpointConfig {
/// Short identifier used in `endpoint:model` routing and in logs.
pub name: String,
/// Base URL of the OpenAI-compatible API. Must include the `/v1`
/// (or equivalent) suffix — paths like `chat/completions` and
/// `models` are joined onto this.
pub base_url: Url,
/// Wire protocol the endpoint speaks. Phase 1 supports
/// [`WireApi::OpenAiChat`] only; `openai-responses` and
/// `anthropic-messages` land later behind their own providers.
#[serde(default)]
pub wire_api: WireApi,
/// Model to use when the client hasn't picked one via
/// `session/set_model`.
#[serde(default)]
pub default_model: Option<String>,
/// Static API key to send as `Authorization: Bearer …`. Prefer
/// `api_key_env` for anything sensitive — keys in plain TOML are a
/// liability.
#[serde(default)]
pub api_key: Option<String>,
/// Env var name to read for the API key. Resolved at startup so a
/// missing env var yields a clear error rather than silent
/// unauthenticated calls.
#[serde(default)]
pub api_key_env: Option<String>,
/// Cap on the model's output tokens per turn. `None` lets the
/// upstream pick its own default (cortex/neuron's default is
/// often small enough to trip Zed's "Output Limit Reached" on
/// long responses). Set to e.g. `32768` to let the model
/// produce longer turns. Goes into the OpenAI `max_tokens`
/// request field.
#[serde(default)]
pub max_tokens: Option<u64>,
/// Model context window in tokens (prompt + response). When set,
/// the agent compacts conversation history before each completion
/// so the prompt fits within `context_window - max_tokens - safety`
/// tokens — long sessions on small-context local models (Qwen3 at
/// 32 K) survive past the first few tool-call rounds rather than
/// dying with `prompt_too_long`. `None` disables compaction.
#[serde(default)]
pub context_window: Option<usize>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
pub enum WireApi {
/// `POST {base}/chat/completions` returning OpenAI-format SSE.
/// Compatible with cortex, LM Studio, Ollama (compat mode),
/// OpenRouter, OpenAI itself.
#[default]
#[serde(rename = "openai-chat")]
OpenAiChat,
/// `POST {base}/responses` — OpenAI's newer Responses API. Not
/// implemented yet; the variant is reserved so endpoint configs
/// can be authored ahead of provider support.
#[serde(rename = "openai-responses")]
OpenAiResponses,
/// `POST {base}/messages` — Anthropic format. Reserved.
#[serde(rename = "anthropic-messages")]
AnthropicMessages,
}
impl EndpointConfig {
/// Resolve the API key from `api_key` (literal) or `api_key_env`
/// (env-var lookup). Returns `Ok(None)` when neither is set;
/// `Err` when `api_key_env` references a missing variable.
pub fn resolve_api_key(&self) -> anyhow::Result<Option<String>> {
if let Some(literal) = &self.api_key {
return Ok(Some(literal.clone()));
}
if let Some(var) = &self.api_key_env {
return Ok(Some(std::env::var(var).with_context(|| {
format!(
"endpoint '{}' references missing env var {}",
self.name, var
)
})?));
}
Ok(None)
}
/// `{base_url}/chat/completions`.
pub fn chat_completions_url(&self) -> Url {
join_segments(&self.base_url, &["chat", "completions"])
}
/// `{base_url}/responses` — OpenAI Responses API endpoint.
pub fn responses_url(&self) -> Url {
join_segments(&self.base_url, &["responses"])
}
/// `{base_url}/models`. Called from `Provider::list_models`, which
/// Stage 4 wires into the model-picker dropdown; until then it's
/// reachable code with no in-tree callers.
#[allow(dead_code)]
pub fn models_url(&self) -> Url {
join_segments(&self.base_url, &["models"])
}
}
impl Config {
/// Load from TOML at the standard config path, or build from env
/// vars if no file exists. Env-fallback yields a single endpoint
/// named `"default"`.
pub fn load() -> anyhow::Result<Self> {
let path = config_path();
if let Some(path) = &path
&& path.exists()
{
return Self::from_file(path);
}
Self::from_env()
}
/// Single-endpoint config constructed from `HELEXA_ACP_BASE_URL`,
/// `HELEXA_ACP_MODEL`, `HELEXA_ACP_API_KEY`,
/// `HELEXA_ACP_SYSTEM_PROMPT_PATH`, `HELEXA_ACP_MAX_TOKENS`.
pub fn from_env() -> anyhow::Result<Self> {
let base_url = std::env::var("HELEXA_ACP_BASE_URL")
.ok()
.unwrap_or_else(|| DEFAULT_BASE_URL.into());
let base_url = Url::parse(&base_url)
.with_context(|| format!("HELEXA_ACP_BASE_URL is not a valid URL ({base_url})"))?;
let default_model = std::env::var("HELEXA_ACP_MODEL")
.ok()
.unwrap_or_else(|| DEFAULT_MODEL.into());
let api_key = std::env::var("HELEXA_ACP_API_KEY")
.ok()
.filter(|s| !s.is_empty());
let system_prompt_path = std::env::var("HELEXA_ACP_SYSTEM_PROMPT_PATH")
.ok()
.filter(|s| !s.is_empty())
.map(PathBuf::from);
let max_tokens = std::env::var("HELEXA_ACP_MAX_TOKENS")
.ok()
.filter(|s| !s.is_empty())
.map(|s| {
s.parse::<u64>().with_context(|| {
format!("HELEXA_ACP_MAX_TOKENS is not a positive integer ({s})")
})
})
.transpose()?;
let context_window = std::env::var("HELEXA_ACP_CONTEXT_WINDOW")
.ok()
.filter(|s| !s.is_empty())
.map(|s| {
s.parse::<usize>().with_context(|| {
format!("HELEXA_ACP_CONTEXT_WINDOW is not a positive integer ({s})")
})
})
.transpose()?;
Ok(Self {
default_endpoint: Some(DEFAULT_ENDPOINT_NAME.into()),
endpoints: vec![EndpointConfig {
name: DEFAULT_ENDPOINT_NAME.into(),
base_url,
wire_api: WireApi::OpenAiChat,
default_model: Some(default_model),
api_key,
api_key_env: None,
max_tokens,
context_window,
}],
system_prompt_path,
})
}
pub fn from_file(path: &Path) -> anyhow::Result<Self> {
let text = std::fs::read_to_string(path)
.with_context(|| format!("read config {}", path.display()))?;
let mut cfg: Self =
toml::from_str(&text).with_context(|| format!("parse config {}", path.display()))?;
cfg.validate()?;
Ok(cfg)
}
fn validate(&mut self) -> anyhow::Result<()> {
if self.endpoints.is_empty() {
return Err(anyhow!("config has no [[endpoints]] entries"));
}
for (i, ep) in self.endpoints.iter().enumerate() {
if ep.name.is_empty() {
return Err(anyhow!("endpoints[{i}] has empty name"));
}
if ep.name.contains(':') {
return Err(anyhow!(
"endpoints[{i}].name '{}' contains ':' which would clash \
with the endpoint:model selector syntax",
ep.name
));
}
}
// Pick a default endpoint if none was named.
if self.default_endpoint.is_none() {
self.default_endpoint = Some(self.endpoints[0].name.clone());
}
let default_name = self.default_endpoint.as_deref().unwrap();
if !self.endpoints.iter().any(|e| e.name == default_name) {
return Err(anyhow!(
"default_endpoint '{default_name}' is not declared in [[endpoints]]"
));
}
Ok(())
}
/// Look up an endpoint by name. Returns `None` if not configured.
pub fn endpoint(&self, name: &str) -> Option<&EndpointConfig> {
self.endpoints.iter().find(|e| e.name == name)
}
/// The default endpoint (guaranteed to exist after `validate`).
pub fn default_endpoint(&self) -> &EndpointConfig {
let name = self
.default_endpoint
.as_deref()
.expect("default_endpoint set by validate");
self.endpoint(name)
.expect("default_endpoint resolves after validate")
}
}
/// Parse an ACP-side `model` field into (endpoint name, raw model id).
///
/// `helexa:helexa/large` → (`Some("helexa")`, `"helexa/large"`).
/// `helexa/large` → (`None`, `"helexa/large"`).
///
/// The split happens at the FIRST colon. Model ids commonly contain
/// `/` (HuggingFace style) but rarely `:`; if a model id ever does, the
/// user can quote-prefix with the default endpoint name.
pub fn parse_model_selector(input: &str) -> (Option<&str>, &str) {
match input.split_once(':') {
Some((endpoint, model)) if !endpoint.is_empty() && !model.is_empty() => {
(Some(endpoint), model)
}
_ => (None, input),
}
}
fn config_path() -> Option<PathBuf> {
if let Ok(override_path) = std::env::var("HELEXA_ACP_CONFIG_PATH") {
return Some(PathBuf::from(override_path));
}
let xdg = std::env::var("XDG_CONFIG_HOME")
.ok()
.filter(|s| !s.is_empty());
let base = xdg.map(PathBuf::from).or_else(|| {
std::env::var("HOME")
.ok()
.map(|h| PathBuf::from(h).join(".config"))
})?;
Some(base.join("helexa-acp").join("config.toml"))
}
fn join_segments(base: &Url, segments: &[&str]) -> Url {
let mut out = base.clone();
if let Ok(mut path) = out.path_segments_mut() {
path.pop_if_empty().extend(segments.iter().copied());
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn url_join_handles_trailing_slash() {
let ep = EndpointConfig {
name: "x".into(),
base_url: Url::parse("http://h.internal:31313/v1").unwrap(),
wire_api: WireApi::OpenAiChat,
default_model: None,
api_key: None,
api_key_env: None,
max_tokens: None,
context_window: None,
};
assert_eq!(
ep.chat_completions_url().as_str(),
"http://h.internal:31313/v1/chat/completions"
);
assert_eq!(
ep.models_url().as_str(),
"http://h.internal:31313/v1/models"
);
}
#[test]
fn parses_model_selector() {
assert_eq!(
parse_model_selector("helexa:helexa/large"),
(Some("helexa"), "helexa/large")
);
assert_eq!(parse_model_selector("helexa/large"), (None, "helexa/large"));
assert_eq!(parse_model_selector("gpt-5"), (None, "gpt-5"));
// Edge case: a leading colon → no endpoint.
assert_eq!(parse_model_selector(":gpt-5"), (None, ":gpt-5"));
}
#[test]
fn env_fallback_builds_single_endpoint() {
// Don't actually set env vars (would race with other tests);
// just confirm the default path constructs cleanly.
unsafe {
std::env::remove_var("HELEXA_ACP_BASE_URL");
std::env::remove_var("HELEXA_ACP_MODEL");
std::env::remove_var("HELEXA_ACP_API_KEY");
}
let cfg = Config::from_env().unwrap();
assert_eq!(cfg.endpoints.len(), 1);
assert_eq!(cfg.endpoints[0].name, "default");
assert_eq!(cfg.endpoints[0].base_url.as_str(), DEFAULT_BASE_URL);
assert_eq!(
cfg.endpoints[0].default_model.as_deref(),
Some(DEFAULT_MODEL)
);
}
#[test]
fn toml_parses_multi_endpoint() {
let toml_text = r#"
default_endpoint = "helexa"
[[endpoints]]
name = "helexa"
base_url = "http://hanzalova.internal:31313/v1"
default_model = "helexa/large"
[[endpoints]]
name = "openrouter"
base_url = "https://openrouter.ai/api/v1"
wire_api = "openai-chat"
api_key_env = "OPENROUTER_API_KEY"
default_model = "anthropic/claude-opus-4"
"#;
let mut cfg: Config = toml::from_str(toml_text).unwrap();
cfg.validate().unwrap();
assert_eq!(cfg.endpoints.len(), 2);
assert_eq!(cfg.default_endpoint().name, "helexa");
assert_eq!(cfg.endpoints[0].wire_api, WireApi::OpenAiChat);
assert_eq!(
cfg.endpoints[1].api_key_env.as_deref(),
Some("OPENROUTER_API_KEY")
);
}
#[test]
fn validate_rejects_colon_in_endpoint_name() {
let toml_text = r#"
[[endpoints]]
name = "bad:name"
base_url = "http://x/v1"
"#;
let mut cfg: Config = toml::from_str(toml_text).unwrap();
let err = cfg.validate().unwrap_err();
assert!(format!("{err}").contains("clash"));
}
}

View File

@@ -1,145 +0,0 @@
//! helexa-acp — Agent Client Protocol bridge for multi-endpoint LLM
//! setups (helexa, LM Studio, Ollama, OpenRouter, OpenAI, Anthropic,
//! …) with a clean per-endpoint wire-format selector.
//!
//! Speaks ACP over stdio to an editor client (Zed today). Every
//! configured endpoint produces a wire-format-specific
//! [`provider::Provider`] implementation; the agent loop in
//! [`agent::Agent`] is provider-agnostic, so adding e.g. an Anthropic
//! /v1/messages provider doesn't touch `agent.rs`.
//!
//! Config: `$XDG_CONFIG_HOME/helexa-acp/config.toml` for the multi-
//! endpoint case; env vars (`HELEXA_ACP_BASE_URL`, etc.) for the
//! single-endpoint case when no config file exists.
use agent_client_protocol::{Result, Stdio};
use std::sync::Arc;
mod agent;
mod compaction;
mod config;
mod path_util;
mod prompt;
mod provider;
mod qwen3;
mod session;
mod store;
mod tool_runner;
mod tools;
use agent::Agent;
use config::{Config, EndpointConfig, WireApi};
use provider::{
Provider, anthropic_messages::AnthropicMessagesProvider, openai_chat::OpenAIChatProvider,
openai_responses::OpenAIResponsesProvider,
};
/// Set up tracing. Logs go to stderr by default — stdout is
/// reserved for the JSON-RPC stream. Setting `HELEXA_ACP_LOG_FILE`
/// to an absolute path appends logs to that file instead, which is
/// the practical way to capture debug output when the agent runs
/// under an editor (Zed, etc.) that doesn't surface stderr.
///
/// `RUST_LOG` still controls levels (e.g. `helexa_acp=debug`).
/// ANSI colours are auto-stripped when writing to a file so the log
/// is plain text.
fn init_tracing() {
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
let log_file = std::env::var("HELEXA_ACP_LOG_FILE")
.ok()
.filter(|s| !s.is_empty());
match log_file {
Some(path) => match std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(&path)
{
Ok(file) => {
tracing_subscriber::fmt()
.with_writer(std::sync::Mutex::new(file))
.with_env_filter(env_filter)
.with_ansi(false)
.init();
}
Err(e) => {
// Fall back to stderr and shout. We don't want a
// typo'd log path to silence the agent entirely.
tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(env_filter)
.init();
tracing::warn!(
path = %path,
error = %e,
"HELEXA_ACP_LOG_FILE could not be opened; using stderr"
);
}
},
None => {
tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_env_filter(env_filter)
.init();
}
}
}
/// Build a provider for `endpoint` according to its declared
/// `wire_api`. Future wire types (OpenAI Responses, Anthropic
/// /v1/messages, Ollama native) slot in here without changing the
/// caller.
fn build_provider(endpoint: EndpointConfig) -> anyhow::Result<Arc<dyn Provider>> {
match endpoint.wire_api {
WireApi::OpenAiChat => Ok(Arc::new(OpenAIChatProvider::new(endpoint)?)),
WireApi::OpenAiResponses => Ok(Arc::new(OpenAIResponsesProvider::new(endpoint)?)),
WireApi::AnthropicMessages => Ok(Arc::new(AnthropicMessagesProvider::new(endpoint)?)),
}
}
#[tokio::main]
async fn main() -> Result<()> {
init_tracing();
let cfg = Config::load()
.map_err(|e| agent_client_protocol::util::internal_error(format!("config: {e:#}")))?;
tracing::info!(
endpoints = cfg.endpoints.len(),
default_endpoint = %cfg.default_endpoint().name,
default_model = ?cfg.default_endpoint().default_model,
"helexa-acp starting"
);
// Build a provider for each configured endpoint up-front. Cheap —
// just sets up a reqwest::Client and resolves the API key — and
// surfaces config mistakes (missing API key env var, unsupported
// wire_api) before the editor even sends an initialize request.
let mut providers: Vec<Arc<dyn Provider>> = Vec::with_capacity(cfg.endpoints.len());
for endpoint in &cfg.endpoints {
match build_provider(endpoint.clone()) {
Ok(p) => {
tracing::info!(
endpoint = %endpoint.name,
base_url = %endpoint.base_url,
wire_api = ?endpoint.wire_api,
"registered provider"
);
providers.push(p);
}
Err(e) => {
tracing::warn!(
endpoint = %endpoint.name,
error = %format!("{e:#}"),
"skipping endpoint with invalid config"
);
}
}
}
let agent = Agent::new(&cfg, providers)
.await
.map_err(|e| agent_client_protocol::util::internal_error(format!("agent: {e:#}")))?;
agent.serve(Stdio::new()).await
}

View File

@@ -1,192 +0,0 @@
//! Path expansion shared across every tool that takes a path.
//!
//! Models often emit shell-style paths like `~/git/repo/file.rs` or
//! `$HOME/notes.md`. ACP's `fs/read_text_file` and friends — and our
//! own local `std::fs` reads — both want a real absolute path; the
//! `~` / `$HOME` forms reach them as literal strings and the open
//! fails. The tool schemas already document "absolute path" but in
//! practice the model slips up often enough that handling it
//! server-side is the difference between "works" and "the agent is
//! brittle".
//!
//! Scope is deliberately small:
//!
//! - `~` and `~/` (current user only — `~user` lookups would require
//! pulling in passwd parsing).
//! - `$HOME` and `$HOME/`.
//!
//! Any other shell variable (`$PWD`, `${HOME}`, …) passes through
//! unchanged. The shell already expands them inside `bash` tool
//! commands; for the file-tool argument fields, we deliberately
//! limit the set so the behaviour is predictable.
//!
//! Falls back to the input path verbatim when `HOME` is unset
//! (stripped-down container env). That preserves the "no surprise
//! mutations" rule — never invent a path the caller didn't ask for.
use std::path::{Path, PathBuf};
/// Process-global lock for tests that mutate `HOME`. Anyone in the
/// crate touching `HOME` must hold this for the duration of the
/// read-modify-restore window — otherwise concurrent `cargo test`
/// workers race and flake.
///
/// Only built into the test binaries. Production code never mutates
/// env vars.
#[cfg(test)]
pub(crate) static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
/// Expand `~`, `~/`, `$HOME`, and `$HOME/` prefixes against the
/// current user's home directory. All other inputs pass through
/// unchanged.
///
/// Returns the input verbatim if `HOME` isn't set in the env.
pub fn expand_path(input: &Path) -> PathBuf {
let Some(s) = input.to_str() else {
return input.to_path_buf();
};
let Ok(home) = std::env::var("HOME") else {
return input.to_path_buf();
};
let home = PathBuf::from(home);
if s == "~" || s == "$HOME" {
return home;
}
if let Some(rest) = s.strip_prefix("~/") {
return home.join(rest);
}
if let Some(rest) = s.strip_prefix("$HOME/") {
return home.join(rest);
}
input.to_path_buf()
}
#[cfg(test)]
mod tests {
use super::*;
/// Set HOME for the duration of the test. Tests using this run
/// serially under the crate-wide [`ENV_LOCK`] because env
/// mutation isn't thread-safe — `cargo test` parallel workers
/// would race without it.
fn with_home<F: FnOnce()>(home: &str, body: F) {
let _g = ENV_LOCK.lock().unwrap();
let prior = std::env::var("HOME").ok();
// SAFETY: tests touch process-global env. The mutex
// serialises access; sub-threads in other test modules
// touching HOME aren't expected (none in this crate).
unsafe {
std::env::set_var("HOME", home);
}
body();
unsafe {
match prior {
Some(p) => std::env::set_var("HOME", p),
None => std::env::remove_var("HOME"),
}
}
}
#[test]
fn expands_tilde_slash() {
with_home("/home/me", || {
assert_eq!(
expand_path(Path::new("~/git/repo/file.rs")),
PathBuf::from("/home/me/git/repo/file.rs")
);
});
}
#[test]
fn expands_bare_tilde() {
with_home("/home/me", || {
assert_eq!(expand_path(Path::new("~")), PathBuf::from("/home/me"));
});
}
#[test]
fn expands_dollar_home_slash() {
with_home("/home/me", || {
assert_eq!(
expand_path(Path::new("$HOME/notes.md")),
PathBuf::from("/home/me/notes.md")
);
});
}
#[test]
fn expands_bare_dollar_home() {
with_home("/home/me", || {
assert_eq!(expand_path(Path::new("$HOME")), PathBuf::from("/home/me"));
});
}
#[test]
fn absolute_path_passes_through() {
with_home("/home/me", || {
assert_eq!(
expand_path(Path::new("/etc/hostname")),
PathBuf::from("/etc/hostname")
);
});
}
#[test]
fn relative_path_passes_through() {
with_home("/home/me", || {
assert_eq!(
expand_path(Path::new("src/main.rs")),
PathBuf::from("src/main.rs")
);
});
}
#[test]
fn tilde_user_form_not_expanded() {
// ~other is shell sugar for /home/other and would require
// passwd parsing to resolve. Out of scope — pass it
// through and let the open fail with a clear error.
with_home("/home/me", || {
assert_eq!(
expand_path(Path::new("~other/x")),
PathBuf::from("~other/x")
);
});
}
#[test]
fn no_home_env_passes_through() {
// Share the same crate-wide lock as `with_home` — otherwise
// a parallel test setting HOME races this clear-and-assert
// window.
let _g = ENV_LOCK.lock().unwrap();
let prior = std::env::var("HOME").ok();
// SAFETY: serialised by LOCK above.
unsafe {
std::env::remove_var("HOME");
}
assert_eq!(
expand_path(Path::new("~/git/repo")),
PathBuf::from("~/git/repo")
);
unsafe {
if let Some(p) = prior {
std::env::set_var("HOME", p);
}
}
}
#[test]
fn dollar_other_var_not_expanded() {
with_home("/home/me", || {
assert_eq!(
expand_path(Path::new("$PWD/file")),
PathBuf::from("$PWD/file")
);
assert_eq!(
expand_path(Path::new("${HOME}/file")),
PathBuf::from("${HOME}/file")
);
});
}
}

View File

@@ -1,274 +0,0 @@
//! System prompt assembly.
//!
//! The system message has two parts:
//!
//! 1. A short human-readable preamble (working directory, style
//! instructions). Either the built-in [`DEFAULT_PROMPT`] or a
//! user-supplied file at `HELEXA_ACP_SYSTEM_PROMPT_PATH` /
//! `system_prompt_path`. `{cwd}` is substituted in both.
//! 2. A `# Tools` block in Qwen3 Hermes format (see [`crate::qwen3`])
//! describing the available functions. This is what makes the
//! model actually call them — neuron/cortex don't honour the
//! OpenAI `tools` API field, so the tool list has to live in the
//! prompt itself.
use agent_client_protocol::schema::SessionModeId;
use anyhow::Context;
use std::path::Path;
use crate::provider::ToolSpec;
use crate::qwen3;
use crate::session::MODE_PLAN;
const DEFAULT_PROMPT: &str = "\
You are helexa-acp, a coding assistant working inside an editor.
Working directory: {cwd}
Use the tools described below whenever the user's request involves
looking at or modifying files, or running commands. Do not ask the
user to paste file contents you could read yourself. All file paths
must be absolute. Writes and shell commands may prompt the user for
permission depending on the session mode.
Be concise; the user is reading your output in an editor pane.";
/// Build the system prompt for a session.
///
/// - `cwd`: session working directory (substituted for `{cwd}` in
/// the preamble — both the default and any user-supplied template).
/// - `override_path`: path to a user-supplied template, already
/// resolved by [`crate::config::Config`]. The `# Tools` block is
/// appended *after* the user's template so a custom preamble
/// still gets the tool descriptions the model needs.
/// - `tools`: the tools to advertise. Empty list → no `# Tools`
/// block is appended at all.
/// - `mode`: current session mode. When the mode is [`MODE_PLAN`]
/// a plan-mode addendum describing the restrictions and the
/// completion menu is appended *after* the `# Tools` block so it
/// is the last thing the model reads before user input.
/// - `plan_dir`: resolved plan directory for the cwd. Only consulted
/// when `mode == MODE_PLAN`. `None` means the plan directory could
/// not be resolved (no `HOME` / `XDG_DATA_HOME`) — the addendum
/// still renders but with a placeholder so the model knows to
/// surface the error to the user rather than guess a path.
pub fn build_system_prompt(
cwd: &Path,
override_path: Option<&Path>,
tools: &[ToolSpec],
mode: &SessionModeId,
plan_dir: Option<&Path>,
) -> anyhow::Result<String> {
let template = match override_path {
Some(path) => std::fs::read_to_string(path)
.with_context(|| format!("read system prompt from {}", path.display()))?,
None => DEFAULT_PROMPT.to_string(),
};
let mut prompt = template.replace("{cwd}", &cwd.display().to_string());
prompt.push_str(&qwen3::render_tool_block(tools));
if mode.0.as_ref() == MODE_PLAN {
prompt.push_str(&render_plan_mode_block(plan_dir));
}
Ok(prompt)
}
/// Plan-mode instruction block. Tells the model:
///
/// 1. Where it may write — only inside `plan_dir`.
/// 2. What it may *not* do — bash is disabled; writes outside
/// `plan_dir` are refused by the runtime.
/// 3. How to finish — emit the 3-option menu so the user can
/// switch modes and either kick off implementation (with or
/// without permission prompts) or keep iterating on the plan.
fn render_plan_mode_block(plan_dir: Option<&Path>) -> String {
let plan_path = plan_dir
.map(|p| p.display().to_string())
.unwrap_or_else(|| "<plan directory could not be resolved — tell the user>".to_string());
format!(
"\n\n# Plan mode\n\
\n\
You are in **plan mode**. Your task is to draft a written\n\
implementation plan for the user; you must NOT modify any\n\
project files or run shell commands.\n\
\n\
Rules in plan mode:\n\
\n\
- `read_file` and `list_dir` are unrestricted — use them to\n\
explore the codebase as needed.\n\
- `write_file` and `edit_file` are allowed ONLY under the\n\
plan directory: `{plan_path}`. The runtime will refuse any\n\
write outside it.\n\
- `bash` is disabled. Do not call it.\n\
\n\
Write the plan as one or more Markdown files under\n\
`{plan_path}`. Use descriptive filenames\n\
(`01-overview.md`, `02-data-model.md`, etc.). It is fine to\n\
iterate — overwrite the file when you refine a section.\n\
\n\
When the plan is complete, do NOT begin implementation.\n\
Instead, end your turn with this menu, verbatim, so the\n\
user can choose how to proceed:\n\
\n\
---\n\
**Plan complete.** To proceed, switch the session mode in\n\
the agent dropdown and send a follow-up message:\n\
\n\
1. **Bypass Permissions** — implement the plan now, skipping\n\
per-tool permission prompts.\n\
2. **Default** — implement the plan now, prompting before\n\
each write or shell command.\n\
3. **Plan** (stay here) — refine the plan; reply with the\n\
change you want and I will revise it.\n\
---\n"
)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::session::{MODE_DEFAULT, MODE_PLAN};
use std::io::Write;
fn default_mode() -> SessionModeId {
SessionModeId::new(MODE_DEFAULT)
}
fn plan_mode() -> SessionModeId {
SessionModeId::new(MODE_PLAN)
}
#[test]
fn default_prompt_substitutes_cwd() {
let prompt =
build_system_prompt(Path::new("/home/me/proj"), None, &[], &default_mode(), None)
.unwrap();
assert!(
prompt.contains("/home/me/proj"),
"cwd not interpolated: {prompt}"
);
assert!(prompt.contains("helexa-acp"));
assert!(
!prompt.contains("{cwd}"),
"left-over placeholder in default prompt"
);
// With no tools, the # Tools block is absent.
assert!(!prompt.contains("# Tools"));
// Default mode does not get the plan-mode addendum.
assert!(!prompt.contains("# Plan mode"));
}
#[test]
fn tools_are_appended_in_hermes_format() {
let spec = ToolSpec {
name: "read_file".into(),
description: "Read a file.".into(),
parameters: serde_json::json!({"type":"object","properties":{}, "required":[]}),
};
let prompt =
build_system_prompt(Path::new("/x"), None, &[spec], &default_mode(), None).unwrap();
assert!(prompt.contains("# Tools"));
assert!(prompt.contains("<tools>"));
assert!(prompt.contains("\"name\":\"read_file\""));
assert!(prompt.contains("<tool_call>"));
}
#[test]
fn override_path_is_read_and_templated() {
let mut tmp = tempfile_in_target("prompt.txt");
tmp.write_all(b"custom prompt for {cwd} only").unwrap();
tmp.flush().unwrap();
let path = tmp.path().to_path_buf();
drop(tmp);
let prompt = build_system_prompt(
Path::new("/etc"),
Some(path.as_path()),
&[],
&default_mode(),
None,
)
.expect("read override");
assert_eq!(prompt, "custom prompt for /etc only");
let _ = std::fs::remove_file(&path);
}
#[test]
fn missing_override_path_errors() {
let err = build_system_prompt(
Path::new("/tmp"),
Some(Path::new("/definitely/not/a/real/path")),
&[],
&default_mode(),
None,
)
.unwrap_err();
assert!(format!("{err:#}").contains("read system prompt"));
}
#[test]
fn plan_mode_addendum_includes_plan_dir_and_menu() {
let plan_dir = Path::new("/home/me/.local/share/helexa-acp/plans/proj-deadbeef");
let prompt = build_system_prompt(
Path::new("/home/me/proj"),
None,
&[],
&plan_mode(),
Some(plan_dir),
)
.unwrap();
assert!(prompt.contains("# Plan mode"));
assert!(
prompt.contains(plan_dir.to_str().unwrap()),
"plan dir not interpolated: {prompt}"
);
// The 3-option menu must be present so the model emits it verbatim.
assert!(prompt.contains("Bypass Permissions"));
assert!(prompt.contains("**Default**"));
assert!(prompt.contains("3. **Plan**"));
// Bash disabled instruction must be present.
assert!(prompt.contains("`bash` is disabled"));
}
#[test]
fn plan_mode_addendum_handles_unresolved_plan_dir() {
let prompt =
build_system_prompt(Path::new("/home/me/proj"), None, &[], &plan_mode(), None).unwrap();
assert!(prompt.contains("# Plan mode"));
assert!(prompt.contains("could not be resolved"));
}
/// Tiny temp-file helper that doesn't pull in the `tempfile` crate.
/// Writes under `target/` so it's cleaned up by `cargo clean`.
fn tempfile_in_target(name: &str) -> TempHandle {
let base = std::env::var("CARGO_TARGET_TMPDIR")
.ok()
.map(std::path::PathBuf::from)
.unwrap_or_else(std::env::temp_dir);
let _ = std::fs::create_dir_all(&base);
let pid = std::process::id();
let path = base.join(format!("helexa-acp-{pid}-{name}"));
let file = std::fs::File::create(&path).expect("create temp file");
TempHandle { file, path }
}
struct TempHandle {
file: std::fs::File,
path: std::path::PathBuf,
}
impl TempHandle {
fn path(&self) -> &Path {
&self.path
}
}
impl Write for TempHandle {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.file.write(buf)
}
fn flush(&mut self) -> std::io::Result<()> {
self.file.flush()
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,230 +0,0 @@
//! Provider trait — the seam between the ACP-side agent loop and
//! whatever wire protocol an endpoint actually speaks.
//!
//! Every concrete provider (OpenAI chat completions, OpenAI Responses,
//! Anthropic /v1/messages, Ollama native, …) implements
//! [`Provider`]. The agent constructs a [`CompletionRequest`] using
//! provider-agnostic types and consumes a stream of
//! [`CompletionEvent`]s — neither end knows which wire format is on
//! the other side of the trait.
//!
//! Day-1 provider: [`openai_chat::OpenAIChatProvider`]. Day-N
//! providers slot in without touching `agent.rs`.
use async_trait::async_trait;
use futures::stream::BoxStream;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use tokio_util::sync::CancellationToken;
pub mod anthropic_messages;
pub mod openai_chat;
pub mod openai_responses;
/// Provider-agnostic LLM endpoint. Implementations translate between
/// [`CompletionRequest`] / [`CompletionEvent`] and whatever wire
/// format their endpoint speaks.
#[async_trait]
pub trait Provider: Send + Sync {
/// Endpoint name as configured by the user (e.g. `"helexa"`,
/// `"openrouter"`). Used in logs and in the `endpoint:model`
/// selector.
fn name(&self) -> &str;
/// List models available at this endpoint. Used to build the
/// model-picker dropdown in editor clients (Stage 4). Should
/// return quickly (cache if necessary).
#[allow(dead_code)]
async fn list_models(&self) -> anyhow::Result<Vec<ModelInfo>>;
/// Run a chat completion. Returns a stream of provider-agnostic
/// events. The stream stops when the upstream finishes, when
/// `cancel` is fired, or when the stream is dropped.
async fn complete(
&self,
request: CompletionRequest,
cancel: CancellationToken,
) -> anyhow::Result<BoxStream<'static, anyhow::Result<CompletionEvent>>>;
}
/// One model exposed by a provider. Constructed by `list_models` —
/// Stage 4 is when the agent loop starts consuming it for the
/// model-picker dropdown.
#[allow(dead_code)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelInfo {
pub id: String,
/// Human-friendly name, if the endpoint exposes one. Otherwise
/// `id` is used as the display name.
#[serde(default)]
pub display_name: Option<String>,
}
/// Inputs to a completion. Provider-agnostic — concrete providers
/// translate this into their wire format.
#[derive(Debug, Clone)]
pub struct CompletionRequest {
/// Endpoint-local model id (without the `endpoint:` prefix).
pub model: String,
pub messages: Vec<Message>,
/// Tools the model is allowed to call. Empty list means no tool
/// support advertised.
pub tools: Vec<ToolSpec>,
pub temperature: Option<f64>,
pub top_p: Option<f64>,
pub max_tokens: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Message {
pub role: Role,
pub content: MessageContent,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Role {
System,
User,
Assistant,
/// Tool result message. Provider impls turn this into whatever
/// shape the upstream wire format wants (OpenAI uses
/// `role: "tool"` + `tool_call_id`; Anthropic uses content blocks).
/// Stage 3 (tools) constructs this; Stage 2 never does.
Tool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MessageContent {
/// Plain text turn (system / user / assistant). Struct variant
/// rather than newtype so the persisted JSON has an explicit
/// `text` field — that lets us use internal tagging on the
/// enum, which is incompatible with newtype-of-primitive
/// variants.
Text { text: String },
/// Mixed text + image user turn. Stage 5 introduces this when
/// Zed sends an `ImageContent` block alongside the user's prompt.
/// Providers that don't support vision should down-convert by
/// dropping image parts and concatenating text parts.
MultiPart { parts: Vec<MessagePart> },
/// Assistant turn that called one or more tools. Stage 3 starts
/// constructing this when the provider stream yields a
/// `ToolCallStart` / `ToolCallArgsDelta` sequence.
ToolCalls {
/// Optional text the assistant said alongside the tool calls.
text: Option<String>,
calls: Vec<ToolCall>,
},
/// Tool result. `tool_call_id` matches the assistant's call id.
/// Stage 3 constructs this after the tool runner finishes.
ToolResult {
tool_call_id: String,
content: String,
},
}
/// One part of a [`MessageContent::MultiPart`] message.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MessagePart {
Text { text: String },
Image(ImageData),
}
/// Inline image attachment. `data` is base64-encoded raw image
/// bytes; the encoder constructs an `image_url` data URI from it
/// at request time. `uri` carries any pointer the client supplied
/// (e.g. `file:///tmp/x.png`) — we keep it on the message for
/// debugging / future providers but the OpenAI encoder ignores it
/// when `data` is present (data wins, since it round-trips through
/// every wire format).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageData {
pub mime_type: String,
/// Base64-encoded image bytes (no `data:` prefix, no padding
/// stripped — exactly what `ImageContent.data` carried).
pub data: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub uri: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolCall {
/// Provider-assigned id that ties the call to its result. The
/// Qwen3 wire format we use today doesn't carry this on the
/// model side (calls and results are matched positionally inside
/// a turn), so the field looks unused in the prod build — but it
/// flows through to `MessageContent::ToolResult.tool_call_id` for
/// history bookkeeping and a future strict-OpenAI backend will
/// consume it directly.
#[allow(dead_code)]
pub id: String,
pub name: String,
/// JSON-encoded arguments. Kept as a string because providers
/// stream argument bytes incrementally and only validate at the
/// end; the agent decodes once the call is complete.
pub arguments: String,
}
#[derive(Debug, Clone)]
pub struct ToolSpec {
pub name: String,
pub description: String,
/// JSON Schema of the arguments object.
pub parameters: Value,
}
/// Events emitted by a provider during a streaming completion.
#[derive(Debug, Clone)]
pub enum CompletionEvent {
/// Incremental visible text from the assistant.
TextDelta(String),
/// Incremental "reasoning" / thought text, if the model emits one
/// (e.g. Qwen3 with `<think>` tags surfaced as a separate stream,
/// or OpenAI reasoning models).
ReasoningDelta(String),
/// A new tool call has started. Stage 2 ignores the payload; the
/// agent loop in Stage 3 reads `index` to correlate with
/// [`Self::ToolCallArgsDelta`], `id` for the eventual tool-result
/// turn, and `name` to dispatch the runner.
#[allow(dead_code)]
ToolCallStart {
index: usize,
id: String,
name: String,
},
/// More argument bytes for a tool call already announced via
/// [`Self::ToolCallStart`]. Stage 2 ignores; Stage 3 accumulates
/// the bytes by `index` until the call's arguments are complete.
#[allow(dead_code)]
ToolCallArgsDelta { index: usize, args_delta: String },
/// A `<tool_call>` block whose JSON couldn't be parsed even with
/// the qwen3 module's repair attempts. The agent surfaces this
/// as a Failed `SessionUpdate::ToolCall` card with the raw body
/// visible (so the editor renders structured failure UI rather
/// than dumping the body inline in the message pane), and feeds
/// a synthetic tool-error message back into history so the
/// model can self-correct on the next round.
MalformedToolCall { raw: String },
/// Stream finished. Carries the upstream `finish_reason` if it
/// gave one (`"stop"`, `"length"`, `"tool_calls"`, …).
Finish { reason: Option<String> },
/// Final usage stats, if the provider supplied them. Stage 2
/// matches the variant to drop it; Stage 6b (token metrics) is
/// when the payload starts being read.
#[allow(dead_code)]
Usage(UsageStats),
}
/// Token accounting reported by the provider at the end of a stream.
/// Stage 2 doesn't surface usage anywhere — the stable `PromptResponse`
/// has no usage field, and the unstable variant is gated. Stage 6b
/// turns these on with Prometheus metrics.
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, Default)]
pub struct UsageStats {
pub prompt_tokens: u64,
pub completion_tokens: u64,
pub total_tokens: u64,
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,987 +0,0 @@
//! OpenAI Responses API (`POST /v1/responses`) provider.
//!
//! Mirror image of [`super::openai_chat`]: same `Provider` trait
//! impl, same back-pressured SSE decoder, but speaking OpenAI's
//! newer Responses surface instead of chat completions.
//!
//! Differences from the chat provider, all contained in this file:
//!
//! - **Request encoding**: history flattens into an `input` array
//! of typed items (`message`, `function_call`, `function_call_output`)
//! plus a top-level `instructions` field for the system prompt.
//! Multi-part user content stays in the same `[{type:"input_text"},
//! {type:"input_image"}]` shape neuron's `request_to_chat` already
//! accepts.
//! - **Streaming decoder**: events are named (`response.created`,
//! `response.output_text.delta`, `response.completed`, …) carried
//! on the SSE `event:` line. The chat path's `[DONE]` terminator
//! doesn't apply; the stream ends after `response.completed`.
//! - **Tool calls** plumb through the `response.output_item.added`
//! (item type `function_call`) → `response.function_call_arguments.delta`
//! → `response.function_call_arguments.done` event sequence. The
//! neuron candle harness doesn't synthesize these yet (tracked as
//! issue #6), but the decoder is wired so the day the upstream
//! does, downstream `CompletionEvent::ToolCall*` plumbing just
//! works.
//!
//! Tool-name handling: the model knows its tool descriptions via
//! the [`crate::qwen3`] system-prompt block exactly the way the chat
//! provider does. We don't echo them in the request body because
//! neuron currently ignores `tools` on /v1/responses (same as on
//! /v1/chat/completions). Once neuron honours request-side tool
//! definitions, both providers add them in the same place.
use async_trait::async_trait;
use eventsource_stream::Eventsource;
use futures::{Stream, StreamExt, stream::BoxStream};
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use std::collections::HashMap;
use tokio_util::sync::CancellationToken;
use super::{
CompletionEvent, CompletionRequest, Message, MessageContent, MessagePart, ModelInfo, Provider,
Role, UsageStats,
};
use crate::config::EndpointConfig;
pub struct OpenAIResponsesProvider {
endpoint: EndpointConfig,
#[allow(dead_code)] // Read in `complete()`'s HTTP path; tests don't stand up a server.
api_key: Option<String>,
#[allow(dead_code)]
http: reqwest::Client,
}
impl OpenAIResponsesProvider {
pub fn new(endpoint: EndpointConfig) -> anyhow::Result<Self> {
let api_key = endpoint.resolve_api_key()?;
let http = reqwest::Client::builder()
// Same generous timeout as the chat provider: cortex may
// need to cold-load a model before serving the first
// chunk, which can be tens of seconds. Cancellation
// handles early termination, not timeout.
.timeout(std::time::Duration::from_secs(600))
.build()?;
Ok(Self {
endpoint,
api_key,
http,
})
}
}
#[async_trait]
impl Provider for OpenAIResponsesProvider {
fn name(&self) -> &str {
&self.endpoint.name
}
async fn list_models(&self) -> anyhow::Result<Vec<ModelInfo>> {
let mut req = self.http.get(self.endpoint.models_url());
if let Some(key) = &self.api_key {
req = req.bearer_auth(key);
}
let resp = req
.send()
.await
.map_err(|e| anyhow::anyhow!("{} list_models: {e}", self.endpoint.name))?;
let status = resp.status();
if !status.is_success() {
let body = resp.text().await.unwrap_or_default();
anyhow::bail!(
"{} list_models returned {}: {}",
self.endpoint.name,
status,
body
);
}
let body: WireModelsResponse = resp.json().await?;
Ok(body
.data
.into_iter()
.map(|m| ModelInfo {
id: m.id,
display_name: None,
})
.collect())
}
async fn complete(
&self,
request: CompletionRequest,
cancel: CancellationToken,
) -> anyhow::Result<BoxStream<'static, anyhow::Result<CompletionEvent>>> {
let body = encode_request(&request);
tracing::debug!(
endpoint = %self.endpoint.name,
url = %self.endpoint.responses_url(),
body = %serde_json::to_string(&body).unwrap_or_else(|_| "<unserializable>".into()),
"POST /responses"
);
let mut req = self.http.post(self.endpoint.responses_url()).json(&body);
if let Some(key) = &self.api_key {
req = req.bearer_auth(key);
}
let resp = req
.send()
.await
.map_err(|e| anyhow::anyhow!("{} responses send: {e}", self.endpoint.name))?;
let status = resp.status();
if !status.is_success() {
let body = resp.text().await.unwrap_or_default();
anyhow::bail!(
"{} responses returned {}: {}",
self.endpoint.name,
status,
body
);
}
let sse = resp.bytes_stream().eventsource();
let stream = decode_stream(sse, cancel);
Ok(Box::pin(stream))
}
}
// ── Request encoding ─────────────────────────────────────────────────
fn encode_request(req: &CompletionRequest) -> Value {
// Pull the system messages out of history into a single
// `instructions` string — the Responses API expects them there,
// not inline as an `input` item. Multiple system messages
// concatenate with blank lines so we don't lose ordering.
let mut instructions: Vec<String> = Vec::new();
let mut input_items: Vec<Value> = Vec::new();
for msg in &req.messages {
if msg.role == Role::System
&& let MessageContent::Text { text } = &msg.content
{
instructions.push(text.clone());
continue;
}
if let Some(item) = encode_message_as_input_item(msg) {
input_items.push(item);
}
}
let mut body = json!({
"model": req.model,
"input": input_items,
"stream": true,
});
if let Value::Object(map) = &mut body {
if !instructions.is_empty() {
map.insert(
"instructions".into(),
Value::String(instructions.join("\n\n")),
);
}
if let Some(t) = req.temperature {
map.insert("temperature".into(), json!(t));
}
if let Some(p) = req.top_p {
map.insert("top_p".into(), json!(p));
}
if let Some(m) = req.max_tokens {
// Responses calls it `max_output_tokens`; preserve the
// semantic (response cap) when we translate.
map.insert("max_output_tokens".into(), json!(m));
}
}
body
}
fn encode_message_as_input_item(msg: &Message) -> Option<Value> {
match (msg.role, &msg.content) {
(Role::System, _) => None, // handled out-of-band as `instructions`
(Role::User, MessageContent::Text { text }) => Some(json!({
"type": "message",
"role": "user",
"content": text,
})),
(Role::User, MessageContent::MultiPart { parts }) => Some(json!({
"type": "message",
"role": "user",
"content": encode_user_parts(parts),
})),
(Role::Assistant, MessageContent::Text { text }) => Some(json!({
"type": "message",
"role": "assistant",
"content": [{
"type": "output_text",
"text": text,
"annotations": [],
}],
})),
(Role::Assistant, MessageContent::ToolCalls { text, calls }) => {
// Assistant turns that called tools become a sequence of
// items: an optional `message` (any prose alongside the
// call) followed by one `function_call` per call. Mirrors
// OpenAI Responses' "each item is one structural slot"
// shape.
//
// We can't return multiple items from one call site, so
// we encode this by side-stuffing additional items into a
// single composite value and have the caller flatten —
// but that complicates the API. Easier: build the array
// ourselves in the caller path. For now, emit just the
// function_calls (the assistant's prose lives in the next
// turn's chat history anyway because the model isn't
// looking back at its own previous narration). If the
// text is non-empty AND we have calls, we lose the text;
// qwen3 rarely emits prose alongside tool calls so this
// is a deliberate simplification — revisit if it bites.
let _ = text;
// Take the first call only for the moment; multi-call
// turns would need the caller-flattening above.
let call = calls.first()?;
Some(json!({
"type": "function_call",
"call_id": call.id,
"name": call.name,
"arguments": call.arguments,
}))
}
(
Role::Tool,
MessageContent::ToolResult {
tool_call_id,
content,
},
) => Some(json!({
"type": "function_call_output",
"call_id": tool_call_id,
"output": content,
})),
(role, content) => {
tracing::warn!(
?role,
?content,
"openai_responses: unexpected (role, content) shape"
);
None
}
}
}
fn encode_user_parts(parts: &[MessagePart]) -> Value {
let items: Vec<Value> = parts
.iter()
.map(|p| match p {
MessagePart::Text { text } => json!({"type": "input_text", "text": text}),
MessagePart::Image(img) => json!({
"type": "input_image",
"image_url": format!("data:{};base64,{}", img.mime_type, img.data),
}),
})
.collect();
Value::Array(items)
}
// ── Wire types ──────────────────────────────────────────────────────
#[allow(dead_code)] // fields read only when list_models runs against a real endpoint
#[derive(Debug, Deserialize)]
struct WireModelsResponse {
data: Vec<WireModelObject>,
}
#[allow(dead_code)]
#[derive(Debug, Deserialize)]
struct WireModelObject {
id: String,
}
// SSE event payload shapes. We only model the fields we care about;
// `#[serde(default)]` + `Option` everywhere else lets the upstream
// add optional fields without breaking deserialise.
#[derive(Debug, Deserialize, Serialize)]
struct OutputItemAddedEvent {
#[serde(default)]
output_index: u32,
item: OutputItem,
}
#[derive(Debug, Deserialize, Serialize)]
#[serde(tag = "type", rename_all = "snake_case")]
enum OutputItem {
Message {
#[serde(default)]
id: Option<String>,
},
FunctionCall {
#[serde(default)]
id: Option<String>,
#[serde(default)]
call_id: Option<String>,
#[serde(default)]
name: Option<String>,
/// Some upstreams populate `arguments` already on the
/// `output_item.added` event for a fully-buffered tool call
/// (i.e. when the model finalised the call before the SSE
/// flush). Capture it so we can emit a single args delta.
#[serde(default)]
arguments: Option<String>,
},
/// `reasoning`, `web_search_call`, etc. We capture-and-ignore
/// any item we don't model; the decoder still emits the
/// outer events correctly.
#[serde(other)]
Unknown,
}
#[derive(Debug, Deserialize, Serialize)]
struct OutputTextDeltaEvent {
#[serde(default)]
item_id: Option<String>,
#[serde(default)]
output_index: u32,
#[serde(default)]
delta: String,
}
#[derive(Debug, Deserialize, Serialize)]
struct FunctionCallArgumentsDeltaEvent {
#[serde(default)]
item_id: Option<String>,
#[serde(default)]
output_index: u32,
#[serde(default)]
delta: String,
}
#[derive(Debug, Deserialize, Serialize)]
struct ResponseCompletedEvent {
response: ResponseShell,
}
#[derive(Debug, Deserialize, Serialize)]
struct ResponseShell {
#[serde(default)]
status: Option<String>,
#[serde(default)]
usage: Option<WireUsage>,
}
#[derive(Debug, Deserialize, Serialize)]
struct WireUsage {
#[serde(default)]
input_tokens: u64,
#[serde(default)]
output_tokens: u64,
#[serde(default)]
total_tokens: u64,
}
// ── Streaming decoder ───────────────────────────────────────────────
/// Translate the named-event Responses SSE into the provider-agnostic
/// [`CompletionEvent`] stream the agent loop expects. The decoder
/// holds per-stream state — output_index → tool-call-index plus
/// the next available tool-call slot — so it can fire
/// `ToolCallStart` exactly once per item.
fn decode_stream<S>(
sse: S,
cancel: CancellationToken,
) -> impl Stream<Item = anyhow::Result<CompletionEvent>>
where
S: Stream<
Item = Result<
eventsource_stream::Event,
eventsource_stream::EventStreamError<reqwest::Error>,
>,
> + Send
+ 'static,
{
async_stream::stream! {
let mut sse = Box::pin(sse);
// Maps an output_index that's a function_call to the tool-call
// slot we hand downstream. Lets us correlate later
// `function_call_arguments.delta` events back to the index
// we already announced on `output_item.added`.
let mut tool_index_by_output: HashMap<u32, usize> = HashMap::new();
let mut next_tool_index: usize = 0;
loop {
tokio::select! {
biased;
_ = cancel.cancelled() => {
tracing::debug!("openai_responses: cancellation requested, ending stream");
break;
}
next = sse.next() => {
let Some(event) = next else { break };
let event = match event {
Ok(e) => e,
Err(e) => {
yield Err(anyhow::anyhow!("SSE transport: {e}"));
break;
}
};
// Event name lives on `event.event`; data is JSON.
let event_name = event.event.as_str();
let data = event.data.as_str();
match event_name {
"response.output_text.delta" => {
match serde_json::from_str::<OutputTextDeltaEvent>(data) {
Ok(d) if !d.delta.is_empty() => {
yield Ok(CompletionEvent::TextDelta(d.delta));
}
Ok(_) => {}
Err(e) => {
tracing::warn!(
error = %e,
raw = %data,
"openai_responses: failed to parse output_text.delta; skipping"
);
}
}
}
"response.output_item.added" => {
match serde_json::from_str::<OutputItemAddedEvent>(data) {
Ok(ev) => {
if let OutputItem::FunctionCall {
id,
call_id,
name,
arguments,
} = ev.item
{
let idx = next_tool_index;
next_tool_index += 1;
tool_index_by_output.insert(ev.output_index, idx);
// Prefer the user-facing
// `call_id` (what gets paired
// with tool results) over the
// internal item `id` when
// both are present. Falls
// back to a synthetic id so
// history bookkeeping never
// breaks.
let final_id = call_id
.or(id)
.unwrap_or_else(|| format!("call_{idx}"));
let final_name = name.unwrap_or_default();
yield Ok(CompletionEvent::ToolCallStart {
index: idx,
id: final_id,
name: final_name,
});
// Some upstreams attach the
// fully-buffered arguments on
// the `output_item.added`
// event itself (rare; happens
// when the model finalised
// before the SSE flush).
// Emit as a single args
// delta if present.
if let Some(args) = arguments
&& !args.is_empty()
{
yield Ok(CompletionEvent::ToolCallArgsDelta {
index: idx,
args_delta: args,
});
}
}
}
Err(e) => {
tracing::warn!(
error = %e,
raw = %data,
"openai_responses: failed to parse output_item.added; skipping"
);
}
}
}
"response.function_call_arguments.delta" => {
match serde_json::from_str::<FunctionCallArgumentsDeltaEvent>(data) {
Ok(ev) => {
let Some(&idx) = tool_index_by_output.get(&ev.output_index)
else {
// Args delta for an item we
// never saw an `output_item.added`
// for. Could happen if the
// upstream reordered events;
// log + skip.
tracing::warn!(
output_index = ev.output_index,
"openai_responses: function_call_arguments.delta for unknown output_index"
);
continue;
};
if !ev.delta.is_empty() {
yield Ok(CompletionEvent::ToolCallArgsDelta {
index: idx,
args_delta: ev.delta,
});
}
}
Err(e) => {
tracing::warn!(
error = %e,
raw = %data,
"openai_responses: failed to parse function_call_arguments.delta; skipping"
);
}
}
}
"response.completed" => {
// Final event. Pull usage + status off
// the response shell. Status maps:
// "completed" → no special handling
// (caller treats as EndTurn),
// "incomplete" → length stop.
let (reason, usage) =
match serde_json::from_str::<ResponseCompletedEvent>(data) {
Ok(ev) => {
let reason = match ev.response.status.as_deref() {
Some("incomplete") => Some("length".to_string()),
_ => Some("stop".to_string()),
};
let usage = ev.response.usage.map(|u| UsageStats {
prompt_tokens: u.input_tokens,
completion_tokens: u.output_tokens,
total_tokens: u.total_tokens,
});
(reason, usage)
}
Err(e) => {
tracing::warn!(
error = %e,
raw = %data,
"openai_responses: failed to parse response.completed; ending stream with EndTurn"
);
(Some("stop".to_string()), None)
}
};
if let Some(u) = usage {
yield Ok(CompletionEvent::Usage(u));
}
yield Ok(CompletionEvent::Finish { reason });
break;
}
// Bookkeeping events we don't need to surface:
// response.created, response.in_progress,
// response.content_part.added/.done,
// response.output_text.done,
// response.output_item.done,
// response.function_call_arguments.done,
// response.reasoning_*. Logged at debug for
// wire-tracing.
other => {
tracing::trace!(
event = other,
"openai_responses: bookkeeping event"
);
}
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::provider::ToolCall;
use crate::provider::{ImageData, MessagePart};
use futures::stream;
use url::Url;
fn ep() -> EndpointConfig {
EndpointConfig {
name: "test".into(),
base_url: Url::parse("http://localhost:9999/v1").unwrap(),
wire_api: crate::config::WireApi::OpenAiResponses,
default_model: None,
api_key: None,
api_key_env: None,
max_tokens: None,
context_window: None,
}
}
// ── encode_request ──────────────────────────────────────────────
#[test]
fn system_messages_collapse_to_instructions() {
let req = CompletionRequest {
model: "m".into(),
messages: vec![
Message {
role: Role::System,
content: MessageContent::Text {
text: "you are helpful".into(),
},
},
Message {
role: Role::User,
content: MessageContent::Text { text: "hi".into() },
},
],
tools: vec![],
temperature: Some(0.7),
top_p: None,
max_tokens: Some(256),
};
let body = encode_request(&req);
assert_eq!(body["model"], "m");
assert_eq!(body["instructions"], "you are helpful");
assert_eq!(body["stream"], true);
assert_eq!(body["max_output_tokens"], 256);
assert_eq!(body["temperature"], 0.7);
let input = body["input"].as_array().unwrap();
// System message NOT echoed in input — it's only in
// instructions.
assert_eq!(input.len(), 1);
assert_eq!(input[0]["type"], "message");
assert_eq!(input[0]["role"], "user");
assert_eq!(input[0]["content"], "hi");
}
#[test]
fn multiple_system_messages_concatenate() {
let req = CompletionRequest {
model: "m".into(),
messages: vec![
Message {
role: Role::System,
content: MessageContent::Text {
text: "first".into(),
},
},
Message {
role: Role::System,
content: MessageContent::Text {
text: "second".into(),
},
},
Message {
role: Role::User,
content: MessageContent::Text { text: "hi".into() },
},
],
tools: vec![],
temperature: None,
top_p: None,
max_tokens: None,
};
let body = encode_request(&req);
assert_eq!(body["instructions"], "first\n\nsecond");
}
#[test]
fn user_multipart_becomes_input_parts_array() {
let req = CompletionRequest {
model: "vl".into(),
messages: vec![Message {
role: Role::User,
content: MessageContent::MultiPart {
parts: vec![
MessagePart::Text {
text: "what's in this?".into(),
},
MessagePart::Image(ImageData {
mime_type: "image/png".into(),
data: "AAA=".into(),
uri: None,
}),
],
},
}],
tools: vec![],
temperature: None,
top_p: None,
max_tokens: None,
};
let body = encode_request(&req);
let content = &body["input"][0]["content"].as_array().unwrap().clone();
assert_eq!(content.len(), 2);
assert_eq!(content[0]["type"], "input_text");
assert_eq!(content[0]["text"], "what's in this?");
assert_eq!(content[1]["type"], "input_image");
assert_eq!(content[1]["image_url"], "data:image/png;base64,AAA=");
}
#[test]
fn assistant_text_becomes_output_text_content_part() {
let req = CompletionRequest {
model: "m".into(),
messages: vec![
Message {
role: Role::User,
content: MessageContent::Text { text: "hi".into() },
},
Message {
role: Role::Assistant,
content: MessageContent::Text {
text: "hello there".into(),
},
},
Message {
role: Role::User,
content: MessageContent::Text {
text: "more".into(),
},
},
],
tools: vec![],
temperature: None,
top_p: None,
max_tokens: None,
};
let body = encode_request(&req);
let input = body["input"].as_array().unwrap();
assert_eq!(input.len(), 3);
assert_eq!(input[1]["type"], "message");
assert_eq!(input[1]["role"], "assistant");
assert_eq!(input[1]["content"][0]["type"], "output_text");
assert_eq!(input[1]["content"][0]["text"], "hello there");
}
#[test]
fn tool_calls_and_results_round_trip_via_function_call_items() {
let req = CompletionRequest {
model: "m".into(),
messages: vec![
Message {
role: Role::Assistant,
content: MessageContent::ToolCalls {
text: None,
calls: vec![ToolCall {
id: "call_42".into(),
name: "read_file".into(),
arguments: r#"{"path":"/etc/hostname"}"#.into(),
}],
},
},
Message {
role: Role::Tool,
content: MessageContent::ToolResult {
tool_call_id: "call_42".into(),
content: "host".into(),
},
},
],
tools: vec![],
temperature: None,
top_p: None,
max_tokens: None,
};
let body = encode_request(&req);
let input = body["input"].as_array().unwrap();
assert_eq!(input.len(), 2);
assert_eq!(input[0]["type"], "function_call");
assert_eq!(input[0]["call_id"], "call_42");
assert_eq!(input[0]["name"], "read_file");
assert_eq!(input[0]["arguments"], r#"{"path":"/etc/hostname"}"#);
assert_eq!(input[1]["type"], "function_call_output");
assert_eq!(input[1]["call_id"], "call_42");
assert_eq!(input[1]["output"], "host");
}
// ── decode_stream ───────────────────────────────────────────────
fn sse_event(name: &str, data: &str) -> eventsource_stream::Event {
eventsource_stream::Event {
id: String::new(),
retry: None,
event: name.into(),
data: data.into(),
}
}
async fn collect_events(
items: Vec<eventsource_stream::Event>,
) -> Vec<anyhow::Result<CompletionEvent>> {
let sse = stream::iter(
items
.into_iter()
.map(Ok::<_, eventsource_stream::EventStreamError<reqwest::Error>>),
);
let decoded = decode_stream(sse, CancellationToken::new());
decoded.collect().await
}
#[tokio::test]
async fn decodes_text_then_finish() {
let events = collect_events(vec![
sse_event("response.created", "{}"),
sse_event(
"response.output_text.delta",
r#"{"item_id":"msg_1","output_index":0,"delta":"hel"}"#,
),
sse_event(
"response.output_text.delta",
r#"{"item_id":"msg_1","output_index":0,"delta":"lo"}"#,
),
sse_event(
"response.completed",
r#"{"response":{"status":"completed","usage":{"input_tokens":3,"output_tokens":2,"total_tokens":5}}}"#,
),
])
.await;
let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
let mut iter = events.into_iter();
assert!(matches!(iter.next(), Some(CompletionEvent::TextDelta(t)) if t == "hel"));
assert!(matches!(iter.next(), Some(CompletionEvent::TextDelta(t)) if t == "lo"));
assert!(matches!(iter.next(), Some(CompletionEvent::Usage(u)) if u.total_tokens == 5));
assert!(matches!(
iter.next(),
Some(CompletionEvent::Finish { reason: Some(r) }) if r == "stop"
));
assert!(iter.next().is_none());
}
#[tokio::test]
async fn empty_delta_is_dropped() {
let events = collect_events(vec![
sse_event(
"response.output_text.delta",
r#"{"item_id":"m","output_index":0,"delta":""}"#,
),
sse_event(
"response.completed",
r#"{"response":{"status":"completed"}}"#,
),
])
.await;
let mut completion_events = events.into_iter().map(|r| r.unwrap());
// First event MUST be the Finish — the empty delta dropped.
assert!(matches!(
completion_events.next(),
Some(CompletionEvent::Finish { .. })
));
}
#[tokio::test]
async fn incomplete_status_maps_to_length_finish_reason() {
let events = collect_events(vec![sse_event(
"response.completed",
r#"{"response":{"status":"incomplete"}}"#,
)])
.await;
let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
assert!(matches!(
events.last(),
Some(CompletionEvent::Finish { reason: Some(r) }) if r == "length"
));
}
#[tokio::test]
async fn function_call_items_emit_toolcall_events() {
let events = collect_events(vec![
sse_event(
"response.output_item.added",
r#"{"output_index":0,"item":{"type":"function_call","id":"item_1","call_id":"call_xyz","name":"read_file"}}"#,
),
sse_event(
"response.function_call_arguments.delta",
r#"{"item_id":"item_1","output_index":0,"delta":"{\"path"}"#,
),
sse_event(
"response.function_call_arguments.delta",
r#"{"item_id":"item_1","output_index":0,"delta":"\":\"/etc/hostname\"}"}"#,
),
sse_event("response.completed", r#"{"response":{"status":"completed"}}"#),
])
.await;
let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
let mut iter = events.into_iter();
assert!(matches!(
iter.next(),
Some(CompletionEvent::ToolCallStart { index: 0, ref id, ref name })
if id == "call_xyz" && name == "read_file"
));
assert!(matches!(
iter.next(),
Some(CompletionEvent::ToolCallArgsDelta { index: 0, ref args_delta })
if args_delta == r#"{"path"#
));
assert!(matches!(
iter.next(),
Some(CompletionEvent::ToolCallArgsDelta { index: 0, ref args_delta })
if args_delta == r#"":"/etc/hostname"}"#
));
assert!(matches!(iter.next(), Some(CompletionEvent::Finish { .. })));
}
#[tokio::test]
async fn function_call_added_with_inline_arguments_emits_single_args_delta() {
// Some upstreams (rare) include the fully-buffered arguments
// on the `output_item.added` event when the model finalised
// the call before SSE flush. Verify both ToolCallStart and a
// single args delta fire.
let events = collect_events(vec![
sse_event(
"response.output_item.added",
r#"{"output_index":0,"item":{"type":"function_call","call_id":"call_a","name":"f","arguments":"{\"x\":1}"}}"#,
),
sse_event("response.completed", r#"{"response":{"status":"completed"}}"#),
])
.await;
let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
let mut iter = events.into_iter();
assert!(matches!(
iter.next(),
Some(CompletionEvent::ToolCallStart { .. })
));
assert!(matches!(
iter.next(),
Some(CompletionEvent::ToolCallArgsDelta { index: 0, ref args_delta })
if args_delta == r#"{"x":1}"#
));
assert!(matches!(iter.next(), Some(CompletionEvent::Finish { .. })));
}
#[tokio::test]
async fn cancellation_ends_stream_promptly() {
// Hand the decoder an empty stream + a triggered cancellation
// token; it should terminate without yielding anything.
let sse = stream::iter(Vec::<
Result<eventsource_stream::Event, eventsource_stream::EventStreamError<reqwest::Error>>,
>::new());
let cancel = CancellationToken::new();
cancel.cancel();
let decoded = decode_stream(sse, cancel);
let events: Vec<_> = decoded.collect().await;
assert!(events.is_empty());
}
#[tokio::test]
async fn malformed_event_payload_is_skipped() {
let events = collect_events(vec![
sse_event("response.output_text.delta", "{not valid json"),
sse_event(
"response.output_text.delta",
r#"{"item_id":"m","output_index":0,"delta":"ok"}"#,
),
sse_event(
"response.completed",
r#"{"response":{"status":"completed"}}"#,
),
])
.await;
let events: Vec<CompletionEvent> = events.into_iter().map(|r| r.unwrap()).collect();
// First text delta dropped; second one fires.
assert!(
events
.iter()
.any(|e| matches!(e, CompletionEvent::TextDelta(t) if t == "ok"))
);
// No errors yielded (parse failures are warn-and-skip).
assert!(
events
.iter()
.all(|e| !matches!(e, CompletionEvent::Finish { reason: None }))
);
}
#[test]
fn provider_construction_is_cheap() {
let _ = OpenAIResponsesProvider::new(ep()).unwrap();
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,188 +0,0 @@
//! Per-session state for the ACP agent loop.
//!
//! Concurrency:
//!
//! - [`SessionStore`] is an `Arc<RwLock<HashMap<SessionId, …>>>`. The map
//! itself is read-mostly: it changes only on `session/new` and never
//! shrinks during Stage 2, so an `RwLock` keeps concurrent reads
//! contention-free.
//! - Each session is wrapped in its own `Arc<Mutex<SessionState>>`. Holding
//! one session's lock doesn't block requests against any other session,
//! which matters once a client opens multiple sessions in parallel.
//!
//! All operations hold a lock only long enough to copy out (or mutate) the
//! state they need — never across an `await` that drives the upstream
//! provider stream.
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use agent_client_protocol::schema::{SessionId, SessionModeId};
use tokio::sync::{Mutex, RwLock};
use tokio_util::sync::CancellationToken;
use crate::provider::Message;
/// Mode id advertised as the gated default. Writes / bash prompt for
/// permission via `session/request_permission`.
pub const MODE_DEFAULT: &str = "default";
/// Mode id advertised as "auto-allow everything". Matches the
/// favorite name (`bypassPermissions`) Zed clients tend to reference.
pub const MODE_BYPASS: &str = "bypassPermissions";
/// Mode id for read-and-plan-only operation. The model may read files
/// and list directories freely, may write *only* into the per-project
/// plan directory under `$XDG_DATA_HOME/helexa-acp/plans/<project-id>/`,
/// and cannot run shell commands. Designed for "draft the
/// implementation plan, then I'll review and let you execute" flows.
pub const MODE_PLAN: &str = "plan";
/// State carried for a single ACP session.
///
/// Mutated under `Mutex<SessionState>`; never share a clone across
/// tasks expecting to see the same `cancel` token — clone the token
/// explicitly when handing it to the streaming task.
#[derive(Debug)]
pub struct SessionState {
/// Conversation history in chronological order (user / assistant
/// turns). The system prompt is *not* stored here — it's built
/// fresh per request so any cwd / config changes take effect.
pub history: Vec<Message>,
/// Working directory the client opened the session against. Used
/// by [`crate::prompt::build_system_prompt`] and (Stage 3) by
/// filesystem tools.
pub cwd: PathBuf,
/// Currently-selected model id. Format is either a bare model id
/// (resolved against the default endpoint) or `endpoint:model`.
/// Mutated by `session/set_model` in Stage 4; Stage 2 sets it
/// once at session creation and never changes it.
pub model_id: String,
/// Cancellation handle for the in-flight prompt, if any. A fresh
/// token is installed at the start of every `session/prompt`
/// request; `session/cancel` fires this one. Between prompts the
/// token is "spent" — firing it does nothing — which is fine,
/// `session/cancel` is a no-op when there's nothing to cancel.
pub cancel: CancellationToken,
/// Permission gating mode. Stage 3 advertises two ids in
/// `NewSessionResponse.modes`: [`MODE_DEFAULT`] (writes / bash
/// prompt the user) and [`MODE_BYPASS`] (auto-allow). Mutated by
/// `session/set_mode`.
pub mode_id: SessionModeId,
}
impl SessionState {
pub fn new(cwd: PathBuf, model_id: String) -> Self {
Self {
history: Vec::new(),
cwd,
model_id,
cancel: CancellationToken::new(),
mode_id: SessionModeId::new(MODE_DEFAULT),
}
}
}
/// Concurrent map of live sessions.
///
/// Cloning is cheap (`Arc` bump). Pass clones into every handler that
/// needs session access; never hold a clone across an `.await` that
/// could outlive the request.
pub type SessionStore = Arc<RwLock<HashMap<SessionId, Arc<Mutex<SessionState>>>>>;
/// Fresh, empty session store.
pub fn new_store() -> SessionStore {
Arc::new(RwLock::new(HashMap::new()))
}
/// Look up a session by id. Returns `None` if no such session is registered.
pub async fn get(store: &SessionStore, id: &SessionId) -> Option<Arc<Mutex<SessionState>>> {
store.read().await.get(id).cloned()
}
/// Register a fresh session. Overwrites any prior entry with the same id
/// (which should never happen — ids are uniquely generated by the agent).
pub async fn insert(store: &SessionStore, id: SessionId, state: SessionState) {
store.write().await.insert(id, Arc::new(Mutex::new(state)));
}
#[cfg(test)]
mod tests {
use super::*;
use crate::provider::{MessageContent, Role};
fn id(s: &str) -> SessionId {
SessionId::new(s)
}
#[tokio::test]
async fn insert_then_get_round_trip() {
let store = new_store();
let state = SessionState::new(PathBuf::from("/tmp"), "m".into());
insert(&store, id("s1"), state).await;
let got = get(&store, &id("s1")).await.expect("session present");
let locked = got.lock().await;
assert_eq!(locked.cwd, PathBuf::from("/tmp"));
assert_eq!(locked.model_id, "m");
assert!(locked.history.is_empty());
}
#[tokio::test]
async fn missing_session_is_none() {
let store = new_store();
assert!(get(&store, &id("nope")).await.is_none());
}
#[tokio::test]
async fn history_is_per_session() {
let store = new_store();
insert(
&store,
id("a"),
SessionState::new(PathBuf::from("/a"), "m".into()),
)
.await;
insert(
&store,
id("b"),
SessionState::new(PathBuf::from("/b"), "m".into()),
)
.await;
// Appending to a's history must not affect b's.
get(&store, &id("a"))
.await
.unwrap()
.lock()
.await
.history
.push(Message {
role: Role::User,
content: MessageContent::Text {
text: "hello".into(),
},
});
assert_eq!(
get(&store, &id("a"))
.await
.unwrap()
.lock()
.await
.history
.len(),
1
);
assert_eq!(
get(&store, &id("b"))
.await
.unwrap()
.lock()
.await
.history
.len(),
0
);
}
}

View File

@@ -1,462 +0,0 @@
//! On-disk session persistence for `session/load` support.
//!
//! Storage layout:
//!
//! ```text
//! $XDG_DATA_HOME/helexa-acp/sessions/{session_id}.json
//! ```
//!
//! (Fallback to `~/.local/share/helexa-acp/sessions/` when
//! `$XDG_DATA_HOME` is unset.) One JSON file per session. Writes
//! happen at the end of every `session/prompt` round through
//! [`save`], using tempfile-plus-rename so a crash mid-write can't
//! corrupt the store. Reads happen on `session/load` via [`load`].
//!
//! No compaction, no rotation: files accumulate until the user
//! cleans them up. That's deliberate — disk is cheap, and the
//! resume-on-restart workflow matters more than tidiness. The
//! [`SESSIONS_DIRNAME`] subdirectory is created lazily on first
//! save so an unprivileged install path never errors at startup.
use std::path::PathBuf;
use std::time::SystemTime;
use agent_client_protocol::schema::SessionId;
use serde::{Deserialize, Serialize};
use crate::provider::Message;
const APP_DIRNAME: &str = "helexa-acp";
const SESSIONS_DIRNAME: &str = "sessions";
const PLANS_DIRNAME: &str = "plans";
/// The shape persisted to disk for one session. Only what we can't
/// rebuild from the running config goes in here: the conversation
/// history, the mode toggle, the model id, and the cwd-at-creation.
///
/// `created_at` / `updated_at` are seconds-since-epoch — cheap to
/// compare, no third-party time crate, and stable across runs.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PersistedSession {
pub session_id: String,
pub cwd: PathBuf,
pub model_id: String,
pub mode_id: String,
pub history: Vec<Message>,
pub created_at: u64,
pub updated_at: u64,
}
/// Resolve the directory that holds session JSON files. Honors
/// `$XDG_DATA_HOME`; falls back to `~/.local/share/helexa-acp/sessions/`.
/// Returns `None` if neither is resolvable (no `HOME` set — possible
/// in stripped-down container environments).
pub fn sessions_dir() -> Option<PathBuf> {
let base = std::env::var("XDG_DATA_HOME")
.ok()
.filter(|s| !s.is_empty())
.map(PathBuf::from)
.or_else(|| {
std::env::var("HOME")
.ok()
.map(|h| PathBuf::from(h).join(".local").join("share"))
})?;
Some(base.join(APP_DIRNAME).join(SESSIONS_DIRNAME))
}
/// Atomic save into the default sessions directory.
pub fn save(session: &PersistedSession) -> anyhow::Result<()> {
let dir = sessions_dir()
.ok_or_else(|| anyhow::anyhow!("can't resolve XDG_DATA_HOME or HOME for session store"))?;
save_to_dir(&dir, session)
}
/// Load from the default sessions directory.
pub fn load(session_id: &SessionId) -> anyhow::Result<PersistedSession> {
let dir = sessions_dir()
.ok_or_else(|| anyhow::anyhow!("can't resolve XDG_DATA_HOME or HOME for session store"))?;
load_from_dir(&dir, session_id)
}
/// Atomic save into an explicit directory. Writes to
/// `{id}.json.tmp` then renames over `{id}.json`. Creates the
/// target directory if it doesn't exist. Split from [`save`] so
/// unit tests can target a per-test scratch dir without mutating
/// process-global env vars.
pub fn save_to_dir(dir: &std::path::Path, session: &PersistedSession) -> anyhow::Result<()> {
std::fs::create_dir_all(dir).map_err(|e| anyhow::anyhow!("create {}: {e}", dir.display()))?;
let safe = sanitize_id(&session.session_id);
let final_path = dir.join(format!("{safe}.json"));
let tmp_path = dir.join(format!("{safe}.json.tmp"));
let json = serde_json::to_string_pretty(session)?;
std::fs::write(&tmp_path, json)
.map_err(|e| anyhow::anyhow!("write {}: {e}", tmp_path.display()))?;
std::fs::rename(&tmp_path, &final_path)
.map_err(|e| anyhow::anyhow!("rename → {}: {e}", final_path.display()))?;
Ok(())
}
/// Load from an explicit directory. Returns a friendly error
/// message when the session id has no file on disk so the caller
/// can map it to a clean ACP error response.
pub fn load_from_dir(
dir: &std::path::Path,
session_id: &SessionId,
) -> anyhow::Result<PersistedSession> {
let safe = sanitize_id(session_id.0.as_ref());
let path = dir.join(format!("{safe}.json"));
let bytes = std::fs::read(&path).map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
anyhow::anyhow!("no persisted session at {}", path.display())
} else {
anyhow::anyhow!("read {}: {e}", path.display())
}
})?;
let session: PersistedSession = serde_json::from_slice(&bytes)
.map_err(|e| anyhow::anyhow!("parse {}: {e}", path.display()))?;
Ok(session)
}
/// List all persisted sessions, optionally filtered by `cwd`. Used
/// by the `session/list` handler so a client (Zed) can find the
/// session that belongs to the workspace it's reopening.
///
/// `filter_cwd = None` returns every session on disk. `Some(path)`
/// returns only sessions whose persisted `cwd` is exactly equal.
///
/// Files that fail to parse are skipped with a warning rather than
/// aborting the whole list — one corrupt session shouldn't make
/// the resume picker unusable.
pub fn list(filter_cwd: Option<&std::path::Path>) -> anyhow::Result<Vec<PersistedSession>> {
let dir = sessions_dir()
.ok_or_else(|| anyhow::anyhow!("can't resolve XDG_DATA_HOME or HOME for session store"))?;
list_in_dir(&dir, filter_cwd)
}
/// Explicit-dir variant for tests, mirroring [`save_to_dir`] /
/// [`load_from_dir`].
pub fn list_in_dir(
dir: &std::path::Path,
filter_cwd: Option<&std::path::Path>,
) -> anyhow::Result<Vec<PersistedSession>> {
let read = match std::fs::read_dir(dir) {
Ok(r) => r,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
Err(e) => return Err(anyhow::anyhow!("read_dir {}: {e}", dir.display())),
};
let mut out = Vec::new();
for entry in read.flatten() {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) != Some("json") {
continue;
}
match std::fs::read(&path).and_then(|bytes| {
serde_json::from_slice::<PersistedSession>(&bytes).map_err(std::io::Error::other)
}) {
Ok(session) => {
if let Some(want) = filter_cwd
&& session.cwd != want
{
continue;
}
out.push(session);
}
Err(e) => {
tracing::warn!(
path = %path.display(),
error = %e,
"store: skipping unparseable session file"
);
}
}
}
// Most-recent first by updated_at.
out.sort_by_key(|s| std::cmp::Reverse(s.updated_at));
Ok(out)
}
/// Seconds-since-epoch, saturating to 0 if the system clock is
/// behind epoch (which shouldn't happen but the type system
/// requires a fallible read).
pub fn now_secs() -> u64 {
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0)
}
/// Root directory for plan-mode artefacts. Mirrors [`sessions_dir`]
/// but under `…/helexa-acp/plans/` so plans and conversation
/// transcripts are siblings, not nested.
pub fn plans_root() -> Option<PathBuf> {
sessions_dir().and_then(|s| s.parent().map(|p| p.join(PLANS_DIRNAME)))
}
/// Per-project plan directory:
/// `$XDG_DATA_HOME/helexa-acp/plans/<project-id>/`. The id derives
/// from the session's cwd so plans for the same project survive
/// across cwd-changes (a `/home/foo/git/bar` ↔ symlinked
/// `/srv/checkout/bar` would technically diverge, accepted as a
/// won't-fix corner case).
pub fn plan_dir_for(cwd: &std::path::Path) -> Option<PathBuf> {
plans_root().map(|root| root.join(project_id_for(cwd)))
}
/// Deterministic, human-readable project identifier. Format:
/// `<basename>-<8-hex>` where the 8-hex suffix is FNV-1a of the
/// full path. Basename keeps the path skim-readable when poking
/// around `$XDG_DATA_HOME` by hand; the hash suffix disambiguates
/// repos that share a final path component (e.g. multiple
/// `/.../checkout/beat` checkouts).
///
/// FNV-1a rather than `std::collections::hash::DefaultHasher`
/// because the latter (SipHash) reseeds per process, so it'd give
/// us a different project_id on every run.
pub fn project_id_for(cwd: &std::path::Path) -> String {
let basename = cwd
.file_name()
.and_then(|s| s.to_str())
.unwrap_or("unknown");
let sanitised: String = basename
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
c
} else {
'_'
}
})
.collect();
let hash = fnv1a_32(cwd.to_string_lossy().as_bytes());
format!("{sanitised}-{hash:08x}")
}
/// FNV-1a (32-bit). Deterministic, no third-party crate. Used for
/// project ids only — not cryptographic.
fn fnv1a_32(bytes: &[u8]) -> u32 {
let mut h: u32 = 0x811c_9dc5;
for b in bytes {
h ^= u32::from(*b);
h = h.wrapping_mul(0x0100_0193);
}
h
}
/// Format seconds-since-epoch as an ISO 8601 / RFC 3339 string
/// (`YYYY-MM-DDTHH:MM:SSZ`) for `SessionInfo.updated_at`. Returns
/// `None` for values outside the representable range, in which
/// case the caller should omit the field.
pub fn unix_to_iso8601(secs: u64) -> Option<String> {
use chrono::TimeZone;
let dt = chrono::Utc.timestamp_opt(secs as i64, 0).single()?;
Some(dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
}
/// Strip anything that isn't a safe filename character so a
/// mischievous (or just unconventional) session id can't escape
/// the sessions directory.
fn sanitize_id(id: &str) -> String {
id.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
c
} else {
'_'
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::provider::{MessageContent, Role};
/// Unique scratch dir per test invocation. We use this dir
/// directly with the `*_to_dir` / `*_from_dir` functions so
/// the tests never mutate `$XDG_DATA_HOME` — that env var
/// would race across the parallel test harness.
fn unique_dir() -> PathBuf {
let base = std::env::var("CARGO_TARGET_TMPDIR")
.ok()
.map(PathBuf::from)
.unwrap_or_else(std::env::temp_dir);
let pid = std::process::id();
let nanos = SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.subsec_nanos())
.unwrap_or(0);
let dir = base.join(format!("helexa-acp-store-test-{pid}-{nanos}"));
std::fs::create_dir_all(&dir).expect("create test dir");
dir
}
fn sample(id: &str) -> PersistedSession {
PersistedSession {
session_id: id.into(),
cwd: PathBuf::from("/home/me/proj"),
model_id: "Qwen/Qwen3.6-27B".into(),
mode_id: "default".into(),
history: vec![
Message {
role: Role::User,
content: MessageContent::Text {
text: "hello".into(),
},
},
Message {
role: Role::Assistant,
content: MessageContent::Text { text: "hi".into() },
},
],
created_at: 1_700_000_000,
updated_at: 1_700_000_001,
}
}
#[test]
fn round_trip_save_then_load() {
let dir = unique_dir();
save_to_dir(&dir, &sample("hxa-1")).expect("save");
let loaded = load_from_dir(&dir, &SessionId::new("hxa-1")).expect("load");
assert_eq!(loaded.session_id, "hxa-1");
assert_eq!(loaded.cwd, PathBuf::from("/home/me/proj"));
assert_eq!(loaded.history.len(), 2);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn load_missing_session_errors_with_not_found_message() {
let dir = unique_dir();
let err = load_from_dir(&dir, &SessionId::new("nope")).unwrap_err();
let msg = format!("{err}");
assert!(
msg.contains("no persisted session"),
"want NotFound, got: {msg}"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn save_overwrites_existing_atomically() {
let dir = unique_dir();
save_to_dir(&dir, &sample("hxa-1")).expect("save");
let mut updated = sample("hxa-1");
updated.history.push(Message {
role: Role::User,
content: MessageContent::Text {
text: "third turn".into(),
},
});
updated.updated_at = 1_700_000_500;
save_to_dir(&dir, &updated).expect("re-save");
let loaded = load_from_dir(&dir, &SessionId::new("hxa-1")).expect("load");
assert_eq!(loaded.history.len(), 3);
assert_eq!(loaded.updated_at, 1_700_000_500);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn save_then_load_preserves_tool_calls_and_results() {
use crate::provider::ToolCall;
let dir = unique_dir();
let mut session = sample("hxa-2");
session.history.push(Message {
role: Role::Assistant,
content: MessageContent::ToolCalls {
text: Some("calling".into()),
calls: vec![ToolCall {
id: "call_0".into(),
name: "read_file".into(),
arguments: r#"{"path":"/etc/hostname"}"#.into(),
}],
},
});
session.history.push(Message {
role: Role::Tool,
content: MessageContent::ToolResult {
tool_call_id: "call_0".into(),
content: "host".into(),
},
});
save_to_dir(&dir, &session).expect("save");
let loaded = load_from_dir(&dir, &SessionId::new("hxa-2")).expect("load");
assert_eq!(loaded.history.len(), 4);
match &loaded.history[2].content {
MessageContent::ToolCalls { calls, .. } => {
assert_eq!(calls[0].name, "read_file");
}
other => panic!("expected ToolCalls, got {other:?}"),
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn list_filters_by_cwd_and_sorts_recent_first() {
let dir = unique_dir();
let mut a = sample("a");
a.cwd = PathBuf::from("/home/me/proj-x");
a.updated_at = 1_700_000_010;
let mut b = sample("b");
b.cwd = PathBuf::from("/home/me/proj-x");
b.updated_at = 1_700_000_020;
let mut c = sample("c");
c.cwd = PathBuf::from("/home/me/elsewhere");
c.updated_at = 1_700_000_030;
save_to_dir(&dir, &a).unwrap();
save_to_dir(&dir, &b).unwrap();
save_to_dir(&dir, &c).unwrap();
let proj_x = PathBuf::from("/home/me/proj-x");
let list = list_in_dir(&dir, Some(&proj_x)).unwrap();
let ids: Vec<&str> = list.iter().map(|s| s.session_id.as_str()).collect();
// Filtered to proj-x; b before a because b is more recent.
assert_eq!(ids, vec!["b", "a"]);
let all = list_in_dir(&dir, None).unwrap();
assert_eq!(all.len(), 3);
// Global list still sorted recent-first across all cwds.
assert_eq!(all[0].session_id, "c");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn list_returns_empty_for_missing_dir() {
let dir = unique_dir().join("does-not-exist");
let list = list_in_dir(&dir, None).unwrap();
assert!(list.is_empty());
}
#[test]
fn list_skips_unparseable_files() {
let dir = unique_dir();
save_to_dir(&dir, &sample("good")).unwrap();
std::fs::write(dir.join("garbage.json"), b"{not valid json").unwrap();
let list = list_in_dir(&dir, None).unwrap();
// Garbage skipped; good survives.
assert_eq!(list.len(), 1);
assert_eq!(list[0].session_id, "good");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn iso8601_formats_unix_seconds() {
// 2024-01-01T00:00:00Z is 1704067200 unix seconds.
assert_eq!(
unix_to_iso8601(1_704_067_200),
Some("2024-01-01T00:00:00Z".into())
);
assert_eq!(unix_to_iso8601(0), Some("1970-01-01T00:00:00Z".into()));
}
#[test]
fn sanitize_id_rejects_path_traversal() {
// `../../etc/passwd` — 6 non-alnum chars before "etc"
// (`.`, `.`, `/`, `.`, `.`, `/`), one between, none
// after, none before nothing. Every disallowed char
// collapses to `_`.
assert_eq!(sanitize_id("../../etc/passwd"), "______etc_passwd");
assert_eq!(sanitize_id("ok-name_42"), "ok-name_42");
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,300 +0,0 @@
//! Tool schemas sent to the upstream model on every completion.
//!
//! These are the OpenAI-function-style declarations the LLM sees in
//! `CompletionRequest.tools`; the runtime dispatch happens in
//! [`crate::tool_runner`]. Keeping declarations and execution in
//! separate modules makes it easy to add a tool without touching the
//! runner, and vice versa.
//!
//! Stage 3 ships five: filesystem read / write / edit, directory
//! listing, and `bash`. Image generation, web fetch, MCP-derived
//! tools, etc. are out of scope here.
use serde_json::json;
use crate::provider::ToolSpec;
pub const READ_FILE: &str = "read_file";
pub const WRITE_FILE: &str = "write_file";
pub const EDIT_FILE: &str = "edit_file";
pub const LIST_DIR: &str = "list_dir";
pub const BASH: &str = "bash";
/// Build the static tool list passed to the model on every prompt.
/// Cheap — the JSON Schema fragments are constructed each call but
/// the bodies are small constants. If this ever shows up in a
/// profile we can `OnceLock` the Vec.
pub fn all_tools() -> Vec<ToolSpec> {
vec![
ToolSpec {
name: READ_FILE.to_string(),
description: "Read the contents of a text file. Returns the file's text.".to_string(),
parameters: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute path to the file."
},
"line": {
"type": "integer",
"description": "Optional 1-based line number to start reading from.",
"minimum": 1
},
"limit": {
"type": "integer",
"description": "Optional maximum number of lines to read.",
"minimum": 1
}
},
"required": ["path"],
"additionalProperties": false
}),
},
ToolSpec {
name: WRITE_FILE.to_string(),
description: "Write text content to a file, replacing any existing contents. \
Creates the file (and parent directories) if needed."
.to_string(),
parameters: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute path to the file."
},
"content": {
"type": "string",
"description": "Full new contents of the file."
}
},
"required": ["path", "content"],
"additionalProperties": false
}),
},
ToolSpec {
name: EDIT_FILE.to_string(),
description: "Replace one exact substring in a file with another. \
Fails if `old_text` does not appear in the file, or appears more than once. \
Use multiple edit_file calls for multiple edits."
.to_string(),
parameters: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute path to the file."
},
"old_text": {
"type": "string",
"description": "Exact text fragment to replace. Must be unique within the file."
},
"new_text": {
"type": "string",
"description": "Replacement text."
}
},
"required": ["path", "old_text", "new_text"],
"additionalProperties": false
}),
},
ToolSpec {
name: LIST_DIR.to_string(),
description:
"List the entries of a directory. Returns names and a (f|d|l) kind per entry."
.to_string(),
parameters: json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Absolute path to the directory."
}
},
"required": ["path"],
"additionalProperties": false
}),
},
ToolSpec {
name: BASH.to_string(),
description: "Run a shell command via `sh -c`. \
Returns combined stdout+stderr and the exit status. \
The command runs in the session's working directory unless `cwd` is given."
.to_string(),
parameters: json!({
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "Shell command line, evaluated by `sh -c`."
},
"cwd": {
"type": "string",
"description": "Optional absolute path to run the command from."
}
},
"required": ["command"],
"additionalProperties": false
}),
},
]
}
/// Try to infer which tool was intended from the shape of an
/// `arguments` object alone. Used by the agent when the model
/// emits a `<tool_call>` whose JSON has the right arguments but a
/// missing or invalid top-level `name` field — a recurring
/// Qwen3.6-27B failure mode.
///
/// Returns `Some(name)` only when the argument keys uniquely match
/// exactly one tool in the catalogue. Ambiguous shapes (`{path}`
/// alone could be either [`READ_FILE`] or [`LIST_DIR`]) return
/// `None` so the caller surfaces a Failed-card and lets the model
/// retry rather than guessing wrong.
///
/// Inference table (key set → tool):
///
/// | Keys | Tool |
/// |---------------------------------------|--------------|
/// | `{command}` or `{command, cwd}` | `bash` |
/// | `{path, content}` | `write_file` |
/// | `{path, old_text, new_text}` | `edit_file` |
/// | `{path}` / `{path, line}` / `{path, line, limit}` | *ambiguous* — None |
/// | (anything else) | None |
pub fn infer_tool_name(arguments: &serde_json::Value) -> Option<&'static str> {
let obj = arguments.as_object()?;
let keys: std::collections::HashSet<&str> = obj.keys().map(|s| s.as_str()).collect();
// `command` is unique to bash. Allow the optional `cwd` arg
// alongside but nothing else (any unrecognised keys → bail and
// let the model retry rather than misroute).
if keys.contains("command") && keys.iter().all(|k| matches!(*k, "command" | "cwd")) {
return Some(BASH);
}
// `content` is unique to write_file.
if keys.contains("content") && keys.contains("path") && keys.len() == 2 {
return Some(WRITE_FILE);
}
// `old_text` + `new_text` are unique to edit_file.
if keys.contains("old_text")
&& keys.contains("new_text")
&& keys.contains("path")
&& keys.len() == 3
{
return Some(EDIT_FILE);
}
// `{path}` / `{path, line}` / `{path, line, limit}` overlap
// between read_file (file contents) and list_dir (directory
// contents). No safe inference — refuse.
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn all_tools_has_five_named_entries() {
let tools = all_tools();
let names: Vec<&str> = tools.iter().map(|t| t.name.as_str()).collect();
assert_eq!(
names,
vec![READ_FILE, WRITE_FILE, EDIT_FILE, LIST_DIR, BASH]
);
}
#[test]
fn infer_bash_from_command_only() {
let args = serde_json::json!({"command": "ls /tmp"});
assert_eq!(infer_tool_name(&args), Some(BASH));
}
#[test]
fn infer_bash_from_command_and_cwd() {
let args = serde_json::json!({"command": "ls", "cwd": "/tmp"});
assert_eq!(infer_tool_name(&args), Some(BASH));
}
#[test]
fn infer_bash_from_mkdir_like_real_failure() {
// Lifted verbatim from the agent failure that motivated
// this helper (helexa-acp.log @ 10:03:11).
let args = serde_json::json!({
"command": "mkdir -p /home/grenade/git/beat/beat/doc/plan/{01-discovery,02-segmentation,03-description,04-summary,05-output}"
});
assert_eq!(infer_tool_name(&args), Some(BASH));
}
#[test]
fn infer_write_file() {
let args = serde_json::json!({"path": "/tmp/x", "content": "hi"});
assert_eq!(infer_tool_name(&args), Some(WRITE_FILE));
}
#[test]
fn infer_edit_file() {
let args = serde_json::json!({
"path": "/tmp/x", "old_text": "a", "new_text": "b"
});
assert_eq!(infer_tool_name(&args), Some(EDIT_FILE));
}
#[test]
fn refuse_ambiguous_path_only() {
let args = serde_json::json!({"path": "/tmp/x"});
assert_eq!(infer_tool_name(&args), None);
}
#[test]
fn refuse_ambiguous_path_with_optionals() {
// read_file accepts these optionals; list_dir doesn't —
// but Qwen wouldn't reliably emit them either, so we
// can't use their presence to disambiguate. Refuse.
let args = serde_json::json!({"path": "/tmp/x", "line": 1, "limit": 50});
assert_eq!(infer_tool_name(&args), None);
}
#[test]
fn refuse_command_with_extra_unknown_keys() {
// Defence in depth: an unrecognised key alongside
// `command` means we don't really know what tool the
// model wanted; refuse rather than guess.
let args = serde_json::json!({"command": "ls", "extra": "?"});
assert_eq!(infer_tool_name(&args), None);
}
#[test]
fn refuse_empty_args() {
let args = serde_json::json!({});
assert_eq!(infer_tool_name(&args), None);
}
#[test]
fn refuse_non_object_args() {
let args = serde_json::json!("not an object");
assert_eq!(infer_tool_name(&args), None);
}
#[test]
fn every_tool_has_an_object_parameter_schema() {
for tool in all_tools() {
let ty = tool.parameters.get("type").and_then(|v| v.as_str());
assert_eq!(
ty,
Some("object"),
"tool {} parameters.type must be \"object\"",
tool.name
);
assert!(
tool.parameters.get("properties").is_some(),
"tool {} missing properties",
tool.name
);
assert!(
tool.parameters.get("required").is_some(),
"tool {} missing required list",
tool.name
);
}
}
}

View File

@@ -1,41 +0,0 @@
[package]
name = "helexa-bench"
version.workspace = true
edition.workspace = true
license.workspace = true
repository.workspace = true
[[bin]]
name = "helexa-bench"
path = "src/main.rs"
[dependencies]
cortex-core = { workspace = true }
tokio = { workspace = true }
reqwest = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
figment = { workspace = true }
anyhow = { workspace = true }
async-trait = { workspace = true }
clap = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
chrono = { workspace = true }
futures = { workspace = true }
tokio-stream = { workspace = true }
eventsource-stream = { workspace = true }
# read-only JSON API (api.rs)
axum = { workspace = true }
tower-http = { workspace = true }
# SQLite system-of-record. `bundled` compiles SQLite from source so the
# binary has no libsqlite3 runtime dependency — matches the project's
# single-static-binary packaging.
rusqlite = { version = "0.32", features = ["bundled"] }
[dev-dependencies]
# Jail (isolated cwd + env) for config tests.
figment = { workspace = true, features = ["test"] }

Some files were not shown because too many files have changed in this diff Show More