Compare commits
36 Commits
v0.1.1
...
5c957d08ec
| Author | SHA1 | Date | |
|---|---|---|---|
|
5c957d08ec
|
|||
|
729317d1ef
|
|||
|
5c2bd1a1da
|
|||
|
3cccc2c56b
|
|||
|
7f797b0265
|
|||
|
5a0360c1d5
|
|||
|
472c0e8737
|
|||
|
|
b9d8e30058 | ||
|
25f75fe552
|
|||
|
3f94c50817
|
|||
|
3e1fb60076
|
|||
|
|
9bf987888c | ||
|
abe4ff7ccc
|
|||
|
7c3390a4e1
|
|||
|
2ff062da0e
|
|||
|
|
357f858a29 | ||
|
556e5293dc
|
|||
|
1d90238b01
|
|||
|
d99b25fb8a
|
|||
|
034da319f1
|
|||
|
|
7ece281617 | ||
|
3bb5b3c425
|
|||
|
|
9fa51ad874 | ||
|
9697fbae73
|
|||
|
|
2ce1060cb8 | ||
|
142e91c3f7
|
|||
|
|
52c8b4c983 | ||
|
4a9a4fc775
|
|||
|
53a3c1e157
|
|||
|
5c7d63c658
|
|||
|
|
f161412f91 | ||
|
ba5020138f
|
|||
|
209150771e
|
|||
|
|
7c60af3464 | ||
|
ada76b0153
|
|||
|
15ded3a5bd
|
319
.gitea/workflows/build-prerelease.yml
Normal file
319
.gitea/workflows/build-prerelease.yml
Normal file
@@ -0,0 +1,319 @@
|
|||||||
|
name: build-prerelease
|
||||||
|
|
||||||
|
# Manually-dispatched workflow that builds CUDA-flavoured neuron binaries
|
||||||
|
# (and a single cortex binary), packages each as a Fedora RPM, signs
|
||||||
|
# them, and publishes to the `unstable` channel at rpm.lair.cafe.
|
||||||
|
#
|
||||||
|
# Trigger from the Gitea UI: Actions → build-prerelease → Run workflow.
|
||||||
|
# Optionally provide a `ref` to build from a non-default branch.
|
||||||
|
#
|
||||||
|
# The published packages are versioned as e.g.
|
||||||
|
# helexa-neuron-blackwell-0.1.16-0.1.20260518gitabcdef0.fc43.x86_64
|
||||||
|
# so they sort BELOW the eventual 0.1.16-1 stable release.
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
ref:
|
||||||
|
description: "Git ref to build (branch / tag / commit). Defaults to the workflow's branch."
|
||||||
|
required: false
|
||||||
|
default: ""
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: prerelease-build
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
env:
|
||||||
|
CARGO_INCREMENTAL: "0"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
prepare:
|
||||||
|
name: Resolve version stamps
|
||||||
|
runs-on: rust
|
||||||
|
outputs:
|
||||||
|
version: ${{ steps.info.outputs.version }}
|
||||||
|
release: ${{ steps.info.outputs.release }}
|
||||||
|
short_sha: ${{ steps.info.outputs.short_sha }}
|
||||||
|
commit_date: ${{ steps.info.outputs.commit_date }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- id: info
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
VERSION=$(awk -F\" '/^version[[:space:]]*=/ { print $2; exit }' Cargo.toml)
|
||||||
|
SHORT_SHA=$(git rev-parse --short=7 HEAD)
|
||||||
|
COMMIT_DATE=$(git log -1 --format=%cd --date=format:%Y%m%d HEAD)
|
||||||
|
# Prerelease release stamp sorts before "1" (the stable release).
|
||||||
|
RELEASE="0.1.${COMMIT_DATE}git${SHORT_SHA}"
|
||||||
|
echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "release=${RELEASE}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "commit_date=${COMMIT_DATE}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
build-cortex:
|
||||||
|
name: Build cortex binary
|
||||||
|
needs: prepare
|
||||||
|
runs-on: rust
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
|
|
||||||
|
- name: Install/update Rust toolchain
|
||||||
|
run: |
|
||||||
|
if command -v rustup &> /dev/null; then
|
||||||
|
rustup update stable
|
||||||
|
else
|
||||||
|
curl --proto '=https' --tlsv1.2 --silent --show-error --fail https://sh.rustup.rs | sh -s -- -y
|
||||||
|
fi
|
||||||
|
echo "${HOME}/.cargo/bin" >> "$GITHUB_PATH"
|
||||||
|
|
||||||
|
- name: Build cortex (release)
|
||||||
|
run: cargo build --release -p cortex-cli
|
||||||
|
|
||||||
|
- name: Stage binary
|
||||||
|
run: |
|
||||||
|
mkdir --parents artifacts
|
||||||
|
cp target/release/cortex artifacts/cortex
|
||||||
|
./artifacts/cortex --version || true
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: cortex-fc43
|
||||||
|
path: artifacts/cortex
|
||||||
|
retention-days: 1
|
||||||
|
|
||||||
|
build-neuron:
|
||||||
|
name: Build neuron-${{ matrix.flavour }}
|
||||||
|
needs: prepare
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- flavour: ada
|
||||||
|
compute_cap: "89"
|
||||||
|
runner: cuda-13.0
|
||||||
|
cuda_home: /usr/local/cuda-13.0
|
||||||
|
build_jobs: 8
|
||||||
|
nvcc_threads: 4
|
||||||
|
cargo_features: "cuda cudnn flash-attn"
|
||||||
|
- flavour: blackwell
|
||||||
|
compute_cap: "120"
|
||||||
|
runner: cuda-13.0
|
||||||
|
cuda_home: /usr/local/cuda-13.0
|
||||||
|
build_jobs: 8
|
||||||
|
nvcc_threads: 4
|
||||||
|
cargo_features: "cuda cudnn flash-attn"
|
||||||
|
runs-on: ${{ matrix.runner }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
|
|
||||||
|
- name: Install/update Rust toolchain
|
||||||
|
run: |
|
||||||
|
if command -v rustup &> /dev/null; then
|
||||||
|
rustup update stable
|
||||||
|
else
|
||||||
|
curl --proto '=https' --tlsv1.2 --silent --show-error --fail https://sh.rustup.rs | sh -s -- -y
|
||||||
|
fi
|
||||||
|
echo "${HOME}/.cargo/bin" >> "$GITHUB_PATH"
|
||||||
|
|
||||||
|
- name: Build neuron with CUDA (${{ matrix.flavour }})
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
export PATH="${{ matrix.cuda_home }}/bin:${PATH}"
|
||||||
|
export LD_LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LD_LIBRARY_PATH:-}"
|
||||||
|
export LIBRARY_PATH="${{ matrix.cuda_home }}/targets/x86_64-linux/lib:${{ matrix.cuda_home }}/lib64:${LIBRARY_PATH:-}"
|
||||||
|
cargo build --release -p neuron --features "${{ matrix.cargo_features }}"
|
||||||
|
env:
|
||||||
|
CUDA_COMPUTE_CAP: ${{ matrix.compute_cap }}
|
||||||
|
CARGO_BUILD_JOBS: ${{ matrix.build_jobs }}
|
||||||
|
NVCC_THREADS: ${{ matrix.nvcc_threads }}
|
||||||
|
|
||||||
|
- name: Stage binary
|
||||||
|
run: |
|
||||||
|
mkdir --parents artifacts
|
||||||
|
cp target/release/neuron artifacts/neuron-${{ matrix.flavour }}
|
||||||
|
file "artifacts/neuron-${{ matrix.flavour }}"
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: neuron-${{ matrix.flavour }}-fc43
|
||||||
|
path: artifacts/neuron-${{ matrix.flavour }}
|
||||||
|
retention-days: 1
|
||||||
|
|
||||||
|
package-cortex:
|
||||||
|
name: Package cortex RPM
|
||||||
|
needs: [prepare, build-cortex]
|
||||||
|
runs-on: rpm
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
|
|
||||||
|
- uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: cortex-fc43
|
||||||
|
path: artifacts/
|
||||||
|
|
||||||
|
- name: Build RPM
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
rm -f ~/.rpmmacros
|
||||||
|
rpmdev-setuptree
|
||||||
|
cp artifacts/cortex ~/rpmbuild/SOURCES/
|
||||||
|
cp data/cortex.service ~/rpmbuild/SOURCES/
|
||||||
|
cp data/cortex-sysusers.conf ~/rpmbuild/SOURCES/
|
||||||
|
cp data/cortex-firewalld.xml ~/rpmbuild/SOURCES/
|
||||||
|
cp cortex.example.toml ~/rpmbuild/SOURCES/
|
||||||
|
cp models.example.toml ~/rpmbuild/SOURCES/
|
||||||
|
cp LICENSE ~/rpmbuild/SOURCES/
|
||||||
|
rpmbuild -bb rpm/cortex-prerelease.spec \
|
||||||
|
--define "cortex_version ${{ needs.prepare.outputs.version }}" \
|
||||||
|
--define "cortex_prerelease ${{ needs.prepare.outputs.release }}" \
|
||||||
|
--undefine dist \
|
||||||
|
--define "dist .fc43"
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: rpm-cortex-fc43
|
||||||
|
path: ~/rpmbuild/RPMS/x86_64/*.rpm
|
||||||
|
retention-days: 7
|
||||||
|
|
||||||
|
package-neuron:
|
||||||
|
name: Package helexa-neuron-${{ matrix.flavour }} RPM
|
||||||
|
needs: [prepare, build-neuron]
|
||||||
|
runs-on: rpm
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- flavour: ada
|
||||||
|
- flavour: blackwell
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
|
|
||||||
|
- uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
name: neuron-${{ matrix.flavour }}-fc43
|
||||||
|
path: artifacts/
|
||||||
|
|
||||||
|
- name: Build RPM
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
rm -f ~/.rpmmacros
|
||||||
|
rpmdev-setuptree
|
||||||
|
cp artifacts/neuron-${{ matrix.flavour }} ~/rpmbuild/SOURCES/
|
||||||
|
cp data/neuron.service ~/rpmbuild/SOURCES/
|
||||||
|
cp data/neuron-sysusers.conf ~/rpmbuild/SOURCES/
|
||||||
|
cp data/neuron-firewalld.xml ~/rpmbuild/SOURCES/
|
||||||
|
cp neuron.example.toml ~/rpmbuild/SOURCES/
|
||||||
|
cp LICENSE ~/rpmbuild/SOURCES/
|
||||||
|
rpmbuild -bb rpm/helexa-neuron-prerelease.spec \
|
||||||
|
--define "neuron_version ${{ needs.prepare.outputs.version }}" \
|
||||||
|
--define "neuron_flavour ${{ matrix.flavour }}" \
|
||||||
|
--define "neuron_prerelease ${{ needs.prepare.outputs.release }}" \
|
||||||
|
--undefine dist \
|
||||||
|
--define "dist .fc43"
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: rpm-neuron-${{ matrix.flavour }}-fc43
|
||||||
|
path: ~/rpmbuild/RPMS/x86_64/*.rpm
|
||||||
|
retention-days: 7
|
||||||
|
|
||||||
|
publish:
|
||||||
|
name: Publish to rpm.lair.cafe (unstable)
|
||||||
|
needs: [package-cortex, package-neuron]
|
||||||
|
runs-on: rpm
|
||||||
|
concurrency:
|
||||||
|
group: rpm-publish
|
||||||
|
cancel-in-progress: false
|
||||||
|
env:
|
||||||
|
RPM_REPO_HOST: oolon.kosherinata.internal
|
||||||
|
FEDORA_VERSION: "43"
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
|
|
||||||
|
- name: Download all built RPMs
|
||||||
|
uses: actions/download-artifact@v3
|
||||||
|
with:
|
||||||
|
path: rpms/
|
||||||
|
pattern: rpm-*-fc43
|
||||||
|
|
||||||
|
- name: Flatten RPM artifacts
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
find rpms/ -name '*.rpm' -exec mv --target-directory=rpms/ {} +
|
||||||
|
find rpms/ -mindepth 1 -type d -empty -delete
|
||||||
|
ls -la rpms/
|
||||||
|
|
||||||
|
- name: Check for sequoia-sq
|
||||||
|
run: |
|
||||||
|
if ! command -v sq &> /dev/null; then
|
||||||
|
echo "ERROR: sequoia-sq is not installed. Install with: sudo dnf install sequoia-sq"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Import signing key
|
||||||
|
run: |
|
||||||
|
echo "${{ secrets.RPM_SIGNING_KEY }}" | gpg --batch --import
|
||||||
|
fpr=$(gpg --batch --with-colons --list-keys "${{ secrets.RPM_SIGNING_KEY_ID }}" | awk -F: '/^fpr:/ { print $10; exit }')
|
||||||
|
echo "${fpr}:6:" | gpg --batch --import-ownertrust
|
||||||
|
sed "s/@GPG_NAME@/${{ secrets.RPM_SIGNING_KEY_ID }}/" rpm/rpmmacros > ~/.rpmmacros
|
||||||
|
|
||||||
|
- name: Sign RPMs
|
||||||
|
run: |
|
||||||
|
set -eux
|
||||||
|
for rpm in rpms/*.rpm; do
|
||||||
|
echo "signing ${rpm}..."
|
||||||
|
rpm --addsign "${rpm}"
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Set up SSH for rsync
|
||||||
|
run: |
|
||||||
|
install --directory --mode 700 ~/.ssh
|
||||||
|
echo "${RSYNC_SSH_KEY}" | install --mode 600 /dev/stdin ~/.ssh/id_ed25519
|
||||||
|
env:
|
||||||
|
RSYNC_SSH_KEY: ${{ secrets.RSYNC_SSH_KEY }}
|
||||||
|
|
||||||
|
- name: Test SSH connectivity
|
||||||
|
run: |
|
||||||
|
ssh -o StrictHostKeyChecking=accept-new "gitea_ci@${RPM_REPO_HOST}" exit
|
||||||
|
|
||||||
|
- name: Ensure unstable repo directory exists
|
||||||
|
run: |
|
||||||
|
ssh "gitea_ci@${RPM_REPO_HOST}" \
|
||||||
|
"mkdir --parents /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable"
|
||||||
|
|
||||||
|
- name: Sync RPMs to unstable repo
|
||||||
|
run: |
|
||||||
|
rsync \
|
||||||
|
--archive \
|
||||||
|
--verbose \
|
||||||
|
--chmod D755,F644 \
|
||||||
|
rpms/*.rpm \
|
||||||
|
"gitea_ci@${RPM_REPO_HOST}:/var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/"
|
||||||
|
|
||||||
|
- name: Update unstable repo metadata
|
||||||
|
run: |
|
||||||
|
ssh "gitea_ci@${RPM_REPO_HOST}" \
|
||||||
|
"cd /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable && createrepo_c --update ."
|
||||||
|
|
||||||
|
- name: Generate packages.json manifest
|
||||||
|
run: |
|
||||||
|
scp script/generate-packages-json.py "gitea_ci@${RPM_REPO_HOST}:/tmp/"
|
||||||
|
ssh "gitea_ci@${RPM_REPO_HOST}" \
|
||||||
|
"python3 /tmp/generate-packages-json.py \
|
||||||
|
--repodata-dir /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/repodata \
|
||||||
|
--output /var/www/rpm/fedora/${FEDORA_VERSION}/x86_64/unstable/packages.json \
|
||||||
|
--base-url https://rpm.lair.cafe/fedora/${FEDORA_VERSION}/x86_64/unstable"
|
||||||
@@ -8,6 +8,7 @@ on:
|
|||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
|
CARGO_INCREMENTAL: "0"
|
||||||
RUSTC_WRAPPER: sccache
|
RUSTC_WRAPPER: sccache
|
||||||
SCCACHE_BUCKET: sccache
|
SCCACHE_BUCKET: sccache
|
||||||
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
|
SCCACHE_ENDPOINT: http://caveman.kosherinata.internal:9000
|
||||||
@@ -17,44 +18,38 @@ env:
|
|||||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCCACHE_S3_SECRET_KEY }}
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check:
|
fmt:
|
||||||
name: Format, lint, build, test
|
name: Format
|
||||||
runs-on: fedora
|
runs-on: rust
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
- run: cargo fmt --check --all
|
||||||
|
|
||||||
- name: Ensure sccache with S3 support
|
clippy:
|
||||||
env:
|
name: Clippy
|
||||||
RUSTC_WRAPPER: ""
|
runs-on: rust
|
||||||
run: |
|
steps:
|
||||||
if sccache --version 2>/dev/null && sccache --show-stats 2>/dev/null; then
|
- uses: actions/checkout@v4
|
||||||
echo "sccache with S3 support already installed"
|
- run: cargo clippy --workspace -- -D warnings
|
||||||
else
|
- run: sccache --show-stats
|
||||||
cargo install sccache --features s3 --locked
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Check formatting
|
test:
|
||||||
run: cargo fmt --check --all
|
name: Test
|
||||||
|
runs-on: rust
|
||||||
- name: Clippy
|
steps:
|
||||||
run: cargo clippy --workspace -- -D warnings
|
- uses: actions/checkout@v4
|
||||||
|
- run: cargo test --workspace
|
||||||
- name: Build
|
- run: sccache --show-stats
|
||||||
run: cargo build --workspace
|
|
||||||
|
|
||||||
- name: Test
|
|
||||||
run: cargo test --workspace
|
|
||||||
|
|
||||||
- name: Show sccache stats
|
|
||||||
run: sccache --show-stats
|
|
||||||
|
|
||||||
srpm-cortex:
|
srpm-cortex:
|
||||||
name: Build cortex SRPM
|
name: Build cortex SRPM
|
||||||
runs-on: fedora
|
runs-on: rpm
|
||||||
needs: check
|
needs: [fmt, clippy, test]
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Determine version
|
- name: Determine version
|
||||||
id: version
|
id: version
|
||||||
@@ -68,6 +63,12 @@ jobs:
|
|||||||
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
|
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
|
||||||
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
|
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
|
||||||
|
|
||||||
|
- name: Generate changelog entry
|
||||||
|
uses: https://git.lair.cafe/actions/rpm-changelog@v1
|
||||||
|
with:
|
||||||
|
spec: cortex.spec
|
||||||
|
version: ${{ steps.version.outputs.VERSION }}
|
||||||
|
|
||||||
- name: Generate source tarball
|
- name: Generate source tarball
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
@@ -102,11 +103,13 @@ jobs:
|
|||||||
|
|
||||||
srpm-neuron:
|
srpm-neuron:
|
||||||
name: Build neuron SRPM
|
name: Build neuron SRPM
|
||||||
runs-on: fedora
|
runs-on: rpm
|
||||||
needs: check
|
needs: [fmt, clippy, test]
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Determine version
|
- name: Determine version
|
||||||
id: version
|
id: version
|
||||||
@@ -118,31 +121,37 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
VERSION="${{ steps.version.outputs.VERSION }}"
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
|
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
|
||||||
sed -i "s/^Version:.*/Version: ${VERSION}/" neuron.spec
|
sed -i "s/^Version:.*/Version: ${VERSION}/" helexa-neuron.spec
|
||||||
|
|
||||||
|
- name: Generate changelog entry
|
||||||
|
uses: https://git.lair.cafe/actions/rpm-changelog@v1
|
||||||
|
with:
|
||||||
|
spec: helexa-neuron.spec
|
||||||
|
version: ${{ steps.version.outputs.VERSION }}
|
||||||
|
|
||||||
- name: Generate source tarball
|
- name: Generate source tarball
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
VERSION="${{ steps.version.outputs.VERSION }}"
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
tar czf /tmp/neuron-${VERSION}.tar.gz \
|
tar czf /tmp/helexa-neuron-${VERSION}.tar.gz \
|
||||||
--transform "s,^\.,neuron-${VERSION}," \
|
--transform "s,^\.,helexa-neuron-${VERSION}," \
|
||||||
--exclude='./target' \
|
--exclude='./target' \
|
||||||
--exclude='./.git' \
|
--exclude='./.git' \
|
||||||
--exclude='*.tar.gz' \
|
--exclude='*.tar.gz' \
|
||||||
--exclude='*.src.rpm' \
|
--exclude='*.src.rpm' \
|
||||||
.
|
.
|
||||||
mv /tmp/neuron-${VERSION}.tar.gz .
|
mv /tmp/helexa-neuron-${VERSION}.tar.gz .
|
||||||
|
|
||||||
- name: Vendor Rust dependencies
|
- name: Vendor Rust dependencies
|
||||||
run: |
|
run: |
|
||||||
VERSION="${{ steps.version.outputs.VERSION }}"
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
cargo vendor vendor/
|
cargo vendor vendor/
|
||||||
tar czf neuron-${VERSION}-vendor.tar.gz vendor/
|
tar czf helexa-neuron-${VERSION}-vendor.tar.gz vendor/
|
||||||
rm -rf vendor/
|
rm -rf vendor/
|
||||||
|
|
||||||
- name: Build SRPM
|
- name: Build SRPM
|
||||||
run: |
|
run: |
|
||||||
rpmbuild -bs neuron.spec \
|
rpmbuild -bs helexa-neuron.spec \
|
||||||
--define "_sourcedir $(pwd)" \
|
--define "_sourcedir $(pwd)" \
|
||||||
--define "_srcrpmdir $(pwd)"
|
--define "_srcrpmdir $(pwd)"
|
||||||
|
|
||||||
@@ -154,7 +163,7 @@ jobs:
|
|||||||
|
|
||||||
copr-cortex:
|
copr-cortex:
|
||||||
name: Publish cortex to COPR
|
name: Publish cortex to COPR
|
||||||
runs-on: fedora
|
runs-on: fedora-43
|
||||||
needs: srpm-cortex
|
needs: srpm-cortex
|
||||||
steps:
|
steps:
|
||||||
- name: Download SRPM
|
- name: Download SRPM
|
||||||
@@ -162,17 +171,16 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: srpm-cortex
|
name: srpm-cortex
|
||||||
|
|
||||||
- name: Configure copr-cli
|
- name: Publish to COPR
|
||||||
run: |
|
uses: https://git.lair.cafe/actions/copr-publish@v1
|
||||||
mkdir -p ~/.config
|
with:
|
||||||
echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
|
project: helexa/helexa
|
||||||
|
srpm: "*.src.rpm"
|
||||||
- name: Submit build to COPR
|
copr-config: ${{ secrets.COPR_CONFIG }}
|
||||||
run: copr-cli build helexa/cortex *.src.rpm
|
|
||||||
|
|
||||||
copr-neuron:
|
copr-neuron:
|
||||||
name: Publish neuron to COPR
|
name: Publish neuron to COPR
|
||||||
runs-on: fedora
|
runs-on: fedora-43
|
||||||
needs: srpm-neuron
|
needs: srpm-neuron
|
||||||
steps:
|
steps:
|
||||||
- name: Download SRPM
|
- name: Download SRPM
|
||||||
@@ -180,35 +188,56 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
name: srpm-neuron
|
name: srpm-neuron
|
||||||
|
|
||||||
- name: Configure copr-cli
|
- name: Publish to COPR
|
||||||
run: |
|
uses: https://git.lair.cafe/actions/copr-publish@v1
|
||||||
mkdir -p ~/.config
|
with:
|
||||||
echo "${{ secrets.COPR_CONFIG }}" > ~/.config/copr
|
project: helexa/helexa
|
||||||
|
srpm: "*.src.rpm"
|
||||||
- name: Submit build to COPR
|
copr-config: ${{ secrets.COPR_CONFIG }}
|
||||||
run: copr-cli build helexa/neuron *.src.rpm
|
|
||||||
|
|
||||||
bump-version:
|
bump-version:
|
||||||
name: Bump version in source
|
name: Bump version in source
|
||||||
runs-on: fedora
|
runs-on: rust
|
||||||
needs: [copr-cortex, copr-neuron]
|
needs: [copr-cortex, copr-neuron]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Stamp version and push
|
- name: Determine version
|
||||||
|
id: version
|
||||||
|
run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Stamp version
|
||||||
|
run: |
|
||||||
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
|
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
|
||||||
|
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
|
||||||
|
sed -i "s/^Version:.*/Version: ${VERSION}/" helexa-neuron.spec
|
||||||
|
cargo check --workspace 2>/dev/null || true
|
||||||
|
|
||||||
|
- name: Generate cortex changelog entry
|
||||||
|
uses: https://git.lair.cafe/actions/rpm-changelog@v1
|
||||||
|
with:
|
||||||
|
spec: cortex.spec
|
||||||
|
version: ${{ steps.version.outputs.VERSION }}
|
||||||
|
|
||||||
|
- name: Generate helexa-neuron changelog entry
|
||||||
|
uses: https://git.lair.cafe/actions/rpm-changelog@v1
|
||||||
|
with:
|
||||||
|
spec: helexa-neuron.spec
|
||||||
|
version: ${{ steps.version.outputs.VERSION }}
|
||||||
|
|
||||||
|
- name: Commit and push
|
||||||
env:
|
env:
|
||||||
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
|
GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
VERSION="${GITHUB_REF#refs/tags/v}"
|
VERSION="${{ steps.version.outputs.VERSION }}"
|
||||||
sed -i '/\[workspace\.package\]/,/\[/{ s/^version = ".*"/version = "'"${VERSION}"'"/ }' Cargo.toml
|
|
||||||
sed -i "s/^Version:.*/Version: ${VERSION}/" cortex.spec
|
|
||||||
sed -i "s/^Version:.*/Version: ${VERSION}/" neuron.spec
|
|
||||||
cargo check --workspace 2>/dev/null || true
|
|
||||||
git config user.name "Gitea Actions"
|
git config user.name "Gitea Actions"
|
||||||
git config user.email "actions@git.lair.cafe"
|
git config user.email "actions@git.lair.cafe"
|
||||||
git add Cargo.toml Cargo.lock cortex.spec neuron.spec
|
git add Cargo.toml Cargo.lock cortex.spec helexa-neuron.spec
|
||||||
if git diff --cached --quiet; then
|
if git diff --cached --quiet; then
|
||||||
echo "Version already at ${VERSION}"
|
echo "Nothing to commit for ${VERSION}"
|
||||||
else
|
else
|
||||||
git commit -m "chore: bump version to ${VERSION}"
|
git commit -m "chore: bump version to ${VERSION}"
|
||||||
git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/helexa/cortex.git"
|
git remote set-url origin "https://gitea-actions:${GITEA_TOKEN}@git.lair.cafe/helexa/cortex.git"
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -5,3 +5,4 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
cortex.toml
|
cortex.toml
|
||||||
doc/plan/*
|
doc/plan/*
|
||||||
|
script/deploy.sh
|
||||||
|
|||||||
116
CLAUDE.md
116
CLAUDE.md
@@ -125,7 +125,8 @@ automatically. Clippy warnings must be resolved, not suppressed with
|
|||||||
- One or more GPU nodes running mistral.rs on port 8080
|
- One or more GPU nodes running mistral.rs on port 8080
|
||||||
- Optionally a metrics-only node (no GPU) for Prometheus/Grafana
|
- Optionally a metrics-only node (no GPU) for Prometheus/Grafana
|
||||||
- Each node runs `mistralrs serve` on port 8080
|
- Each node runs `mistralrs serve` on port 8080
|
||||||
- Gateway listens on port 8000 (API) and 9100 (metrics)
|
- Gateway listens on port 31313 (API) and 31314 (metrics)
|
||||||
|
- neuron listens on port 13131 on each GPU host
|
||||||
- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
|
- TLS terminated at gateway or via nginx; internal traffic is plaintext over WireGuard
|
||||||
|
|
||||||
## Conventions
|
## Conventions
|
||||||
@@ -380,7 +381,7 @@ processes (one process per loaded model, each on its own port).
|
|||||||
|
|
||||||
## neuron API
|
## neuron API
|
||||||
|
|
||||||
neuron exposes an HTTP API on port 9090 that cortex polls and calls.
|
neuron exposes an HTTP API on port 13131 that cortex polls and calls.
|
||||||
|
|
||||||
```
|
```
|
||||||
GET /discovery
|
GET /discovery
|
||||||
@@ -424,8 +425,8 @@ endpoint. cortex.toml shrinks to:
|
|||||||
|
|
||||||
```toml
|
```toml
|
||||||
[gateway]
|
[gateway]
|
||||||
listen = "0.0.0.0:8000"
|
listen = "0.0.0.0:31313"
|
||||||
metrics_listen = "0.0.0.0:9100"
|
metrics_listen = "0.0.0.0:31314"
|
||||||
|
|
||||||
[eviction]
|
[eviction]
|
||||||
strategy = "lru"
|
strategy = "lru"
|
||||||
@@ -433,15 +434,15 @@ defrag_after_cycles = 50
|
|||||||
|
|
||||||
[[neurons]]
|
[[neurons]]
|
||||||
name = "beast"
|
name = "beast"
|
||||||
endpoint = "http://beast.hanzalova.internal:9090"
|
endpoint = "http://beast.hanzalova.internal:13131"
|
||||||
|
|
||||||
[[neurons]]
|
[[neurons]]
|
||||||
name = "benjy"
|
name = "benjy"
|
||||||
endpoint = "http://benjy.kosherinata.internal:9090"
|
endpoint = "http://benjy.hanzalova.internal:13131"
|
||||||
|
|
||||||
[[neurons]]
|
[[neurons]]
|
||||||
name = "quadbrat"
|
name = "quadbrat"
|
||||||
endpoint = "http://quadbrat.hanzalova.internal:9090"
|
endpoint = "http://quadbrat.hanzalova.internal:13131"
|
||||||
```
|
```
|
||||||
|
|
||||||
On startup and periodically, cortex calls `GET /discovery` and
|
On startup and periodically, cortex calls `GET /discovery` and
|
||||||
@@ -521,7 +522,7 @@ cortex/
|
|||||||
│ │ └── metrics.rs # prometheus exporter (unchanged)
|
│ │ └── metrics.rs # prometheus exporter (unchanged)
|
||||||
│ ├── neuron/ # node plane (replaces cortex-agent)
|
│ ├── neuron/ # node plane (replaces cortex-agent)
|
||||||
│ │ └── src/
|
│ │ └── src/
|
||||||
│ │ ├── main.rs # binary entrypoint, axum server on :9090
|
│ │ ├── main.rs # binary entrypoint, axum server on :13131
|
||||||
│ │ ├── discovery.rs # nvidia-smi, device enumeration
|
│ │ ├── discovery.rs # nvidia-smi, device enumeration
|
||||||
│ │ ├── health.rs # runtime GPU polling
|
│ │ ├── health.rs # runtime GPU polling
|
||||||
│ │ ├── api.rs # HTTP handlers for /discovery, /models, etc.
|
│ │ ├── api.rs # HTTP handlers for /discovery, /models, etc.
|
||||||
@@ -595,70 +596,65 @@ placement matching can be added incrementally.
|
|||||||
Completed. Both packages have RPM specs, systemd units, and example configs.
|
Completed. Both packages have RPM specs, systemd units, and example configs.
|
||||||
CI builds parallel SRPMs on tag push and publishes to separate COPR repos.
|
CI builds parallel SRPMs on tag push and publishes to separate COPR repos.
|
||||||
|
|
||||||
- `cortex.spec` → `helexa/cortex` COPR: binary, systemd unit, config files
|
- `cortex.spec` — installs the `cortex` binary. Package name keeps the
|
||||||
- `neuron.spec` → `helexa/neuron` COPR: binary, systemd unit, config
|
short `cortex` because no Fedora package collides with it.
|
||||||
|
- `helexa-neuron.spec` — installs the `neuron` binary under package name
|
||||||
|
`helexa-neuron`. Renamed from bare `neuron` to avoid collision with
|
||||||
|
Fedora's NEURON neural-simulation package
|
||||||
|
(https://src.fedoraproject.org/rpms/neuron); binary, systemd unit,
|
||||||
|
system user, and config dir all stay named `neuron` since those are
|
||||||
|
project-local contexts.
|
||||||
- `data/cortex.service`, `data/neuron.service` — systemd units
|
- `data/cortex.service`, `data/neuron.service` — systemd units
|
||||||
- `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
|
- `cortex.example.toml`, `neuron.example.toml`, `models.example.toml`
|
||||||
- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR publish
|
- CI: parallel `srpm-cortex` + `srpm-neuron` jobs, then parallel COPR
|
||||||
|
publish to a single project `helexa/helexa` hosting both packages.
|
||||||
|
|
||||||
Install:
|
Install:
|
||||||
```sh
|
```sh
|
||||||
dnf copr enable helexa/cortex && dnf install cortex # gateway host
|
dnf copr enable helexa/helexa
|
||||||
dnf copr enable helexa/neuron && dnf install neuron # GPU nodes
|
dnf install cortex # gateway host
|
||||||
|
dnf install helexa-neuron # GPU nodes
|
||||||
```
|
```
|
||||||
|
|
||||||
### Phase 11: llama.cpp harness stub
|
## 2026-05-18 addendum: candle-native pivot
|
||||||
|
|
||||||
**Goal:** Prove the harness abstraction works with a second engine.
|
Phases 11 (llama.cpp harness) and 12 (mistral.rs COPR) below are
|
||||||
|
**superseded**. The project no longer treats mistral.rs or llama.cpp as
|
||||||
|
dependencies — both are conceptually out of scope. neuron becomes a
|
||||||
|
candle-native inference daemon, with `Harness` retained as an
|
||||||
|
internal seam for adding future engines (vision/audio/diffusion) but
|
||||||
|
its only implementation being in-process candle.
|
||||||
|
|
||||||
**Steps:**
|
The full staged plan for this pivot lives at
|
||||||
1. `crates/neuron/src/harness/llamacpp.rs` — implement the `Harness`
|
`~/.claude/plans/create-a-more-aggressive-calm-naur.md`. Summary:
|
||||||
trait for llama.cpp's `llama-server`.
|
|
||||||
- `start()` — launch `llama-server` with the correct model path,
|
|
||||||
`--port`, `--n-gpu-layers`, `--tensor-split` args. Track the
|
|
||||||
child process.
|
|
||||||
- `stop()` — send SIGTERM to the child process.
|
|
||||||
- `list_models()` — llama-server serves one model per process, so
|
|
||||||
return a single-element list.
|
|
||||||
- `load_model()` — start a new llama-server process for this model.
|
|
||||||
- `unload_model()` — stop the process.
|
|
||||||
- `inference_endpoint()` — return `http://localhost:{assigned_port}`.
|
|
||||||
2. Port allocation: neuron assigns ports from a range (e.g. 8100-8199)
|
|
||||||
to llama-server instances.
|
|
||||||
3. Register in `HarnessRegistry` when configured:
|
|
||||||
```toml
|
|
||||||
[[harnesses]]
|
|
||||||
name = "llamacpp"
|
|
||||||
binary = "/usr/local/bin/llama-server"
|
|
||||||
port_range = [8100, 8199]
|
|
||||||
```
|
|
||||||
4. Tests: mock llama-server (simple HTTP server returning canned
|
|
||||||
responses), test load/unload/endpoint lifecycle.
|
|
||||||
|
|
||||||
**Done when:** A model with `harness = "llamacpp"` in `models.toml` can
|
- **Stage 1 (this commit):** delete `mistralrs.rs` and `llamacpp.rs`,
|
||||||
be loaded and served through cortex. Tests pass with mock llama-server.
|
scaffold inert `CandleHarness`, drop `endpoint`/`systemd_unit` from
|
||||||
|
`HarnessConfig`, default no-op `start`/`stop` on the `Harness` trait.
|
||||||
|
- **Stages 2–4:** wire up candle model load/unload (quantized Qwen3
|
||||||
|
first), add OpenAI-compatible inference endpoint in neuron, then SSE
|
||||||
|
streaming.
|
||||||
|
- **Stages 5–6:** load-on-activation (default models in config) and
|
||||||
|
unload-on-deactivation (graceful shutdown).
|
||||||
|
- **Stages 7–8:** multi-GPU tensor parallelism and broader model/quant
|
||||||
|
coverage.
|
||||||
|
|
||||||
### Phase 12 (lower priority): mistral.rs COPR packaging
|
Sections of this document that describe mistral.rs HTTP behaviour
|
||||||
|
("mistral.rs API gotchas") are retained as historical context for
|
||||||
|
Phases 1–10 — they document what was true while the project depended
|
||||||
|
on mistral.rs. They do not describe current behaviour.
|
||||||
|
|
||||||
**Goal:** Fedora RPMs for mistral.rs built against specific CUDA versions.
|
---
|
||||||
|
|
||||||
**Steps:**
|
### Phase 11 (superseded): llama.cpp harness stub
|
||||||
1. `mistralrs-cuda.spec` — RPM spec that clones a pinned mistral.rs git
|
|
||||||
tag, builds with `--features cuda`, links against the system CUDA
|
|
||||||
toolkit. Produces `mistralrs-cuda13-server` (CUDA 13.x / sm_120) and
|
|
||||||
`mistralrs-cuda12-server` (CUDA 12.x / sm_89). Install binary to
|
|
||||||
`/usr/local/bin/mistralrs`.
|
|
||||||
2. COPR build config: enable the NVIDIA CUDA repo as a build dependency.
|
|
||||||
Pin the CUDA toolkit version in `BuildRequires`.
|
|
||||||
3. Gitea Actions or manual workflow: bump the mistral.rs tag in the spec,
|
|
||||||
trigger COPR rebuild.
|
|
||||||
4. neuron's mistralrs harness config references which binary/package
|
|
||||||
provides the mistral.rs binary. neuron could warn at startup if the
|
|
||||||
installed mistral.rs CUDA version doesn't match the discovered driver.
|
|
||||||
|
|
||||||
**Done when:** `dnf install mistralrs-cuda13-server` on beast provides a
|
~~Originally planned as a second engine to prove the harness
|
||||||
working `mistralrs` binary built for Blackwell GPUs. `dnf install
|
abstraction.~~ Replaced by the candle harness work in the 2026-05-18
|
||||||
mistralrs-cuda12-server` on benjy provides one built for Ada GPUs.
|
addendum above. llama.cpp's any-model/any-hardware breadth is no
|
||||||
|
longer in scope for helexa.
|
||||||
|
|
||||||
This is a separate repo/spec — not part of the cortex workspace — but
|
### Phase 12 (superseded): mistral.rs COPR packaging
|
||||||
tightly coupled operationally. Track it as a sibling project.
|
|
||||||
|
~~Originally planned to ship CUDA-versioned mistral.rs RPMs.~~ Replaced
|
||||||
|
by the candle harness work in the 2026-05-18 addendum above. With
|
||||||
|
mistral.rs out of the dependency tree, there is nothing to package.
|
||||||
|
|||||||
1609
Cargo.lock
generated
1609
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ members = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "0.1.0"
|
version = "0.1.16"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
repository = "https://git.lair.cafe/helexa/cortex"
|
repository = "https://git.lair.cafe/helexa/cortex"
|
||||||
@@ -27,7 +27,7 @@ serde = { version = "1", features = ["derive"] }
|
|||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
toml = "0.8"
|
toml = "0.8"
|
||||||
|
|
||||||
# http client (for proxying to mistralrs backends)
|
# http client (for proxying to neuron backends)
|
||||||
reqwest = { version = "0.12", features = ["json", "stream"] }
|
reqwest = { version = "0.12", features = ["json", "stream"] }
|
||||||
|
|
||||||
# observability
|
# observability
|
||||||
|
|||||||
105
README.md
105
README.md
@@ -1,22 +1,23 @@
|
|||||||
# cortex
|
# cortex
|
||||||
|
|
||||||
A Rust reverse-proxy and fleet management layer for multi-node
|
A Rust reverse-proxy and fleet management layer for multi-node GPU inference
|
||||||
[mistral.rs](https://github.com/EricLBuehler/mistral.rs) inference clusters.
|
clusters. Cortex sits in front of one or more `neuron` daemons (each running
|
||||||
|
candle-based inference on a local GPU host) and presents a unified OpenAI +
|
||||||
|
Anthropic compatible API surface.
|
||||||
|
|
||||||
## Problem
|
## Problem
|
||||||
|
|
||||||
Running local LLMs across multiple GPU nodes (different VRAM tiers, different
|
Running local LLMs across multiple GPU nodes (different VRAM tiers, different
|
||||||
model affinities) requires a unified API surface that:
|
model affinities) requires a unified API surface that:
|
||||||
|
|
||||||
- Presents a **single `/v1/models` catalogue** merging every model across every
|
- Presents a **single `/v1/models` catalogue** merging every model that can be
|
||||||
node.
|
served by any neuron in the fleet.
|
||||||
- **Routes requests** to the correct node based on where a model is loaded (or
|
- **Routes requests** to the correct node based on where a model is loaded
|
||||||
*can* be loaded).
|
(or can be loaded), handling cold-load and eviction transparently.
|
||||||
- Manages **model lifecycle** — unload cold models, reload on demand, pin
|
- Manages **model lifecycle** — load on demand, unload cold models, pin
|
||||||
critical ones — using the mistral.rs
|
critical ones — by calling each neuron's `/models/{load,unload}` API.
|
||||||
`/v1/models/{unload,reload,status}` HTTP API (PR #1828+).
|
|
||||||
- Translates between **OpenAI and Anthropic** request/response envelopes so
|
- Translates between **OpenAI and Anthropic** request/response envelopes so
|
||||||
every client in the homelab speaks whichever dialect it prefers.
|
every client speaks whichever dialect it prefers.
|
||||||
- Captures **per-request metrics** (tokens, tok/s, TTFT, latency) and exposes
|
- Captures **per-request metrics** (tokens, tok/s, TTFT, latency) and exposes
|
||||||
them as Prometheus counters/histograms.
|
them as Prometheus counters/histograms.
|
||||||
|
|
||||||
@@ -30,18 +31,17 @@ model affinities) requires a unified API surface that:
|
|||||||
└────────────────┴──────┬───────┴───────────────┘
|
└────────────────┴──────┬───────┴───────────────┘
|
||||||
│
|
│
|
||||||
┌──────────▼──────────┐
|
┌──────────▼──────────┐
|
||||||
│ cortex │
|
│ cortex │
|
||||||
│ (cortex-gateway) │
|
│ (cortex-gateway) │
|
||||||
│ │
|
│ │
|
||||||
│ Router · Metrics │
|
│ Router · Metrics │
|
||||||
│ Evictor · Translate│
|
│ Evictor · Translate│
|
||||||
└──┬──────┬────────┬──┘
|
└──┬──────┬────────┬──┘
|
||||||
│ │ │
|
│ │ │
|
||||||
┌──────────▼┐ ┌──▼─────┐ ┌▼──────────┐
|
┌──────────▼┐ ┌──▼─────┐ ┌▼──────────┐
|
||||||
│ gpu-large │ │gpu-med │ │ gpu-small │
|
│ neuron │ │ neuron │ │ neuron │
|
||||||
│ mistralrs │ │mistral │ │ mistralrs │
|
│ :13131 │ │ :13131 │ │ :13131 │
|
||||||
│ serve │ │rs serve│ │ serve │
|
│ candle │ │ candle │ │ candle │
|
||||||
│ :8080 │ │ :8080 │ │ :8080 │
|
|
||||||
└───────────┘ └────────┘ └───────────┘
|
└───────────┘ └────────┘ └───────────┘
|
||||||
private network (.internal)
|
private network (.internal)
|
||||||
```
|
```
|
||||||
@@ -50,70 +50,48 @@ model affinities) requires a unified API surface that:
|
|||||||
|
|
||||||
| Crate | Purpose |
|
| Crate | Purpose |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic request/response envelopes |
|
| `cortex-core` | Shared types: config, node/model state, metrics, OpenAI/Anthropic envelopes, harness trait, discovery types |
|
||||||
| `cortex-gateway` | Axum HTTP server: proxy, router, evictor, metrics exporter |
|
| `cortex-gateway` | Axum HTTP server: proxy, router, evictor, poller, metrics exporter |
|
||||||
| `cortex-agent` | Per-node sidecar: polls local mistralrs, reports to gateway, handles restart/defrag |
|
| `neuron` | Per-node daemon: GPU discovery, in-process candle inference, model lifecycle API |
|
||||||
| `cortex-cli` | CLI entrypoint (`cortex serve`, `cortex status`, etc.) |
|
| `cortex-cli` | CLI entrypoint (`cortex serve`, `cortex status`, etc.) |
|
||||||
|
|
||||||
## Node setup
|
## Node setup
|
||||||
|
|
||||||
Each GPU node runs `mistralrs serve` with a multi-model config. Models are
|
Each GPU node runs `neuron` (listening on `:13131`). Neuron uses
|
||||||
declared but start **unloaded** — mistral.rs lazy-loads on first request and
|
huggingface/candle for in-process inference — there is no external
|
||||||
the gateway can explicitly unload/reload via the HTTP API.
|
inference subprocess to manage.
|
||||||
|
|
||||||
Example node systemd unit:
|
The neuron RPM (`helexa-neuron`) ships a systemd unit:
|
||||||
|
|
||||||
```ini
|
```sh
|
||||||
# /etc/systemd/system/mistralrs.service
|
dnf copr enable helexa/helexa
|
||||||
[Unit]
|
dnf install helexa-neuron
|
||||||
Description=mistral.rs inference server
|
systemctl enable --now neuron
|
||||||
After=network-online.target
|
|
||||||
Wants=network-online.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
ExecStart=/usr/local/bin/mistralrs serve \
|
|
||||||
--from-config /etc/mistralrs/config.toml \
|
|
||||||
--port 8080
|
|
||||||
Restart=on-failure
|
|
||||||
RestartSec=5
|
|
||||||
Environment=CUDA_VISIBLE_DEVICES=0,1
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Gateway config
|
## Gateway config
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
# cortex.toml
|
# /etc/cortex/cortex.toml
|
||||||
[gateway]
|
[gateway]
|
||||||
listen = "0.0.0.0:8000"
|
listen = "0.0.0.0:31313"
|
||||||
metrics_listen = "0.0.0.0:9100"
|
metrics_listen = "0.0.0.0:31314"
|
||||||
|
|
||||||
[eviction]
|
[eviction]
|
||||||
strategy = "lru" # lru | priority
|
strategy = "lru" # lru | priority
|
||||||
defrag_after_cycles = 50
|
defrag_after_cycles = 50
|
||||||
|
|
||||||
[[nodes]]
|
[[neurons]]
|
||||||
name = "gpu-large"
|
name = "beast"
|
||||||
endpoint = "http://gpu-large.internal:8080"
|
endpoint = "http://beast.internal:13131"
|
||||||
vram_mb = 49_152 # e.g. 2x RTX 4090
|
|
||||||
pinned = ["your-org/large-model"]
|
|
||||||
|
|
||||||
[[nodes]]
|
[[neurons]]
|
||||||
name = "gpu-medium"
|
name = "benjy"
|
||||||
endpoint = "http://gpu-medium.internal:8080"
|
endpoint = "http://benjy.internal:13131"
|
||||||
vram_mb = 24_576 # e.g. RTX 4090
|
|
||||||
pinned = ["your-org/medium-model"]
|
|
||||||
|
|
||||||
[[nodes]]
|
|
||||||
name = "gpu-small"
|
|
||||||
endpoint = "http://gpu-small.internal:8080"
|
|
||||||
vram_mb = 12_288 # e.g. RTX 3060
|
|
||||||
pinned = ["your-org/embedding-model"]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Model placement profiles live in `models.toml` — see `models.example.toml`.
|
||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
@@ -131,19 +109,20 @@ cargo clippy --workspace -- -D warnings # warnings are errors
|
|||||||
cargo test --workspace # all tests must pass
|
cargo test --workspace # all tests must pass
|
||||||
```
|
```
|
||||||
|
|
||||||
Tagged releases (`v*`) additionally build an SRPM and publish to COPR.
|
Tagged releases (`v*`) additionally build SRPMs for both `cortex` and
|
||||||
|
`helexa-neuron` and publish to COPR.
|
||||||
|
|
||||||
## Running
|
## Running
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# start the gateway
|
# start the gateway
|
||||||
cortex serve --config cortex.toml
|
cortex serve --config /etc/cortex/cortex.toml
|
||||||
|
|
||||||
# check fleet status
|
# check fleet status
|
||||||
cortex status
|
cortex status
|
||||||
|
|
||||||
# list all models across nodes
|
# list all models across nodes
|
||||||
curl http://localhost:8000/v1/models
|
curl http://localhost:31313/v1/models
|
||||||
```
|
```
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|||||||
@@ -3,22 +3,22 @@
|
|||||||
# Copy to cortex.toml and adjust for your environment.
|
# Copy to cortex.toml and adjust for your environment.
|
||||||
#
|
#
|
||||||
# Environment variable overrides use CORTEX_ prefix with __ separators:
|
# Environment variable overrides use CORTEX_ prefix with __ separators:
|
||||||
# CORTEX_GATEWAY__LISTEN=0.0.0.0:9000
|
# CORTEX_GATEWAY__LISTEN=0.0.0.0:31313
|
||||||
|
|
||||||
[gateway]
|
[gateway]
|
||||||
listen = "0.0.0.0:8000"
|
listen = "0.0.0.0:31313"
|
||||||
metrics_listen = "0.0.0.0:9100"
|
metrics_listen = "0.0.0.0:31314"
|
||||||
|
|
||||||
[eviction]
|
[eviction]
|
||||||
strategy = "lru"
|
strategy = "lru"
|
||||||
# Restart mistralrs after this many load/unload cycles to defragment VRAM.
|
# Restart neurons after this many load/unload cycles to defragment VRAM.
|
||||||
# Set to 0 to disable.
|
# Set to 0 to disable.
|
||||||
defrag_after_cycles = 50
|
defrag_after_cycles = 50
|
||||||
|
|
||||||
# -- Nodes ---------------------------------------------------------------
|
# -- Nodes ---------------------------------------------------------------
|
||||||
# Each [[nodes]] entry declares a mistral.rs instance in the fleet.
|
# Each [[nodes]] entry declares a neuron daemon in the fleet.
|
||||||
# Models are discovered by polling the node's /v1/models endpoint.
|
# Models are discovered by polling the neuron's /models endpoint.
|
||||||
# Pinned models are never evicted.
|
# Pinned models (see models.toml) are never evicted.
|
||||||
|
|
||||||
[[nodes]]
|
[[nodes]]
|
||||||
name = "gpu-large"
|
name = "gpu-large"
|
||||||
|
|||||||
48
cortex.spec
48
cortex.spec
@@ -1,5 +1,5 @@
|
|||||||
Name: cortex
|
Name: cortex
|
||||||
Version: 0.1.0
|
Version: 0.1.16
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Inference gateway for multi-node GPU clusters
|
Summary: Inference gateway for multi-node GPU clusters
|
||||||
|
|
||||||
@@ -13,9 +13,24 @@ ExclusiveArch: x86_64
|
|||||||
BuildRequires: rust >= 1.85
|
BuildRequires: rust >= 1.85
|
||||||
BuildRequires: cargo
|
BuildRequires: cargo
|
||||||
BuildRequires: gcc
|
BuildRequires: gcc
|
||||||
|
BuildRequires: gcc-c++
|
||||||
|
BuildRequires: cmake
|
||||||
|
BuildRequires: perl-interpreter
|
||||||
|
BuildRequires: pkgconfig(openssl)
|
||||||
BuildRequires: systemd-rpm-macros
|
BuildRequires: systemd-rpm-macros
|
||||||
|
|
||||||
Requires(pre): shadow-utils
|
Requires(pre): shadow-utils
|
||||||
|
Requires: systemd
|
||||||
|
Requires: firewalld-filesystem
|
||||||
|
|
||||||
|
# systemd-rpm-macros ships a unit dep generator that parses User=/Group=
|
||||||
|
# from our .service file and emits Requires: user(cortex)/group(cortex).
|
||||||
|
# rpm's sysusers provides-generator emits the unversioned form for groups
|
||||||
|
# but only a versioned user(cortex) = <base64> for users with GECOS/home/
|
||||||
|
# shell. Provide the unversioned user(cortex) explicitly so dnf can resolve
|
||||||
|
# the auto-generated Requires. Without this, dnf5 silently filters the
|
||||||
|
# package and reports "Nothing to do".
|
||||||
|
Provides: user(cortex)
|
||||||
|
|
||||||
%description
|
%description
|
||||||
Cortex is a Rust reverse-proxy that sits in front of multiple inference
|
Cortex is a Rust reverse-proxy that sits in front of multiple inference
|
||||||
@@ -41,13 +56,14 @@ cargo build --release -p cortex-cli
|
|||||||
%install
|
%install
|
||||||
install -Dm755 target/release/cortex %{buildroot}%{_bindir}/cortex
|
install -Dm755 target/release/cortex %{buildroot}%{_bindir}/cortex
|
||||||
install -Dm644 data/cortex.service %{buildroot}%{_unitdir}/cortex.service
|
install -Dm644 data/cortex.service %{buildroot}%{_unitdir}/cortex.service
|
||||||
install -dm750 %{buildroot}%{_sysconfdir}/cortex
|
install -Dm644 data/cortex-sysusers.conf %{buildroot}%{_sysusersdir}/cortex.conf
|
||||||
install -Dm640 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
|
install -Dm644 data/cortex-firewalld.xml %{buildroot}%{_prefix}/lib/firewalld/services/cortex.xml
|
||||||
install -Dm640 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
|
install -dm755 %{buildroot}%{_sysconfdir}/cortex
|
||||||
|
install -Dm644 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
|
||||||
|
install -Dm644 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
|
||||||
|
|
||||||
%pre
|
%pre
|
||||||
getent group cortex >/dev/null || groupadd -r cortex
|
%sysusers_create_compat %{_builddir}/%{name}-%{version}/data/cortex-sysusers.conf
|
||||||
getent passwd cortex >/dev/null || useradd -r -g cortex -d /var/lib/cortex -s /sbin/nologin cortex
|
|
||||||
|
|
||||||
%post
|
%post
|
||||||
%systemd_post cortex.service
|
%systemd_post cortex.service
|
||||||
@@ -63,10 +79,22 @@ getent passwd cortex >/dev/null || useradd -r -g cortex -d /var/lib/cortex -s /s
|
|||||||
%doc README.md
|
%doc README.md
|
||||||
%{_bindir}/cortex
|
%{_bindir}/cortex
|
||||||
%{_unitdir}/cortex.service
|
%{_unitdir}/cortex.service
|
||||||
%dir %attr(750,root,cortex) %{_sysconfdir}/cortex
|
%{_sysusersdir}/cortex.conf
|
||||||
%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/cortex.toml
|
%{_prefix}/lib/firewalld/services/cortex.xml
|
||||||
%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/models.toml
|
%dir %{_sysconfdir}/cortex
|
||||||
|
%config(noreplace) %{_sysconfdir}/cortex/cortex.toml
|
||||||
|
%config(noreplace) %{_sysconfdir}/cortex/models.toml
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
* Tue Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
|
* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.16-1
|
||||||
|
- chore: ignore local deploy script
|
||||||
|
- chore: move default ports out of common-collision ranges
|
||||||
|
- ci: drop actions/cache for cargo registry and target
|
||||||
|
|
||||||
|
* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.14-1
|
||||||
|
- ci: publish both packages to a single helexa/helexa COPR project
|
||||||
|
- fix(rpm): rename neuron package to helexa-neuron
|
||||||
|
- ci: commit generated %changelog entries back to main
|
||||||
|
|
||||||
|
* Wed Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
|
||||||
- Initial package
|
- Initial package
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use tracing_subscriber::EnvFilter;
|
|||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(name = "cortex")]
|
#[command(name = "cortex")]
|
||||||
#[command(about = "Unified inference gateway for multi-node mistral.rs clusters")]
|
#[command(about = "Unified inference gateway for multi-node GPU clusters")]
|
||||||
#[command(version)]
|
#[command(version)]
|
||||||
struct Cli {
|
struct Cli {
|
||||||
#[command(subcommand)]
|
#[command(subcommand)]
|
||||||
@@ -23,7 +23,7 @@ enum Commands {
|
|||||||
/// Print the fleet status (models, nodes, health).
|
/// Print the fleet status (models, nodes, health).
|
||||||
Status {
|
Status {
|
||||||
/// Gateway API endpoint to query.
|
/// Gateway API endpoint to query.
|
||||||
#[arg(short, long, default_value = "http://localhost:8000")]
|
#[arg(short, long, default_value = "http://localhost:31313")]
|
||||||
endpoint: String,
|
endpoint: String,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
//!
|
//!
|
||||||
//! These mirror the `/v1/messages` format used by the Anthropic API.
|
//! These mirror the `/v1/messages` format used by the Anthropic API.
|
||||||
//! The gateway accepts these, translates to OpenAI format, proxies to
|
//! The gateway accepts these, translates to OpenAI format, proxies to
|
||||||
//! mistral.rs, then translates the response back.
|
//! the inference backend (neuron), then translates the response back.
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|||||||
@@ -22,9 +22,9 @@ fn default_models_path() -> String {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct GatewaySettings {
|
pub struct GatewaySettings {
|
||||||
/// Address to listen on for API requests (e.g. "0.0.0.0:8000")
|
/// Address to listen on for API requests (e.g. "0.0.0.0:31313")
|
||||||
pub listen: String,
|
pub listen: String,
|
||||||
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:9100")
|
/// Address to listen on for Prometheus metrics (e.g. "0.0.0.0:31314")
|
||||||
pub metrics_listen: String,
|
pub metrics_listen: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,7 +50,7 @@ pub enum EvictionStrategy {
|
|||||||
pub struct NeuronEndpoint {
|
pub struct NeuronEndpoint {
|
||||||
/// Human-readable node name (e.g. "beast")
|
/// Human-readable node name (e.g. "beast")
|
||||||
pub name: String,
|
pub name: String,
|
||||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090")
|
/// Base URL of the neuron daemon (e.g. "http://beast.internal:13131")
|
||||||
pub endpoint: String,
|
pub endpoint: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,8 +70,8 @@ impl Default for GatewayConfig {
|
|||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
gateway: GatewaySettings {
|
gateway: GatewaySettings {
|
||||||
listen: "0.0.0.0:8000".into(),
|
listen: "0.0.0.0:31313".into(),
|
||||||
metrics_listen: "0.0.0.0:9100".into(),
|
metrics_listen: "0.0.0.0:31314".into(),
|
||||||
},
|
},
|
||||||
eviction: EvictionSettings {
|
eviction: EvictionSettings {
|
||||||
strategy: EvictionStrategy::Lru,
|
strategy: EvictionStrategy::Lru,
|
||||||
|
|||||||
@@ -9,13 +9,13 @@ use async_trait::async_trait;
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
/// Configuration for a harness instance on a neuron.
|
/// Configuration for a harness instance on a neuron.
|
||||||
|
///
|
||||||
|
/// All current harnesses are in-process (candle); per-harness tuning
|
||||||
|
/// (cache paths, device policies, etc.) lives in dedicated config
|
||||||
|
/// blocks rather than on this struct.
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct HarnessConfig {
|
pub struct HarnessConfig {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
/// Base URL of the harness (e.g. "http://localhost:8080" for mistral.rs).
|
|
||||||
pub endpoint: Option<String>,
|
|
||||||
/// Systemd unit name, if the harness is managed via systemd.
|
|
||||||
pub systemd_unit: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Health status of a harness process.
|
/// Health status of a harness process.
|
||||||
@@ -47,16 +47,24 @@ pub struct ModelInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// What an inference harness must do, from neuron's perspective.
|
/// What an inference harness must do, from neuron's perspective.
|
||||||
|
///
|
||||||
|
/// All current harnesses are in-process — they share neuron's address
|
||||||
|
/// space and lifecycle. `start`/`stop` therefore default to no-ops; a
|
||||||
|
/// future process-supervising harness would override them.
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait Harness: Send + Sync {
|
pub trait Harness: Send + Sync {
|
||||||
/// Human-readable name (e.g. "mistralrs", "llamacpp", "comfyui").
|
/// Human-readable name (e.g. "candle").
|
||||||
fn name(&self) -> &str;
|
fn name(&self) -> &str;
|
||||||
|
|
||||||
/// Start the harness process if it is not already running.
|
/// Start the harness. Default no-op for in-process harnesses.
|
||||||
async fn start(&self, config: &HarnessConfig) -> Result<()>;
|
async fn start(&self, _config: &HarnessConfig) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Stop the harness process gracefully.
|
/// Stop the harness. Default no-op for in-process harnesses.
|
||||||
async fn stop(&self) -> Result<()>;
|
async fn stop(&self) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Health check. Returns the harness process status.
|
/// Health check. Returns the harness process status.
|
||||||
async fn health(&self) -> HarnessHealth;
|
async fn health(&self) -> HarnessHealth;
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use std::collections::HashMap;
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct NodeState {
|
pub struct NodeState {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
/// Base URL of the neuron daemon (e.g. "http://beast.internal:9090").
|
/// Base URL of the neuron daemon (e.g. "http://beast.internal:13131").
|
||||||
pub endpoint: String,
|
pub endpoint: String,
|
||||||
pub healthy: bool,
|
pub healthy: bool,
|
||||||
pub models: HashMap<String, ModelEntry>,
|
pub models: HashMap<String, ModelEntry>,
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
//! These are a subset sufficient for chat completions (streaming + non-streaming).
|
//! These are a subset sufficient for chat completions (streaming + non-streaming).
|
||||||
//! Fields not relevant to proxying are captured as `serde_json::Value` via
|
//! Fields not relevant to proxying are captured as `serde_json::Value` via
|
||||||
//! `#[serde(flatten)]` so we forward them without needing to enumerate every
|
//! `#[serde(flatten)]` so we forward them without needing to enumerate every
|
||||||
//! extension field mistral.rs supports.
|
//! extension field a backend might support.
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@@ -22,7 +22,7 @@ pub struct ChatCompletionRequest {
|
|||||||
pub max_tokens: Option<u64>,
|
pub max_tokens: Option<u64>,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub stream: Option<bool>,
|
pub stream: Option<bool>,
|
||||||
/// All other fields (tools, response_format, mistral.rs extensions, etc.)
|
/// All other fields (tools, response_format, backend extensions, etc.)
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
pub extra: Value,
|
pub extra: Value,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
//! Streaming HTTP reverse proxy to mistral.rs backends.
|
//! Streaming HTTP reverse proxy to neuron backends.
|
||||||
//!
|
//!
|
||||||
//! For streaming requests, SSE chunks are forwarded as they arrive.
|
//! For streaming requests, SSE chunks are forwarded as they arrive.
|
||||||
//! The proxy captures timing information for metrics but does not
|
//! The proxy captures timing information for metrics but does not
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ use tokio::net::TcpListener;
|
|||||||
/// - GET /models/:id/endpoint (returns the inference URL)
|
/// - GET /models/:id/endpoint (returns the inference URL)
|
||||||
/// - POST /models/unload (accepts unload requests)
|
/// - POST /models/unload (accepts unload requests)
|
||||||
/// - GET /v1/chat/completions + POST /v1/chat/completions (inference)
|
/// - GET /v1/chat/completions + POST /v1/chat/completions (inference)
|
||||||
|
///
|
||||||
/// Returns the neuron base URL.
|
/// Returns the neuron base URL.
|
||||||
pub async fn spawn_mock_neuron() -> String {
|
pub async fn spawn_mock_neuron() -> String {
|
||||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||||
@@ -54,7 +55,7 @@ pub async fn spawn_mock_neuron() -> String {
|
|||||||
|
|
||||||
async fn mock_neuron_list_models() -> Json<Value> {
|
async fn mock_neuron_list_models() -> Json<Value> {
|
||||||
Json(json!([
|
Json(json!([
|
||||||
{"id": "test-model", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000}
|
{"id": "test-model", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000}
|
||||||
]))
|
]))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use std::sync::Arc;
|
|||||||
async fn test_poller_discovers_models() {
|
async fn test_poller_discovers_models() {
|
||||||
// Mock neuron reports 2 models via /models endpoint (neuron format).
|
// Mock neuron reports 2 models via /models endpoint (neuron format).
|
||||||
let mock_url = common::spawn_mock_neuron_with_models(json!([
|
let mock_url = common::spawn_mock_neuron_with_models(json!([
|
||||||
{"id": "model-a", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
|
{"id": "model-a", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": 8000},
|
||||||
{"id": "model-b", "harness": "mistralrs", "status": "unloaded", "devices": [], "vram_used_mb": null}
|
{"id": "model-b", "harness": "candle", "status": "unloaded", "devices": [], "vram_used_mb": null}
|
||||||
]))
|
]))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -63,8 +63,8 @@ async fn test_poller_discovers_models() {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_poller_updates_gateway_models_endpoint() {
|
async fn test_poller_updates_gateway_models_endpoint() {
|
||||||
let mock_url = common::spawn_mock_neuron_with_models(json!([
|
let mock_url = common::spawn_mock_neuron_with_models(json!([
|
||||||
{"id": "model-x", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
|
{"id": "model-x", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null},
|
||||||
{"id": "model-y", "harness": "mistralrs", "status": "loaded", "devices": [1], "vram_used_mb": null}
|
{"id": "model-y", "harness": "candle", "status": "loaded", "devices": [1], "vram_used_mb": null}
|
||||||
]))
|
]))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -152,8 +152,8 @@ async fn test_poller_marks_unreachable_node_unhealthy() {
|
|||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_poller_removes_stale_models() {
|
async fn test_poller_removes_stale_models() {
|
||||||
let mock_url = common::spawn_mock_neuron_with_models(json!([
|
let mock_url = common::spawn_mock_neuron_with_models(json!([
|
||||||
{"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null},
|
{"id": "keep-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null},
|
||||||
{"id": "drop-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
|
{"id": "drop-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null}
|
||||||
]))
|
]))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -183,7 +183,7 @@ async fn test_poller_removes_stale_models() {
|
|||||||
|
|
||||||
// New mock with only one model.
|
// New mock with only one model.
|
||||||
let new_mock_url = common::spawn_mock_neuron_with_models(json!([
|
let new_mock_url = common::spawn_mock_neuron_with_models(json!([
|
||||||
{"id": "keep-me", "harness": "mistralrs", "status": "loaded", "devices": [0], "vram_used_mb": null}
|
{"id": "keep-me", "harness": "candle", "status": "loaded", "devices": [0], "vram_used_mb": null}
|
||||||
]))
|
]))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
|||||||
@@ -51,18 +51,18 @@ async fn test_streaming_sse_passthrough() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
assert!(
|
assert!(
|
||||||
chunks.len() >= chunk_count + 1,
|
chunks.len() > chunk_count,
|
||||||
"expected at least {} chunks (got {}): {:?}",
|
"expected more than {} chunks (got {}): {:?}",
|
||||||
chunk_count + 1,
|
chunk_count,
|
||||||
chunks.len(),
|
chunks.len(),
|
||||||
chunks,
|
chunks,
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(chunks.last().unwrap(), "[DONE]");
|
assert_eq!(chunks.last().unwrap(), "[DONE]");
|
||||||
|
|
||||||
for i in 0..chunk_count {
|
for (i, chunk) in chunks.iter().enumerate().take(chunk_count) {
|
||||||
let chunk_json: serde_json::Value =
|
let chunk_json: serde_json::Value =
|
||||||
serde_json::from_str(&chunks[i]).expect("chunk should be valid JSON");
|
serde_json::from_str(chunk).expect("chunk should be valid JSON");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
chunk_json["choices"][0]["delta"]["content"],
|
chunk_json["choices"][0]["delta"]["content"],
|
||||||
format!("token{i}")
|
format!("token{i}")
|
||||||
|
|||||||
@@ -12,6 +12,30 @@ path = "src/lib.rs"
|
|||||||
name = "neuron"
|
name = "neuron"
|
||||||
path = "src/main.rs"
|
path = "src/main.rs"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
# Enables CUDA acceleration in candle. Without this feature, candle
|
||||||
|
# compiles for CPU only and Device::new_cuda calls fall back to CPU.
|
||||||
|
cuda = [
|
||||||
|
"candle-core/cuda",
|
||||||
|
"candle-nn/cuda",
|
||||||
|
"candle-transformers/cuda",
|
||||||
|
]
|
||||||
|
# Use cuDNN for convolution / attention kernels. Requires CUDA.
|
||||||
|
cudnn = [
|
||||||
|
"cuda",
|
||||||
|
"candle-core/cudnn",
|
||||||
|
"candle-nn/cudnn",
|
||||||
|
"candle-transformers/cudnn",
|
||||||
|
]
|
||||||
|
# FlashAttention kernels. Requires CUDA.
|
||||||
|
flash-attn = [
|
||||||
|
"cuda",
|
||||||
|
"candle-transformers/flash-attn",
|
||||||
|
]
|
||||||
|
# Reserved for GPU-only integration tests in later stages.
|
||||||
|
cuda-integration = ["cuda"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
cortex-core.workspace = true
|
cortex-core.workspace = true
|
||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
@@ -24,9 +48,19 @@ tracing-subscriber.workspace = true
|
|||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
|
thiserror.workspace = true
|
||||||
figment.workspace = true
|
figment.workspace = true
|
||||||
toml.workspace = true
|
toml.workspace = true
|
||||||
|
|
||||||
|
# candle for in-process inference. CUDA support is gated behind the
|
||||||
|
# crate's `cuda` feature (default off) so the workspace builds on
|
||||||
|
# non-CUDA hosts and CI runners.
|
||||||
|
candle-core = "0.10.2"
|
||||||
|
candle-nn = "0.10.2"
|
||||||
|
candle-transformers = "0.10.2"
|
||||||
|
tokenizers = { version = "0.22", default-features = false, features = ["onig"] }
|
||||||
|
hf-hub = { version = "0.4", features = ["tokio"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tokio = { workspace = true, features = ["test-util"] }
|
tokio = { workspace = true, features = ["test-util"] }
|
||||||
reqwest.workspace = true
|
reqwest.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
//! HTTP API handlers for the neuron daemon.
|
//! HTTP API handlers for the neuron daemon.
|
||||||
|
|
||||||
use crate::harness::HarnessRegistry;
|
use crate::harness::HarnessRegistry;
|
||||||
|
use crate::harness::candle::{CandleHarness, InferenceError};
|
||||||
use crate::health::HealthCache;
|
use crate::health::HealthCache;
|
||||||
use axum::Router;
|
use axum::Router;
|
||||||
use axum::extract::{Path, State};
|
use axum::extract::{Path, State};
|
||||||
@@ -9,6 +10,7 @@ use axum::response::{IntoResponse, Json};
|
|||||||
use axum::routing::{get, post};
|
use axum::routing::{get, post};
|
||||||
use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
|
use cortex_core::discovery::{DiscoveryResponse, HealthResponse};
|
||||||
use cortex_core::harness::ModelSpec;
|
use cortex_core::harness::ModelSpec;
|
||||||
|
use cortex_core::openai::ChatCompletionRequest;
|
||||||
use serde_json::{Value, json};
|
use serde_json::{Value, json};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
@@ -18,6 +20,10 @@ pub struct NeuronState {
|
|||||||
pub discovery: DiscoveryResponse,
|
pub discovery: DiscoveryResponse,
|
||||||
pub health_cache: Arc<HealthCache>,
|
pub health_cache: Arc<HealthCache>,
|
||||||
pub registry: RwLock<HarnessRegistry>,
|
pub registry: RwLock<HarnessRegistry>,
|
||||||
|
/// Typed handle to the candle harness for inference routes. Cached at
|
||||||
|
/// startup so `/v1/chat/completions` doesn't have to hold the registry
|
||||||
|
/// read lock or perform dyn-Trait dispatch per request.
|
||||||
|
pub candle: Option<Arc<CandleHarness>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build the neuron API router.
|
/// Build the neuron API router.
|
||||||
@@ -29,6 +35,7 @@ pub fn neuron_routes() -> Router<Arc<NeuronState>> {
|
|||||||
.route("/models/load", post(load_model))
|
.route("/models/load", post(load_model))
|
||||||
.route("/models/unload", post(unload_model))
|
.route("/models/unload", post(unload_model))
|
||||||
.route("/models/{model_id}/endpoint", get(model_endpoint))
|
.route("/models/{model_id}/endpoint", get(model_endpoint))
|
||||||
|
.route("/v1/chat/completions", post(chat_completions))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn discovery_handler(State(state): State<Arc<NeuronState>>) -> Json<DiscoveryResponse> {
|
async fn discovery_handler(State(state): State<Arc<NeuronState>>) -> Json<DiscoveryResponse> {
|
||||||
@@ -102,3 +109,40 @@ async fn model_endpoint(
|
|||||||
.into_response(),
|
.into_response(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// OpenAI-compatible chat completions. Non-streaming for Stage 3; the
|
||||||
|
/// streaming path is added in Stage 4.
|
||||||
|
async fn chat_completions(
|
||||||
|
State(state): State<Arc<NeuronState>>,
|
||||||
|
Json(req): Json<ChatCompletionRequest>,
|
||||||
|
) -> impl IntoResponse {
|
||||||
|
let Some(candle) = state.candle.as_ref().map(Arc::clone) else {
|
||||||
|
return (
|
||||||
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
|
Json(json!({"error": "candle harness not enabled on this neuron"})),
|
||||||
|
)
|
||||||
|
.into_response();
|
||||||
|
};
|
||||||
|
|
||||||
|
if req.stream.unwrap_or(false) {
|
||||||
|
return (
|
||||||
|
StatusCode::NOT_IMPLEMENTED,
|
||||||
|
Json(json!({"error": "streaming responses arrive in Stage 4"})),
|
||||||
|
)
|
||||||
|
.into_response();
|
||||||
|
}
|
||||||
|
|
||||||
|
match candle.chat_completion(req).await {
|
||||||
|
Ok(resp) => Json(resp).into_response(),
|
||||||
|
Err(InferenceError::ModelNotLoaded(id)) => (
|
||||||
|
StatusCode::NOT_FOUND,
|
||||||
|
Json(json!({"error": format!("model '{id}' not loaded on this neuron")})),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
|
Err(InferenceError::Other(e)) => (
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
Json(json!({"error": e.to_string()})),
|
||||||
|
)
|
||||||
|
.into_response(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use figment::{
|
|||||||
providers::{Env, Format, Toml},
|
providers::{Env, Format, Toml},
|
||||||
};
|
};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct NeuronConfig {
|
pub struct NeuronConfig {
|
||||||
@@ -14,10 +14,29 @@ pub struct NeuronConfig {
|
|||||||
pub port: u16,
|
pub port: u16,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub harnesses: Vec<HarnessConfig>,
|
pub harnesses: Vec<HarnessConfig>,
|
||||||
|
/// Per-harness configuration. Currently only `candle` is recognised.
|
||||||
|
#[serde(default)]
|
||||||
|
pub harness: HarnessSettings,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Settings for individual harness implementations. Each harness owns
|
||||||
|
/// its own sub-table so users only configure the harnesses they enable.
|
||||||
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||||
|
pub struct HarnessSettings {
|
||||||
|
#[serde(default)]
|
||||||
|
pub candle: CandleHarnessConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||||
|
pub struct CandleHarnessConfig {
|
||||||
|
/// HuggingFace cache directory for model weights.
|
||||||
|
/// When unset, defers to hf-hub's default (~/.cache/huggingface).
|
||||||
|
#[serde(default)]
|
||||||
|
pub hf_cache: Option<PathBuf>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_port() -> u16 {
|
fn default_port() -> u16 {
|
||||||
9090
|
13131
|
||||||
}
|
}
|
||||||
|
|
||||||
impl NeuronConfig {
|
impl NeuronConfig {
|
||||||
@@ -33,8 +52,9 @@ impl NeuronConfig {
|
|||||||
impl Default for NeuronConfig {
|
impl Default for NeuronConfig {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
port: 9090,
|
port: 13131,
|
||||||
harnesses: vec![],
|
harnesses: vec![],
|
||||||
|
harness: HarnessSettings::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
441
crates/neuron/src/harness/candle.rs
Normal file
441
crates/neuron/src/harness/candle.rs
Normal file
@@ -0,0 +1,441 @@
|
|||||||
|
//! Candle harness — in-process inference using huggingface/candle.
|
||||||
|
//!
|
||||||
|
//! This is the sole `Harness` implementation. Inference runs inside
|
||||||
|
//! the neuron process; there is no external subprocess.
|
||||||
|
//!
|
||||||
|
//! - Stage 2 wired GGUF (Qwen3 only) load/unload via `quantized_qwen3`.
|
||||||
|
//! - Stage 3 (this) adds `chat_completion` — a non-streaming OpenAI
|
||||||
|
//! compatible chat completion routed to the loaded model's forward
|
||||||
|
//! pass on a per-model serialised generation loop.
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use candle_core::quantized::gguf_file;
|
||||||
|
use candle_core::{Device, Tensor};
|
||||||
|
use candle_transformers::generation::{LogitsProcessor, Sampling};
|
||||||
|
use candle_transformers::models::quantized_qwen3::ModelWeights as QuantizedQwen3Weights;
|
||||||
|
use cortex_core::harness::{Harness, HarnessHealth, ModelInfo, ModelSpec};
|
||||||
|
use cortex_core::openai::{
|
||||||
|
ChatCompletionChoice, ChatCompletionRequest, ChatCompletionResponse, ChatMessage,
|
||||||
|
MessageContent, Usage,
|
||||||
|
};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
use tokenizers::Tokenizer;
|
||||||
|
use tokio::sync::{Mutex, RwLock};
|
||||||
|
|
||||||
|
/// In-process candle harness. Owns the loaded model registry.
|
||||||
|
pub struct CandleHarness {
|
||||||
|
models: Arc<RwLock<HashMap<String, Arc<LoadedModel>>>>,
|
||||||
|
hf_cache: Option<PathBuf>,
|
||||||
|
bind_url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A loaded model with its tokenizer, device placement, and architecture-
|
||||||
|
/// specific weights. The `arch` is `Arc<Mutex<>>` so the lock guard can be
|
||||||
|
/// moved into `spawn_blocking` for synchronous candle forward passes.
|
||||||
|
pub struct LoadedModel {
|
||||||
|
pub model_id: String,
|
||||||
|
pub arch: Arc<Mutex<ModelArch>>,
|
||||||
|
pub tokenizer: Tokenizer,
|
||||||
|
pub device: Device,
|
||||||
|
pub quant: Option<String>,
|
||||||
|
pub devices: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Architecture-specific weights. Stage 3 still supports only Qwen3
|
||||||
|
/// quantized; Stage 8 broadens this to additional families and
|
||||||
|
/// non-quantized variants.
|
||||||
|
pub enum ModelArch {
|
||||||
|
Qwen3Quantized(QuantizedQwen3Weights),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CandleHarness {
|
||||||
|
pub fn new(bind_url: String, hf_cache: Option<PathBuf>) -> Self {
|
||||||
|
Self {
|
||||||
|
models: Arc::new(RwLock::new(HashMap::new())),
|
||||||
|
hf_cache,
|
||||||
|
bind_url,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pick a candle `Device` for the requested indices. Without the
|
||||||
|
/// `cuda` feature, or if CUDA initialisation fails, falls back to CPU.
|
||||||
|
fn pick_device(devices: &[u32]) -> Result<Device> {
|
||||||
|
let _idx = devices.first().copied().unwrap_or(0) as usize;
|
||||||
|
#[cfg(feature = "cuda")]
|
||||||
|
{
|
||||||
|
match Device::new_cuda(_idx) {
|
||||||
|
Ok(d) => return Ok(d),
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
device = _idx,
|
||||||
|
error = %e,
|
||||||
|
"CUDA device unavailable, falling back to CPU"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Device::Cpu)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve a model spec to local GGUF and tokenizer file paths via
|
||||||
|
/// hf-hub. Downloads on first use; subsequent calls are cached.
|
||||||
|
async fn resolve_files(&self, spec: &ModelSpec) -> Result<(PathBuf, PathBuf)> {
|
||||||
|
let mut builder = hf_hub::api::tokio::ApiBuilder::new();
|
||||||
|
if let Some(cache) = &self.hf_cache {
|
||||||
|
builder = builder.with_cache_dir(cache.clone());
|
||||||
|
}
|
||||||
|
let api = builder.build().context("build hf-hub API")?;
|
||||||
|
let repo = api.model(spec.model_id.clone());
|
||||||
|
|
||||||
|
let info = repo
|
||||||
|
.info()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("fetch HF repo info for {}", spec.model_id))?;
|
||||||
|
|
||||||
|
let quant = spec.quant.as_deref().unwrap_or("");
|
||||||
|
let quant_lc = quant.to_lowercase();
|
||||||
|
let gguf_filename = info
|
||||||
|
.siblings
|
||||||
|
.iter()
|
||||||
|
.map(|s| s.rfilename.as_str())
|
||||||
|
.filter(|name| name.to_lowercase().ends_with(".gguf"))
|
||||||
|
.find(|name| quant_lc.is_empty() || name.to_lowercase().contains(&quant_lc))
|
||||||
|
.ok_or_else(|| {
|
||||||
|
anyhow::anyhow!(
|
||||||
|
"no GGUF file matching quant {:?} in repo {}",
|
||||||
|
spec.quant,
|
||||||
|
spec.model_id
|
||||||
|
)
|
||||||
|
})?
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
model = %spec.model_id,
|
||||||
|
file = %gguf_filename,
|
||||||
|
"resolving GGUF (may be cached)"
|
||||||
|
);
|
||||||
|
let gguf_path = repo
|
||||||
|
.get(&gguf_filename)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("fetch GGUF {gguf_filename}"))?;
|
||||||
|
let tokenizer_path = repo
|
||||||
|
.get("tokenizer.json")
|
||||||
|
.await
|
||||||
|
.context("fetch tokenizer.json")?;
|
||||||
|
Ok((gguf_path, tokenizer_path))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a non-streaming chat completion against a loaded model.
|
||||||
|
///
|
||||||
|
/// Returns a typed `InferenceError` when the model isn't loaded so the
|
||||||
|
/// handler can map to an appropriate HTTP status without string-matching.
|
||||||
|
pub async fn chat_completion(
|
||||||
|
&self,
|
||||||
|
request: ChatCompletionRequest,
|
||||||
|
) -> Result<ChatCompletionResponse, InferenceError> {
|
||||||
|
let loaded = {
|
||||||
|
let models = self.models.read().await;
|
||||||
|
models.get(&request.model).cloned()
|
||||||
|
};
|
||||||
|
let loaded = loaded.ok_or_else(|| InferenceError::ModelNotLoaded(request.model.clone()))?;
|
||||||
|
|
||||||
|
let prompt = format_qwen3_prompt(&request.messages);
|
||||||
|
|
||||||
|
let encoding = loaded
|
||||||
|
.tokenizer
|
||||||
|
.encode(prompt.as_str(), true)
|
||||||
|
.map_err(|e| InferenceError::Other(anyhow::anyhow!("tokenize: {e}")))?;
|
||||||
|
let prompt_tokens: Vec<u32> = encoding.get_ids().to_vec();
|
||||||
|
let prompt_len = prompt_tokens.len();
|
||||||
|
|
||||||
|
let temperature = request.temperature.unwrap_or(0.7);
|
||||||
|
let top_p = request.top_p;
|
||||||
|
let max_new = request.max_tokens.unwrap_or(512) as usize;
|
||||||
|
let seed = unix_subsec_nanos();
|
||||||
|
|
||||||
|
let eos_id = loaded
|
||||||
|
.tokenizer
|
||||||
|
.token_to_id("<|im_end|>")
|
||||||
|
.or_else(|| loaded.tokenizer.token_to_id("<|endoftext|>"));
|
||||||
|
|
||||||
|
let arch_arc = Arc::clone(&loaded.arch);
|
||||||
|
let device = loaded.device.clone();
|
||||||
|
let model_id = request.model.clone();
|
||||||
|
|
||||||
|
let (generated_ids, finish_reason) =
|
||||||
|
tokio::task::spawn_blocking(move || -> Result<(Vec<u32>, String)> {
|
||||||
|
let mut guard = arch_arc.blocking_lock();
|
||||||
|
run_inference(
|
||||||
|
&mut guard,
|
||||||
|
&device,
|
||||||
|
&prompt_tokens,
|
||||||
|
max_new,
|
||||||
|
temperature,
|
||||||
|
top_p,
|
||||||
|
seed,
|
||||||
|
eos_id,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|e| InferenceError::Other(anyhow::anyhow!("inference task panicked: {e}")))?
|
||||||
|
.map_err(InferenceError::Other)?;
|
||||||
|
|
||||||
|
let completion_text = loaded
|
||||||
|
.tokenizer
|
||||||
|
.decode(&generated_ids, true)
|
||||||
|
.map_err(|e| InferenceError::Other(anyhow::anyhow!("detokenize: {e}")))?;
|
||||||
|
|
||||||
|
let usage = Usage {
|
||||||
|
prompt_tokens: prompt_len as u64,
|
||||||
|
completion_tokens: generated_ids.len() as u64,
|
||||||
|
total_tokens: (prompt_len + generated_ids.len()) as u64,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(ChatCompletionResponse {
|
||||||
|
id: format!("chatcmpl-{:x}", unix_subsec_nanos()),
|
||||||
|
object: "chat.completion".into(),
|
||||||
|
created: unix_now_secs(),
|
||||||
|
model: model_id,
|
||||||
|
choices: vec![ChatCompletionChoice {
|
||||||
|
index: 0,
|
||||||
|
message: ChatMessage {
|
||||||
|
role: "assistant".into(),
|
||||||
|
content: MessageContent::Text(completion_text),
|
||||||
|
extra: serde_json::Value::Object(Default::default()),
|
||||||
|
},
|
||||||
|
finish_reason: Some(finish_reason),
|
||||||
|
extra: serde_json::Value::Object(Default::default()),
|
||||||
|
}],
|
||||||
|
usage: Some(usage),
|
||||||
|
extra: serde_json::Value::Object(Default::default()),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Harness for CandleHarness {
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"candle"
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn health(&self) -> HarnessHealth {
|
||||||
|
HarnessHealth {
|
||||||
|
name: "candle".into(),
|
||||||
|
running: true,
|
||||||
|
uptime_secs: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_models(&self) -> Result<Vec<ModelInfo>> {
|
||||||
|
let models = self.models.read().await;
|
||||||
|
Ok(models
|
||||||
|
.values()
|
||||||
|
.map(|m| ModelInfo {
|
||||||
|
id: m.model_id.clone(),
|
||||||
|
harness: "candle".into(),
|
||||||
|
status: "loaded".into(),
|
||||||
|
devices: m.devices.clone(),
|
||||||
|
vram_used_mb: None,
|
||||||
|
})
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
|
||||||
|
if spec.harness != "candle" {
|
||||||
|
anyhow::bail!("expected harness=candle, got harness={}", spec.harness);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let models = self.models.read().await;
|
||||||
|
if models.contains_key(&spec.model_id) {
|
||||||
|
anyhow::bail!("model '{}' already loaded", spec.model_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let devices = spec.devices.clone().unwrap_or_else(|| vec![0]);
|
||||||
|
let device = Self::pick_device(&devices)?;
|
||||||
|
|
||||||
|
let (gguf_path, tokenizer_path) = self.resolve_files(spec).await?;
|
||||||
|
|
||||||
|
let tokenizer = Tokenizer::from_file(&tokenizer_path)
|
||||||
|
.map_err(|e| anyhow::anyhow!("load tokenizer: {e}"))?;
|
||||||
|
|
||||||
|
// File I/O + GGUF parsing + tensor materialisation are CPU-bound,
|
||||||
|
// so run them on a blocking task to avoid stalling the runtime.
|
||||||
|
let device_for_load = device.clone();
|
||||||
|
let gguf_path_for_load = gguf_path.clone();
|
||||||
|
let model_id_for_log = spec.model_id.clone();
|
||||||
|
let arch = tokio::task::spawn_blocking(move || -> Result<ModelArch> {
|
||||||
|
tracing::info!(model = %model_id_for_log, path = ?gguf_path_for_load, "loading GGUF");
|
||||||
|
let mut file = std::fs::File::open(&gguf_path_for_load).context("open GGUF file")?;
|
||||||
|
let content = gguf_file::Content::read(&mut file)
|
||||||
|
.map_err(|e| anyhow::anyhow!("parse GGUF: {e}"))?;
|
||||||
|
|
||||||
|
let architecture = content
|
||||||
|
.metadata
|
||||||
|
.get("general.architecture")
|
||||||
|
.and_then(|v| v.to_string().ok().cloned())
|
||||||
|
.unwrap_or_default();
|
||||||
|
tracing::info!(architecture = %architecture, "GGUF architecture");
|
||||||
|
|
||||||
|
match architecture.as_str() {
|
||||||
|
"qwen3" => {
|
||||||
|
let weights =
|
||||||
|
QuantizedQwen3Weights::from_gguf(content, &mut file, &device_for_load)
|
||||||
|
.map_err(|e| anyhow::anyhow!("from_gguf qwen3: {e}"))?;
|
||||||
|
Ok(ModelArch::Qwen3Quantized(weights))
|
||||||
|
}
|
||||||
|
other => anyhow::bail!(
|
||||||
|
"unsupported GGUF architecture '{other}'; Stage 3 only supports qwen3"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.context("blocking load task panicked")??;
|
||||||
|
|
||||||
|
let loaded = Arc::new(LoadedModel {
|
||||||
|
model_id: spec.model_id.clone(),
|
||||||
|
arch: Arc::new(Mutex::new(arch)),
|
||||||
|
tokenizer,
|
||||||
|
device,
|
||||||
|
quant: spec.quant.clone(),
|
||||||
|
devices,
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut models = self.models.write().await;
|
||||||
|
models.insert(spec.model_id.clone(), loaded);
|
||||||
|
tracing::info!(model = %spec.model_id, "model loaded");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn unload_model(&self, model_id: &str) -> Result<()> {
|
||||||
|
let mut models = self.models.write().await;
|
||||||
|
if models.remove(model_id).is_none() {
|
||||||
|
anyhow::bail!("model '{model_id}' not loaded");
|
||||||
|
}
|
||||||
|
tracing::info!(model = %model_id, "model unloaded");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn inference_endpoint(&self, model_id: &str) -> Option<String> {
|
||||||
|
let models = self.models.read().await;
|
||||||
|
models.contains_key(model_id).then(|| self.bind_url.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Errors returned by `CandleHarness::chat_completion`. The
|
||||||
|
/// `ModelNotLoaded` variant lets the HTTP handler map cleanly to 404
|
||||||
|
/// without string-matching on anyhow messages.
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum InferenceError {
|
||||||
|
#[error("model '{0}' not loaded on this neuron")]
|
||||||
|
ModelNotLoaded(String),
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply the Qwen3 chat template:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// <|im_start|>{role}\n{content}<|im_end|>\n
|
||||||
|
/// ...
|
||||||
|
/// <|im_start|>assistant\n
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// The trailing `<|im_start|>assistant\n` cues the model to begin a turn.
|
||||||
|
/// Non-text content parts (vision blocks) are joined as text only; full
|
||||||
|
/// multimodal handling is out of scope for Stage 3.
|
||||||
|
fn format_qwen3_prompt(messages: &[ChatMessage]) -> String {
|
||||||
|
let mut prompt = String::new();
|
||||||
|
for msg in messages {
|
||||||
|
let content = match &msg.content {
|
||||||
|
MessageContent::Text(s) => s.clone(),
|
||||||
|
MessageContent::Parts(parts) => parts
|
||||||
|
.iter()
|
||||||
|
.filter_map(|p| p.get("text").and_then(|v| v.as_str()))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(""),
|
||||||
|
};
|
||||||
|
prompt.push_str("<|im_start|>");
|
||||||
|
prompt.push_str(&msg.role);
|
||||||
|
prompt.push('\n');
|
||||||
|
prompt.push_str(&content);
|
||||||
|
prompt.push_str("<|im_end|>\n");
|
||||||
|
}
|
||||||
|
prompt.push_str("<|im_start|>assistant\n");
|
||||||
|
prompt
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
fn run_inference(
|
||||||
|
arch: &mut ModelArch,
|
||||||
|
device: &Device,
|
||||||
|
prompt_tokens: &[u32],
|
||||||
|
max_new: usize,
|
||||||
|
temperature: f64,
|
||||||
|
top_p: Option<f64>,
|
||||||
|
seed: u64,
|
||||||
|
eos_id: Option<u32>,
|
||||||
|
) -> Result<(Vec<u32>, String)> {
|
||||||
|
let mut logits_processor = {
|
||||||
|
let sampling = if temperature <= 0.0 {
|
||||||
|
Sampling::ArgMax
|
||||||
|
} else {
|
||||||
|
match top_p {
|
||||||
|
Some(p) => Sampling::TopP { p, temperature },
|
||||||
|
None => Sampling::All { temperature },
|
||||||
|
}
|
||||||
|
};
|
||||||
|
LogitsProcessor::from_sampling(seed, sampling)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut generated: Vec<u32> = Vec::new();
|
||||||
|
|
||||||
|
let mut next_token = match arch {
|
||||||
|
ModelArch::Qwen3Quantized(model) => {
|
||||||
|
model.clear_kv_cache();
|
||||||
|
let input = Tensor::new(prompt_tokens, device)?.unsqueeze(0)?;
|
||||||
|
let logits = model.forward(&input, 0)?;
|
||||||
|
let logits = logits.squeeze(0)?;
|
||||||
|
logits_processor.sample(&logits)?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if Some(next_token) == eos_id {
|
||||||
|
return Ok((generated, "stop".into()));
|
||||||
|
}
|
||||||
|
generated.push(next_token);
|
||||||
|
|
||||||
|
for index in 0..max_new.saturating_sub(1) {
|
||||||
|
next_token = match arch {
|
||||||
|
ModelArch::Qwen3Quantized(model) => {
|
||||||
|
let input = Tensor::new(&[next_token], device)?.unsqueeze(0)?;
|
||||||
|
let logits = model.forward(&input, prompt_tokens.len() + index)?;
|
||||||
|
let logits = logits.squeeze(0)?;
|
||||||
|
logits_processor.sample(&logits)?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if Some(next_token) == eos_id {
|
||||||
|
return Ok((generated, "stop".into()));
|
||||||
|
}
|
||||||
|
generated.push(next_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((generated, "length".into()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unix_now_secs() -> u64 {
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.map(|d| d.as_secs())
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unix_subsec_nanos() -> u64 {
|
||||||
|
SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.map(|d| d.as_nanos() as u64)
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
@@ -1 +0,0 @@
|
|||||||
// llama.cpp harness implementation — Phase 11.
|
|
||||||
@@ -1,163 +0,0 @@
|
|||||||
//! mistral.rs harness implementation.
|
|
||||||
//!
|
|
||||||
//! Wraps the mistral.rs HTTP API for model lifecycle management
|
|
||||||
//! and optionally manages the process via systemd.
|
|
||||||
|
|
||||||
use anyhow::Result;
|
|
||||||
use async_trait::async_trait;
|
|
||||||
use cortex_core::harness::{Harness, HarnessConfig, HarnessHealth, ModelInfo, ModelSpec};
|
|
||||||
use reqwest::Client;
|
|
||||||
use serde::Deserialize;
|
|
||||||
|
|
||||||
pub struct MistralRsHarness {
|
|
||||||
endpoint: String,
|
|
||||||
systemd_unit: Option<String>,
|
|
||||||
client: Client,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MistralRsHarness {
|
|
||||||
pub fn new(endpoint: String, systemd_unit: Option<String>) -> Self {
|
|
||||||
Self {
|
|
||||||
endpoint,
|
|
||||||
systemd_unit,
|
|
||||||
client: Client::builder()
|
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
|
||||||
.build()
|
|
||||||
.expect("failed to build HTTP client"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Response from mistral.rs `GET /v1/models`.
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct ModelsResponse {
|
|
||||||
data: Vec<ModelEntry>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
struct ModelEntry {
|
|
||||||
id: String,
|
|
||||||
#[serde(default)]
|
|
||||||
status: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[async_trait]
|
|
||||||
impl Harness for MistralRsHarness {
|
|
||||||
fn name(&self) -> &str {
|
|
||||||
"mistralrs"
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn start(&self, _config: &HarnessConfig) -> Result<()> {
|
|
||||||
let Some(unit) = &self.systemd_unit else {
|
|
||||||
anyhow::bail!("no systemd unit configured for mistralrs harness");
|
|
||||||
};
|
|
||||||
|
|
||||||
let output = tokio::process::Command::new("systemctl")
|
|
||||||
.args(["start", unit])
|
|
||||||
.output()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
anyhow::bail!("systemctl start {unit} failed: {stderr}");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for the health endpoint to respond (up to 30s).
|
|
||||||
let url = format!("{}/health", self.endpoint);
|
|
||||||
for _ in 0..30 {
|
|
||||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
|
||||||
if self.client.get(&url).send().await.is_ok() {
|
|
||||||
tracing::info!(unit, "mistralrs started and healthy");
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
anyhow::bail!("mistralrs started but health endpoint did not respond within 30s");
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn stop(&self) -> Result<()> {
|
|
||||||
let Some(unit) = &self.systemd_unit else {
|
|
||||||
anyhow::bail!("no systemd unit configured for mistralrs harness");
|
|
||||||
};
|
|
||||||
|
|
||||||
let output = tokio::process::Command::new("systemctl")
|
|
||||||
.args(["stop", unit])
|
|
||||||
.output()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
anyhow::bail!("systemctl stop {unit} failed: {stderr}");
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn health(&self) -> HarnessHealth {
|
|
||||||
let url = format!("{}/health", self.endpoint);
|
|
||||||
let running = self.client.get(&url).send().await.is_ok();
|
|
||||||
HarnessHealth {
|
|
||||||
name: "mistralrs".into(),
|
|
||||||
running,
|
|
||||||
uptime_secs: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn list_models(&self) -> Result<Vec<ModelInfo>> {
|
|
||||||
let url = format!("{}/v1/models", self.endpoint);
|
|
||||||
let resp = self.client.get(&url).send().await?;
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
anyhow::bail!("GET /v1/models returned {}", resp.status());
|
|
||||||
}
|
|
||||||
|
|
||||||
let models_resp: ModelsResponse = resp.json().await?;
|
|
||||||
Ok(models_resp
|
|
||||||
.data
|
|
||||||
.into_iter()
|
|
||||||
.map(|m| ModelInfo {
|
|
||||||
id: m.id,
|
|
||||||
harness: "mistralrs".into(),
|
|
||||||
status: m.status.unwrap_or_else(|| "loaded".into()),
|
|
||||||
devices: vec![],
|
|
||||||
vram_used_mb: None,
|
|
||||||
})
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn load_model(&self, spec: &ModelSpec) -> Result<()> {
|
|
||||||
let url = format!("{}/v1/models/reload", self.endpoint);
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.post(&url)
|
|
||||||
.json(&serde_json::json!({ "model_id": spec.model_id }))
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
let body = resp.text().await.unwrap_or_default();
|
|
||||||
anyhow::bail!("POST /v1/models/reload failed: {body}");
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn unload_model(&self, model_id: &str) -> Result<()> {
|
|
||||||
let url = format!("{}/v1/models/unload", self.endpoint);
|
|
||||||
let resp = self
|
|
||||||
.client
|
|
||||||
.post(&url)
|
|
||||||
.json(&serde_json::json!({ "model_id": model_id }))
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if !resp.status().is_success() {
|
|
||||||
let body = resp.text().await.unwrap_or_default();
|
|
||||||
anyhow::bail!("POST /v1/models/unload failed: {body}");
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn inference_endpoint(&self, _model_id: &str) -> Option<String> {
|
|
||||||
// mistral.rs routes internally by model name in the request body,
|
|
||||||
// so the inference endpoint is always the base URL.
|
|
||||||
Some(self.endpoint.clone())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,15 +1,22 @@
|
|||||||
//! Harness registry — maps harness names to trait implementations.
|
//! Harness registry — maps harness names to trait implementations.
|
||||||
|
|
||||||
pub mod llamacpp;
|
pub mod candle;
|
||||||
pub mod mistralrs;
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use cortex_core::harness::{Harness, HarnessConfig, ModelInfo, ModelSpec};
|
use cortex_core::harness::{Harness, HarnessConfig, ModelInfo, ModelSpec};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
/// Registry of available harness implementations.
|
/// Registry of available harness implementations.
|
||||||
|
///
|
||||||
|
/// Holds an `Arc<dyn Harness>` per harness for generic lifecycle dispatch
|
||||||
|
/// (load/unload/list_models). When a candle harness is registered, a typed
|
||||||
|
/// `Arc<CandleHarness>` is also cached so inference routes can bypass the
|
||||||
|
/// dyn-Trait dispatch and reach harness-specific methods (chat completion,
|
||||||
|
/// streaming, etc.).
|
||||||
pub struct HarnessRegistry {
|
pub struct HarnessRegistry {
|
||||||
harnesses: HashMap<String, Box<dyn Harness>>,
|
harnesses: HashMap<String, Arc<dyn Harness>>,
|
||||||
|
candle: Option<Arc<candle::CandleHarness>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for HarnessRegistry {
|
impl Default for HarnessRegistry {
|
||||||
@@ -22,10 +29,11 @@ impl HarnessRegistry {
|
|||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
harnesses: HashMap::new(),
|
harnesses: HashMap::new(),
|
||||||
|
candle: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn register(&mut self, harness: Box<dyn Harness>) {
|
pub fn register(&mut self, harness: Arc<dyn Harness>) {
|
||||||
self.harnesses.insert(harness.name().to_string(), harness);
|
self.harnesses.insert(harness.name().to_string(), harness);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -34,6 +42,12 @@ impl HarnessRegistry {
|
|||||||
self.harnesses.keys().cloned().collect()
|
self.harnesses.keys().cloned().collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Typed handle to the candle harness, if registered. Used by inference
|
||||||
|
/// routes that need methods beyond the `Harness` trait surface.
|
||||||
|
pub fn candle(&self) -> Option<Arc<candle::CandleHarness>> {
|
||||||
|
self.candle.clone()
|
||||||
|
}
|
||||||
|
|
||||||
/// List models from all registered harnesses.
|
/// List models from all registered harnesses.
|
||||||
pub async fn list_all_models(&self) -> Result<Vec<ModelInfo>> {
|
pub async fn list_all_models(&self) -> Result<Vec<ModelInfo>> {
|
||||||
let mut all = Vec::new();
|
let mut all = Vec::new();
|
||||||
@@ -81,19 +95,25 @@ impl HarnessRegistry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Build a registry from harness configs.
|
/// Build a registry from harness configs.
|
||||||
pub fn from_configs(configs: &[HarnessConfig]) -> Self {
|
///
|
||||||
|
/// `bind_url` is the URL where this neuron serves inference (its own
|
||||||
|
/// listen address). In-process harnesses (currently the only kind)
|
||||||
|
/// return this URL from `inference_endpoint`.
|
||||||
|
pub fn from_configs(
|
||||||
|
configs: &[HarnessConfig],
|
||||||
|
bind_url: &str,
|
||||||
|
settings: &crate::config::HarnessSettings,
|
||||||
|
) -> Self {
|
||||||
let mut registry = Self::new();
|
let mut registry = Self::new();
|
||||||
for config in configs {
|
for config in configs {
|
||||||
match config.name.as_str() {
|
match config.name.as_str() {
|
||||||
"mistralrs" => {
|
"candle" => {
|
||||||
if let Some(endpoint) = &config.endpoint {
|
let harness = Arc::new(candle::CandleHarness::new(
|
||||||
registry.register(Box::new(mistralrs::MistralRsHarness::new(
|
bind_url.to_string(),
|
||||||
endpoint.clone(),
|
settings.candle.hf_cache.clone(),
|
||||||
config.systemd_unit.clone(),
|
));
|
||||||
)));
|
registry.candle = Some(Arc::clone(&harness));
|
||||||
} else {
|
registry.harnesses.insert("candle".into(), harness);
|
||||||
tracing::warn!("mistralrs harness missing endpoint, skipping");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
other => {
|
other => {
|
||||||
tracing::warn!(harness = other, "unknown harness type, skipping");
|
tracing::warn!(harness = other, "unknown harness type, skipping");
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ async fn main() -> Result<()> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let port = args.port.unwrap_or(cfg.port);
|
let port = args.port.unwrap_or(cfg.port);
|
||||||
|
let bind_url = format!("http://localhost:{port}");
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
|
|
||||||
tracing::info!("running hardware discovery");
|
tracing::info!("running hardware discovery");
|
||||||
@@ -47,9 +48,12 @@ async fn main() -> Result<()> {
|
|||||||
"discovery complete"
|
"discovery complete"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Build harness registry from config.
|
// Build harness registry from config. In-process harnesses (candle)
|
||||||
let registry = HarnessRegistry::from_configs(&cfg.harnesses);
|
// need to know neuron's own bind URL so they can return it from
|
||||||
|
// inference_endpoint.
|
||||||
|
let registry = HarnessRegistry::from_configs(&cfg.harnesses, &bind_url, &cfg.harness);
|
||||||
discovery_result.harnesses = registry.names();
|
discovery_result.harnesses = registry.names();
|
||||||
|
let candle = registry.candle();
|
||||||
|
|
||||||
let health_cache = Arc::new(health::HealthCache::new());
|
let health_cache = Arc::new(health::HealthCache::new());
|
||||||
health_cache
|
health_cache
|
||||||
@@ -65,6 +69,7 @@ async fn main() -> Result<()> {
|
|||||||
discovery: discovery_result,
|
discovery: discovery_result,
|
||||||
health_cache,
|
health_cache,
|
||||||
registry: RwLock::new(registry),
|
registry: RwLock::new(registry),
|
||||||
|
candle,
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = api::neuron_routes().with_state(state);
|
let app = api::neuron_routes().with_state(state);
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ async fn spawn_neuron(discovery: DiscoveryResponse) -> String {
|
|||||||
discovery,
|
discovery,
|
||||||
health_cache,
|
health_cache,
|
||||||
registry: RwLock::new(registry),
|
registry: RwLock::new(registry),
|
||||||
|
candle: None,
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = api::neuron_routes().with_state(state);
|
let app = api::neuron_routes().with_state(state);
|
||||||
@@ -135,56 +136,30 @@ async fn test_models_empty_registry() {
|
|||||||
assert!(body.as_array().unwrap().is_empty());
|
assert!(body.as_array().unwrap().is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawn a mock mistral.rs backend and a neuron with the mistralrs harness
|
/// Verify the candle harness registers, list is empty by default, and a
|
||||||
/// pointing at it, then test the full model lifecycle through neuron's API.
|
/// load attempt for an obviously-bogus model id returns a 4xx error
|
||||||
|
/// without crashing the daemon. Real load/unload exercising actual GGUF
|
||||||
|
/// download is covered by `tests/candle_lifecycle.rs` (cuda-integration).
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_models_via_mistralrs_harness() {
|
async fn test_candle_harness_registers_and_rejects_bogus_model() {
|
||||||
use axum::routing::{get, post};
|
|
||||||
use axum::{Json, Router};
|
|
||||||
use cortex_core::harness::HarnessConfig;
|
use cortex_core::harness::HarnessConfig;
|
||||||
use serde_json::Value;
|
use neuron::config::HarnessSettings;
|
||||||
|
|
||||||
// Mock mistral.rs backend.
|
let registry = HarnessRegistry::from_configs(
|
||||||
let mock_app = Router::new()
|
&[HarnessConfig {
|
||||||
.route(
|
name: "candle".into(),
|
||||||
"/v1/models",
|
}],
|
||||||
get(|| async {
|
"http://localhost:13131",
|
||||||
Json(json!({
|
&HarnessSettings::default(),
|
||||||
"data": [
|
);
|
||||||
{"id": "test-model", "status": "loaded"},
|
|
||||||
{"id": "other-model", "status": "unloaded"}
|
|
||||||
]
|
|
||||||
}))
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
.route(
|
|
||||||
"/v1/models/unload",
|
|
||||||
post(|Json(_body): Json<Value>| async { Json(json!({"status": "ok"})) }),
|
|
||||||
)
|
|
||||||
.route(
|
|
||||||
"/v1/models/reload",
|
|
||||||
post(|Json(_body): Json<Value>| async { Json(json!({"status": "ok"})) }),
|
|
||||||
);
|
|
||||||
|
|
||||||
let mock_listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
|
||||||
let mock_addr = mock_listener.local_addr().unwrap();
|
|
||||||
tokio::spawn(async move {
|
|
||||||
axum::serve(mock_listener, mock_app).await.unwrap();
|
|
||||||
});
|
|
||||||
let mock_url = format!("http://{mock_addr}");
|
|
||||||
|
|
||||||
// Build neuron with mistralrs harness pointing at mock.
|
|
||||||
let registry = HarnessRegistry::from_configs(&[HarnessConfig {
|
|
||||||
name: "mistralrs".into(),
|
|
||||||
endpoint: Some(mock_url.clone()),
|
|
||||||
systemd_unit: None,
|
|
||||||
}]);
|
|
||||||
|
|
||||||
|
let candle = registry.candle();
|
||||||
let health_cache = Arc::new(HealthCache::new());
|
let health_cache = Arc::new(HealthCache::new());
|
||||||
let state = Arc::new(NeuronState {
|
let state = Arc::new(NeuronState {
|
||||||
discovery: fake_discovery(),
|
discovery: fake_discovery(),
|
||||||
health_cache,
|
health_cache,
|
||||||
registry: RwLock::new(registry),
|
registry: RwLock::new(registry),
|
||||||
|
candle,
|
||||||
});
|
});
|
||||||
|
|
||||||
let app = api::neuron_routes().with_state(state);
|
let app = api::neuron_routes().with_state(state);
|
||||||
@@ -197,7 +172,6 @@ async fn test_models_via_mistralrs_harness() {
|
|||||||
|
|
||||||
let client = reqwest::Client::new();
|
let client = reqwest::Client::new();
|
||||||
|
|
||||||
// GET /models — should return models from mock mistralrs.
|
|
||||||
let resp = client
|
let resp = client
|
||||||
.get(format!("{neuron_url}/models"))
|
.get(format!("{neuron_url}/models"))
|
||||||
.send()
|
.send()
|
||||||
@@ -205,45 +179,139 @@ async fn test_models_via_mistralrs_harness() {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), 200);
|
assert_eq!(resp.status(), 200);
|
||||||
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
||||||
assert_eq!(models.len(), 2);
|
assert!(models.is_empty());
|
||||||
assert_eq!(models[0]["id"], "test-model");
|
|
||||||
assert_eq!(models[0]["harness"], "mistralrs");
|
|
||||||
assert_eq!(models[0]["status"], "loaded");
|
|
||||||
assert_eq!(models[1]["id"], "other-model");
|
|
||||||
assert_eq!(models[1]["status"], "unloaded");
|
|
||||||
|
|
||||||
// GET /models/test-model/endpoint — should return mock URL.
|
// Sending a wrong-harness spec should be rejected synchronously
|
||||||
let resp = client
|
// without touching the network or the model registry.
|
||||||
.get(format!("{neuron_url}/models/test-model/endpoint"))
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(resp.status(), 200);
|
|
||||||
let body: serde_json::Value = resp.json().await.unwrap();
|
|
||||||
assert_eq!(body["url"], mock_url);
|
|
||||||
|
|
||||||
// POST /models/unload — should succeed.
|
|
||||||
let resp = client
|
|
||||||
.post(format!("{neuron_url}/models/unload"))
|
|
||||||
.json(&json!({"model_id": "test-model"}))
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(resp.status(), 200);
|
|
||||||
let body: serde_json::Value = resp.json().await.unwrap();
|
|
||||||
assert_eq!(body["status"], "unloaded");
|
|
||||||
|
|
||||||
// POST /models/load — should succeed.
|
|
||||||
let resp = client
|
let resp = client
|
||||||
.post(format!("{neuron_url}/models/load"))
|
.post(format!("{neuron_url}/models/load"))
|
||||||
|
.json(&json!({"model_id": "definitely/not-real", "harness": "not-candle"}))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), 400);
|
||||||
|
|
||||||
|
// Registry still empty.
|
||||||
|
let resp = client
|
||||||
|
.get(format!("{neuron_url}/models"))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let models: Vec<serde_json::Value> = resp.json().await.unwrap();
|
||||||
|
assert!(models.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `/v1/chat/completions` returns 503 when no candle harness is registered.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_chat_completions_no_candle_harness() {
|
||||||
|
let registry = HarnessRegistry::new();
|
||||||
|
let health_cache = Arc::new(HealthCache::new());
|
||||||
|
let state = Arc::new(NeuronState {
|
||||||
|
discovery: fake_discovery(),
|
||||||
|
health_cache,
|
||||||
|
registry: RwLock::new(registry),
|
||||||
|
candle: None,
|
||||||
|
});
|
||||||
|
let app = api::neuron_routes().with_state(state);
|
||||||
|
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||||
|
let addr = listener.local_addr().unwrap();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
axum::serve(listener, app).await.unwrap();
|
||||||
|
});
|
||||||
|
let url = format!("http://{addr}");
|
||||||
|
|
||||||
|
let resp = reqwest::Client::new()
|
||||||
|
.post(format!("{url}/v1/chat/completions"))
|
||||||
.json(&json!({
|
.json(&json!({
|
||||||
"model_id": "test-model",
|
"model": "anything",
|
||||||
"harness": "mistralrs"
|
"messages": [{"role": "user", "content": "hi"}]
|
||||||
}))
|
}))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), 200);
|
assert_eq!(resp.status(), 503);
|
||||||
let body: serde_json::Value = resp.json().await.unwrap();
|
}
|
||||||
assert_eq!(body["status"], "loaded");
|
|
||||||
|
/// `/v1/chat/completions` returns 404 when the requested model isn't loaded.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_chat_completions_model_not_loaded() {
|
||||||
|
use cortex_core::harness::HarnessConfig;
|
||||||
|
use neuron::config::HarnessSettings;
|
||||||
|
|
||||||
|
let registry = HarnessRegistry::from_configs(
|
||||||
|
&[HarnessConfig {
|
||||||
|
name: "candle".into(),
|
||||||
|
}],
|
||||||
|
"http://localhost:0",
|
||||||
|
&HarnessSettings::default(),
|
||||||
|
);
|
||||||
|
let candle = registry.candle();
|
||||||
|
let health_cache = Arc::new(HealthCache::new());
|
||||||
|
let state = Arc::new(NeuronState {
|
||||||
|
discovery: fake_discovery(),
|
||||||
|
health_cache,
|
||||||
|
registry: RwLock::new(registry),
|
||||||
|
candle,
|
||||||
|
});
|
||||||
|
let app = api::neuron_routes().with_state(state);
|
||||||
|
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||||
|
let addr = listener.local_addr().unwrap();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
axum::serve(listener, app).await.unwrap();
|
||||||
|
});
|
||||||
|
let url = format!("http://{addr}");
|
||||||
|
|
||||||
|
let resp = reqwest::Client::new()
|
||||||
|
.post(format!("{url}/v1/chat/completions"))
|
||||||
|
.json(&json!({
|
||||||
|
"model": "definitely/not-loaded",
|
||||||
|
"messages": [{"role": "user", "content": "hi"}]
|
||||||
|
}))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), 404);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `/v1/chat/completions` with `stream: true` returns 501 until Stage 4
|
||||||
|
/// wires up SSE.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_chat_completions_streaming_not_yet_implemented() {
|
||||||
|
use cortex_core::harness::HarnessConfig;
|
||||||
|
use neuron::config::HarnessSettings;
|
||||||
|
|
||||||
|
let registry = HarnessRegistry::from_configs(
|
||||||
|
&[HarnessConfig {
|
||||||
|
name: "candle".into(),
|
||||||
|
}],
|
||||||
|
"http://localhost:0",
|
||||||
|
&HarnessSettings::default(),
|
||||||
|
);
|
||||||
|
let candle = registry.candle();
|
||||||
|
let health_cache = Arc::new(HealthCache::new());
|
||||||
|
let state = Arc::new(NeuronState {
|
||||||
|
discovery: fake_discovery(),
|
||||||
|
health_cache,
|
||||||
|
registry: RwLock::new(registry),
|
||||||
|
candle,
|
||||||
|
});
|
||||||
|
let app = api::neuron_routes().with_state(state);
|
||||||
|
let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||||
|
let addr = listener.local_addr().unwrap();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
axum::serve(listener, app).await.unwrap();
|
||||||
|
});
|
||||||
|
let url = format!("http://{addr}");
|
||||||
|
|
||||||
|
let resp = reqwest::Client::new()
|
||||||
|
.post(format!("{url}/v1/chat/completions"))
|
||||||
|
.json(&json!({
|
||||||
|
"model": "anything",
|
||||||
|
"messages": [{"role": "user", "content": "hi"}],
|
||||||
|
"stream": true
|
||||||
|
}))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), 501);
|
||||||
}
|
}
|
||||||
|
|||||||
87
crates/neuron/tests/candle_lifecycle.rs
Normal file
87
crates/neuron/tests/candle_lifecycle.rs
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
//! Real model load/unload lifecycle through the candle harness.
|
||||||
|
//!
|
||||||
|
//! Gated behind the `cuda-integration` feature because it downloads a
|
||||||
|
//! real (small) GGUF from HuggingFace and materialises tensors on the
|
||||||
|
//! configured device. Run on a host with network access and either a
|
||||||
|
//! CUDA GPU (when built with `--features cuda`) or enough CPU RAM to
|
||||||
|
//! hold the model.
|
||||||
|
//!
|
||||||
|
//! Usage:
|
||||||
|
//! cargo test -p neuron --features cuda-integration --test candle_lifecycle
|
||||||
|
//!
|
||||||
|
//! Optional environment variables:
|
||||||
|
//! NEURON_TEST_MODEL_ID — HuggingFace repo to load (default: a small
|
||||||
|
//! public Qwen3 GGUF repo).
|
||||||
|
//! NEURON_TEST_QUANT — quant substring matched against GGUF
|
||||||
|
//! filenames (default: "Q4_K_M").
|
||||||
|
//! HF_HOME — HuggingFace cache directory.
|
||||||
|
|
||||||
|
#![cfg(feature = "cuda-integration")]
|
||||||
|
|
||||||
|
use cortex_core::harness::{HarnessConfig, ModelSpec};
|
||||||
|
use neuron::config::HarnessSettings;
|
||||||
|
use neuron::harness::HarnessRegistry;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_candle_qwen3_load_unload_lifecycle() {
|
||||||
|
let _ = tracing_subscriber::fmt()
|
||||||
|
.with_test_writer()
|
||||||
|
.with_env_filter("info,neuron=debug")
|
||||||
|
.try_init();
|
||||||
|
|
||||||
|
let model_id = std::env::var("NEURON_TEST_MODEL_ID")
|
||||||
|
.unwrap_or_else(|_| "Qwen/Qwen3-0.6B-GGUF".to_string());
|
||||||
|
let quant = std::env::var("NEURON_TEST_QUANT").unwrap_or_else(|_| "Q4_K_M".to_string());
|
||||||
|
|
||||||
|
let mut settings = HarnessSettings::default();
|
||||||
|
if let Ok(home) = std::env::var("HF_HOME") {
|
||||||
|
settings.candle.hf_cache = Some(PathBuf::from(home));
|
||||||
|
}
|
||||||
|
|
||||||
|
let registry = HarnessRegistry::from_configs(
|
||||||
|
&[HarnessConfig {
|
||||||
|
name: "candle".into(),
|
||||||
|
}],
|
||||||
|
"http://localhost:13131",
|
||||||
|
&settings,
|
||||||
|
);
|
||||||
|
|
||||||
|
let spec = ModelSpec {
|
||||||
|
model_id: model_id.clone(),
|
||||||
|
harness: "candle".into(),
|
||||||
|
quant: Some(quant),
|
||||||
|
tensor_parallel: None,
|
||||||
|
devices: Some(vec![0]),
|
||||||
|
};
|
||||||
|
|
||||||
|
registry
|
||||||
|
.load_model(&spec)
|
||||||
|
.await
|
||||||
|
.expect("load_model should succeed");
|
||||||
|
|
||||||
|
let models = registry.list_all_models().await.expect("list_all_models");
|
||||||
|
assert_eq!(models.len(), 1, "expected exactly one loaded model");
|
||||||
|
assert_eq!(models[0].id, model_id);
|
||||||
|
assert_eq!(models[0].harness, "candle");
|
||||||
|
assert_eq!(models[0].status, "loaded");
|
||||||
|
|
||||||
|
let url = registry.inference_endpoint(&model_id).await;
|
||||||
|
assert_eq!(url, Some("http://localhost:13131".into()));
|
||||||
|
|
||||||
|
// Re-loading the same model should be rejected.
|
||||||
|
let again = registry.load_model(&spec).await;
|
||||||
|
assert!(again.is_err(), "second load should error");
|
||||||
|
|
||||||
|
registry
|
||||||
|
.unload_model(&model_id)
|
||||||
|
.await
|
||||||
|
.expect("unload_model should succeed");
|
||||||
|
|
||||||
|
let models = registry.list_all_models().await.expect("list_all_models");
|
||||||
|
assert!(models.is_empty(), "registry should be empty after unload");
|
||||||
|
|
||||||
|
// Unloading a model that isn't loaded should error.
|
||||||
|
let err = registry.unload_model(&model_id).await;
|
||||||
|
assert!(err.is_err(), "unload of missing model should error");
|
||||||
|
}
|
||||||
7
data/cortex-firewalld.xml
Normal file
7
data/cortex-firewalld.xml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<service>
|
||||||
|
<short>cortex</short>
|
||||||
|
<description>Cortex — inference gateway for multi-node GPU clusters</description>
|
||||||
|
<port protocol="tcp" port="31313"/>
|
||||||
|
<port protocol="tcp" port="31314"/>
|
||||||
|
</service>
|
||||||
3
data/cortex-sysusers.conf
Normal file
3
data/cortex-sysusers.conf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
g cortex - -
|
||||||
|
u cortex - "Cortex inference cluster" /var/lib/cortex /sbin/nologin
|
||||||
|
m cortex cortex
|
||||||
6
data/neuron-firewalld.xml
Normal file
6
data/neuron-firewalld.xml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<service>
|
||||||
|
<short>helexa-neuron</short>
|
||||||
|
<description>Neuron — per-node GPU discovery and harness daemon for cortex</description>
|
||||||
|
<port protocol="tcp" port="13131"/>
|
||||||
|
</service>
|
||||||
3
data/neuron-sysusers.conf
Normal file
3
data/neuron-sysusers.conf
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
g neuron - -
|
||||||
|
u neuron - "Neuron GPU node daemon" /var/lib/neuron /sbin/nologin
|
||||||
|
m neuron neuron
|
||||||
@@ -5,11 +5,11 @@ Wants=network-online.target
|
|||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
ExecStart=/usr/bin/neuron --config /etc/cortex/neuron.toml
|
ExecStart=/usr/bin/neuron --config /etc/neuron/neuron.toml
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
User=cortex
|
User=neuron
|
||||||
Group=cortex
|
Group=neuron
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
101
helexa-neuron.spec
Normal file
101
helexa-neuron.spec
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
Name: helexa-neuron
|
||||||
|
Version: 0.1.16
|
||||||
|
Release: 1%{?dist}
|
||||||
|
Summary: Per-node GPU discovery and harness management daemon for cortex
|
||||||
|
# Package name disambiguates from Fedora's existing "neuron" package
|
||||||
|
# (NEURON neural simulation environment from Yale). Binary, systemd
|
||||||
|
# unit, and system user are still called "neuron" for brevity.
|
||||||
|
|
||||||
|
License: GPL-3.0-or-later
|
||||||
|
URL: https://git.lair.cafe/helexa/cortex
|
||||||
|
Source0: %{name}-%{version}.tar.gz
|
||||||
|
Source1: %{name}-%{version}-vendor.tar.gz
|
||||||
|
|
||||||
|
ExclusiveArch: x86_64
|
||||||
|
|
||||||
|
BuildRequires: rust >= 1.85
|
||||||
|
BuildRequires: cargo
|
||||||
|
BuildRequires: gcc
|
||||||
|
BuildRequires: gcc-c++
|
||||||
|
BuildRequires: cmake
|
||||||
|
BuildRequires: perl-interpreter
|
||||||
|
BuildRequires: pkgconfig(openssl)
|
||||||
|
BuildRequires: systemd-rpm-macros
|
||||||
|
|
||||||
|
Requires(pre): shadow-utils
|
||||||
|
Requires: systemd
|
||||||
|
Requires: firewalld-filesystem
|
||||||
|
|
||||||
|
# systemd-rpm-macros ships a unit dep generator that parses User=/Group=
|
||||||
|
# from our .service file and emits Requires: user(neuron)/group(neuron).
|
||||||
|
# rpm's sysusers provides-generator emits the unversioned form for groups
|
||||||
|
# but only a versioned user(neuron) = <base64> for users with GECOS/home/
|
||||||
|
# shell. Provide the unversioned user(neuron) explicitly so dnf can resolve
|
||||||
|
# the auto-generated Requires. Without this, dnf5 silently filters the
|
||||||
|
# package and reports "Nothing to do".
|
||||||
|
Provides: user(neuron)
|
||||||
|
|
||||||
|
%description
|
||||||
|
Neuron is a per-node daemon for cortex inference clusters. It discovers
|
||||||
|
local GPU hardware via nvidia-smi, runs in-process inference via
|
||||||
|
huggingface/candle, and exposes an HTTP API for model lifecycle
|
||||||
|
management (load, unload, list, inference endpoint).
|
||||||
|
|
||||||
|
%prep
|
||||||
|
%autosetup
|
||||||
|
tar xf %{SOURCE1}
|
||||||
|
mkdir -p .cargo
|
||||||
|
cat > .cargo/config.toml << 'EOF'
|
||||||
|
[source.crates-io]
|
||||||
|
replace-with = "vendored-sources"
|
||||||
|
|
||||||
|
[source.vendored-sources]
|
||||||
|
directory = "vendor"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
%build
|
||||||
|
cargo build --release -p neuron
|
||||||
|
|
||||||
|
%install
|
||||||
|
install -Dm755 target/release/neuron %{buildroot}%{_bindir}/neuron
|
||||||
|
install -Dm644 data/neuron.service %{buildroot}%{_unitdir}/neuron.service
|
||||||
|
install -Dm644 data/neuron-sysusers.conf %{buildroot}%{_sysusersdir}/neuron.conf
|
||||||
|
install -Dm644 data/neuron-firewalld.xml %{buildroot}%{_prefix}/lib/firewalld/services/helexa-neuron.xml
|
||||||
|
install -dm755 %{buildroot}%{_sysconfdir}/neuron
|
||||||
|
install -Dm644 neuron.example.toml %{buildroot}%{_sysconfdir}/neuron/neuron.toml
|
||||||
|
|
||||||
|
%pre
|
||||||
|
%sysusers_create_compat %{_builddir}/%{name}-%{version}/data/neuron-sysusers.conf
|
||||||
|
|
||||||
|
%post
|
||||||
|
%systemd_post neuron.service
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%systemd_preun neuron.service
|
||||||
|
|
||||||
|
%postun
|
||||||
|
%systemd_postun_with_restart neuron.service
|
||||||
|
|
||||||
|
%files
|
||||||
|
%license LICENSE
|
||||||
|
%doc README.md
|
||||||
|
%{_bindir}/neuron
|
||||||
|
%{_unitdir}/neuron.service
|
||||||
|
%{_sysusersdir}/neuron.conf
|
||||||
|
%{_prefix}/lib/firewalld/services/helexa-neuron.xml
|
||||||
|
%dir %{_sysconfdir}/neuron
|
||||||
|
%config(noreplace) %{_sysconfdir}/neuron/neuron.toml
|
||||||
|
|
||||||
|
%changelog
|
||||||
|
* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.16-1
|
||||||
|
- chore: ignore local deploy script
|
||||||
|
- chore: move default ports out of common-collision ranges
|
||||||
|
- ci: drop actions/cache for cargo registry and target
|
||||||
|
|
||||||
|
* Thu Apr 16 2026 Gitea Actions <actions@git.lair.cafe> - 0.1.14-1
|
||||||
|
- ci: publish both packages to a single helexa/helexa COPR project
|
||||||
|
- fix(rpm): rename neuron package to helexa-neuron
|
||||||
|
- ci: commit generated %changelog entries back to main
|
||||||
|
|
||||||
|
* Wed Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
|
||||||
|
- Initial package
|
||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
[[models]]
|
[[models]]
|
||||||
id = "your-org/large-model"
|
id = "your-org/large-model"
|
||||||
harness = "mistralrs"
|
harness = "candle"
|
||||||
quant = "Q4_K_M"
|
quant = "Q4_K_M"
|
||||||
vram_mb = 19000
|
vram_mb = 19000
|
||||||
min_devices = 2
|
min_devices = 2
|
||||||
@@ -15,7 +15,7 @@ pinned_on = ["gpu-large"]
|
|||||||
|
|
||||||
[[models]]
|
[[models]]
|
||||||
id = "your-org/medium-model"
|
id = "your-org/medium-model"
|
||||||
harness = "mistralrs"
|
harness = "candle"
|
||||||
quant = "Q6_K"
|
quant = "Q6_K"
|
||||||
vram_mb = 12000
|
vram_mb = 12000
|
||||||
min_devices = 1
|
min_devices = 1
|
||||||
@@ -23,7 +23,7 @@ pinned_on = ["gpu-medium"]
|
|||||||
|
|
||||||
[[models]]
|
[[models]]
|
||||||
id = "your-org/embedding-model"
|
id = "your-org/embedding-model"
|
||||||
harness = "mistralrs"
|
harness = "candle"
|
||||||
quant = "Q8_0"
|
quant = "Q8_0"
|
||||||
vram_mb = 8000
|
vram_mb = 8000
|
||||||
min_devices = 1
|
min_devices = 1
|
||||||
|
|||||||
@@ -1,16 +1,24 @@
|
|||||||
# neuron.example.toml — example configuration
|
# neuron.example.toml — example configuration
|
||||||
#
|
#
|
||||||
# Copy to /etc/cortex/neuron.toml and adjust for your environment.
|
# Copy to /etc/neuron/neuron.toml and adjust for your environment.
|
||||||
#
|
#
|
||||||
# Environment variable overrides use NEURON_ prefix with __ separators:
|
# Environment variable overrides use NEURON_ prefix with __ separators:
|
||||||
# NEURON_PORT=9090
|
# NEURON_PORT=13131
|
||||||
|
|
||||||
port = 9090
|
port = 13131
|
||||||
|
|
||||||
# -- Harnesses ---------------------------------------------------------------
|
# -- Harnesses ---------------------------------------------------------------
|
||||||
# Each [[harnesses]] entry declares an inference engine managed by neuron.
|
# Each [[harnesses]] entry enables an inference engine. Currently only
|
||||||
|
# "candle" is supported — it runs in-process and uses huggingface/candle
|
||||||
|
# for inference on local CUDA devices (or CPU when CUDA is unavailable).
|
||||||
|
|
||||||
[[harnesses]]
|
[[harnesses]]
|
||||||
name = "mistralrs"
|
name = "candle"
|
||||||
endpoint = "http://localhost:8080"
|
|
||||||
systemd_unit = "mistralrs.service"
|
# -- Candle harness settings -------------------------------------------------
|
||||||
|
# Optional tuning for the candle harness.
|
||||||
|
|
||||||
|
[harness.candle]
|
||||||
|
# HuggingFace cache directory for model weights. When unset, hf-hub's
|
||||||
|
# default (~/.cache/huggingface) is used.
|
||||||
|
# hf_cache = "/var/lib/neuron/hf-cache"
|
||||||
|
|||||||
69
neuron.spec
69
neuron.spec
@@ -1,69 +0,0 @@
|
|||||||
Name: neuron
|
|
||||||
Version: 0.1.0
|
|
||||||
Release: 1%{?dist}
|
|
||||||
Summary: Per-node GPU discovery and harness management daemon for cortex
|
|
||||||
|
|
||||||
License: GPL-3.0-or-later
|
|
||||||
URL: https://git.lair.cafe/helexa/cortex
|
|
||||||
Source0: %{name}-%{version}.tar.gz
|
|
||||||
Source1: %{name}-%{version}-vendor.tar.gz
|
|
||||||
|
|
||||||
ExclusiveArch: x86_64
|
|
||||||
|
|
||||||
BuildRequires: rust >= 1.85
|
|
||||||
BuildRequires: cargo
|
|
||||||
BuildRequires: gcc
|
|
||||||
BuildRequires: systemd-rpm-macros
|
|
||||||
|
|
||||||
Requires(pre): shadow-utils
|
|
||||||
|
|
||||||
%description
|
|
||||||
Neuron is a per-node daemon for cortex inference clusters. It discovers
|
|
||||||
local GPU hardware via nvidia-smi, manages inference harnesses (mistral.rs,
|
|
||||||
llama.cpp), and exposes an HTTP API for model lifecycle management.
|
|
||||||
|
|
||||||
%prep
|
|
||||||
%autosetup
|
|
||||||
tar xf %{SOURCE1}
|
|
||||||
mkdir -p .cargo
|
|
||||||
cat > .cargo/config.toml << 'EOF'
|
|
||||||
[source.crates-io]
|
|
||||||
replace-with = "vendored-sources"
|
|
||||||
|
|
||||||
[source.vendored-sources]
|
|
||||||
directory = "vendor"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
%build
|
|
||||||
cargo build --release -p neuron
|
|
||||||
|
|
||||||
%install
|
|
||||||
install -Dm755 target/release/neuron %{buildroot}%{_bindir}/neuron
|
|
||||||
install -Dm644 data/neuron.service %{buildroot}%{_unitdir}/neuron.service
|
|
||||||
install -dm750 %{buildroot}%{_sysconfdir}/cortex
|
|
||||||
install -Dm640 neuron.example.toml %{buildroot}%{_sysconfdir}/cortex/neuron.toml
|
|
||||||
|
|
||||||
%pre
|
|
||||||
getent group cortex >/dev/null || groupadd -r cortex
|
|
||||||
getent passwd cortex >/dev/null || useradd -r -g cortex -d /var/lib/cortex -s /sbin/nologin cortex
|
|
||||||
|
|
||||||
%post
|
|
||||||
%systemd_post neuron.service
|
|
||||||
|
|
||||||
%preun
|
|
||||||
%systemd_preun neuron.service
|
|
||||||
|
|
||||||
%postun
|
|
||||||
%systemd_postun_with_restart neuron.service
|
|
||||||
|
|
||||||
%files
|
|
||||||
%license LICENSE
|
|
||||||
%doc README.md
|
|
||||||
%{_bindir}/neuron
|
|
||||||
%{_unitdir}/neuron.service
|
|
||||||
%dir %attr(750,root,cortex) %{_sysconfdir}/cortex
|
|
||||||
%config(noreplace) %attr(640,root,cortex) %{_sysconfdir}/cortex/neuron.toml
|
|
||||||
|
|
||||||
%changelog
|
|
||||||
* Tue Apr 15 2026 Rob Thijssen <grenade@rob.tn> - 0.1.0-1
|
|
||||||
- Initial package
|
|
||||||
102
rpm/cortex-prerelease.spec
Normal file
102
rpm/cortex-prerelease.spec
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
# Prebuilt-binary spec for cortex.
|
||||||
|
#
|
||||||
|
# Unlike cortex.spec (which builds from source via cargo), this spec
|
||||||
|
# wraps a pre-built `cortex` binary produced by an upstream CI job and
|
||||||
|
# packages it for rpm.lair.cafe. The %build phase is a no-op.
|
||||||
|
#
|
||||||
|
# Required defines at rpmbuild time:
|
||||||
|
# cortex_version e.g. "0.1.16"
|
||||||
|
# cortex_prerelease e.g. "0.1.20260518gitabcdef0" (used as Release)
|
||||||
|
|
||||||
|
%global _build_id_links none
|
||||||
|
%global debug_package %{nil}
|
||||||
|
%global __strip /usr/bin/true
|
||||||
|
|
||||||
|
%{!?cortex_version: %global cortex_version 0.0.0}
|
||||||
|
%if 0%{?cortex_prerelease:1}
|
||||||
|
%global cortex_release %{cortex_prerelease}
|
||||||
|
%else
|
||||||
|
%global cortex_release 1
|
||||||
|
%endif
|
||||||
|
|
||||||
|
Name: cortex
|
||||||
|
Version: %{cortex_version}
|
||||||
|
Release: %{cortex_release}%{?dist}
|
||||||
|
Summary: Inference gateway for multi-node GPU clusters (prebuilt)
|
||||||
|
|
||||||
|
License: GPL-3.0-or-later
|
||||||
|
URL: https://git.lair.cafe/helexa/cortex
|
||||||
|
|
||||||
|
Source0: cortex
|
||||||
|
Source1: cortex.service
|
||||||
|
Source2: cortex-sysusers.conf
|
||||||
|
Source3: cortex-firewalld.xml
|
||||||
|
Source4: cortex.example.toml
|
||||||
|
Source5: models.example.toml
|
||||||
|
Source6: LICENSE
|
||||||
|
|
||||||
|
ExclusiveArch: x86_64
|
||||||
|
|
||||||
|
Requires(pre): shadow-utils
|
||||||
|
Requires: systemd
|
||||||
|
Requires: firewalld-filesystem
|
||||||
|
|
||||||
|
Provides: user(cortex)
|
||||||
|
|
||||||
|
%description
|
||||||
|
Cortex is a Rust reverse-proxy that sits in front of multiple neuron
|
||||||
|
inference daemons and presents a unified OpenAI and Anthropic
|
||||||
|
compatible API surface.
|
||||||
|
|
||||||
|
This package wraps a binary built upstream in CI; the source-build
|
||||||
|
spec (cortex.spec) remains available for stable releases.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
cp %{SOURCE0} ./cortex
|
||||||
|
cp %{SOURCE1} .
|
||||||
|
cp %{SOURCE2} .
|
||||||
|
cp %{SOURCE3} .
|
||||||
|
cp %{SOURCE4} .
|
||||||
|
cp %{SOURCE5} .
|
||||||
|
cp %{SOURCE6} .
|
||||||
|
|
||||||
|
%build
|
||||||
|
# Already built in the upstream CI build job.
|
||||||
|
|
||||||
|
%install
|
||||||
|
install -Dm755 cortex %{buildroot}%{_bindir}/cortex
|
||||||
|
install -Dm644 cortex.service %{buildroot}%{_unitdir}/cortex.service
|
||||||
|
install -Dm644 cortex-sysusers.conf %{buildroot}%{_sysusersdir}/cortex.conf
|
||||||
|
install -Dm644 cortex-firewalld.xml %{buildroot}%{_prefix}/lib/firewalld/services/cortex.xml
|
||||||
|
install -dm755 %{buildroot}%{_sysconfdir}/cortex
|
||||||
|
install -Dm644 cortex.example.toml %{buildroot}%{_sysconfdir}/cortex/cortex.toml
|
||||||
|
install -Dm644 models.example.toml %{buildroot}%{_sysconfdir}/cortex/models.toml
|
||||||
|
|
||||||
|
%pre
|
||||||
|
getent group cortex >/dev/null || groupadd -r cortex
|
||||||
|
getent passwd cortex >/dev/null || \
|
||||||
|
useradd -r -g cortex -d /var/lib/cortex -s /sbin/nologin \
|
||||||
|
-c "Cortex inference gateway" cortex
|
||||||
|
|
||||||
|
%post
|
||||||
|
%systemd_post cortex.service
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%systemd_preun cortex.service
|
||||||
|
|
||||||
|
%postun
|
||||||
|
%systemd_postun_with_restart cortex.service
|
||||||
|
|
||||||
|
%files
|
||||||
|
%license LICENSE
|
||||||
|
%{_bindir}/cortex
|
||||||
|
%{_unitdir}/cortex.service
|
||||||
|
%{_sysusersdir}/cortex.conf
|
||||||
|
%{_prefix}/lib/firewalld/services/cortex.xml
|
||||||
|
%dir %{_sysconfdir}/cortex
|
||||||
|
%config(noreplace) %{_sysconfdir}/cortex/cortex.toml
|
||||||
|
%config(noreplace) %{_sysconfdir}/cortex/models.toml
|
||||||
|
|
||||||
|
%changelog
|
||||||
|
* Mon May 18 2026 Gitea Actions <actions@git.lair.cafe> - %{cortex_version}-%{cortex_release}
|
||||||
|
- Prerelease build from upstream CI binary.
|
||||||
122
rpm/helexa-neuron-prerelease.spec
Normal file
122
rpm/helexa-neuron-prerelease.spec
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
# Prebuilt-binary spec for helexa-neuron flavoured by CUDA compute capability.
|
||||||
|
#
|
||||||
|
# Unlike helexa-neuron.spec (which builds from source via cargo), this
|
||||||
|
# spec wraps a pre-built `neuron-{flavour}` binary produced by an
|
||||||
|
# upstream CI job and packages it for rpm.lair.cafe. The %build phase
|
||||||
|
# is a no-op.
|
||||||
|
#
|
||||||
|
# Required defines at rpmbuild time:
|
||||||
|
# neuron_version e.g. "0.1.16"
|
||||||
|
# neuron_flavour e.g. "ada", "blackwell" — matches the CI build
|
||||||
|
# matrix's compute_cap label.
|
||||||
|
# neuron_prerelease e.g. "0.1.20260518gitabcdef0" (used as Release)
|
||||||
|
#
|
||||||
|
# One flavour can be installed at a time on a given host; flavour
|
||||||
|
# packages Conflict with each other.
|
||||||
|
|
||||||
|
%global _build_id_links none
|
||||||
|
%global debug_package %{nil}
|
||||||
|
%global __strip /usr/bin/true
|
||||||
|
|
||||||
|
%{!?neuron_version: %global neuron_version 0.0.0}
|
||||||
|
%{!?neuron_flavour: %global neuron_flavour blackwell}
|
||||||
|
%if 0%{?neuron_prerelease:1}
|
||||||
|
%global neuron_release %{neuron_prerelease}
|
||||||
|
%else
|
||||||
|
%global neuron_release 1
|
||||||
|
%endif
|
||||||
|
|
||||||
|
Name: helexa-neuron-%{neuron_flavour}
|
||||||
|
Version: %{neuron_version}
|
||||||
|
Release: %{neuron_release}%{?dist}
|
||||||
|
Summary: Per-node GPU inference daemon (candle, %{neuron_flavour} flavour)
|
||||||
|
|
||||||
|
License: GPL-3.0-or-later
|
||||||
|
URL: https://git.lair.cafe/helexa/cortex
|
||||||
|
|
||||||
|
Source0: neuron-%{neuron_flavour}
|
||||||
|
Source1: neuron.service
|
||||||
|
Source2: neuron-sysusers.conf
|
||||||
|
Source3: neuron-firewalld.xml
|
||||||
|
Source4: neuron.example.toml
|
||||||
|
Source5: LICENSE
|
||||||
|
|
||||||
|
ExclusiveArch: x86_64
|
||||||
|
|
||||||
|
# Binary links against the CUDA runtime, cuDNN, NCCL, etc. Suppress
|
||||||
|
# auto-detected exact soname deps — users may have CUDA from various
|
||||||
|
# sources (rpmfusion, nvidia-direct) at different compatible versions;
|
||||||
|
# a runtime dlopen failure surfaces a clearer error than rpm dep
|
||||||
|
# resolution would.
|
||||||
|
%global __requires_exclude ^lib(cuda|cudart|cudnn|cublas|cublasLt|curand|nvrtc|nccl)
|
||||||
|
|
||||||
|
Requires(pre): shadow-utils
|
||||||
|
Requires: systemd
|
||||||
|
Requires: firewalld-filesystem
|
||||||
|
|
||||||
|
Provides: helexa-neuron = %{neuron_version}-%{neuron_release}
|
||||||
|
Provides: user(neuron)
|
||||||
|
|
||||||
|
# Mutual exclusion across flavours and the source-build variant.
|
||||||
|
Conflicts: helexa-neuron
|
||||||
|
Conflicts: helexa-neuron-ada
|
||||||
|
Conflicts: helexa-neuron-ampere
|
||||||
|
Conflicts: helexa-neuron-blackwell
|
||||||
|
# (The Conflicts: with self is filtered by rpm at install time.)
|
||||||
|
|
||||||
|
%description
|
||||||
|
Neuron is the per-node daemon for cortex inference clusters. It
|
||||||
|
discovers local GPU hardware via nvidia-smi, runs in-process
|
||||||
|
inference via huggingface/candle, and exposes an HTTP API for model
|
||||||
|
lifecycle management (load, unload, list, inference endpoint).
|
||||||
|
|
||||||
|
This is the %{neuron_flavour} flavour, built for that CUDA compute
|
||||||
|
capability. Install the flavour matching the GPUs on this host.
|
||||||
|
|
||||||
|
%prep
|
||||||
|
cp %{SOURCE0} ./neuron
|
||||||
|
cp %{SOURCE1} .
|
||||||
|
cp %{SOURCE2} .
|
||||||
|
cp %{SOURCE3} .
|
||||||
|
cp %{SOURCE4} .
|
||||||
|
cp %{SOURCE5} .
|
||||||
|
|
||||||
|
%build
|
||||||
|
# Already built in the upstream CI build job (with --features cuda).
|
||||||
|
|
||||||
|
%install
|
||||||
|
install -Dm755 neuron %{buildroot}%{_bindir}/neuron
|
||||||
|
install -Dm644 neuron.service %{buildroot}%{_unitdir}/neuron.service
|
||||||
|
install -Dm644 neuron-sysusers.conf %{buildroot}%{_sysusersdir}/neuron.conf
|
||||||
|
install -Dm644 neuron-firewalld.xml %{buildroot}%{_prefix}/lib/firewalld/services/helexa-neuron.xml
|
||||||
|
install -dm755 %{buildroot}%{_sysconfdir}/neuron
|
||||||
|
install -Dm644 neuron.example.toml %{buildroot}%{_sysconfdir}/neuron/neuron.toml
|
||||||
|
|
||||||
|
%pre
|
||||||
|
getent group neuron >/dev/null || groupadd -r neuron
|
||||||
|
getent passwd neuron >/dev/null || \
|
||||||
|
useradd -r -g neuron -d /var/lib/neuron -s /sbin/nologin \
|
||||||
|
-G video,render \
|
||||||
|
-c "Neuron GPU node daemon" neuron
|
||||||
|
|
||||||
|
%post
|
||||||
|
%systemd_post neuron.service
|
||||||
|
|
||||||
|
%preun
|
||||||
|
%systemd_preun neuron.service
|
||||||
|
|
||||||
|
%postun
|
||||||
|
%systemd_postun_with_restart neuron.service
|
||||||
|
|
||||||
|
%files
|
||||||
|
%license LICENSE
|
||||||
|
%{_bindir}/neuron
|
||||||
|
%{_unitdir}/neuron.service
|
||||||
|
%{_sysusersdir}/neuron.conf
|
||||||
|
%{_prefix}/lib/firewalld/services/helexa-neuron.xml
|
||||||
|
%dir %{_sysconfdir}/neuron
|
||||||
|
%config(noreplace) %{_sysconfdir}/neuron/neuron.toml
|
||||||
|
|
||||||
|
%changelog
|
||||||
|
* Mon May 18 2026 Gitea Actions <actions@git.lair.cafe> - %{neuron_version}-%{neuron_release}
|
||||||
|
- Prerelease build from upstream CI binary (%{neuron_flavour} flavour).
|
||||||
1
rpm/rpmmacros
Normal file
1
rpm/rpmmacros
Normal file
@@ -0,0 +1 @@
|
|||||||
|
%_openpgp_sign_id @GPG_NAME@
|
||||||
154
script/generate-packages-json.py
Executable file
154
script/generate-packages-json.py
Executable file
@@ -0,0 +1,154 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Parse RPM repodata and emit a packages.json manifest for the UI."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import gzip
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
RPM_NS = "http://linux.duke.edu/metadata/common"
|
||||||
|
OTHER_NS = "http://linux.duke.edu/metadata/other"
|
||||||
|
REPO_NS = "http://linux.duke.edu/metadata/repo"
|
||||||
|
|
||||||
|
|
||||||
|
def find_repodata_file(repodata_dir, data_type):
|
||||||
|
"""Read repomd.xml and return the path to a specific data type's file."""
|
||||||
|
repomd_path = os.path.join(repodata_dir, "repomd.xml")
|
||||||
|
tree = ET.parse(repomd_path)
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
for data in root.findall(f"{{{REPO_NS}}}data"):
|
||||||
|
if data.get("type") == data_type:
|
||||||
|
location = data.find(f"{{{REPO_NS}}}location")
|
||||||
|
if location is not None:
|
||||||
|
href = location.get("href", "")
|
||||||
|
return os.path.join(os.path.dirname(repodata_dir), href)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def open_compressed(path):
|
||||||
|
"""Open a gzip or zstd compressed file for reading."""
|
||||||
|
if path.endswith(".zst"):
|
||||||
|
result = subprocess.run(
|
||||||
|
["zstdcat", path], capture_output=True, check=True
|
||||||
|
)
|
||||||
|
import io
|
||||||
|
return io.BytesIO(result.stdout)
|
||||||
|
else:
|
||||||
|
return gzip.open(path, "rb")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_primary(repodata_dir):
|
||||||
|
"""Parse primary.xml.{gz,zst} and return package metadata."""
|
||||||
|
path = find_repodata_file(repodata_dir, "primary")
|
||||||
|
if not path:
|
||||||
|
print("error: primary metadata not found in repomd.xml", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
packages = {}
|
||||||
|
with open_compressed(path) as f:
|
||||||
|
tree = ET.parse(f)
|
||||||
|
|
||||||
|
for pkg in tree.getroot().findall(f"{{{RPM_NS}}}package"):
|
||||||
|
if pkg.get("type") != "rpm":
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = pkg.findtext(f"{{{RPM_NS}}}name", "")
|
||||||
|
version_el = pkg.find(f"{{{RPM_NS}}}version")
|
||||||
|
ver = version_el.get("ver", "") if version_el is not None else ""
|
||||||
|
rel = version_el.get("rel", "") if version_el is not None else ""
|
||||||
|
arch = pkg.findtext(f"{{{RPM_NS}}}arch", "")
|
||||||
|
|
||||||
|
size_el = pkg.find(f"{{{RPM_NS}}}size")
|
||||||
|
size = int(size_el.get("package", "0")) if size_el is not None else 0
|
||||||
|
|
||||||
|
time_el = pkg.find(f"{{{RPM_NS}}}time")
|
||||||
|
build_time = int(time_el.get("build", "0")) if time_el is not None else 0
|
||||||
|
|
||||||
|
location_el = pkg.find(f"{{{RPM_NS}}}location")
|
||||||
|
filename = os.path.basename(location_el.get("href", "")) if location_el is not None else ""
|
||||||
|
|
||||||
|
key = f"{name}-{ver}-{rel}"
|
||||||
|
packages[key] = {
|
||||||
|
"name": name,
|
||||||
|
"version": ver,
|
||||||
|
"release": rel,
|
||||||
|
"arch": arch,
|
||||||
|
"summary": pkg.findtext(f"{{{RPM_NS}}}summary", ""),
|
||||||
|
"size": size,
|
||||||
|
"buildTime": build_time,
|
||||||
|
"rpmFilename": filename,
|
||||||
|
"changelog": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
return packages
|
||||||
|
|
||||||
|
|
||||||
|
def parse_other(repodata_dir, packages):
|
||||||
|
"""Parse other.xml.gz and attach changelog entries to packages."""
|
||||||
|
path = find_repodata_file(repodata_dir, "other")
|
||||||
|
if not path:
|
||||||
|
return
|
||||||
|
|
||||||
|
with open_compressed(path) as f:
|
||||||
|
tree = ET.parse(f)
|
||||||
|
|
||||||
|
for pkg in tree.getroot().findall(f"{{{OTHER_NS}}}package"):
|
||||||
|
name = pkg.get("name", "")
|
||||||
|
version_el = pkg.find(f"{{{OTHER_NS}}}version")
|
||||||
|
ver = version_el.get("ver", "") if version_el is not None else ""
|
||||||
|
rel = version_el.get("rel", "") if version_el is not None else ""
|
||||||
|
key = f"{name}-{ver}-{rel}"
|
||||||
|
|
||||||
|
if key not in packages:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for entry in pkg.findall(f"{{{OTHER_NS}}}changelog"):
|
||||||
|
packages[key]["changelog"].append({
|
||||||
|
"author": entry.get("author", ""),
|
||||||
|
"date": int(entry.get("date", "0")),
|
||||||
|
"text": (entry.text or "").strip(),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--repodata-dir",
|
||||||
|
required=True,
|
||||||
|
help="path to the repodata/ directory",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
required=True,
|
||||||
|
help="path to write packages.json",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-url",
|
||||||
|
required=True,
|
||||||
|
help="public base URL for the repo (e.g. https://rpm.lair.cafe/fedora/43/x86_64)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
packages = parse_primary(args.repodata_dir)
|
||||||
|
parse_other(args.repodata_dir, packages)
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"generated": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"baseUrl": args.base_url,
|
||||||
|
"packages": list(packages.values()),
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(args.output, "w") as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
print(f"wrote {len(packages)} packages to {args.output}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user