fix(neuron): macro-ify CUDA single-GPU route_token so DecodeStream type stays inferred
All checks were successful
CI / CUDA type-check (push) Successful in 32s
build-prerelease / Resolve version stamps (push) Successful in 29s
CI / Format (push) Successful in 29s
CI / Clippy (push) Successful in 2m47s
build-prerelease / Build cortex binary (push) Successful in 4m27s
CI / Test (push) Successful in 5m40s
build-prerelease / Build neuron-blackwell (push) Successful in 5m47s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m21s
build-prerelease / Build neuron-ampere (push) Successful in 8m30s
build-prerelease / Build neuron-ada (push) Successful in 5m39s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m2s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m11s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 4m1s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m5s
All checks were successful
CI / CUDA type-check (push) Successful in 32s
build-prerelease / Resolve version stamps (push) Successful in 29s
CI / Format (push) Successful in 29s
CI / Clippy (push) Successful in 2m47s
build-prerelease / Build cortex binary (push) Successful in 4m27s
CI / Test (push) Successful in 5m40s
build-prerelease / Build neuron-blackwell (push) Successful in 5m47s
CI / Build cortex SRPM (push) Has been skipped
CI / Publish cortex to COPR (push) Has been skipped
CI / Build neuron SRPM (push) Has been skipped
CI / Publish neuron to COPR (push) Has been skipped
CI / Bump version in source (push) Has been skipped
build-prerelease / Package cortex RPM (push) Successful in 1m21s
build-prerelease / Build neuron-ampere (push) Successful in 8m30s
build-prerelease / Build neuron-ada (push) Successful in 5m39s
build-prerelease / Package helexa-neuron-ada RPM (push) Successful in 3m2s
build-prerelease / Package helexa-neuron-ampere RPM (push) Successful in 3m11s
build-prerelease / Package helexa-neuron-blackwell RPM (push) Successful in 4m1s
build-prerelease / Publish to rpm.lair.cafe (unstable) (push) Successful in 1m5s
Prerelease build (run 270) failed on commit cb30383 with:
error[E0107]: struct takes 5 generic arguments but 0 generic
arguments were supplied
--> crates/neuron/src/harness/candle.rs:3554:41
|
3554 | decode_stream: &mut tokenizers::DecodeStream<'_>,
| ^^^^^^^^^^^^
The Step-2-era refactor for #6's tool-call extraction added a
nested `async fn route_token` inside `stream_inference_via_worker`
that named `tokenizers::DecodeStream<'_>` as a parameter type.
`DecodeStream` actually has five generic parameters
(`'tok, M, N, PT, PP, D`) which makes naming it explicitly
painful — the working approach the CPU path uses is a macro,
where the body expands inline at the call site and the
decoder type stays inferred.
This commit replicates the CPU-side macro for the CUDA worker
path. Same shape, just with `.await` calls inside (macros tolerate
that since they expand inline into the enclosing async context).
Control flow uses a labelled-block + `consumer_alive` flag rather
than `return` so the macro stays generic over the surrounding
return type.
The CPU build (default-feature workspace, what `clippy` and `test`
jobs exercise) doesn't compile this `#[cfg(feature = "cuda")]`
branch, which is why local CI green-lit it. The cuda-check job
should catch this category of breakage now that #cb30383+CI-fix
landed; this commit just resolves the actual breakage on the
prerelease workflow.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -3537,29 +3537,38 @@ async fn stream_inference_via_worker(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Inlined per-token routing — parallel to the TP path. Macro
|
// Per-token routing. `tokenizers::DecodeStream` carries five
|
||||||
// approach used in the CPU path doesn't translate cleanly
|
// generic parameters (`M, N, PT, PP, D`) which makes naming
|
||||||
// here because the emit is async (.await) and macros don't
|
// its type from a helper signature painful. Use a macro
|
||||||
// tolerate `.await` inside reused expansions across two
|
// instead — the body expands inline with `decode_stream`'s
|
||||||
// call sites well.
|
// concrete type inferred from the call site. The macro
|
||||||
async fn route_token(
|
// contains `.await` calls, so it can only expand inside an
|
||||||
next_token: u32,
|
// `async` context (which both call sites below are).
|
||||||
all_tokens: &mut Vec<u32>,
|
//
|
||||||
in_reasoning: &mut bool,
|
// The macro takes a single `$next_token` expression and
|
||||||
in_tool_call: &mut bool,
|
// returns control to the enclosing scope via `break 'work_step`
|
||||||
tool_call_buf: &mut String,
|
// (success path) — labels are needed because Rust macros can't
|
||||||
tool_call_idx: &mut usize,
|
// emit naked `return` from the caller when the caller's return
|
||||||
reasoning_tokens: Option<&ReasoningTokenPair>,
|
// type isn't `()`. Instead the macro `break`s out of a
|
||||||
tool_call_tokens: Option<&ToolCallTokenPair>,
|
// labelled block, and the surrounding `if !routed { ... }`
|
||||||
decode_stream: &mut tokenizers::DecodeStream<'_>,
|
// checks whether the consumer hung up via a captured `routed`
|
||||||
tx: &mpsc::Sender<InferenceEvent>,
|
// flag.
|
||||||
) -> bool {
|
macro_rules! route_token {
|
||||||
all_tokens.push(next_token);
|
($next_token:expr) => {{
|
||||||
match handle_tool_call_marker(next_token, tool_call_tokens, in_tool_call, tool_call_buf) {
|
let nt = $next_token;
|
||||||
ToolCallMarker::Enter => return true,
|
all_tokens.push(nt);
|
||||||
|
let mut consumer_alive = true;
|
||||||
|
'route: {
|
||||||
|
match handle_tool_call_marker(
|
||||||
|
nt,
|
||||||
|
tool_call_tokens.as_ref(),
|
||||||
|
&mut in_tool_call,
|
||||||
|
&mut tool_call_buf,
|
||||||
|
) {
|
||||||
|
ToolCallMarker::Enter => break 'route,
|
||||||
ToolCallMarker::Exit { buffer } => {
|
ToolCallMarker::Exit { buffer } => {
|
||||||
let idx = *tool_call_idx;
|
let idx = tool_call_idx;
|
||||||
*tool_call_idx += 1;
|
tool_call_idx += 1;
|
||||||
match parse_tool_call_body(&buffer, idx) {
|
match parse_tool_call_body(&buffer, idx) {
|
||||||
Some((id, name, arguments)) => {
|
Some((id, name, arguments)) => {
|
||||||
if tx
|
if tx
|
||||||
@@ -3572,65 +3581,59 @@ async fn stream_inference_via_worker(
|
|||||||
.await
|
.await
|
||||||
.is_err()
|
.is_err()
|
||||||
{
|
{
|
||||||
return false;
|
consumer_alive = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
let open = tool_call_tokens
|
let open = tool_call_tokens
|
||||||
|
.as_ref()
|
||||||
.map(|p| p.open_text.as_str())
|
.map(|p| p.open_text.as_str())
|
||||||
.unwrap_or("<tool_call>");
|
.unwrap_or("<tool_call>");
|
||||||
let close = tool_call_tokens
|
let close = tool_call_tokens
|
||||||
|
.as_ref()
|
||||||
.map(|p| p.close_text.as_str())
|
.map(|p| p.close_text.as_str())
|
||||||
.unwrap_or("</tool_call>");
|
.unwrap_or("</tool_call>");
|
||||||
let raw = format!("{open}{buffer}{close}");
|
let raw = format!("{open}{buffer}{close}");
|
||||||
if !emit_delta(&raw, tx, *in_reasoning).await {
|
if !emit_delta(&raw, &tx, in_reasoning).await {
|
||||||
return false;
|
consumer_alive = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
break 'route;
|
||||||
}
|
}
|
||||||
ToolCallMarker::None => {}
|
ToolCallMarker::None => {}
|
||||||
}
|
}
|
||||||
if *in_tool_call {
|
if in_tool_call {
|
||||||
match decode_stream.step(next_token) {
|
match decode_stream.step(nt) {
|
||||||
Ok(Some(s)) => tool_call_buf.push_str(&s),
|
Ok(Some(s)) => tool_call_buf.push_str(&s),
|
||||||
Ok(None) => {}
|
Ok(None) => {}
|
||||||
Err(e) => tracing::warn!(error = %e, "decode_stream step failed (in tool_call)"),
|
Err(e) => tracing::warn!(
|
||||||
|
error = %e,
|
||||||
|
"decode_stream step failed (in tool_call)"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
return true;
|
break 'route;
|
||||||
}
|
}
|
||||||
if handle_reasoning_marker(next_token, reasoning_tokens, in_reasoning) {
|
if handle_reasoning_marker(nt, reasoning_tokens.as_ref(), &mut in_reasoning) {
|
||||||
return true;
|
break 'route;
|
||||||
}
|
}
|
||||||
match decode_stream.step(next_token) {
|
match decode_stream.step(nt) {
|
||||||
Ok(Some(delta)) => {
|
Ok(Some(delta)) => {
|
||||||
if !emit_delta(&delta, tx, *in_reasoning).await {
|
if !emit_delta(&delta, &tx, in_reasoning).await {
|
||||||
return false;
|
consumer_alive = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(None) => {}
|
Ok(None) => {}
|
||||||
Err(e) => tracing::warn!(error = %e, "decode_stream step failed"),
|
Err(e) => tracing::warn!(error = %e, "decode_stream step failed"),
|
||||||
}
|
}
|
||||||
true
|
}
|
||||||
|
consumer_alive
|
||||||
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
if Some(next_token) == eos_id {
|
if Some(next_token) == eos_id {
|
||||||
finish_reason = FinishReason::Stop;
|
finish_reason = FinishReason::Stop;
|
||||||
} else if !route_token(
|
} else if !route_token!(next_token) {
|
||||||
next_token,
|
|
||||||
&mut all_tokens,
|
|
||||||
&mut in_reasoning,
|
|
||||||
&mut in_tool_call,
|
|
||||||
&mut tool_call_buf,
|
|
||||||
&mut tool_call_idx,
|
|
||||||
reasoning_tokens.as_ref(),
|
|
||||||
tool_call_tokens.as_ref(),
|
|
||||||
&mut decode_stream,
|
|
||||||
&tx,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
return Ok(finish_reason.as_openai_str().to_string());
|
return Ok(finish_reason.as_openai_str().to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3656,20 +3659,7 @@ async fn stream_inference_via_worker(
|
|||||||
finish_reason = FinishReason::Stop;
|
finish_reason = FinishReason::Stop;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if !route_token(
|
if !route_token!(next_token) {
|
||||||
next_token,
|
|
||||||
&mut all_tokens,
|
|
||||||
&mut in_reasoning,
|
|
||||||
&mut in_tool_call,
|
|
||||||
&mut tool_call_buf,
|
|
||||||
&mut tool_call_idx,
|
|
||||||
reasoning_tokens.as_ref(),
|
|
||||||
tool_call_tokens.as_ref(),
|
|
||||||
&mut decode_stream,
|
|
||||||
&tx,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
return Ok(finish_reason.as_openai_str().to_string());
|
return Ok(finish_reason.as_openai_str().to_string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user