mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-05-13 16:07:30 +00:00
🧠 fix: charge Gemini reasoning tokens in agent usage accounting (#13014)
* 🧠 fix: charge Gemini reasoning tokens in agent usage accounting Resolves #13006. `usage.ts` previously billed `usage.output_tokens` directly. For Vertex AI Gemini thinking models, `@langchain/google-common`'s streaming path emits `output_tokens = candidatesTokenCount` only, dropping `thoughtsTokenCount`. Reasoning was billed at zero and the `total_tokens === input_tokens + output_tokens` invariant was broken. The fix lives in agents (danny-avila/agents#157) — but this is also a defense-in-depth backstop in case agents misses a path or another provider exhibits the same shape. `resolveCompletionTokens(usage)` adds `output_token_details.reasoning` back when (and only when) the gap is present (`total - input > output`), so providers that already include reasoning in `output_tokens` (OpenAI o-series, Anthropic, the Google-API wrapper) are no-ops — no double-counting. - `SplitUsage` gains a `completion` field; all four billing call sites in `processUsageGroup` use it instead of `usage.output_tokens`. - `total_output_tokens` in the result also reflects the corrected count. - `UsageMetadata` interface in `IJobStore.ts` adds the `output_token_details` field for type safety. - 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI no-double-count, structured spend path with cache + reasoning, no-op when no details present. * 🩹 fix: simplify reasoning correction to invariant-based gap check Initial fix gated the correction on `output_token_details.reasoning > 0`, which doesn't help in the live failure case: when google-common's stream emits the buggy fallback usage_metadata, output_token_details is empty ({}) and the gate exits early. Live debugging showed the reliable signal is the documented invariant itself: `total_tokens === input_tokens + output_tokens`. When buggy streams undercount output, total exceeds input + output by exactly the unbilled reasoning. Use `total - input` as the corrected output. This is provider-agnostic and stays a no-op for compliant providers (OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the gap is zero. Live verified end-to-end against gemini-3-flash-preview: - With agents fix in place: output_tokens=437 → billed 437 (no-op) - Backstop only (no agents fix, buggy input): raw 135, billed 297 (= total 309 - input 12, matches actual API charge) Updated tests to cover both scenarios.
This commit is contained in:
parent
93c4ef4ba8
commit
0fe203aaca
3 changed files with 171 additions and 6 deletions
|
|
@ -364,6 +364,128 @@ describe('recordCollectedUsage', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('reasoning token handling - issue #13006', () => {
|
||||
it('uses total - input when output_tokens undercounts (Vertex stream undercount with details present)', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 80657,
|
||||
output_tokens: 766,
|
||||
total_tokens: 83265,
|
||||
output_token_details: { reasoning: 1842 },
|
||||
model: 'gemini-3-flash-preview',
|
||||
provider: 'vertexai',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
|
||||
{ promptTokens: 80657, completionTokens: 2608 },
|
||||
);
|
||||
expect(result?.output_tokens).toBe(2608);
|
||||
});
|
||||
|
||||
it('uses total - input even when output_token_details is missing (raw langchain google-common path)', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 12,
|
||||
output_tokens: 135,
|
||||
total_tokens: 309,
|
||||
model: 'gemini-3-flash-preview',
|
||||
provider: 'vertexai',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
|
||||
{ promptTokens: 12, completionTokens: 297 },
|
||||
);
|
||||
expect(result?.output_tokens).toBe(297);
|
||||
});
|
||||
|
||||
it('does not change output when invariant already holds (OpenAI o-series, reasoning already a subset)', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 500,
|
||||
total_tokens: 600,
|
||||
output_token_details: { reasoning: 200 },
|
||||
model: 'o1-preview',
|
||||
provider: 'openAI',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'o1-preview' }),
|
||||
{ promptTokens: 100, completionTokens: 500 },
|
||||
);
|
||||
expect(result?.output_tokens).toBe(500);
|
||||
});
|
||||
|
||||
it('routes correction through structured spend when cache tokens are present', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 80657,
|
||||
output_tokens: 766,
|
||||
total_tokens: 83265,
|
||||
output_token_details: { reasoning: 1842 },
|
||||
input_token_details: { cache_read: 30000 },
|
||||
model: 'gemini-3-flash-preview',
|
||||
provider: 'vertexai',
|
||||
},
|
||||
];
|
||||
|
||||
await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
|
||||
{
|
||||
promptTokens: { input: 50657, write: 0, read: 30000 },
|
||||
completionTokens: 2608,
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it('no-op when total_tokens is absent or zero', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'gpt-4',
|
||||
provider: 'openAI',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledWith(
|
||||
expect.anything(),
|
||||
{ promptTokens: 100, completionTokens: 50 },
|
||||
);
|
||||
expect(result?.output_tokens).toBe(50);
|
||||
});
|
||||
});
|
||||
|
||||
describe('mixed cache and non-cache entries', () => {
|
||||
it('should handle mixed entries correctly', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
|
|
|
|||
|
|
@ -50,6 +50,34 @@ function inputTokensIncludesCache(provider?: string): boolean {
|
|||
return provider != null && SUBSET_PROVIDERS.has(provider);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves `completionTokens` for billing, repairing providers whose
|
||||
* `usage_metadata.output_tokens` undercounts.
|
||||
*
|
||||
* The documented `UsageMetadata` contract (`@langchain/core`) is
|
||||
* `total_tokens === input_tokens + output_tokens`. Compliant providers
|
||||
* (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
|
||||
* include any reasoning/thinking tokens inside `output_tokens` already,
|
||||
* so the invariant holds.
|
||||
*
|
||||
* Vertex AI Gemini through `@langchain/google-common`'s streaming path
|
||||
* emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
|
||||
* leaving `total - input > output`. When that gap shows up we use the
|
||||
* invariant to recover the correct billable output (`total - input`).
|
||||
* Compliant providers have a zero gap, so this is a no-op for them.
|
||||
*
|
||||
* Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
|
||||
*/
|
||||
function resolveCompletionTokens(usage: UsageMetadata): number {
|
||||
const output = Number(usage.output_tokens) || 0;
|
||||
const total = Number(usage.total_tokens) || 0;
|
||||
const input = Number(usage.input_tokens) || 0;
|
||||
if (total > input + output) {
|
||||
return total - input;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
interface SplitUsage {
|
||||
/** Non-cached input portion — what gets billed at the standard input rate */
|
||||
inputOnly: number;
|
||||
|
|
@ -57,6 +85,8 @@ interface SplitUsage {
|
|||
cacheRead: number;
|
||||
/** Total prompt tokens including cached portion */
|
||||
totalInput: number;
|
||||
/** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */
|
||||
completion: number;
|
||||
}
|
||||
|
||||
function splitUsage(usage: UsageMetadata): SplitUsage {
|
||||
|
|
@ -67,12 +97,14 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
|
|||
const cacheRead =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
const rawInput = Number(usage.input_tokens) || 0;
|
||||
const completion = resolveCompletionTokens(usage);
|
||||
if (inputTokensIncludesCache(usage.provider)) {
|
||||
return {
|
||||
inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
|
||||
cacheCreation,
|
||||
cacheRead,
|
||||
totalInput: rawInput,
|
||||
completion,
|
||||
};
|
||||
}
|
||||
return {
|
||||
|
|
@ -80,6 +112,7 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
|
|||
cacheCreation,
|
||||
cacheRead,
|
||||
totalInput: rawInput + cacheCreation + cacheRead,
|
||||
completion,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -161,9 +194,9 @@ export async function recordCollectedUsage(
|
|||
continue;
|
||||
}
|
||||
|
||||
const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage);
|
||||
const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage);
|
||||
|
||||
total_output_tokens += Number(usage.output_tokens) || 0;
|
||||
total_output_tokens += completion;
|
||||
|
||||
const txMetadata: TxMetadata = {
|
||||
user,
|
||||
|
|
@ -187,7 +220,7 @@ export async function recordCollectedUsage(
|
|||
write: cacheCreation,
|
||||
read: cacheRead,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
completionTokens: completion,
|
||||
},
|
||||
pricing,
|
||||
)
|
||||
|
|
@ -195,7 +228,7 @@ export async function recordCollectedUsage(
|
|||
txMetadata,
|
||||
{
|
||||
promptTokens: inputOnly,
|
||||
completionTokens: usage.output_tokens,
|
||||
completionTokens: completion,
|
||||
},
|
||||
pricing,
|
||||
);
|
||||
|
|
@ -211,7 +244,7 @@ export async function recordCollectedUsage(
|
|||
write: cacheCreation,
|
||||
read: cacheRead,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
completionTokens: completion,
|
||||
})
|
||||
.catch((err) => {
|
||||
logger.error(
|
||||
|
|
@ -225,7 +258,7 @@ export async function recordCollectedUsage(
|
|||
deps
|
||||
.spendTokens(txMetadata, {
|
||||
promptTokens: inputOnly,
|
||||
completionTokens: usage.output_tokens,
|
||||
completionTokens: completion,
|
||||
})
|
||||
.catch((err) => {
|
||||
logger.error(
|
||||
|
|
|
|||
|
|
@ -98,6 +98,16 @@ export interface UsageMetadata {
|
|||
* Present for Claude models. Mutually exclusive with input_token_details.
|
||||
*/
|
||||
cache_read_input_tokens?: number;
|
||||
/**
|
||||
* Breakdown of output token counts. Per the LangChain core contract,
|
||||
* `output_tokens` is the sum of all output token types — these fields
|
||||
* are subsets of `output_tokens`, *not* additional charges.
|
||||
*/
|
||||
output_token_details?: {
|
||||
/** Reasoning/thinking tokens generated as chain-of-thought (o1, Gemini thinking, etc.) */
|
||||
reasoning?: number;
|
||||
audio?: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue