🧠 fix: charge Gemini reasoning tokens in agent usage accounting (#13014)

* 🧠 fix: charge Gemini reasoning tokens in agent usage accounting

Resolves #13006.

`usage.ts` previously billed `usage.output_tokens` directly. For Vertex
AI Gemini thinking models, `@langchain/google-common`'s streaming path
emits `output_tokens = candidatesTokenCount` only, dropping
`thoughtsTokenCount`. Reasoning was billed at zero and the
`total_tokens === input_tokens + output_tokens` invariant was broken.

The fix lives in agents (danny-avila/agents#157) — but this is also a
defense-in-depth backstop in case agents misses a path or another
provider exhibits the same shape. `resolveCompletionTokens(usage)` adds
`output_token_details.reasoning` back when (and only when) the gap is
present (`total - input > output`), so providers that already include
reasoning in `output_tokens` (OpenAI o-series, Anthropic, the
Google-API wrapper) are no-ops — no double-counting.

- `SplitUsage` gains a `completion` field; all four billing call sites
  in `processUsageGroup` use it instead of `usage.output_tokens`.
- `total_output_tokens` in the result also reflects the corrected
  count.
- `UsageMetadata` interface in `IJobStore.ts` adds the
  `output_token_details` field for type safety.
- 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI
  no-double-count, structured spend path with cache + reasoning, no-op
  when no details present.

* 🩹 fix: simplify reasoning correction to invariant-based gap check

Initial fix gated the correction on `output_token_details.reasoning > 0`,
which doesn't help in the live failure case: when google-common's stream
emits the buggy fallback usage_metadata, output_token_details is empty
({}) and the gate exits early.

Live debugging showed the reliable signal is the documented invariant
itself: `total_tokens === input_tokens + output_tokens`. When buggy
streams undercount output, total exceeds input + output by exactly the
unbilled reasoning. Use `total - input` as the corrected output.

This is provider-agnostic and stays a no-op for compliant providers
(OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the
gap is zero.

Live verified end-to-end against gemini-3-flash-preview:
- With agents fix in place:  output_tokens=437 → billed 437  (no-op)
- Backstop only (no agents fix, buggy input): raw 135, billed 297
  (= total 309 - input 12, matches actual API charge)

Updated tests to cover both scenarios.
This commit is contained in:
Danny Avila 2026-05-08 10:29:35 -04:00
parent 93c4ef4ba8
commit 0fe203aaca
No known key found for this signature in database
GPG key ID: BF31EEB2C5CA0956
3 changed files with 171 additions and 6 deletions

View file

@ -364,6 +364,128 @@ describe('recordCollectedUsage', () => {
});
});
describe('reasoning token handling - issue #13006', () => {
it('uses total - input when output_tokens undercounts (Vertex stream undercount with details present)', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 80657,
output_tokens: 766,
total_tokens: 83265,
output_token_details: { reasoning: 1842 },
model: 'gemini-3-flash-preview',
provider: 'vertexai',
},
];
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
{ promptTokens: 80657, completionTokens: 2608 },
);
expect(result?.output_tokens).toBe(2608);
});
it('uses total - input even when output_token_details is missing (raw langchain google-common path)', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 12,
output_tokens: 135,
total_tokens: 309,
model: 'gemini-3-flash-preview',
provider: 'vertexai',
},
];
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
{ promptTokens: 12, completionTokens: 297 },
);
expect(result?.output_tokens).toBe(297);
});
it('does not change output when invariant already holds (OpenAI o-series, reasoning already a subset)', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 100,
output_tokens: 500,
total_tokens: 600,
output_token_details: { reasoning: 200 },
model: 'o1-preview',
provider: 'openAI',
},
];
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'o1-preview' }),
{ promptTokens: 100, completionTokens: 500 },
);
expect(result?.output_tokens).toBe(500);
});
it('routes correction through structured spend when cache tokens are present', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 80657,
output_tokens: 766,
total_tokens: 83265,
output_token_details: { reasoning: 1842 },
input_token_details: { cache_read: 30000 },
model: 'gemini-3-flash-preview',
provider: 'vertexai',
},
];
await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
{
promptTokens: { input: 50657, write: 0, read: 30000 },
completionTokens: 2608,
},
);
});
it('no-op when total_tokens is absent or zero', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 100,
output_tokens: 50,
model: 'gpt-4',
provider: 'openAI',
},
];
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendTokens).toHaveBeenCalledWith(
expect.anything(),
{ promptTokens: 100, completionTokens: 50 },
);
expect(result?.output_tokens).toBe(50);
});
});
describe('mixed cache and non-cache entries', () => {
it('should handle mixed entries correctly', async () => {
const collectedUsage: UsageMetadata[] = [

View file

@ -50,6 +50,34 @@ function inputTokensIncludesCache(provider?: string): boolean {
return provider != null && SUBSET_PROVIDERS.has(provider);
}
/**
* Resolves `completionTokens` for billing, repairing providers whose
* `usage_metadata.output_tokens` undercounts.
*
* The documented `UsageMetadata` contract (`@langchain/core`) is
* `total_tokens === input_tokens + output_tokens`. Compliant providers
* (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
* include any reasoning/thinking tokens inside `output_tokens` already,
* so the invariant holds.
*
* Vertex AI Gemini through `@langchain/google-common`'s streaming path
* emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
* leaving `total - input > output`. When that gap shows up we use the
* invariant to recover the correct billable output (`total - input`).
* Compliant providers have a zero gap, so this is a no-op for them.
*
* Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
*/
function resolveCompletionTokens(usage: UsageMetadata): number {
const output = Number(usage.output_tokens) || 0;
const total = Number(usage.total_tokens) || 0;
const input = Number(usage.input_tokens) || 0;
if (total > input + output) {
return total - input;
}
return output;
}
interface SplitUsage {
/** Non-cached input portion — what gets billed at the standard input rate */
inputOnly: number;
@ -57,6 +85,8 @@ interface SplitUsage {
cacheRead: number;
/** Total prompt tokens including cached portion */
totalInput: number;
/** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */
completion: number;
}
function splitUsage(usage: UsageMetadata): SplitUsage {
@ -67,12 +97,14 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
const cacheRead =
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
const rawInput = Number(usage.input_tokens) || 0;
const completion = resolveCompletionTokens(usage);
if (inputTokensIncludesCache(usage.provider)) {
return {
inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
cacheCreation,
cacheRead,
totalInput: rawInput,
completion,
};
}
return {
@ -80,6 +112,7 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
cacheCreation,
cacheRead,
totalInput: rawInput + cacheCreation + cacheRead,
completion,
};
}
@ -161,9 +194,9 @@ export async function recordCollectedUsage(
continue;
}
const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage);
const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage);
total_output_tokens += Number(usage.output_tokens) || 0;
total_output_tokens += completion;
const txMetadata: TxMetadata = {
user,
@ -187,7 +220,7 @@ export async function recordCollectedUsage(
write: cacheCreation,
read: cacheRead,
},
completionTokens: usage.output_tokens,
completionTokens: completion,
},
pricing,
)
@ -195,7 +228,7 @@ export async function recordCollectedUsage(
txMetadata,
{
promptTokens: inputOnly,
completionTokens: usage.output_tokens,
completionTokens: completion,
},
pricing,
);
@ -211,7 +244,7 @@ export async function recordCollectedUsage(
write: cacheCreation,
read: cacheRead,
},
completionTokens: usage.output_tokens,
completionTokens: completion,
})
.catch((err) => {
logger.error(
@ -225,7 +258,7 @@ export async function recordCollectedUsage(
deps
.spendTokens(txMetadata, {
promptTokens: inputOnly,
completionTokens: usage.output_tokens,
completionTokens: completion,
})
.catch((err) => {
logger.error(

View file

@ -98,6 +98,16 @@ export interface UsageMetadata {
* Present for Claude models. Mutually exclusive with input_token_details.
*/
cache_read_input_tokens?: number;
/**
* Breakdown of output token counts. Per the LangChain core contract,
* `output_tokens` is the sum of all output token types these fields
* are subsets of `output_tokens`, *not* additional charges.
*/
output_token_details?: {
/** Reasoning/thinking tokens generated as chain-of-thought (o1, Gemini thinking, etc.) */
reasoning?: number;
audio?: number;
};
}
/**