From 0fe203aaca6b69d9bb76c38d3f6f0ca00ac72779 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Fri, 8 May 2026 10:29:35 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=A0=20fix:=20charge=20Gemini=20reasoni?= =?UTF-8?q?ng=20tokens=20in=20agent=20usage=20accounting=20(#13014)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🧠 fix: charge Gemini reasoning tokens in agent usage accounting Resolves #13006. `usage.ts` previously billed `usage.output_tokens` directly. For Vertex AI Gemini thinking models, `@langchain/google-common`'s streaming path emits `output_tokens = candidatesTokenCount` only, dropping `thoughtsTokenCount`. Reasoning was billed at zero and the `total_tokens === input_tokens + output_tokens` invariant was broken. The fix lives in agents (danny-avila/agents#157) — but this is also a defense-in-depth backstop in case agents misses a path or another provider exhibits the same shape. `resolveCompletionTokens(usage)` adds `output_token_details.reasoning` back when (and only when) the gap is present (`total - input > output`), so providers that already include reasoning in `output_tokens` (OpenAI o-series, Anthropic, the Google-API wrapper) are no-ops — no double-counting. - `SplitUsage` gains a `completion` field; all four billing call sites in `processUsageGroup` use it instead of `usage.output_tokens`. - `total_output_tokens` in the result also reflects the corrected count. - `UsageMetadata` interface in `IJobStore.ts` adds the `output_token_details` field for type safety. - 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI no-double-count, structured spend path with cache + reasoning, no-op when no details present. * 🩹 fix: simplify reasoning correction to invariant-based gap check Initial fix gated the correction on `output_token_details.reasoning > 0`, which doesn't help in the live failure case: when google-common's stream emits the buggy fallback usage_metadata, output_token_details is empty ({}) and the gate exits early. Live debugging showed the reliable signal is the documented invariant itself: `total_tokens === input_tokens + output_tokens`. When buggy streams undercount output, total exceeds input + output by exactly the unbilled reasoning. Use `total - input` as the corrected output. This is provider-agnostic and stays a no-op for compliant providers (OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the gap is zero. Live verified end-to-end against gemini-3-flash-preview: - With agents fix in place: output_tokens=437 → billed 437 (no-op) - Backstop only (no agents fix, buggy input): raw 135, billed 297 (= total 309 - input 12, matches actual API charge) Updated tests to cover both scenarios. --- packages/api/src/agents/usage.spec.ts | 122 ++++++++++++++++++ packages/api/src/agents/usage.ts | 45 ++++++- .../api/src/stream/interfaces/IJobStore.ts | 10 ++ 3 files changed, 171 insertions(+), 6 deletions(-) diff --git a/packages/api/src/agents/usage.spec.ts b/packages/api/src/agents/usage.spec.ts index b9656ba2ec..bee0c5248b 100644 --- a/packages/api/src/agents/usage.spec.ts +++ b/packages/api/src/agents/usage.spec.ts @@ -364,6 +364,128 @@ describe('recordCollectedUsage', () => { }); }); + describe('reasoning token handling - issue #13006', () => { + it('uses total - input when output_tokens undercounts (Vertex stream undercount with details present)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 80657, + output_tokens: 766, + total_tokens: 83265, + output_token_details: { reasoning: 1842 }, + model: 'gemini-3-flash-preview', + provider: 'vertexai', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { promptTokens: 80657, completionTokens: 2608 }, + ); + expect(result?.output_tokens).toBe(2608); + }); + + it('uses total - input even when output_token_details is missing (raw langchain google-common path)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 12, + output_tokens: 135, + total_tokens: 309, + model: 'gemini-3-flash-preview', + provider: 'vertexai', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { promptTokens: 12, completionTokens: 297 }, + ); + expect(result?.output_tokens).toBe(297); + }); + + it('does not change output when invariant already holds (OpenAI o-series, reasoning already a subset)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 100, + output_tokens: 500, + total_tokens: 600, + output_token_details: { reasoning: 200 }, + model: 'o1-preview', + provider: 'openAI', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'o1-preview' }), + { promptTokens: 100, completionTokens: 500 }, + ); + expect(result?.output_tokens).toBe(500); + }); + + it('routes correction through structured spend when cache tokens are present', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 80657, + output_tokens: 766, + total_tokens: 83265, + output_token_details: { reasoning: 1842 }, + input_token_details: { cache_read: 30000 }, + model: 'gemini-3-flash-preview', + provider: 'vertexai', + }, + ]; + + await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { + promptTokens: { input: 50657, write: 0, read: 30000 }, + completionTokens: 2608, + }, + ); + }); + + it('no-op when total_tokens is absent or zero', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 100, + output_tokens: 50, + model: 'gpt-4', + provider: 'openAI', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.anything(), + { promptTokens: 100, completionTokens: 50 }, + ); + expect(result?.output_tokens).toBe(50); + }); + }); + describe('mixed cache and non-cache entries', () => { it('should handle mixed entries correctly', async () => { const collectedUsage: UsageMetadata[] = [ diff --git a/packages/api/src/agents/usage.ts b/packages/api/src/agents/usage.ts index b5a2fa1f4d..3d8ddf3799 100644 --- a/packages/api/src/agents/usage.ts +++ b/packages/api/src/agents/usage.ts @@ -50,6 +50,34 @@ function inputTokensIncludesCache(provider?: string): boolean { return provider != null && SUBSET_PROVIDERS.has(provider); } +/** + * Resolves `completionTokens` for billing, repairing providers whose + * `usage_metadata.output_tokens` undercounts. + * + * The documented `UsageMetadata` contract (`@langchain/core`) is + * `total_tokens === input_tokens + output_tokens`. Compliant providers + * (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`) + * include any reasoning/thinking tokens inside `output_tokens` already, + * so the invariant holds. + * + * Vertex AI Gemini through `@langchain/google-common`'s streaming path + * emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`, + * leaving `total - input > output`. When that gap shows up we use the + * invariant to recover the correct billable output (`total - input`). + * Compliant providers have a zero gap, so this is a no-op for them. + * + * Tracked in: https://github.com/danny-avila/LibreChat/issues/13006 + */ +function resolveCompletionTokens(usage: UsageMetadata): number { + const output = Number(usage.output_tokens) || 0; + const total = Number(usage.total_tokens) || 0; + const input = Number(usage.input_tokens) || 0; + if (total > input + output) { + return total - input; + } + return output; +} + interface SplitUsage { /** Non-cached input portion — what gets billed at the standard input rate */ inputOnly: number; @@ -57,6 +85,8 @@ interface SplitUsage { cacheRead: number; /** Total prompt tokens including cached portion */ totalInput: number; + /** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */ + completion: number; } function splitUsage(usage: UsageMetadata): SplitUsage { @@ -67,12 +97,14 @@ function splitUsage(usage: UsageMetadata): SplitUsage { const cacheRead = Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0; const rawInput = Number(usage.input_tokens) || 0; + const completion = resolveCompletionTokens(usage); if (inputTokensIncludesCache(usage.provider)) { return { inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead), cacheCreation, cacheRead, totalInput: rawInput, + completion, }; } return { @@ -80,6 +112,7 @@ function splitUsage(usage: UsageMetadata): SplitUsage { cacheCreation, cacheRead, totalInput: rawInput + cacheCreation + cacheRead, + completion, }; } @@ -161,9 +194,9 @@ export async function recordCollectedUsage( continue; } - const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage); + const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage); - total_output_tokens += Number(usage.output_tokens) || 0; + total_output_tokens += completion; const txMetadata: TxMetadata = { user, @@ -187,7 +220,7 @@ export async function recordCollectedUsage( write: cacheCreation, read: cacheRead, }, - completionTokens: usage.output_tokens, + completionTokens: completion, }, pricing, ) @@ -195,7 +228,7 @@ export async function recordCollectedUsage( txMetadata, { promptTokens: inputOnly, - completionTokens: usage.output_tokens, + completionTokens: completion, }, pricing, ); @@ -211,7 +244,7 @@ export async function recordCollectedUsage( write: cacheCreation, read: cacheRead, }, - completionTokens: usage.output_tokens, + completionTokens: completion, }) .catch((err) => { logger.error( @@ -225,7 +258,7 @@ export async function recordCollectedUsage( deps .spendTokens(txMetadata, { promptTokens: inputOnly, - completionTokens: usage.output_tokens, + completionTokens: completion, }) .catch((err) => { logger.error( diff --git a/packages/api/src/stream/interfaces/IJobStore.ts b/packages/api/src/stream/interfaces/IJobStore.ts index 0d07b19538..afd6ac68a0 100644 --- a/packages/api/src/stream/interfaces/IJobStore.ts +++ b/packages/api/src/stream/interfaces/IJobStore.ts @@ -98,6 +98,16 @@ export interface UsageMetadata { * Present for Claude models. Mutually exclusive with input_token_details. */ cache_read_input_tokens?: number; + /** + * Breakdown of output token counts. Per the LangChain core contract, + * `output_tokens` is the sum of all output token types — these fields + * are subsets of `output_tokens`, *not* additional charges. + */ + output_token_details?: { + /** Reasoning/thinking tokens generated as chain-of-thought (o1, Gemini thinking, etc.) */ + reasoning?: number; + audio?: number; + }; } /**