diff --git a/packages/api/src/agents/usage.spec.ts b/packages/api/src/agents/usage.spec.ts index b9656ba2ec..bee0c5248b 100644 --- a/packages/api/src/agents/usage.spec.ts +++ b/packages/api/src/agents/usage.spec.ts @@ -364,6 +364,128 @@ describe('recordCollectedUsage', () => { }); }); + describe('reasoning token handling - issue #13006', () => { + it('uses total - input when output_tokens undercounts (Vertex stream undercount with details present)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 80657, + output_tokens: 766, + total_tokens: 83265, + output_token_details: { reasoning: 1842 }, + model: 'gemini-3-flash-preview', + provider: 'vertexai', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { promptTokens: 80657, completionTokens: 2608 }, + ); + expect(result?.output_tokens).toBe(2608); + }); + + it('uses total - input even when output_token_details is missing (raw langchain google-common path)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 12, + output_tokens: 135, + total_tokens: 309, + model: 'gemini-3-flash-preview', + provider: 'vertexai', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { promptTokens: 12, completionTokens: 297 }, + ); + expect(result?.output_tokens).toBe(297); + }); + + it('does not change output when invariant already holds (OpenAI o-series, reasoning already a subset)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 100, + output_tokens: 500, + total_tokens: 600, + output_token_details: { reasoning: 200 }, + model: 'o1-preview', + provider: 'openAI', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'o1-preview' }), + { promptTokens: 100, completionTokens: 500 }, + ); + expect(result?.output_tokens).toBe(500); + }); + + it('routes correction through structured spend when cache tokens are present', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 80657, + output_tokens: 766, + total_tokens: 83265, + output_token_details: { reasoning: 1842 }, + input_token_details: { cache_read: 30000 }, + model: 'gemini-3-flash-preview', + provider: 'vertexai', + }, + ]; + + await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { + promptTokens: { input: 50657, write: 0, read: 30000 }, + completionTokens: 2608, + }, + ); + }); + + it('no-op when total_tokens is absent or zero', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 100, + output_tokens: 50, + model: 'gpt-4', + provider: 'openAI', + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.anything(), + { promptTokens: 100, completionTokens: 50 }, + ); + expect(result?.output_tokens).toBe(50); + }); + }); + describe('mixed cache and non-cache entries', () => { it('should handle mixed entries correctly', async () => { const collectedUsage: UsageMetadata[] = [ diff --git a/packages/api/src/agents/usage.ts b/packages/api/src/agents/usage.ts index b5a2fa1f4d..3d8ddf3799 100644 --- a/packages/api/src/agents/usage.ts +++ b/packages/api/src/agents/usage.ts @@ -50,6 +50,34 @@ function inputTokensIncludesCache(provider?: string): boolean { return provider != null && SUBSET_PROVIDERS.has(provider); } +/** + * Resolves `completionTokens` for billing, repairing providers whose + * `usage_metadata.output_tokens` undercounts. + * + * The documented `UsageMetadata` contract (`@langchain/core`) is + * `total_tokens === input_tokens + output_tokens`. Compliant providers + * (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`) + * include any reasoning/thinking tokens inside `output_tokens` already, + * so the invariant holds. + * + * Vertex AI Gemini through `@langchain/google-common`'s streaming path + * emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`, + * leaving `total - input > output`. When that gap shows up we use the + * invariant to recover the correct billable output (`total - input`). + * Compliant providers have a zero gap, so this is a no-op for them. + * + * Tracked in: https://github.com/danny-avila/LibreChat/issues/13006 + */ +function resolveCompletionTokens(usage: UsageMetadata): number { + const output = Number(usage.output_tokens) || 0; + const total = Number(usage.total_tokens) || 0; + const input = Number(usage.input_tokens) || 0; + if (total > input + output) { + return total - input; + } + return output; +} + interface SplitUsage { /** Non-cached input portion — what gets billed at the standard input rate */ inputOnly: number; @@ -57,6 +85,8 @@ interface SplitUsage { cacheRead: number; /** Total prompt tokens including cached portion */ totalInput: number; + /** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */ + completion: number; } function splitUsage(usage: UsageMetadata): SplitUsage { @@ -67,12 +97,14 @@ function splitUsage(usage: UsageMetadata): SplitUsage { const cacheRead = Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0; const rawInput = Number(usage.input_tokens) || 0; + const completion = resolveCompletionTokens(usage); if (inputTokensIncludesCache(usage.provider)) { return { inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead), cacheCreation, cacheRead, totalInput: rawInput, + completion, }; } return { @@ -80,6 +112,7 @@ function splitUsage(usage: UsageMetadata): SplitUsage { cacheCreation, cacheRead, totalInput: rawInput + cacheCreation + cacheRead, + completion, }; } @@ -161,9 +194,9 @@ export async function recordCollectedUsage( continue; } - const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage); + const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage); - total_output_tokens += Number(usage.output_tokens) || 0; + total_output_tokens += completion; const txMetadata: TxMetadata = { user, @@ -187,7 +220,7 @@ export async function recordCollectedUsage( write: cacheCreation, read: cacheRead, }, - completionTokens: usage.output_tokens, + completionTokens: completion, }, pricing, ) @@ -195,7 +228,7 @@ export async function recordCollectedUsage( txMetadata, { promptTokens: inputOnly, - completionTokens: usage.output_tokens, + completionTokens: completion, }, pricing, ); @@ -211,7 +244,7 @@ export async function recordCollectedUsage( write: cacheCreation, read: cacheRead, }, - completionTokens: usage.output_tokens, + completionTokens: completion, }) .catch((err) => { logger.error( @@ -225,7 +258,7 @@ export async function recordCollectedUsage( deps .spendTokens(txMetadata, { promptTokens: inputOnly, - completionTokens: usage.output_tokens, + completionTokens: completion, }) .catch((err) => { logger.error( diff --git a/packages/api/src/stream/interfaces/IJobStore.ts b/packages/api/src/stream/interfaces/IJobStore.ts index 0d07b19538..afd6ac68a0 100644 --- a/packages/api/src/stream/interfaces/IJobStore.ts +++ b/packages/api/src/stream/interfaces/IJobStore.ts @@ -98,6 +98,16 @@ export interface UsageMetadata { * Present for Claude models. Mutually exclusive with input_token_details. */ cache_read_input_tokens?: number; + /** + * Breakdown of output token counts. Per the LangChain core contract, + * `output_tokens` is the sum of all output token types — these fields + * are subsets of `output_tokens`, *not* additional charges. + */ + output_token_details?: { + /** Reasoning/thinking tokens generated as chain-of-thought (o1, Gemini thinking, etc.) */ + reasoning?: number; + audio?: number; + }; } /**