From c50b3c58d5b240fee7692bb73e7d74910de8dad3 Mon Sep 17 00:00:00 2001 From: "Joohan(Lucas)" Date: Wed, 3 Jun 2026 09:45:37 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8F=B7=EF=B8=8F=20fix:=20Prevent=20Bedroc?= =?UTF-8?q?k=20Cache=20Tokens=20from=20Inflating=20Completion=20Count=20(#?= =?UTF-8?q?13468)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🐛 fix: prevent Bedrock cache tokens from inflating completion count * style: fix prettier formatting --- packages/api/src/agents/usage.spec.ts | 101 ++++++++++++++++++++++++++ packages/api/src/agents/usage.ts | 38 +++++++--- 2 files changed, 130 insertions(+), 9 deletions(-) diff --git a/packages/api/src/agents/usage.spec.ts b/packages/api/src/agents/usage.spec.ts index 4521de8eb4..38e2fb5476 100644 --- a/packages/api/src/agents/usage.spec.ts +++ b/packages/api/src/agents/usage.spec.ts @@ -988,6 +988,107 @@ describe('recordCollectedUsage', () => { }); }); + describe('Bedrock prompt caching — completion token inflation regression', () => { + it('does not fold cache_creation into completion on the first cached step', async () => { + // Bedrock: total = input + output + cache_creation (additive, not subset). + // Before fix: resolveCompletionTokens returned output + cache_creation (5500) + // instead of output (500). + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 100, + output_tokens: 500, + total_tokens: 5600, + cache_creation_input_tokens: 5000, + cache_read_input_tokens: 0, + model: 'claude-sonnet-4-6', + }, + ]; + + const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'claude-sonnet-4-6' }), + { + promptTokens: { input: 100, write: 5000, read: 0 }, + completionTokens: 500, + }, + ); + expect(result?.output_tokens).toBe(500); + }); + + it('does not fold cache_read into completion on subsequent cached steps', async () => { + // Bedrock: total = input + output + cache_read on every read step. + // Before fix: each step returned output + cache_read instead of output. + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 200, + output_tokens: 300, + total_tokens: 4500, + cache_read_input_tokens: 4000, + cache_creation_input_tokens: 0, + model: 'claude-sonnet-4-6', + }, + ]; + + const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'claude-sonnet-4-6' }), + { + promptTokens: { input: 200, write: 0, read: 4000 }, + completionTokens: 300, + }, + ); + expect(result?.output_tokens).toBe(300); + }); + + it('handles cache tokens in input_token_details format (alternate field path)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 200, + output_tokens: 300, + total_tokens: 4500, + input_token_details: { cache_read: 4000, cache_creation: 0 }, + model: 'claude-sonnet-4-6', + }, + ]; + + const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage }); + + expect(result?.output_tokens).toBe(300); + }); + + it('accumulates only true output across a multi-step cached agent run', async () => { + // 1 write step + 4 read steps. Without the fix, each step folds its + // cache tokens into completion, inflating the total by the full cache size. + const writeStep: UsageMetadata = { + input_tokens: 100, + output_tokens: 500, + total_tokens: 5600, + cache_creation_input_tokens: 5000, + cache_read_input_tokens: 0, + model: 'claude-sonnet-4-6', + }; + const readSteps: UsageMetadata[] = Array.from({ length: 4 }, (_, i) => ({ + input_tokens: 200, + output_tokens: 300 + i * 50, + total_tokens: 200 + (300 + i * 50) + 5000, + cache_read_input_tokens: 5000, + cache_creation_input_tokens: 0, + model: 'claude-sonnet-4-6', + })); + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage: [writeStep, ...readSteps], + }); + + // True output: 500 + 300 + 350 + 400 + 450 = 2000 + const trueOutput = 500 + readSteps.reduce((sum, s) => sum + (s.output_tokens ?? 0), 0); + expect(result?.output_tokens).toBe(trueOutput); + }); + }); + describe('bulk write with summarization usage', () => { let mockInsertMany: jest.Mock; let mockUpdateBalance: jest.Mock; diff --git a/packages/api/src/agents/usage.ts b/packages/api/src/agents/usage.ts index 3d8ddf3799..cc15168547 100644 --- a/packages/api/src/agents/usage.ts +++ b/packages/api/src/agents/usage.ts @@ -58,22 +58,42 @@ function inputTokensIncludesCache(provider?: string): boolean { * `total_tokens === input_tokens + output_tokens`. Compliant providers * (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`) * include any reasoning/thinking tokens inside `output_tokens` already, - * so the invariant holds. + * so the invariant holds and this function is a no-op for them. * - * Vertex AI Gemini through `@langchain/google-common`'s streaming path - * emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`, - * leaving `total - input > output`. When that gap shows up we use the - * invariant to recover the correct billable output (`total - input`). - * Compliant providers have a zero gap, so this is a no-op for them. + * **Vertex AI undercount (issue #13006):** `@langchain/google-common`'s streaming + * path emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`, + * so `total - input > output`. The gap is recovered as `total - input`. * - * Tracked in: https://github.com/danny-avila/LibreChat/issues/13006 + * **Bedrock / Anthropic cache inflation:** additive providers keep cache tokens + * separate from `input_tokens`, making + * `total = input + output + cache_read + cache_creation`. Without adjustment + * the Vertex recovery fires on every cached step and returns + * `output + cache_read + cache_creation` instead of `output`, inflating + * completion counts by orders of magnitude. The fix subtracts the cache + * adjustment before the gap test — but only for additive providers; subset + * providers (Google, OpenAI, …) already include cache inside `input_tokens` + * so their `cacheAdjustment` is zero and the Vertex recovery is unaffected. */ function resolveCompletionTokens(usage: UsageMetadata): number { const output = Number(usage.output_tokens) || 0; const total = Number(usage.total_tokens) || 0; const input = Number(usage.input_tokens) || 0; - if (total > input + output) { - return total - input; + + // For additive providers (Bedrock, Anthropic), cache tokens are separate + // from input_tokens and are included in total_tokens, widening the gap + // independently of any missing thinking tokens. Subtract them so the gap + // check only fires when output_tokens genuinely undercounts (Vertex case). + // Subset providers fold cache into input_tokens, so their adjustment is 0. + const cacheRead = + Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0; + const cacheCreation = + Number(usage.input_token_details?.cache_creation) || + Number(usage.cache_creation_input_tokens) || + 0; + const cacheAdjustment = inputTokensIncludesCache(usage.provider) ? 0 : cacheRead + cacheCreation; + + if (total > input + output + cacheAdjustment) { + return total - input - cacheAdjustment; } return output; }