From 89bf2ab7b465e983e31b59d67caeb1fda2d8ed97 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Wed, 29 Apr 2026 08:36:00 +0900 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=8E=20fix:=20Stop=20Double-Counting=20?= =?UTF-8?q?Cache=20Tokens=20for=20Gemini/OpenAI=20in=20Usage=20Spend=20(#1?= =?UTF-8?q?2868)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 💎 fix: Stop Double-Counting Cache Tokens for Gemini/OpenAI in Usage Spend (#12855) Different providers report `usage_metadata.input_tokens` with different semantics: - Anthropic / Bedrock: `input_tokens` EXCLUDES cache; cache reads/writes arrive separately and must be added to get the total prompt size. - Gemini / OpenAI: `input_tokens` ALREADY INCLUDES cached tokens (Google's `promptTokenCount`, OpenAI's `prompt_tokens`). Their `input_token_details.cache_*` are subsets of `input_tokens`. `recordCollectedUsage` treated both schemes as additive, so for cache-hit requests on Gemini/OpenAI it added cache tokens on top of an `input_tokens` value that already contained them — overcharging users by the cache_hit_rate (e.g., ~67% cache hit ≈ 1.67x overcharge). This matches the issue reporter's GCP billing comparison. Adds a small `splitUsage` helper that classifies the provider by model name and computes `inputOnly` (the non-cached portion) plus the all-inclusive `totalInput` for both the spend math and the returned `input_tokens` summary. The helper defaults to additive semantics (the historical behavior) so unknown providers are unaffected. Updates existing OpenAI-shaped tests that previously asserted the buggy additive math, and adds Gemini regression tests using the exact numbers from the issue report (input=11125, cache_read=7441 → input=3684). Anthropic / Bedrock paths remain bit-identical to before. * 🔧 refactor: Classify Cache-Token Semantics by Provider, Not Model Name Follows up the previous commit. Replaces a model-name regex (`gemini|gpt|o[1-9]|chatgpt`) with an explicit `Providers` enum lookup keyed off the `usage.provider` field — `UsageMetadata.provider` already exists in `IJobStore.ts` but was never being populated. - `callbacks.js#ModelEndHandler` now attaches `usage.provider` from `agentContext.provider` alongside `usage.model`. - `usage.ts` uses a `SUBSET_PROVIDERS` set (`openAI`, `azureOpenAI`, `google`, `vertexai`, `xai`, `deepseek`, `openrouter`, `moonshot`) backed by the canonical `Providers` enum from `librechat-data-provider`. - `xai`, `deepseek`, `openrouter`, `moonshot` extend `ChatOpenAI` so they inherit subset semantics (verified in node_modules). - Defaults to additive when `usage.provider` is missing, so the title flow (which doesn't propagate provider) and any pre-this-PR usage entries keep their existing behavior. Tests: switch fixtures from model-name signaling to explicit `provider` field, plus a Vertex AI case and a "missing provider" fallback case. --- api/server/controllers/agents/callbacks.js | 3 + .../api/src/agents/usage.bulk-parity.spec.ts | 32 ++++- packages/api/src/agents/usage.spec.ts | 115 +++++++++++++++++- packages/api/src/agents/usage.ts | 99 +++++++++++---- 4 files changed, 215 insertions(+), 34 deletions(-) diff --git a/api/server/controllers/agents/callbacks.js b/api/server/controllers/agents/callbacks.js index 439ac9b93e..2af4d5b451 100644 --- a/api/server/controllers/agents/callbacks.js +++ b/api/server/controllers/agents/callbacks.js @@ -75,6 +75,9 @@ class ModelEndHandler { if (modelName) { usage.model = modelName; } + if (agentContext.provider) { + usage.provider = agentContext.provider; + } const taggedUsage = markSummarizationUsage(usage, metadata); diff --git a/packages/api/src/agents/usage.bulk-parity.spec.ts b/packages/api/src/agents/usage.bulk-parity.spec.ts index 79dd50b2e3..a18565ba0d 100644 --- a/packages/api/src/agents/usage.bulk-parity.spec.ts +++ b/packages/api/src/agents/usage.bulk-parity.spec.ts @@ -171,31 +171,55 @@ describe('recordCollectedUsage — bulk path parity', () => { }); }); - describe('cache token handling - OpenAI format', () => { - it('should route cache entries to structured path — same input_tokens as legacy', async () => { + describe('cache token handling - subset providers (input_tokens already includes cache)', () => { + it('subtracts cache from input portion in bulk docs for OpenAI', async () => { const collectedUsage: UsageMetadata[] = [ { input_tokens: 100, output_tokens: 50, model: 'gpt-4', + provider: 'openAI', input_token_details: { cache_creation: 20, cache_read: 10 }, }, ]; const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage }); - expect(result?.input_tokens).toBe(130); // 100 + 20 + 10 + expect(result?.input_tokens).toBe(100); expect(mockInsertMany).toHaveBeenCalledTimes(1); expect(mockSpendStructuredTokens).not.toHaveBeenCalled(); expect(mockSpendTokens).not.toHaveBeenCalled(); const docs = mockInsertMany.mock.calls[0][0]; const promptDoc = docs.find((d: { tokenType: string }) => d.tokenType === 'prompt'); - expect(promptDoc.inputTokens).toBe(-100); + expect(promptDoc.inputTokens).toBe(-70); expect(promptDoc.writeTokens).toBe(-20); expect(promptDoc.readTokens).toBe(-10); expect(promptDoc.model).toBe('gpt-4'); }); + + it('does not double-count cache_read for Gemini in bulk path — issue #12855', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 11125, + output_tokens: 20, + model: 'gemini-3-flash-preview', + provider: 'google', + input_token_details: { cache_read: 7441 }, + }, + ]; + + const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage }); + + expect(result?.input_tokens).toBe(11125); + expect(mockInsertMany).toHaveBeenCalledTimes(1); + + const docs = mockInsertMany.mock.calls[0][0]; + const promptDoc = docs.find((d: { tokenType: string }) => d.tokenType === 'prompt'); + expect(promptDoc.inputTokens).toBe(-3684); + expect(promptDoc.readTokens).toBe(-7441); + expect(promptDoc.writeTokens || 0).toBe(0); + }); }); describe('cache token handling - Anthropic format', () => { diff --git a/packages/api/src/agents/usage.spec.ts b/packages/api/src/agents/usage.spec.ts index b75baf69a8..b9656ba2ec 100644 --- a/packages/api/src/agents/usage.spec.ts +++ b/packages/api/src/agents/usage.spec.ts @@ -197,13 +197,14 @@ describe('recordCollectedUsage', () => { }); }); - describe('cache token handling - OpenAI format', () => { - it('should use spendStructuredTokens for cache tokens (input_token_details)', async () => { + describe('cache token handling - subset providers (input_tokens already includes cache)', () => { + it('subtracts cache from input_tokens for OpenAI to avoid double-counting', async () => { const collectedUsage: UsageMetadata[] = [ { input_tokens: 100, output_tokens: 50, model: 'gpt-4', + provider: 'openAI', input_token_details: { cache_creation: 20, cache_read: 10, @@ -218,6 +219,110 @@ describe('recordCollectedUsage', () => { expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1); expect(mockSpendTokens).not.toHaveBeenCalled(); + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gpt-4' }), + { + promptTokens: { input: 70, write: 20, read: 10 }, + completionTokens: 50, + }, + ); + expect(result?.input_tokens).toBe(100); + }); + + it('does not double-count cache_read for Gemini — issue #12855', async () => { + // Real numbers from the issue report + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 11125, + output_tokens: 20, + model: 'gemini-3-flash-preview', + provider: 'google', + input_token_details: { cache_read: 7441 }, + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1); + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-3-flash-preview' }), + { + promptTokens: { input: 3684, write: 0, read: 7441 }, + completionTokens: 20, + }, + ); + expect(result?.input_tokens).toBe(11125); + }); + + it('also applies to Vertex AI', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 5000, + output_tokens: 100, + model: 'gemini-2.5-pro', + provider: 'vertexai', + input_token_details: { cache_read: 4000 }, + }, + ]; + + await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-2.5-pro' }), + { + promptTokens: { input: 1000, write: 0, read: 4000 }, + completionTokens: 100, + }, + ); + }); + + it('handles cache_read >= input_tokens defensively (clamps inputOnly to 0)', async () => { + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 1000, + output_tokens: 30, + model: 'gemini-2.5-pro', + provider: 'google', + input_token_details: { cache_read: 1000 }, + }, + ]; + + await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gemini-2.5-pro' }), + { + promptTokens: { input: 0, write: 0, read: 1000 }, + completionTokens: 30, + }, + ); + }); + + it('falls through to additive (historical default) when provider is missing', async () => { + // Defensive: an unclassified or pre-this-PR usage entry should keep old behavior + const collectedUsage: UsageMetadata[] = [ + { + input_tokens: 100, + output_tokens: 50, + model: 'gpt-4', + input_token_details: { cache_creation: 20, cache_read: 10 }, + }, + ]; + + const result = await recordCollectedUsage(deps, { + ...baseParams, + collectedUsage, + }); + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( expect.objectContaining({ model: 'gpt-4' }), { @@ -225,7 +330,7 @@ describe('recordCollectedUsage', () => { completionTokens: 50, }, ); - expect(result?.input_tokens).toBe(130); // 100 + 20 + 10 + expect(result?.input_tokens).toBe(130); }); }); @@ -397,6 +502,7 @@ describe('recordCollectedUsage', () => { input_tokens: 100, output_tokens: 50, model: 'gpt-4', + provider: 'openAI', input_token_details: { cache_creation: 20, cache_read: 10 }, }, ]; @@ -406,7 +512,8 @@ describe('recordCollectedUsage', () => { collectedUsage, }); - expect(result).toEqual({ input_tokens: 130, output_tokens: 50 }); + // openAI is a subset provider → input_tokens already includes cache + expect(result).toEqual({ input_tokens: 100, output_tokens: 50 }); }); }); diff --git a/packages/api/src/agents/usage.ts b/packages/api/src/agents/usage.ts index 3b2497c947..b5a2fa1f4d 100644 --- a/packages/api/src/agents/usage.ts +++ b/packages/api/src/agents/usage.ts @@ -1,4 +1,5 @@ import { logger } from '@librechat/data-schemas'; +import { Providers } from 'librechat-data-provider'; import type { TCustomConfig, TTransactionsConfig } from 'librechat-data-provider'; import type { StructuredTokenUsage, @@ -22,6 +23,66 @@ type SpendStructuredTokensFn = ( tokenUsage: StructuredTokenUsage, ) => Promise; +/** + * Providers whose `usage_metadata.input_tokens` ALREADY INCLUDES cached tokens + * (i.e. `input_token_details.cache_*` is a subset, not an additional charge): + * + * - Google / Vertex AI: `input_tokens` = `promptTokenCount` (includes `cachedContentTokenCount`) + * - OpenAI / Azure OpenAI: `input_tokens` = `prompt_tokens` (includes `prompt_tokens_details.cached_tokens`) + * - xAI, DeepSeek, OpenRouter, Moonshot: extend `ChatOpenAI`, same semantics + * + * Anthropic and Bedrock keep cache values separate from `input_tokens`, so they + * must be added back to compute the total prompt size — that's the historical + * additive default. Providers not listed here fall through to additive. + */ +const SUBSET_PROVIDERS: ReadonlySet = new Set([ + Providers.OPENAI, + Providers.AZURE, + Providers.GOOGLE, + Providers.VERTEXAI, + Providers.XAI, + Providers.DEEPSEEK, + Providers.OPENROUTER, + Providers.MOONSHOT, +]); + +function inputTokensIncludesCache(provider?: string): boolean { + return provider != null && SUBSET_PROVIDERS.has(provider); +} + +interface SplitUsage { + /** Non-cached input portion — what gets billed at the standard input rate */ + inputOnly: number; + cacheCreation: number; + cacheRead: number; + /** Total prompt tokens including cached portion */ + totalInput: number; +} + +function splitUsage(usage: UsageMetadata): SplitUsage { + const cacheCreation = + Number(usage.input_token_details?.cache_creation) || + Number(usage.cache_creation_input_tokens) || + 0; + const cacheRead = + Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0; + const rawInput = Number(usage.input_tokens) || 0; + if (inputTokensIncludesCache(usage.provider)) { + return { + inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead), + cacheCreation, + cacheRead, + totalInput: rawInput, + }; + } + return { + inputOnly: rawInput, + cacheCreation, + cacheRead, + totalInput: rawInput + cacheCreation + cacheRead, + }; +} + export interface RecordUsageDeps { spendTokens: SpendTokensFn; spendStructuredTokens: SpendStructuredTokensFn; @@ -83,16 +144,7 @@ export async function recordCollectedUsage( } const firstUsage = messageUsages[0]; - const input_tokens = - firstUsage == null - ? 0 - : (firstUsage.input_tokens || 0) + - (Number(firstUsage.input_token_details?.cache_creation) || - Number(firstUsage.cache_creation_input_tokens) || - 0) + - (Number(firstUsage.input_token_details?.cache_read) || - Number(firstUsage.cache_read_input_tokens) || - 0); + const input_tokens = firstUsage == null ? 0 : splitUsage(firstUsage).totalInput; let total_output_tokens = 0; @@ -109,12 +161,7 @@ export async function recordCollectedUsage( continue; } - const cache_creation = - Number(usage.input_token_details?.cache_creation) || - Number(usage.cache_creation_input_tokens) || - 0; - const cache_read = - Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0; + const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage); total_output_tokens += Number(usage.output_tokens) || 0; @@ -131,14 +178,14 @@ export async function recordCollectedUsage( if (useBulk) { const entries = - cache_creation > 0 || cache_read > 0 + cacheCreation > 0 || cacheRead > 0 ? prepareStructuredTokenSpend( txMetadata, { promptTokens: { - input: usage.input_tokens, - write: cache_creation, - read: cache_read, + input: inputOnly, + write: cacheCreation, + read: cacheRead, }, completionTokens: usage.output_tokens, }, @@ -147,7 +194,7 @@ export async function recordCollectedUsage( : prepareTokenSpend( txMetadata, { - promptTokens: usage.input_tokens, + promptTokens: inputOnly, completionTokens: usage.output_tokens, }, pricing, @@ -156,13 +203,13 @@ export async function recordCollectedUsage( continue; } - if (cache_creation > 0 || cache_read > 0) { + if (cacheCreation > 0 || cacheRead > 0) { deps .spendStructuredTokens(txMetadata, { promptTokens: { - input: usage.input_tokens, - write: cache_creation, - read: cache_read, + input: inputOnly, + write: cacheCreation, + read: cacheRead, }, completionTokens: usage.output_tokens, }) @@ -177,7 +224,7 @@ export async function recordCollectedUsage( deps .spendTokens(txMetadata, { - promptTokens: usage.input_tokens, + promptTokens: inputOnly, completionTokens: usage.output_tokens, }) .catch((err) => {