💎 fix: Stop Double-Counting Cache Tokens for Gemini/OpenAI in Usage Spend (#12868)

* 💎 fix: Stop Double-Counting Cache Tokens for Gemini/OpenAI in Usage Spend (#12855)

Different providers report `usage_metadata.input_tokens` with different
semantics:

  - Anthropic / Bedrock: `input_tokens` EXCLUDES cache; cache reads/writes
    arrive separately and must be added to get the total prompt size.
  - Gemini / OpenAI: `input_tokens` ALREADY INCLUDES cached tokens
    (Google's `promptTokenCount`, OpenAI's `prompt_tokens`). Their
    `input_token_details.cache_*` are subsets of `input_tokens`.

`recordCollectedUsage` treated both schemes as additive, so for cache-hit
requests on Gemini/OpenAI it added cache tokens on top of an
`input_tokens` value that already contained them — overcharging users by
the cache_hit_rate (e.g., ~67% cache hit ≈ 1.67x overcharge). This
matches the issue reporter's GCP billing comparison.

Adds a small `splitUsage` helper that classifies the provider by model
name and computes `inputOnly` (the non-cached portion) plus the
all-inclusive `totalInput` for both the spend math and the returned
`input_tokens` summary. The helper defaults to additive semantics (the
historical behavior) so unknown providers are unaffected.

Updates existing OpenAI-shaped tests that previously asserted the buggy
additive math, and adds Gemini regression tests using the exact numbers
from the issue report (input=11125, cache_read=7441 → input=3684).

Anthropic / Bedrock paths remain bit-identical to before.

* 🔧 refactor: Classify Cache-Token Semantics by Provider, Not Model Name

Follows up the previous commit. Replaces a model-name regex
(`gemini|gpt|o[1-9]|chatgpt`) with an explicit `Providers` enum lookup
keyed off the `usage.provider` field — `UsageMetadata.provider` already
exists in `IJobStore.ts` but was never being populated.

  - `callbacks.js#ModelEndHandler` now attaches `usage.provider` from
    `agentContext.provider` alongside `usage.model`.
  - `usage.ts` uses a `SUBSET_PROVIDERS` set (`openAI`, `azureOpenAI`,
    `google`, `vertexai`, `xai`, `deepseek`, `openrouter`, `moonshot`)
    backed by the canonical `Providers` enum from
    `librechat-data-provider`.
  - `xai`, `deepseek`, `openrouter`, `moonshot` extend `ChatOpenAI` so
    they inherit subset semantics (verified in node_modules).
  - Defaults to additive when `usage.provider` is missing, so the title
    flow (which doesn't propagate provider) and any pre-this-PR usage
    entries keep their existing behavior.

Tests: switch fixtures from model-name signaling to explicit `provider`
field, plus a Vertex AI case and a "missing provider" fallback case.
This commit is contained in:
Danny Avila 2026-04-29 08:36:00 +09:00 committed by GitHub
parent 46a86d849f
commit 89bf2ab7b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 215 additions and 34 deletions

View file

@ -75,6 +75,9 @@ class ModelEndHandler {
if (modelName) {
usage.model = modelName;
}
if (agentContext.provider) {
usage.provider = agentContext.provider;
}
const taggedUsage = markSummarizationUsage(usage, metadata);

View file

@ -171,31 +171,55 @@ describe('recordCollectedUsage — bulk path parity', () => {
});
});
describe('cache token handling - OpenAI format', () => {
it('should route cache entries to structured path — same input_tokens as legacy', async () => {
describe('cache token handling - subset providers (input_tokens already includes cache)', () => {
it('subtracts cache from input portion in bulk docs for OpenAI', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 100,
output_tokens: 50,
model: 'gpt-4',
provider: 'openAI',
input_token_details: { cache_creation: 20, cache_read: 10 },
},
];
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
expect(result?.input_tokens).toBe(130); // 100 + 20 + 10
expect(result?.input_tokens).toBe(100);
expect(mockInsertMany).toHaveBeenCalledTimes(1);
expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
expect(mockSpendTokens).not.toHaveBeenCalled();
const docs = mockInsertMany.mock.calls[0][0];
const promptDoc = docs.find((d: { tokenType: string }) => d.tokenType === 'prompt');
expect(promptDoc.inputTokens).toBe(-100);
expect(promptDoc.inputTokens).toBe(-70);
expect(promptDoc.writeTokens).toBe(-20);
expect(promptDoc.readTokens).toBe(-10);
expect(promptDoc.model).toBe('gpt-4');
});
it('does not double-count cache_read for Gemini in bulk path — issue #12855', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 11125,
output_tokens: 20,
model: 'gemini-3-flash-preview',
provider: 'google',
input_token_details: { cache_read: 7441 },
},
];
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
expect(result?.input_tokens).toBe(11125);
expect(mockInsertMany).toHaveBeenCalledTimes(1);
const docs = mockInsertMany.mock.calls[0][0];
const promptDoc = docs.find((d: { tokenType: string }) => d.tokenType === 'prompt');
expect(promptDoc.inputTokens).toBe(-3684);
expect(promptDoc.readTokens).toBe(-7441);
expect(promptDoc.writeTokens || 0).toBe(0);
});
});
describe('cache token handling - Anthropic format', () => {

View file

@ -197,13 +197,14 @@ describe('recordCollectedUsage', () => {
});
});
describe('cache token handling - OpenAI format', () => {
it('should use spendStructuredTokens for cache tokens (input_token_details)', async () => {
describe('cache token handling - subset providers (input_tokens already includes cache)', () => {
it('subtracts cache from input_tokens for OpenAI to avoid double-counting', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 100,
output_tokens: 50,
model: 'gpt-4',
provider: 'openAI',
input_token_details: {
cache_creation: 20,
cache_read: 10,
@ -218,6 +219,110 @@ describe('recordCollectedUsage', () => {
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
expect(mockSpendTokens).not.toHaveBeenCalled();
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gpt-4' }),
{
promptTokens: { input: 70, write: 20, read: 10 },
completionTokens: 50,
},
);
expect(result?.input_tokens).toBe(100);
});
it('does not double-count cache_read for Gemini — issue #12855', async () => {
// Real numbers from the issue report
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 11125,
output_tokens: 20,
model: 'gemini-3-flash-preview',
provider: 'google',
input_token_details: { cache_read: 7441 },
},
];
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
{
promptTokens: { input: 3684, write: 0, read: 7441 },
completionTokens: 20,
},
);
expect(result?.input_tokens).toBe(11125);
});
it('also applies to Vertex AI', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 5000,
output_tokens: 100,
model: 'gemini-2.5-pro',
provider: 'vertexai',
input_token_details: { cache_read: 4000 },
},
];
await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gemini-2.5-pro' }),
{
promptTokens: { input: 1000, write: 0, read: 4000 },
completionTokens: 100,
},
);
});
it('handles cache_read >= input_tokens defensively (clamps inputOnly to 0)', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 1000,
output_tokens: 30,
model: 'gemini-2.5-pro',
provider: 'google',
input_token_details: { cache_read: 1000 },
},
];
await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gemini-2.5-pro' }),
{
promptTokens: { input: 0, write: 0, read: 1000 },
completionTokens: 30,
},
);
});
it('falls through to additive (historical default) when provider is missing', async () => {
// Defensive: an unclassified or pre-this-PR usage entry should keep old behavior
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 100,
output_tokens: 50,
model: 'gpt-4',
input_token_details: { cache_creation: 20, cache_read: 10 },
},
];
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage,
});
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'gpt-4' }),
{
@ -225,7 +330,7 @@ describe('recordCollectedUsage', () => {
completionTokens: 50,
},
);
expect(result?.input_tokens).toBe(130); // 100 + 20 + 10
expect(result?.input_tokens).toBe(130);
});
});
@ -397,6 +502,7 @@ describe('recordCollectedUsage', () => {
input_tokens: 100,
output_tokens: 50,
model: 'gpt-4',
provider: 'openAI',
input_token_details: { cache_creation: 20, cache_read: 10 },
},
];
@ -406,7 +512,8 @@ describe('recordCollectedUsage', () => {
collectedUsage,
});
expect(result).toEqual({ input_tokens: 130, output_tokens: 50 });
// openAI is a subset provider → input_tokens already includes cache
expect(result).toEqual({ input_tokens: 100, output_tokens: 50 });
});
});

View file

@ -1,4 +1,5 @@
import { logger } from '@librechat/data-schemas';
import { Providers } from 'librechat-data-provider';
import type { TCustomConfig, TTransactionsConfig } from 'librechat-data-provider';
import type {
StructuredTokenUsage,
@ -22,6 +23,66 @@ type SpendStructuredTokensFn = (
tokenUsage: StructuredTokenUsage,
) => Promise<unknown>;
/**
* Providers whose `usage_metadata.input_tokens` ALREADY INCLUDES cached tokens
* (i.e. `input_token_details.cache_*` is a subset, not an additional charge):
*
* - Google / Vertex AI: `input_tokens` = `promptTokenCount` (includes `cachedContentTokenCount`)
* - OpenAI / Azure OpenAI: `input_tokens` = `prompt_tokens` (includes `prompt_tokens_details.cached_tokens`)
* - xAI, DeepSeek, OpenRouter, Moonshot: extend `ChatOpenAI`, same semantics
*
* Anthropic and Bedrock keep cache values separate from `input_tokens`, so they
* must be added back to compute the total prompt size that's the historical
* additive default. Providers not listed here fall through to additive.
*/
const SUBSET_PROVIDERS: ReadonlySet<string> = new Set([
Providers.OPENAI,
Providers.AZURE,
Providers.GOOGLE,
Providers.VERTEXAI,
Providers.XAI,
Providers.DEEPSEEK,
Providers.OPENROUTER,
Providers.MOONSHOT,
]);
function inputTokensIncludesCache(provider?: string): boolean {
return provider != null && SUBSET_PROVIDERS.has(provider);
}
interface SplitUsage {
/** Non-cached input portion — what gets billed at the standard input rate */
inputOnly: number;
cacheCreation: number;
cacheRead: number;
/** Total prompt tokens including cached portion */
totalInput: number;
}
function splitUsage(usage: UsageMetadata): SplitUsage {
const cacheCreation =
Number(usage.input_token_details?.cache_creation) ||
Number(usage.cache_creation_input_tokens) ||
0;
const cacheRead =
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
const rawInput = Number(usage.input_tokens) || 0;
if (inputTokensIncludesCache(usage.provider)) {
return {
inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
cacheCreation,
cacheRead,
totalInput: rawInput,
};
}
return {
inputOnly: rawInput,
cacheCreation,
cacheRead,
totalInput: rawInput + cacheCreation + cacheRead,
};
}
export interface RecordUsageDeps {
spendTokens: SpendTokensFn;
spendStructuredTokens: SpendStructuredTokensFn;
@ -83,16 +144,7 @@ export async function recordCollectedUsage(
}
const firstUsage = messageUsages[0];
const input_tokens =
firstUsage == null
? 0
: (firstUsage.input_tokens || 0) +
(Number(firstUsage.input_token_details?.cache_creation) ||
Number(firstUsage.cache_creation_input_tokens) ||
0) +
(Number(firstUsage.input_token_details?.cache_read) ||
Number(firstUsage.cache_read_input_tokens) ||
0);
const input_tokens = firstUsage == null ? 0 : splitUsage(firstUsage).totalInput;
let total_output_tokens = 0;
@ -109,12 +161,7 @@ export async function recordCollectedUsage(
continue;
}
const cache_creation =
Number(usage.input_token_details?.cache_creation) ||
Number(usage.cache_creation_input_tokens) ||
0;
const cache_read =
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage);
total_output_tokens += Number(usage.output_tokens) || 0;
@ -131,14 +178,14 @@ export async function recordCollectedUsage(
if (useBulk) {
const entries =
cache_creation > 0 || cache_read > 0
cacheCreation > 0 || cacheRead > 0
? prepareStructuredTokenSpend(
txMetadata,
{
promptTokens: {
input: usage.input_tokens,
write: cache_creation,
read: cache_read,
input: inputOnly,
write: cacheCreation,
read: cacheRead,
},
completionTokens: usage.output_tokens,
},
@ -147,7 +194,7 @@ export async function recordCollectedUsage(
: prepareTokenSpend(
txMetadata,
{
promptTokens: usage.input_tokens,
promptTokens: inputOnly,
completionTokens: usage.output_tokens,
},
pricing,
@ -156,13 +203,13 @@ export async function recordCollectedUsage(
continue;
}
if (cache_creation > 0 || cache_read > 0) {
if (cacheCreation > 0 || cacheRead > 0) {
deps
.spendStructuredTokens(txMetadata, {
promptTokens: {
input: usage.input_tokens,
write: cache_creation,
read: cache_read,
input: inputOnly,
write: cacheCreation,
read: cacheRead,
},
completionTokens: usage.output_tokens,
})
@ -177,7 +224,7 @@ export async function recordCollectedUsage(
deps
.spendTokens(txMetadata, {
promptTokens: usage.input_tokens,
promptTokens: inputOnly,
completionTokens: usage.output_tokens,
})
.catch((err) => {