mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-05-13 16:07:30 +00:00
💎 fix: Stop Double-Counting Cache Tokens for Gemini/OpenAI in Usage Spend (#12868)
* 💎 fix: Stop Double-Counting Cache Tokens for Gemini/OpenAI in Usage Spend (#12855) Different providers report `usage_metadata.input_tokens` with different semantics: - Anthropic / Bedrock: `input_tokens` EXCLUDES cache; cache reads/writes arrive separately and must be added to get the total prompt size. - Gemini / OpenAI: `input_tokens` ALREADY INCLUDES cached tokens (Google's `promptTokenCount`, OpenAI's `prompt_tokens`). Their `input_token_details.cache_*` are subsets of `input_tokens`. `recordCollectedUsage` treated both schemes as additive, so for cache-hit requests on Gemini/OpenAI it added cache tokens on top of an `input_tokens` value that already contained them — overcharging users by the cache_hit_rate (e.g., ~67% cache hit ≈ 1.67x overcharge). This matches the issue reporter's GCP billing comparison. Adds a small `splitUsage` helper that classifies the provider by model name and computes `inputOnly` (the non-cached portion) plus the all-inclusive `totalInput` for both the spend math and the returned `input_tokens` summary. The helper defaults to additive semantics (the historical behavior) so unknown providers are unaffected. Updates existing OpenAI-shaped tests that previously asserted the buggy additive math, and adds Gemini regression tests using the exact numbers from the issue report (input=11125, cache_read=7441 → input=3684). Anthropic / Bedrock paths remain bit-identical to before. * 🔧 refactor: Classify Cache-Token Semantics by Provider, Not Model Name Follows up the previous commit. Replaces a model-name regex (`gemini|gpt|o[1-9]|chatgpt`) with an explicit `Providers` enum lookup keyed off the `usage.provider` field — `UsageMetadata.provider` already exists in `IJobStore.ts` but was never being populated. - `callbacks.js#ModelEndHandler` now attaches `usage.provider` from `agentContext.provider` alongside `usage.model`. - `usage.ts` uses a `SUBSET_PROVIDERS` set (`openAI`, `azureOpenAI`, `google`, `vertexai`, `xai`, `deepseek`, `openrouter`, `moonshot`) backed by the canonical `Providers` enum from `librechat-data-provider`. - `xai`, `deepseek`, `openrouter`, `moonshot` extend `ChatOpenAI` so they inherit subset semantics (verified in node_modules). - Defaults to additive when `usage.provider` is missing, so the title flow (which doesn't propagate provider) and any pre-this-PR usage entries keep their existing behavior. Tests: switch fixtures from model-name signaling to explicit `provider` field, plus a Vertex AI case and a "missing provider" fallback case.
This commit is contained in:
parent
46a86d849f
commit
89bf2ab7b4
4 changed files with 215 additions and 34 deletions
|
|
@ -75,6 +75,9 @@ class ModelEndHandler {
|
|||
if (modelName) {
|
||||
usage.model = modelName;
|
||||
}
|
||||
if (agentContext.provider) {
|
||||
usage.provider = agentContext.provider;
|
||||
}
|
||||
|
||||
const taggedUsage = markSummarizationUsage(usage, metadata);
|
||||
|
||||
|
|
|
|||
|
|
@ -171,31 +171,55 @@ describe('recordCollectedUsage — bulk path parity', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('cache token handling - OpenAI format', () => {
|
||||
it('should route cache entries to structured path — same input_tokens as legacy', async () => {
|
||||
describe('cache token handling - subset providers (input_tokens already includes cache)', () => {
|
||||
it('subtracts cache from input portion in bulk docs for OpenAI', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'gpt-4',
|
||||
provider: 'openAI',
|
||||
input_token_details: { cache_creation: 20, cache_read: 10 },
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
|
||||
|
||||
expect(result?.input_tokens).toBe(130); // 100 + 20 + 10
|
||||
expect(result?.input_tokens).toBe(100);
|
||||
expect(mockInsertMany).toHaveBeenCalledTimes(1);
|
||||
expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
|
||||
expect(mockSpendTokens).not.toHaveBeenCalled();
|
||||
|
||||
const docs = mockInsertMany.mock.calls[0][0];
|
||||
const promptDoc = docs.find((d: { tokenType: string }) => d.tokenType === 'prompt');
|
||||
expect(promptDoc.inputTokens).toBe(-100);
|
||||
expect(promptDoc.inputTokens).toBe(-70);
|
||||
expect(promptDoc.writeTokens).toBe(-20);
|
||||
expect(promptDoc.readTokens).toBe(-10);
|
||||
expect(promptDoc.model).toBe('gpt-4');
|
||||
});
|
||||
|
||||
it('does not double-count cache_read for Gemini in bulk path — issue #12855', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 11125,
|
||||
output_tokens: 20,
|
||||
model: 'gemini-3-flash-preview',
|
||||
provider: 'google',
|
||||
input_token_details: { cache_read: 7441 },
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
|
||||
|
||||
expect(result?.input_tokens).toBe(11125);
|
||||
expect(mockInsertMany).toHaveBeenCalledTimes(1);
|
||||
|
||||
const docs = mockInsertMany.mock.calls[0][0];
|
||||
const promptDoc = docs.find((d: { tokenType: string }) => d.tokenType === 'prompt');
|
||||
expect(promptDoc.inputTokens).toBe(-3684);
|
||||
expect(promptDoc.readTokens).toBe(-7441);
|
||||
expect(promptDoc.writeTokens || 0).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cache token handling - Anthropic format', () => {
|
||||
|
|
|
|||
|
|
@ -197,13 +197,14 @@ describe('recordCollectedUsage', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('cache token handling - OpenAI format', () => {
|
||||
it('should use spendStructuredTokens for cache tokens (input_token_details)', async () => {
|
||||
describe('cache token handling - subset providers (input_tokens already includes cache)', () => {
|
||||
it('subtracts cache from input_tokens for OpenAI to avoid double-counting', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'gpt-4',
|
||||
provider: 'openAI',
|
||||
input_token_details: {
|
||||
cache_creation: 20,
|
||||
cache_read: 10,
|
||||
|
|
@ -218,6 +219,110 @@ describe('recordCollectedUsage', () => {
|
|||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
|
||||
expect(mockSpendTokens).not.toHaveBeenCalled();
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gpt-4' }),
|
||||
{
|
||||
promptTokens: { input: 70, write: 20, read: 10 },
|
||||
completionTokens: 50,
|
||||
},
|
||||
);
|
||||
expect(result?.input_tokens).toBe(100);
|
||||
});
|
||||
|
||||
it('does not double-count cache_read for Gemini — issue #12855', async () => {
|
||||
// Real numbers from the issue report
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 11125,
|
||||
output_tokens: 20,
|
||||
model: 'gemini-3-flash-preview',
|
||||
provider: 'google',
|
||||
input_token_details: { cache_read: 7441 },
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
|
||||
{
|
||||
promptTokens: { input: 3684, write: 0, read: 7441 },
|
||||
completionTokens: 20,
|
||||
},
|
||||
);
|
||||
expect(result?.input_tokens).toBe(11125);
|
||||
});
|
||||
|
||||
it('also applies to Vertex AI', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 5000,
|
||||
output_tokens: 100,
|
||||
model: 'gemini-2.5-pro',
|
||||
provider: 'vertexai',
|
||||
input_token_details: { cache_read: 4000 },
|
||||
},
|
||||
];
|
||||
|
||||
await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gemini-2.5-pro' }),
|
||||
{
|
||||
promptTokens: { input: 1000, write: 0, read: 4000 },
|
||||
completionTokens: 100,
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it('handles cache_read >= input_tokens defensively (clamps inputOnly to 0)', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 1000,
|
||||
output_tokens: 30,
|
||||
model: 'gemini-2.5-pro',
|
||||
provider: 'google',
|
||||
input_token_details: { cache_read: 1000 },
|
||||
},
|
||||
];
|
||||
|
||||
await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gemini-2.5-pro' }),
|
||||
{
|
||||
promptTokens: { input: 0, write: 0, read: 1000 },
|
||||
completionTokens: 30,
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it('falls through to additive (historical default) when provider is missing', async () => {
|
||||
// Defensive: an unclassified or pre-this-PR usage entry should keep old behavior
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'gpt-4',
|
||||
input_token_details: { cache_creation: 20, cache_read: 10 },
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gpt-4' }),
|
||||
{
|
||||
|
|
@ -225,7 +330,7 @@ describe('recordCollectedUsage', () => {
|
|||
completionTokens: 50,
|
||||
},
|
||||
);
|
||||
expect(result?.input_tokens).toBe(130); // 100 + 20 + 10
|
||||
expect(result?.input_tokens).toBe(130);
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -397,6 +502,7 @@ describe('recordCollectedUsage', () => {
|
|||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'gpt-4',
|
||||
provider: 'openAI',
|
||||
input_token_details: { cache_creation: 20, cache_read: 10 },
|
||||
},
|
||||
];
|
||||
|
|
@ -406,7 +512,8 @@ describe('recordCollectedUsage', () => {
|
|||
collectedUsage,
|
||||
});
|
||||
|
||||
expect(result).toEqual({ input_tokens: 130, output_tokens: 50 });
|
||||
// openAI is a subset provider → input_tokens already includes cache
|
||||
expect(result).toEqual({ input_tokens: 100, output_tokens: 50 });
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import { logger } from '@librechat/data-schemas';
|
||||
import { Providers } from 'librechat-data-provider';
|
||||
import type { TCustomConfig, TTransactionsConfig } from 'librechat-data-provider';
|
||||
import type {
|
||||
StructuredTokenUsage,
|
||||
|
|
@ -22,6 +23,66 @@ type SpendStructuredTokensFn = (
|
|||
tokenUsage: StructuredTokenUsage,
|
||||
) => Promise<unknown>;
|
||||
|
||||
/**
|
||||
* Providers whose `usage_metadata.input_tokens` ALREADY INCLUDES cached tokens
|
||||
* (i.e. `input_token_details.cache_*` is a subset, not an additional charge):
|
||||
*
|
||||
* - Google / Vertex AI: `input_tokens` = `promptTokenCount` (includes `cachedContentTokenCount`)
|
||||
* - OpenAI / Azure OpenAI: `input_tokens` = `prompt_tokens` (includes `prompt_tokens_details.cached_tokens`)
|
||||
* - xAI, DeepSeek, OpenRouter, Moonshot: extend `ChatOpenAI`, same semantics
|
||||
*
|
||||
* Anthropic and Bedrock keep cache values separate from `input_tokens`, so they
|
||||
* must be added back to compute the total prompt size — that's the historical
|
||||
* additive default. Providers not listed here fall through to additive.
|
||||
*/
|
||||
const SUBSET_PROVIDERS: ReadonlySet<string> = new Set([
|
||||
Providers.OPENAI,
|
||||
Providers.AZURE,
|
||||
Providers.GOOGLE,
|
||||
Providers.VERTEXAI,
|
||||
Providers.XAI,
|
||||
Providers.DEEPSEEK,
|
||||
Providers.OPENROUTER,
|
||||
Providers.MOONSHOT,
|
||||
]);
|
||||
|
||||
function inputTokensIncludesCache(provider?: string): boolean {
|
||||
return provider != null && SUBSET_PROVIDERS.has(provider);
|
||||
}
|
||||
|
||||
interface SplitUsage {
|
||||
/** Non-cached input portion — what gets billed at the standard input rate */
|
||||
inputOnly: number;
|
||||
cacheCreation: number;
|
||||
cacheRead: number;
|
||||
/** Total prompt tokens including cached portion */
|
||||
totalInput: number;
|
||||
}
|
||||
|
||||
function splitUsage(usage: UsageMetadata): SplitUsage {
|
||||
const cacheCreation =
|
||||
Number(usage.input_token_details?.cache_creation) ||
|
||||
Number(usage.cache_creation_input_tokens) ||
|
||||
0;
|
||||
const cacheRead =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
const rawInput = Number(usage.input_tokens) || 0;
|
||||
if (inputTokensIncludesCache(usage.provider)) {
|
||||
return {
|
||||
inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
|
||||
cacheCreation,
|
||||
cacheRead,
|
||||
totalInput: rawInput,
|
||||
};
|
||||
}
|
||||
return {
|
||||
inputOnly: rawInput,
|
||||
cacheCreation,
|
||||
cacheRead,
|
||||
totalInput: rawInput + cacheCreation + cacheRead,
|
||||
};
|
||||
}
|
||||
|
||||
export interface RecordUsageDeps {
|
||||
spendTokens: SpendTokensFn;
|
||||
spendStructuredTokens: SpendStructuredTokensFn;
|
||||
|
|
@ -83,16 +144,7 @@ export async function recordCollectedUsage(
|
|||
}
|
||||
|
||||
const firstUsage = messageUsages[0];
|
||||
const input_tokens =
|
||||
firstUsage == null
|
||||
? 0
|
||||
: (firstUsage.input_tokens || 0) +
|
||||
(Number(firstUsage.input_token_details?.cache_creation) ||
|
||||
Number(firstUsage.cache_creation_input_tokens) ||
|
||||
0) +
|
||||
(Number(firstUsage.input_token_details?.cache_read) ||
|
||||
Number(firstUsage.cache_read_input_tokens) ||
|
||||
0);
|
||||
const input_tokens = firstUsage == null ? 0 : splitUsage(firstUsage).totalInput;
|
||||
|
||||
let total_output_tokens = 0;
|
||||
|
||||
|
|
@ -109,12 +161,7 @@ export async function recordCollectedUsage(
|
|||
continue;
|
||||
}
|
||||
|
||||
const cache_creation =
|
||||
Number(usage.input_token_details?.cache_creation) ||
|
||||
Number(usage.cache_creation_input_tokens) ||
|
||||
0;
|
||||
const cache_read =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage);
|
||||
|
||||
total_output_tokens += Number(usage.output_tokens) || 0;
|
||||
|
||||
|
|
@ -131,14 +178,14 @@ export async function recordCollectedUsage(
|
|||
|
||||
if (useBulk) {
|
||||
const entries =
|
||||
cache_creation > 0 || cache_read > 0
|
||||
cacheCreation > 0 || cacheRead > 0
|
||||
? prepareStructuredTokenSpend(
|
||||
txMetadata,
|
||||
{
|
||||
promptTokens: {
|
||||
input: usage.input_tokens,
|
||||
write: cache_creation,
|
||||
read: cache_read,
|
||||
input: inputOnly,
|
||||
write: cacheCreation,
|
||||
read: cacheRead,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
},
|
||||
|
|
@ -147,7 +194,7 @@ export async function recordCollectedUsage(
|
|||
: prepareTokenSpend(
|
||||
txMetadata,
|
||||
{
|
||||
promptTokens: usage.input_tokens,
|
||||
promptTokens: inputOnly,
|
||||
completionTokens: usage.output_tokens,
|
||||
},
|
||||
pricing,
|
||||
|
|
@ -156,13 +203,13 @@ export async function recordCollectedUsage(
|
|||
continue;
|
||||
}
|
||||
|
||||
if (cache_creation > 0 || cache_read > 0) {
|
||||
if (cacheCreation > 0 || cacheRead > 0) {
|
||||
deps
|
||||
.spendStructuredTokens(txMetadata, {
|
||||
promptTokens: {
|
||||
input: usage.input_tokens,
|
||||
write: cache_creation,
|
||||
read: cache_read,
|
||||
input: inputOnly,
|
||||
write: cacheCreation,
|
||||
read: cacheRead,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
})
|
||||
|
|
@ -177,7 +224,7 @@ export async function recordCollectedUsage(
|
|||
|
||||
deps
|
||||
.spendTokens(txMetadata, {
|
||||
promptTokens: usage.input_tokens,
|
||||
promptTokens: inputOnly,
|
||||
completionTokens: usage.output_tokens,
|
||||
})
|
||||
.catch((err) => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue