mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-06-09 17:31:19 +00:00
🏷️ fix: Prevent Bedrock Cache Tokens from Inflating Completion Count (#13468)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
GitNexus Index / index (push) Waiting to run
GitNexus Index / post-index (push) Blocked by required conditions
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
GitNexus Index / index (push) Waiting to run
GitNexus Index / post-index (push) Blocked by required conditions
* 🐛 fix: prevent Bedrock cache tokens from inflating completion count
* style: fix prettier formatting
This commit is contained in:
parent
eae00250df
commit
c50b3c58d5
2 changed files with 130 additions and 9 deletions
|
|
@ -988,6 +988,107 @@ describe('recordCollectedUsage', () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe('Bedrock prompt caching — completion token inflation regression', () => {
|
||||
it('does not fold cache_creation into completion on the first cached step', async () => {
|
||||
// Bedrock: total = input + output + cache_creation (additive, not subset).
|
||||
// Before fix: resolveCompletionTokens returned output + cache_creation (5500)
|
||||
// instead of output (500).
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 500,
|
||||
total_tokens: 5600,
|
||||
cache_creation_input_tokens: 5000,
|
||||
cache_read_input_tokens: 0,
|
||||
model: 'claude-sonnet-4-6',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'claude-sonnet-4-6' }),
|
||||
{
|
||||
promptTokens: { input: 100, write: 5000, read: 0 },
|
||||
completionTokens: 500,
|
||||
},
|
||||
);
|
||||
expect(result?.output_tokens).toBe(500);
|
||||
});
|
||||
|
||||
it('does not fold cache_read into completion on subsequent cached steps', async () => {
|
||||
// Bedrock: total = input + output + cache_read on every read step.
|
||||
// Before fix: each step returned output + cache_read instead of output.
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 200,
|
||||
output_tokens: 300,
|
||||
total_tokens: 4500,
|
||||
cache_read_input_tokens: 4000,
|
||||
cache_creation_input_tokens: 0,
|
||||
model: 'claude-sonnet-4-6',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'claude-sonnet-4-6' }),
|
||||
{
|
||||
promptTokens: { input: 200, write: 0, read: 4000 },
|
||||
completionTokens: 300,
|
||||
},
|
||||
);
|
||||
expect(result?.output_tokens).toBe(300);
|
||||
});
|
||||
|
||||
it('handles cache tokens in input_token_details format (alternate field path)', async () => {
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 200,
|
||||
output_tokens: 300,
|
||||
total_tokens: 4500,
|
||||
input_token_details: { cache_read: 4000, cache_creation: 0 },
|
||||
model: 'claude-sonnet-4-6',
|
||||
},
|
||||
];
|
||||
|
||||
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
|
||||
|
||||
expect(result?.output_tokens).toBe(300);
|
||||
});
|
||||
|
||||
it('accumulates only true output across a multi-step cached agent run', async () => {
|
||||
// 1 write step + 4 read steps. Without the fix, each step folds its
|
||||
// cache tokens into completion, inflating the total by the full cache size.
|
||||
const writeStep: UsageMetadata = {
|
||||
input_tokens: 100,
|
||||
output_tokens: 500,
|
||||
total_tokens: 5600,
|
||||
cache_creation_input_tokens: 5000,
|
||||
cache_read_input_tokens: 0,
|
||||
model: 'claude-sonnet-4-6',
|
||||
};
|
||||
const readSteps: UsageMetadata[] = Array.from({ length: 4 }, (_, i) => ({
|
||||
input_tokens: 200,
|
||||
output_tokens: 300 + i * 50,
|
||||
total_tokens: 200 + (300 + i * 50) + 5000,
|
||||
cache_read_input_tokens: 5000,
|
||||
cache_creation_input_tokens: 0,
|
||||
model: 'claude-sonnet-4-6',
|
||||
}));
|
||||
|
||||
const result = await recordCollectedUsage(deps, {
|
||||
...baseParams,
|
||||
collectedUsage: [writeStep, ...readSteps],
|
||||
});
|
||||
|
||||
// True output: 500 + 300 + 350 + 400 + 450 = 2000
|
||||
const trueOutput = 500 + readSteps.reduce((sum, s) => sum + (s.output_tokens ?? 0), 0);
|
||||
expect(result?.output_tokens).toBe(trueOutput);
|
||||
});
|
||||
});
|
||||
|
||||
describe('bulk write with summarization usage', () => {
|
||||
let mockInsertMany: jest.Mock;
|
||||
let mockUpdateBalance: jest.Mock;
|
||||
|
|
|
|||
|
|
@ -58,22 +58,42 @@ function inputTokensIncludesCache(provider?: string): boolean {
|
|||
* `total_tokens === input_tokens + output_tokens`. Compliant providers
|
||||
* (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
|
||||
* include any reasoning/thinking tokens inside `output_tokens` already,
|
||||
* so the invariant holds.
|
||||
* so the invariant holds and this function is a no-op for them.
|
||||
*
|
||||
* Vertex AI Gemini through `@langchain/google-common`'s streaming path
|
||||
* emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
|
||||
* leaving `total - input > output`. When that gap shows up we use the
|
||||
* invariant to recover the correct billable output (`total - input`).
|
||||
* Compliant providers have a zero gap, so this is a no-op for them.
|
||||
* **Vertex AI undercount (issue #13006):** `@langchain/google-common`'s streaming
|
||||
* path emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
|
||||
* so `total - input > output`. The gap is recovered as `total - input`.
|
||||
*
|
||||
* Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
|
||||
* **Bedrock / Anthropic cache inflation:** additive providers keep cache tokens
|
||||
* separate from `input_tokens`, making
|
||||
* `total = input + output + cache_read + cache_creation`. Without adjustment
|
||||
* the Vertex recovery fires on every cached step and returns
|
||||
* `output + cache_read + cache_creation` instead of `output`, inflating
|
||||
* completion counts by orders of magnitude. The fix subtracts the cache
|
||||
* adjustment before the gap test — but only for additive providers; subset
|
||||
* providers (Google, OpenAI, …) already include cache inside `input_tokens`
|
||||
* so their `cacheAdjustment` is zero and the Vertex recovery is unaffected.
|
||||
*/
|
||||
function resolveCompletionTokens(usage: UsageMetadata): number {
|
||||
const output = Number(usage.output_tokens) || 0;
|
||||
const total = Number(usage.total_tokens) || 0;
|
||||
const input = Number(usage.input_tokens) || 0;
|
||||
if (total > input + output) {
|
||||
return total - input;
|
||||
|
||||
// For additive providers (Bedrock, Anthropic), cache tokens are separate
|
||||
// from input_tokens and are included in total_tokens, widening the gap
|
||||
// independently of any missing thinking tokens. Subtract them so the gap
|
||||
// check only fires when output_tokens genuinely undercounts (Vertex case).
|
||||
// Subset providers fold cache into input_tokens, so their adjustment is 0.
|
||||
const cacheRead =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
const cacheCreation =
|
||||
Number(usage.input_token_details?.cache_creation) ||
|
||||
Number(usage.cache_creation_input_tokens) ||
|
||||
0;
|
||||
const cacheAdjustment = inputTokensIncludesCache(usage.provider) ? 0 : cacheRead + cacheCreation;
|
||||
|
||||
if (total > input + output + cacheAdjustment) {
|
||||
return total - input - cacheAdjustment;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue