🏷️ fix: Prevent Bedrock Cache Tokens from Inflating Completion Count (#13468)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
GitNexus Index / index (push) Waiting to run
GitNexus Index / post-index (push) Blocked by required conditions

* 🐛 fix: prevent Bedrock cache tokens from inflating completion count

* style: fix prettier formatting
This commit is contained in:
Joohan(Lucas) 2026-06-03 09:45:37 -04:00 committed by GitHub
parent eae00250df
commit c50b3c58d5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 130 additions and 9 deletions

View file

@ -988,6 +988,107 @@ describe('recordCollectedUsage', () => {
});
});
describe('Bedrock prompt caching — completion token inflation regression', () => {
it('does not fold cache_creation into completion on the first cached step', async () => {
// Bedrock: total = input + output + cache_creation (additive, not subset).
// Before fix: resolveCompletionTokens returned output + cache_creation (5500)
// instead of output (500).
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 100,
output_tokens: 500,
total_tokens: 5600,
cache_creation_input_tokens: 5000,
cache_read_input_tokens: 0,
model: 'claude-sonnet-4-6',
},
];
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'claude-sonnet-4-6' }),
{
promptTokens: { input: 100, write: 5000, read: 0 },
completionTokens: 500,
},
);
expect(result?.output_tokens).toBe(500);
});
it('does not fold cache_read into completion on subsequent cached steps', async () => {
// Bedrock: total = input + output + cache_read on every read step.
// Before fix: each step returned output + cache_read instead of output.
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 200,
output_tokens: 300,
total_tokens: 4500,
cache_read_input_tokens: 4000,
cache_creation_input_tokens: 0,
model: 'claude-sonnet-4-6',
},
];
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
expect.objectContaining({ model: 'claude-sonnet-4-6' }),
{
promptTokens: { input: 200, write: 0, read: 4000 },
completionTokens: 300,
},
);
expect(result?.output_tokens).toBe(300);
});
it('handles cache tokens in input_token_details format (alternate field path)', async () => {
const collectedUsage: UsageMetadata[] = [
{
input_tokens: 200,
output_tokens: 300,
total_tokens: 4500,
input_token_details: { cache_read: 4000, cache_creation: 0 },
model: 'claude-sonnet-4-6',
},
];
const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
expect(result?.output_tokens).toBe(300);
});
it('accumulates only true output across a multi-step cached agent run', async () => {
// 1 write step + 4 read steps. Without the fix, each step folds its
// cache tokens into completion, inflating the total by the full cache size.
const writeStep: UsageMetadata = {
input_tokens: 100,
output_tokens: 500,
total_tokens: 5600,
cache_creation_input_tokens: 5000,
cache_read_input_tokens: 0,
model: 'claude-sonnet-4-6',
};
const readSteps: UsageMetadata[] = Array.from({ length: 4 }, (_, i) => ({
input_tokens: 200,
output_tokens: 300 + i * 50,
total_tokens: 200 + (300 + i * 50) + 5000,
cache_read_input_tokens: 5000,
cache_creation_input_tokens: 0,
model: 'claude-sonnet-4-6',
}));
const result = await recordCollectedUsage(deps, {
...baseParams,
collectedUsage: [writeStep, ...readSteps],
});
// True output: 500 + 300 + 350 + 400 + 450 = 2000
const trueOutput = 500 + readSteps.reduce((sum, s) => sum + (s.output_tokens ?? 0), 0);
expect(result?.output_tokens).toBe(trueOutput);
});
});
describe('bulk write with summarization usage', () => {
let mockInsertMany: jest.Mock;
let mockUpdateBalance: jest.Mock;

View file

@ -58,22 +58,42 @@ function inputTokensIncludesCache(provider?: string): boolean {
* `total_tokens === input_tokens + output_tokens`. Compliant providers
* (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
* include any reasoning/thinking tokens inside `output_tokens` already,
* so the invariant holds.
* so the invariant holds and this function is a no-op for them.
*
* Vertex AI Gemini through `@langchain/google-common`'s streaming path
* emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
* leaving `total - input > output`. When that gap shows up we use the
* invariant to recover the correct billable output (`total - input`).
* Compliant providers have a zero gap, so this is a no-op for them.
* **Vertex AI undercount (issue #13006):** `@langchain/google-common`'s streaming
* path emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
* so `total - input > output`. The gap is recovered as `total - input`.
*
* Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
* **Bedrock / Anthropic cache inflation:** additive providers keep cache tokens
* separate from `input_tokens`, making
* `total = input + output + cache_read + cache_creation`. Without adjustment
* the Vertex recovery fires on every cached step and returns
* `output + cache_read + cache_creation` instead of `output`, inflating
* completion counts by orders of magnitude. The fix subtracts the cache
* adjustment before the gap test but only for additive providers; subset
* providers (Google, OpenAI, ) already include cache inside `input_tokens`
* so their `cacheAdjustment` is zero and the Vertex recovery is unaffected.
*/
function resolveCompletionTokens(usage: UsageMetadata): number {
const output = Number(usage.output_tokens) || 0;
const total = Number(usage.total_tokens) || 0;
const input = Number(usage.input_tokens) || 0;
if (total > input + output) {
return total - input;
// For additive providers (Bedrock, Anthropic), cache tokens are separate
// from input_tokens and are included in total_tokens, widening the gap
// independently of any missing thinking tokens. Subtract them so the gap
// check only fires when output_tokens genuinely undercounts (Vertex case).
// Subset providers fold cache into input_tokens, so their adjustment is 0.
const cacheRead =
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
const cacheCreation =
Number(usage.input_token_details?.cache_creation) ||
Number(usage.cache_creation_input_tokens) ||
0;
const cacheAdjustment = inputTokensIncludesCache(usage.provider) ? 0 : cacheRead + cacheCreation;
if (total > input + output + cacheAdjustment) {
return total - input - cacheAdjustment;
}
return output;
}