🏷️ fix: Prevent Bedrock Cache Tokens from Inflating Completion Count (#13468)

* 🐛 fix: prevent Bedrock cache tokens from inflating completion count * style: fix prettier formatting
2026-06-09 17:31:19 +00:00 · 2026-06-03 09:45:37 -04:00 · 2026-06-03 09:45:37 -04:00 · c50b3c58d5
commit c50b3c58d5
parent eae00250df
2 changed files with 130 additions and 9 deletions
--- a/packages/api/src/agents/usage.spec.ts
+++ b/packages/api/src/agents/usage.spec.ts
@ -988,6 +988,107 @@ describe('recordCollectedUsage', () => {
    });
  });

+  describe('Bedrock prompt caching — completion token inflation regression', () => {
+    it('does not fold cache_creation into completion on the first cached step', async () => {
+      // Bedrock: total = input + output + cache_creation (additive, not subset).
+      // Before fix: resolveCompletionTokens returned output + cache_creation (5500)
+      // instead of output (500).
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 100,
+          output_tokens: 500,
+          total_tokens: 5600,
+          cache_creation_input_tokens: 5000,
+          cache_read_input_tokens: 0,
+          model: 'claude-sonnet-4-6',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'claude-sonnet-4-6' }),
+        {
+          promptTokens: { input: 100, write: 5000, read: 0 },
+          completionTokens: 500,
+        },
+      );
+      expect(result?.output_tokens).toBe(500);
+    });
+
+    it('does not fold cache_read into completion on subsequent cached steps', async () => {
+      // Bedrock: total = input + output + cache_read on every read step.
+      // Before fix: each step returned output + cache_read instead of output.
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 200,
+          output_tokens: 300,
+          total_tokens: 4500,
+          cache_read_input_tokens: 4000,
+          cache_creation_input_tokens: 0,
+          model: 'claude-sonnet-4-6',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'claude-sonnet-4-6' }),
+        {
+          promptTokens: { input: 200, write: 0, read: 4000 },
+          completionTokens: 300,
+        },
+      );
+      expect(result?.output_tokens).toBe(300);
+    });
+
+    it('handles cache tokens in input_token_details format (alternate field path)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 200,
+          output_tokens: 300,
+          total_tokens: 4500,
+          input_token_details: { cache_read: 4000, cache_creation: 0 },
+          model: 'claude-sonnet-4-6',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, { ...baseParams, collectedUsage });
+
+      expect(result?.output_tokens).toBe(300);
+    });
+
+    it('accumulates only true output across a multi-step cached agent run', async () => {
+      // 1 write step + 4 read steps. Without the fix, each step folds its
+      // cache tokens into completion, inflating the total by the full cache size.
+      const writeStep: UsageMetadata = {
+        input_tokens: 100,
+        output_tokens: 500,
+        total_tokens: 5600,
+        cache_creation_input_tokens: 5000,
+        cache_read_input_tokens: 0,
+        model: 'claude-sonnet-4-6',
+      };
+      const readSteps: UsageMetadata[] = Array.from({ length: 4 }, (_, i) => ({
+        input_tokens: 200,
+        output_tokens: 300 + i * 50,
+        total_tokens: 200 + (300 + i * 50) + 5000,
+        cache_read_input_tokens: 5000,
+        cache_creation_input_tokens: 0,
+        model: 'claude-sonnet-4-6',
+      }));
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage: [writeStep, ...readSteps],
+      });
+
+      // True output: 500 + 300 + 350 + 400 + 450 = 2000
+      const trueOutput = 500 + readSteps.reduce((sum, s) => sum + (s.output_tokens ?? 0), 0);
+      expect(result?.output_tokens).toBe(trueOutput);
+    });
+  });
+
  describe('bulk write with summarization usage', () => {
    let mockInsertMany: jest.Mock;
    let mockUpdateBalance: jest.Mock;
--- a/packages/api/src/agents/usage.ts
+++ b/packages/api/src/agents/usage.ts
@ -58,22 +58,42 @@ function inputTokensIncludesCache(provider?: string): boolean {
 * `total_tokens === input_tokens + output_tokens`. Compliant providers
 * (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
 * include any reasoning/thinking tokens inside `output_tokens` already,
- * so the invariant holds.
+ * so the invariant holds and this function is a no-op for them.
 *
- * Vertex AI Gemini through `@langchain/google-common`'s streaming path
- * emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
- * leaving `total - input > output`. When that gap shows up we use the
- * invariant to recover the correct billable output (`total - input`).
- * Compliant providers have a zero gap, so this is a no-op for them.
+ * **Vertex AI undercount (issue #13006):** `@langchain/google-common`'s streaming
+ * path emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
+ * so `total - input > output`. The gap is recovered as `total - input`.
 *
- * Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
+ * **Bedrock / Anthropic cache inflation:** additive providers keep cache tokens
+ * separate from `input_tokens`, making
+ * `total = input + output + cache_read + cache_creation`. Without adjustment
+ * the Vertex recovery fires on every cached step and returns
+ * `output + cache_read + cache_creation` instead of `output`, inflating
+ * completion counts by orders of magnitude. The fix subtracts the cache
+ * adjustment before the gap test — but only for additive providers; subset
+ * providers (Google, OpenAI, …) already include cache inside `input_tokens`
+ * so their `cacheAdjustment` is zero and the Vertex recovery is unaffected.
 */
 function resolveCompletionTokens(usage: UsageMetadata): number {
  const output = Number(usage.output_tokens) || 0;
  const total = Number(usage.total_tokens) || 0;
  const input = Number(usage.input_tokens) || 0;
-  if (total > input + output) {
-    return total - input;
+
+  // For additive providers (Bedrock, Anthropic), cache tokens are separate
+  // from input_tokens and are included in total_tokens, widening the gap
+  // independently of any missing thinking tokens. Subtract them so the gap
+  // check only fires when output_tokens genuinely undercounts (Vertex case).
+  // Subset providers fold cache into input_tokens, so their adjustment is 0.
+  const cacheRead =
+    Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
+  const cacheCreation =
+    Number(usage.input_token_details?.cache_creation) ||
+    Number(usage.cache_creation_input_tokens) ||
+    0;
+  const cacheAdjustment = inputTokensIncludesCache(usage.provider) ? 0 : cacheRead + cacheCreation;
+
+  if (total > input + output + cacheAdjustment) {
+    return total - input - cacheAdjustment;
  }
  return output;
 }