🧠 fix: charge Gemini reasoning tokens in agent usage accounting (#13014)

* 🧠 fix: charge Gemini reasoning tokens in agent usage accounting Resolves #13006. `usage.ts` previously billed `usage.output_tokens` directly. For Vertex AI Gemini thinking models, `@langchain/google-common`'s streaming path emits `output_tokens = candidatesTokenCount` only, dropping `thoughtsTokenCount`. Reasoning was billed at zero and the `total_tokens === input_tokens + output_tokens` invariant was broken. The fix lives in agents (danny-avila/agents#157) — but this is also a defense-in-depth backstop in case agents misses a path or another provider exhibits the same shape. `resolveCompletionTokens(usage)` adds `output_token_details.reasoning` back when (and only when) the gap is present (`total - input > output`), so providers that already include reasoning in `output_tokens` (OpenAI o-series, Anthropic, the Google-API wrapper) are no-ops — no double-counting. - `SplitUsage` gains a `completion` field; all four billing call sites in `processUsageGroup` use it instead of `usage.output_tokens`. - `total_output_tokens` in the result also reflects the corrected count. - `UsageMetadata` interface in `IJobStore.ts` adds the `output_token_details` field for type safety. - 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI no-double-count, structured spend path with cache + reasoning, no-op when no details present. * 🩹 fix: simplify reasoning correction to invariant-based gap check Initial fix gated the correction on `output_token_details.reasoning > 0`, which doesn't help in the live failure case: when google-common's stream emits the buggy fallback usage_metadata, output_token_details is empty ({}) and the gate exits early. Live debugging showed the reliable signal is the documented invariant itself: `total_tokens === input_tokens + output_tokens`. When buggy streams undercount output, total exceeds input + output by exactly the unbilled reasoning. Use `total - input` as the corrected output. This is provider-agnostic and stays a no-op for compliant providers (OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the gap is zero. Live verified end-to-end against gemini-3-flash-preview: - With agents fix in place: output_tokens=437 → billed 437 (no-op) - Backstop only (no agents fix, buggy input): raw 135, billed 297 (= total 309 - input 12, matches actual API charge) Updated tests to cover both scenarios.
2026-05-13 16:07:30 +00:00 · 2026-05-08 10:29:35 -04:00 · 2026-05-08 10:29:35 -04:00 · 0fe203aaca
commit 0fe203aaca
parent 93c4ef4ba8
3 changed files with 171 additions and 6 deletions
--- a/packages/api/src/agents/usage.spec.ts
+++ b/packages/api/src/agents/usage.spec.ts
@ -364,6 +364,128 @@ describe('recordCollectedUsage', () => {
    });
  });

+  describe('reasoning token handling - issue #13006', () => {
+    it('uses total - input when output_tokens undercounts (Vertex stream undercount with details present)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 80657,
+          output_tokens: 766,
+          total_tokens: 83265,
+          output_token_details: { reasoning: 1842 },
+          model: 'gemini-3-flash-preview',
+          provider: 'vertexai',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        { promptTokens: 80657, completionTokens: 2608 },
+      );
+      expect(result?.output_tokens).toBe(2608);
+    });
+
+    it('uses total - input even when output_token_details is missing (raw langchain google-common path)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 12,
+          output_tokens: 135,
+          total_tokens: 309,
+          model: 'gemini-3-flash-preview',
+          provider: 'vertexai',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        { promptTokens: 12, completionTokens: 297 },
+      );
+      expect(result?.output_tokens).toBe(297);
+    });
+
+    it('does not change output when invariant already holds (OpenAI o-series, reasoning already a subset)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 100,
+          output_tokens: 500,
+          total_tokens: 600,
+          output_token_details: { reasoning: 200 },
+          model: 'o1-preview',
+          provider: 'openAI',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'o1-preview' }),
+        { promptTokens: 100, completionTokens: 500 },
+      );
+      expect(result?.output_tokens).toBe(500);
+    });
+
+    it('routes correction through structured spend when cache tokens are present', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 80657,
+          output_tokens: 766,
+          total_tokens: 83265,
+          output_token_details: { reasoning: 1842 },
+          input_token_details: { cache_read: 30000 },
+          model: 'gemini-3-flash-preview',
+          provider: 'vertexai',
+        },
+      ];
+
+      await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        {
+          promptTokens: { input: 50657, write: 0, read: 30000 },
+          completionTokens: 2608,
+        },
+      );
+    });
+
+    it('no-op when total_tokens is absent or zero', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'gpt-4',
+          provider: 'openAI',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.anything(),
+        { promptTokens: 100, completionTokens: 50 },
+      );
+      expect(result?.output_tokens).toBe(50);
+    });
+  });
+
  describe('mixed cache and non-cache entries', () => {
    it('should handle mixed entries correctly', async () => {
      const collectedUsage: UsageMetadata[] = [
--- a/packages/api/src/agents/usage.ts
+++ b/packages/api/src/agents/usage.ts
@ -50,6 +50,34 @@ function inputTokensIncludesCache(provider?: string): boolean {
  return provider != null && SUBSET_PROVIDERS.has(provider);
 }

+/**
+ * Resolves `completionTokens` for billing, repairing providers whose
+ * `usage_metadata.output_tokens` undercounts.
+ *
+ * The documented `UsageMetadata` contract (`@langchain/core`) is
+ * `total_tokens === input_tokens + output_tokens`. Compliant providers
+ * (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
+ * include any reasoning/thinking tokens inside `output_tokens` already,
+ * so the invariant holds.
+ *
+ * Vertex AI Gemini through `@langchain/google-common`'s streaming path
+ * emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
+ * leaving `total - input > output`. When that gap shows up we use the
+ * invariant to recover the correct billable output (`total - input`).
+ * Compliant providers have a zero gap, so this is a no-op for them.
+ *
+ * Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
+ */
+function resolveCompletionTokens(usage: UsageMetadata): number {
+  const output = Number(usage.output_tokens) || 0;
+  const total = Number(usage.total_tokens) || 0;
+  const input = Number(usage.input_tokens) || 0;
+  if (total > input + output) {
+    return total - input;
+  }
+  return output;
+}
+
 interface SplitUsage {
  /** Non-cached input portion — what gets billed at the standard input rate */
  inputOnly: number;
@ -57,6 +85,8 @@ interface SplitUsage {
  cacheRead: number;
  /** Total prompt tokens including cached portion */
  totalInput: number;
+  /** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */
+  completion: number;
 }

 function splitUsage(usage: UsageMetadata): SplitUsage {
@ -67,12 +97,14 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
  const cacheRead =
    Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
  const rawInput = Number(usage.input_tokens) || 0;
+  const completion = resolveCompletionTokens(usage);
  if (inputTokensIncludesCache(usage.provider)) {
    return {
      inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
      cacheCreation,
      cacheRead,
      totalInput: rawInput,
+      completion,
    };
  }
  return {
@ -80,6 +112,7 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
    cacheCreation,
    cacheRead,
    totalInput: rawInput + cacheCreation + cacheRead,
+    completion,
  };
 }

@ -161,9 +194,9 @@ export async function recordCollectedUsage(
        continue;
      }

-      const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage);
+      const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage);

-      total_output_tokens += Number(usage.output_tokens) || 0;
+      total_output_tokens += completion;

      const txMetadata: TxMetadata = {
        user,
@ -187,7 +220,7 @@ export async function recordCollectedUsage(
                    write: cacheCreation,
                    read: cacheRead,
                  },
-                  completionTokens: usage.output_tokens,
+                  completionTokens: completion,
                },
                pricing,
              )
@ -195,7 +228,7 @@ export async function recordCollectedUsage(
                txMetadata,
                {
                  promptTokens: inputOnly,
-                  completionTokens: usage.output_tokens,
+                  completionTokens: completion,
                },
                pricing,
              );
@ -211,7 +244,7 @@ export async function recordCollectedUsage(
              write: cacheCreation,
              read: cacheRead,
            },
-            completionTokens: usage.output_tokens,
+            completionTokens: completion,
          })
          .catch((err) => {
            logger.error(
@ -225,7 +258,7 @@ export async function recordCollectedUsage(
      deps
        .spendTokens(txMetadata, {
          promptTokens: inputOnly,
-          completionTokens: usage.output_tokens,
+          completionTokens: completion,
        })
        .catch((err) => {
          logger.error(
--- a/packages/api/src/stream/interfaces/IJobStore.ts
+++ b/packages/api/src/stream/interfaces/IJobStore.ts
@ -98,6 +98,16 @@ export interface UsageMetadata {
   * Present for Claude models. Mutually exclusive with input_token_details.
   */
  cache_read_input_tokens?: number;
+  /**
+   * Breakdown of output token counts. Per the LangChain core contract,
+   * `output_tokens` is the sum of all output token types — these fields
+   * are subsets of `output_tokens`, *not* additional charges.
+   */
+  output_token_details?: {
+    /** Reasoning/thinking tokens generated as chain-of-thought (o1, Gemini thinking, etc.) */
+    reasoning?: number;
+    audio?: number;
+  };
 }

 /**