From 0fe203aaca6b69d9bb76c38d3f6f0ca00ac72779 Mon Sep 17 00:00:00 2001
From: Danny Avila <danny@librechat.ai>
Date: Fri, 8 May 2026 10:29:35 -0400
Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=A0=20fix:=20charge=20Gemini=20reasoni?=
 =?UTF-8?q?ng=20tokens=20in=20agent=20usage=20accounting=20(#13014)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 🧠 fix: charge Gemini reasoning tokens in agent usage accounting

Resolves #13006.

`usage.ts` previously billed `usage.output_tokens` directly. For Vertex
AI Gemini thinking models, `@langchain/google-common`'s streaming path
emits `output_tokens = candidatesTokenCount` only, dropping
`thoughtsTokenCount`. Reasoning was billed at zero and the
`total_tokens === input_tokens + output_tokens` invariant was broken.

The fix lives in agents (danny-avila/agents#157) — but this is also a
defense-in-depth backstop in case agents misses a path or another
provider exhibits the same shape. `resolveCompletionTokens(usage)` adds
`output_token_details.reasoning` back when (and only when) the gap is
present (`total - input > output`), so providers that already include
reasoning in `output_tokens` (OpenAI o-series, Anthropic, the
Google-API wrapper) are no-ops — no double-counting.

- `SplitUsage` gains a `completion` field; all four billing call sites
  in `processUsageGroup` use it instead of `usage.output_tokens`.
- `total_output_tokens` in the result also reflects the corrected
  count.
- `UsageMetadata` interface in `IJobStore.ts` adds the
  `output_token_details` field for type safety.
- 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI
  no-double-count, structured spend path with cache + reasoning, no-op
  when no details present.

* 🩹 fix: simplify reasoning correction to invariant-based gap check

Initial fix gated the correction on `output_token_details.reasoning > 0`,
which doesn't help in the live failure case: when google-common's stream
emits the buggy fallback usage_metadata, output_token_details is empty
({}) and the gate exits early.

Live debugging showed the reliable signal is the documented invariant
itself: `total_tokens === input_tokens + output_tokens`. When buggy
streams undercount output, total exceeds input + output by exactly the
unbilled reasoning. Use `total - input` as the corrected output.

This is provider-agnostic and stays a no-op for compliant providers
(OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the
gap is zero.

Live verified end-to-end against gemini-3-flash-preview:
- With agents fix in place:  output_tokens=437 → billed 437  (no-op)
- Backstop only (no agents fix, buggy input): raw 135, billed 297
  (= total 309 - input 12, matches actual API charge)

Updated tests to cover both scenarios.
---
 packages/api/src/agents/usage.spec.ts         | 122 ++++++++++++++++++
 packages/api/src/agents/usage.ts              |  45 ++++++-
 .../api/src/stream/interfaces/IJobStore.ts    |  10 ++
 3 files changed, 171 insertions(+), 6 deletions(-)

diff --git a/packages/api/src/agents/usage.spec.ts b/packages/api/src/agents/usage.spec.ts
index b9656ba2ec..bee0c5248b 100644
--- a/packages/api/src/agents/usage.spec.ts
+++ b/packages/api/src/agents/usage.spec.ts
@@ -364,6 +364,128 @@ describe('recordCollectedUsage', () => {
     });
   });
 
+  describe('reasoning token handling - issue #13006', () => {
+    it('uses total - input when output_tokens undercounts (Vertex stream undercount with details present)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 80657,
+          output_tokens: 766,
+          total_tokens: 83265,
+          output_token_details: { reasoning: 1842 },
+          model: 'gemini-3-flash-preview',
+          provider: 'vertexai',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        { promptTokens: 80657, completionTokens: 2608 },
+      );
+      expect(result?.output_tokens).toBe(2608);
+    });
+
+    it('uses total - input even when output_token_details is missing (raw langchain google-common path)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 12,
+          output_tokens: 135,
+          total_tokens: 309,
+          model: 'gemini-3-flash-preview',
+          provider: 'vertexai',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        { promptTokens: 12, completionTokens: 297 },
+      );
+      expect(result?.output_tokens).toBe(297);
+    });
+
+    it('does not change output when invariant already holds (OpenAI o-series, reasoning already a subset)', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 100,
+          output_tokens: 500,
+          total_tokens: 600,
+          output_token_details: { reasoning: 200 },
+          model: 'o1-preview',
+          provider: 'openAI',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'o1-preview' }),
+        { promptTokens: 100, completionTokens: 500 },
+      );
+      expect(result?.output_tokens).toBe(500);
+    });
+
+    it('routes correction through structured spend when cache tokens are present', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 80657,
+          output_tokens: 766,
+          total_tokens: 83265,
+          output_token_details: { reasoning: 1842 },
+          input_token_details: { cache_read: 30000 },
+          model: 'gemini-3-flash-preview',
+          provider: 'vertexai',
+        },
+      ];
+
+      await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        {
+          promptTokens: { input: 50657, write: 0, read: 30000 },
+          completionTokens: 2608,
+        },
+      );
+    });
+
+    it('no-op when total_tokens is absent or zero', async () => {
+      const collectedUsage: UsageMetadata[] = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'gpt-4',
+          provider: 'openAI',
+        },
+      ];
+
+      const result = await recordCollectedUsage(deps, {
+        ...baseParams,
+        collectedUsage,
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.anything(),
+        { promptTokens: 100, completionTokens: 50 },
+      );
+      expect(result?.output_tokens).toBe(50);
+    });
+  });
+
   describe('mixed cache and non-cache entries', () => {
     it('should handle mixed entries correctly', async () => {
       const collectedUsage: UsageMetadata[] = [
diff --git a/packages/api/src/agents/usage.ts b/packages/api/src/agents/usage.ts
index b5a2fa1f4d..3d8ddf3799 100644
--- a/packages/api/src/agents/usage.ts
+++ b/packages/api/src/agents/usage.ts
@@ -50,6 +50,34 @@ function inputTokensIncludesCache(provider?: string): boolean {
   return provider != null && SUBSET_PROVIDERS.has(provider);
 }
 
+/**
+ * Resolves `completionTokens` for billing, repairing providers whose
+ * `usage_metadata.output_tokens` undercounts.
+ *
+ * The documented `UsageMetadata` contract (`@langchain/core`) is
+ * `total_tokens === input_tokens + output_tokens`. Compliant providers
+ * (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
+ * include any reasoning/thinking tokens inside `output_tokens` already,
+ * so the invariant holds.
+ *
+ * Vertex AI Gemini through `@langchain/google-common`'s streaming path
+ * emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
+ * leaving `total - input > output`. When that gap shows up we use the
+ * invariant to recover the correct billable output (`total - input`).
+ * Compliant providers have a zero gap, so this is a no-op for them.
+ *
+ * Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
+ */
+function resolveCompletionTokens(usage: UsageMetadata): number {
+  const output = Number(usage.output_tokens) || 0;
+  const total = Number(usage.total_tokens) || 0;
+  const input = Number(usage.input_tokens) || 0;
+  if (total > input + output) {
+    return total - input;
+  }
+  return output;
+}
+
 interface SplitUsage {
   /** Non-cached input portion — what gets billed at the standard input rate */
   inputOnly: number;
@@ -57,6 +85,8 @@ interface SplitUsage {
   cacheRead: number;
   /** Total prompt tokens including cached portion */
   totalInput: number;
+  /** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */
+  completion: number;
 }
 
 function splitUsage(usage: UsageMetadata): SplitUsage {
@@ -67,12 +97,14 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
   const cacheRead =
     Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
   const rawInput = Number(usage.input_tokens) || 0;
+  const completion = resolveCompletionTokens(usage);
   if (inputTokensIncludesCache(usage.provider)) {
     return {
       inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
       cacheCreation,
       cacheRead,
       totalInput: rawInput,
+      completion,
     };
   }
   return {
@@ -80,6 +112,7 @@ function splitUsage(usage: UsageMetadata): SplitUsage {
     cacheCreation,
     cacheRead,
     totalInput: rawInput + cacheCreation + cacheRead,
+    completion,
   };
 }
 
@@ -161,9 +194,9 @@ export async function recordCollectedUsage(
         continue;
       }
 
-      const { inputOnly, cacheCreation, cacheRead } = splitUsage(usage);
+      const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage);
 
-      total_output_tokens += Number(usage.output_tokens) || 0;
+      total_output_tokens += completion;
 
       const txMetadata: TxMetadata = {
         user,
@@ -187,7 +220,7 @@ export async function recordCollectedUsage(
                     write: cacheCreation,
                     read: cacheRead,
                   },
-                  completionTokens: usage.output_tokens,
+                  completionTokens: completion,
                 },
                 pricing,
               )
@@ -195,7 +228,7 @@ export async function recordCollectedUsage(
                 txMetadata,
                 {
                   promptTokens: inputOnly,
-                  completionTokens: usage.output_tokens,
+                  completionTokens: completion,
                 },
                 pricing,
               );
@@ -211,7 +244,7 @@ export async function recordCollectedUsage(
               write: cacheCreation,
               read: cacheRead,
             },
-            completionTokens: usage.output_tokens,
+            completionTokens: completion,
           })
           .catch((err) => {
             logger.error(
@@ -225,7 +258,7 @@ export async function recordCollectedUsage(
       deps
         .spendTokens(txMetadata, {
           promptTokens: inputOnly,
-          completionTokens: usage.output_tokens,
+          completionTokens: completion,
         })
         .catch((err) => {
           logger.error(
diff --git a/packages/api/src/stream/interfaces/IJobStore.ts b/packages/api/src/stream/interfaces/IJobStore.ts
index 0d07b19538..afd6ac68a0 100644
--- a/packages/api/src/stream/interfaces/IJobStore.ts
+++ b/packages/api/src/stream/interfaces/IJobStore.ts
@@ -98,6 +98,16 @@ export interface UsageMetadata {
    * Present for Claude models. Mutually exclusive with input_token_details.
    */
   cache_read_input_tokens?: number;
+  /**
+   * Breakdown of output token counts. Per the LangChain core contract,
+   * `output_tokens` is the sum of all output token types — these fields
+   * are subsets of `output_tokens`, *not* additional charges.
+   */
+  output_token_details?: {
+    /** Reasoning/thinking tokens generated as chain-of-thought (o1, Gemini thinking, etc.) */
+    reasoning?: number;
+    audio?: number;
+  };
 }
 
 /**