From 5706e414fd954435e48d99bcaa3e9dff31422ef5 Mon Sep 17 00:00:00 2001
From: Danny Avila <danny@librechat.ai>
Date: Thu, 25 Jun 2026 15:48:41 -0400
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20Mirror=20send-path=20prunin?=
 =?UTF-8?q?g=20in=20the=20over-window=20context=20estimate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For a snapshot-less branch whose tokens exceed the window, the send path
prunes oldest-first (getMessagesWithinTokenLimit), so the next call can sit
well under the window. The gauge previously clamped the full sum to 100%,
hiding that headroom. Add prunedBranchTokens — a newest->oldest walk that
keeps messages until the next would overflow the message budget (max minus the
summary baseline), mirroring the pruner — and use it on the estimate path in
place of the clamp. Approximation: omits the instruction/tool overhead and
tool-call pairing the real pruner accounts for (unknowable for a snapshot-less
branch); superseded by an exact snapshot once the branch is generated.
---
 client/src/hooks/Chat/useTokenUsage.ts | 33 ++++++++++------
 client/src/utils/tokens.spec.ts        | 29 ++++++++++++++
 client/src/utils/tokens.ts             | 53 ++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 11 deletions(-)

diff --git a/client/src/hooks/Chat/useTokenUsage.ts b/client/src/hooks/Chat/useTokenUsage.ts
index 39b6df9220..7b25d2aba1 100644
--- a/client/src/hooks/Chat/useTokenUsage.ts
+++ b/client/src/hooks/Chat/useTokenUsage.ts
@@ -21,6 +21,7 @@ import {
   clearIndex,
   mergeUsage,
   sumTotalUsage,
+  prunedBranchTokens,
   findBranchSnapshotAnchor,
 } from '~/utils';
 import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage';
@@ -252,23 +253,32 @@ export default function useTokenUsage({
      *  from re-summing the discarded pre-summary history (which otherwise pins the
      *  gauge at 100% after a compaction). */
     const maxTokens = limits.maxContextTokens;
+    const liveOnTail = liveTokens > 0;
     /** When a stream is live the tail is the in-flight response, already counted
      *  by `liveTokens`; drop its static estimate so a resumed/partial response
      *  isn't double-counted on the estimate path. */
     const estimatedTokens = Math.max(
       0,
-      branchTotals.estTokens - (liveTokens > 0 ? branchTotals.tailEstTokens : 0),
+      branchTotals.estTokens - (liveOnTail ? branchTotals.tailEstTokens : 0),
     );
-    const rawUsed =
-      branchTotals.input +
-      branchTotals.output +
-      estimatedTokens +
-      branchTotals.summaryBaseline +
-      liveTokens;
-    /** The send path prunes an over-window branch before calling the model, so the
-     *  live gauge never actually exceeds the window; clamp the display to the
-     *  window rather than show impossible values (e.g. 50k / 8k). */
-    const usedTokens = maxTokens != null && maxTokens > 0 ? Math.min(rawUsed, maxTokens) : rawUsed;
+    let messageTokens = branchTotals.input + branchTotals.output + estimatedTokens;
+    /** The send path prunes an over-window branch oldest-first before calling the
+     *  model, so the next call can sit well under the window even when the full
+     *  branch exceeds it. Mirror that: when the raw sum overflows the message
+     *  window (max minus the always-sent summary baseline), report the newest
+     *  messages that actually fit instead of clamping the whole branch to 100%. */
+    if (maxTokens != null && maxTokens > 0) {
+      const messageBudget = Math.max(0, maxTokens - branchTotals.summaryBaseline);
+      if (messageTokens > messageBudget) {
+        messageTokens = prunedBranchTokens(
+          conversationKey,
+          branchTotals.tailId,
+          messageBudget,
+          liveOnTail,
+        );
+      }
+    }
+    const usedTokens = messageTokens + branchTotals.summaryBaseline + liveTokens;
     return {
       usedTokens,
       maxTokens,
@@ -297,5 +307,6 @@ export default function useTokenUsage({
     liveTokens,
     limits,
     branchSnapshot,
+    conversationKey,
   ]);
 }
diff --git a/client/src/utils/tokens.spec.ts b/client/src/utils/tokens.spec.ts
index eddc7a2c89..ed4c865a4a 100644
--- a/client/src/utils/tokens.spec.ts
+++ b/client/src/utils/tokens.spec.ts
@@ -10,6 +10,7 @@ import {
   mergeUsage,
   setEntryUsage,
   sumTotalUsage,
+  prunedBranchTokens,
   findBranchSnapshotAnchor,
   estimateTokens,
   normalizeUsageUnits,
@@ -218,6 +219,34 @@ describe('token index', () => {
     expect(totals.tailEstTokens).toBe(5);
   });
 
+  describe('prunedBranchTokens (over-window mirror of getMessagesWithinTokenLimit)', () => {
+    /** u1 ← a1(huge, old) ← u2 ← a2(tail). */
+    const buildChain = () =>
+      buildIndex(CONVO, [
+        msg('u1', Constants.NO_PARENT, true, 2),
+        msg('a1', 'u1', false, 10),
+        msg('u2', 'a1', true, 2),
+        msg('a2', 'u2', false, 2),
+      ]);
+
+    it('keeps the newest messages that fit and stops at the first overflow', () => {
+      buildChain();
+      /** Budget 8: a2(2)+u2(2)=4 fit; a1(10) would overflow → pruned. */
+      expect(prunedBranchTokens(CONVO, 'a2', 8, false)).toBe(4);
+    });
+
+    it('returns the full branch sum when it fits the budget', () => {
+      buildChain();
+      expect(prunedBranchTokens(CONVO, 'a2', 100, false)).toBe(16);
+    });
+
+    it('skips the in-flight tail when excludeTail is set', () => {
+      buildChain();
+      /** Skip a2; a1(10)+u2(2)+u1(2)=14 all fit under 100. */
+      expect(prunedBranchTokens(CONVO, 'a2', 100, true)).toBe(14);
+    });
+  });
+
   it('caps the branch at a summary marker instead of re-summing compacted history', () => {
     const summarized = {
       messageId: 'a2',
diff --git a/client/src/utils/tokens.ts b/client/src/utils/tokens.ts
index 1372aad111..8ee65cc3dc 100644
--- a/client/src/utils/tokens.ts
+++ b/client/src/utils/tokens.ts
@@ -387,6 +387,59 @@ export function sumBranch(
   return { ...totals, tailEstTokens, tailId, usage, summaryBaseline };
 }
 
+/**
+ * Message tokens that would actually be sent for an over-window branch. The send
+ * path prunes oldest-first to fit (`getMessagesWithinTokenLimit`), so walk the
+ * branch newest→oldest and stop once the next message would exceed `budget`,
+ * mirroring its "newest-that-fits" behavior for the gauge. Approximation: it omits
+ * the instruction/tool-schema overhead and tool-call pairing the real pruner also
+ * accounts for, which the client can't know for a snapshot-less branch — close
+ * enough for an estimate, and superseded by an exact snapshot once generated.
+ * `budget` is the message window (max minus the always-sent summary baseline);
+ * when `excludeTail`, the in-flight tail response is skipped (it rides on
+ * `liveTokens`). Per-message contribution matches `sumBranch`: stored `tokenCount`
+ * when counted, else the char-based `estTokens`.
+ */
+export function prunedBranchTokens(
+  conversationId: string,
+  tailId: string | null | undefined,
+  budget: number,
+  excludeTail: boolean,
+): number {
+  const index = registry.get(conversationId);
+  if (!index || !tailId || budget <= 0) {
+    return 0;
+  }
+
+  let total = 0;
+  let currentId: string | null = tailId;
+  let guard = index.size;
+  let isTail = true;
+
+  while (currentId && currentId !== Constants.NO_PARENT && guard-- > 0) {
+    const entry: TokenEntry | undefined = index.get(currentId);
+    if (!entry) {
+      break;
+    }
+    const skip = isTail && excludeTail;
+    isTail = false;
+    if (!skip) {
+      const contribution = entry.tokenCount > 0 ? entry.tokenCount : entry.estTokens;
+      if (total + contribution > budget) {
+        break;
+      }
+      total += contribution;
+    }
+    /** Pre-summary turns are subsumed by the baseline the caller already reserved,
+     *  so stop after counting the summarizing turn — mirrors `sumBranch`. */
+    if (entry.summaryUsedTokens != null && entry.summaryUsedTokens > 0) {
+      break;
+    }
+    currentId = entry.parentMessageId;
+  }
+  return total;
+}
+
 /**
  * Sums provider usage/cost across EVERY message in the conversation (all
  * branches, including regenerated/abandoned responses) — the conversation