✨ feat: Mirror send-path pruning in the over-window context estimate

For a snapshot-less branch whose tokens exceed the window, the send path prunes oldest-first (getMessagesWithinTokenLimit), so the next call can sit well under the window. The gauge previously clamped the full sum to 100%, hiding that headroom. Add prunedBranchTokens — a newest->oldest walk that keeps messages until the next would overflow the message budget (max minus the summary baseline), mirroring the pruner — and use it on the estimate path in place of the clamp. Approximation: omits the instruction/tool overhead and tool-call pairing the real pruner accounts for (unknowable for a snapshot-less branch); superseded by an exact snapshot once the branch is generated.
2026-07-01 11:53:55 +00:00 · 2026-06-25 15:48:41 -04:00 · 2026-06-25 15:48:41 -04:00 · 5706e414fd
commit 5706e414fd
parent 376370d610
3 changed files with 104 additions and 11 deletions
--- a/client/src/hooks/Chat/useTokenUsage.ts
+++ b/client/src/hooks/Chat/useTokenUsage.ts
@ -21,6 +21,7 @@ import {
  clearIndex,
  mergeUsage,
  sumTotalUsage,
+  prunedBranchTokens,
  findBranchSnapshotAnchor,
 } from '~/utils';
 import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage';
@ -252,23 +253,32 @@ export default function useTokenUsage({
     *  from re-summing the discarded pre-summary history (which otherwise pins the
     *  gauge at 100% after a compaction). */
    const maxTokens = limits.maxContextTokens;
+    const liveOnTail = liveTokens > 0;
    /** When a stream is live the tail is the in-flight response, already counted
     *  by `liveTokens`; drop its static estimate so a resumed/partial response
     *  isn't double-counted on the estimate path. */
    const estimatedTokens = Math.max(
      0,
-      branchTotals.estTokens - (liveTokens > 0 ? branchTotals.tailEstTokens : 0),
+      branchTotals.estTokens - (liveOnTail ? branchTotals.tailEstTokens : 0),
    );
-    const rawUsed =
-      branchTotals.input +
-      branchTotals.output +
-      estimatedTokens +
-      branchTotals.summaryBaseline +
-      liveTokens;
-    /** The send path prunes an over-window branch before calling the model, so the
-     *  live gauge never actually exceeds the window; clamp the display to the
-     *  window rather than show impossible values (e.g. 50k / 8k). */
-    const usedTokens = maxTokens != null && maxTokens > 0 ? Math.min(rawUsed, maxTokens) : rawUsed;
+    let messageTokens = branchTotals.input + branchTotals.output + estimatedTokens;
+    /** The send path prunes an over-window branch oldest-first before calling the
+     *  model, so the next call can sit well under the window even when the full
+     *  branch exceeds it. Mirror that: when the raw sum overflows the message
+     *  window (max minus the always-sent summary baseline), report the newest
+     *  messages that actually fit instead of clamping the whole branch to 100%. */
+    if (maxTokens != null && maxTokens > 0) {
+      const messageBudget = Math.max(0, maxTokens - branchTotals.summaryBaseline);
+      if (messageTokens > messageBudget) {
+        messageTokens = prunedBranchTokens(
+          conversationKey,
+          branchTotals.tailId,
+          messageBudget,
+          liveOnTail,
+        );
+      }
+    }
+    const usedTokens = messageTokens + branchTotals.summaryBaseline + liveTokens;
    return {
      usedTokens,
      maxTokens,
@ -297,5 +307,6 @@ export default function useTokenUsage({
    liveTokens,
    limits,
    branchSnapshot,
+    conversationKey,
  ]);
 }
--- a/client/src/utils/tokens.spec.ts
+++ b/client/src/utils/tokens.spec.ts
@ -10,6 +10,7 @@ import {
  mergeUsage,
  setEntryUsage,
  sumTotalUsage,
+  prunedBranchTokens,
  findBranchSnapshotAnchor,
  estimateTokens,
  normalizeUsageUnits,
@ -218,6 +219,34 @@ describe('token index', () => {
    expect(totals.tailEstTokens).toBe(5);
  });

+  describe('prunedBranchTokens (over-window mirror of getMessagesWithinTokenLimit)', () => {
+    /** u1 ← a1(huge, old) ← u2 ← a2(tail). */
+    const buildChain = () =>
+      buildIndex(CONVO, [
+        msg('u1', Constants.NO_PARENT, true, 2),
+        msg('a1', 'u1', false, 10),
+        msg('u2', 'a1', true, 2),
+        msg('a2', 'u2', false, 2),
+      ]);
+
+    it('keeps the newest messages that fit and stops at the first overflow', () => {
+      buildChain();
+      /** Budget 8: a2(2)+u2(2)=4 fit; a1(10) would overflow → pruned. */
+      expect(prunedBranchTokens(CONVO, 'a2', 8, false)).toBe(4);
+    });
+
+    it('returns the full branch sum when it fits the budget', () => {
+      buildChain();
+      expect(prunedBranchTokens(CONVO, 'a2', 100, false)).toBe(16);
+    });
+
+    it('skips the in-flight tail when excludeTail is set', () => {
+      buildChain();
+      /** Skip a2; a1(10)+u2(2)+u1(2)=14 all fit under 100. */
+      expect(prunedBranchTokens(CONVO, 'a2', 100, true)).toBe(14);
+    });
+  });
+
  it('caps the branch at a summary marker instead of re-summing compacted history', () => {
    const summarized = {
      messageId: 'a2',
--- a/client/src/utils/tokens.ts
+++ b/client/src/utils/tokens.ts
@ -387,6 +387,59 @@ export function sumBranch(
  return { ...totals, tailEstTokens, tailId, usage, summaryBaseline };
 }

+/**
+ * Message tokens that would actually be sent for an over-window branch. The send
+ * path prunes oldest-first to fit (`getMessagesWithinTokenLimit`), so walk the
+ * branch newest→oldest and stop once the next message would exceed `budget`,
+ * mirroring its "newest-that-fits" behavior for the gauge. Approximation: it omits
+ * the instruction/tool-schema overhead and tool-call pairing the real pruner also
+ * accounts for, which the client can't know for a snapshot-less branch — close
+ * enough for an estimate, and superseded by an exact snapshot once generated.
+ * `budget` is the message window (max minus the always-sent summary baseline);
+ * when `excludeTail`, the in-flight tail response is skipped (it rides on
+ * `liveTokens`). Per-message contribution matches `sumBranch`: stored `tokenCount`
+ * when counted, else the char-based `estTokens`.
+ */
+export function prunedBranchTokens(
+  conversationId: string,
+  tailId: string | null | undefined,
+  budget: number,
+  excludeTail: boolean,
+): number {
+  const index = registry.get(conversationId);
+  if (!index || !tailId || budget <= 0) {
+    return 0;
+  }
+
+  let total = 0;
+  let currentId: string | null = tailId;
+  let guard = index.size;
+  let isTail = true;
+
+  while (currentId && currentId !== Constants.NO_PARENT && guard-- > 0) {
+    const entry: TokenEntry | undefined = index.get(currentId);
+    if (!entry) {
+      break;
+    }
+    const skip = isTail && excludeTail;
+    isTail = false;
+    if (!skip) {
+      const contribution = entry.tokenCount > 0 ? entry.tokenCount : entry.estTokens;
+      if (total + contribution > budget) {
+        break;
+      }
+      total += contribution;
+    }
+    /** Pre-summary turns are subsumed by the baseline the caller already reserved,
+     *  so stop after counting the summarizing turn — mirrors `sumBranch`. */
+    if (entry.summaryUsedTokens != null && entry.summaryUsedTokens > 0) {
+      break;
+    }
+    currentId = entry.parentMessageId;
+  }
+  return total;
+}
+
 /**
 * Sums provider usage/cost across EVERY message in the conversation (all
 * branches, including regenerated/abandoned responses) — the conversation