From 5706e414fd954435e48d99bcaa3e9dff31422ef5 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Thu, 25 Jun 2026 15:48:41 -0400 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20Mirror=20send-path=20prunin?= =?UTF-8?q?g=20in=20the=20over-window=20context=20estimate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For a snapshot-less branch whose tokens exceed the window, the send path prunes oldest-first (getMessagesWithinTokenLimit), so the next call can sit well under the window. The gauge previously clamped the full sum to 100%, hiding that headroom. Add prunedBranchTokens — a newest->oldest walk that keeps messages until the next would overflow the message budget (max minus the summary baseline), mirroring the pruner — and use it on the estimate path in place of the clamp. Approximation: omits the instruction/tool overhead and tool-call pairing the real pruner accounts for (unknowable for a snapshot-less branch); superseded by an exact snapshot once the branch is generated. --- client/src/hooks/Chat/useTokenUsage.ts | 33 ++++++++++------ client/src/utils/tokens.spec.ts | 29 ++++++++++++++ client/src/utils/tokens.ts | 53 ++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 11 deletions(-) diff --git a/client/src/hooks/Chat/useTokenUsage.ts b/client/src/hooks/Chat/useTokenUsage.ts index 39b6df9220..7b25d2aba1 100644 --- a/client/src/hooks/Chat/useTokenUsage.ts +++ b/client/src/hooks/Chat/useTokenUsage.ts @@ -21,6 +21,7 @@ import { clearIndex, mergeUsage, sumTotalUsage, + prunedBranchTokens, findBranchSnapshotAnchor, } from '~/utils'; import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage'; @@ -252,23 +253,32 @@ export default function useTokenUsage({ * from re-summing the discarded pre-summary history (which otherwise pins the * gauge at 100% after a compaction). */ const maxTokens = limits.maxContextTokens; + const liveOnTail = liveTokens > 0; /** When a stream is live the tail is the in-flight response, already counted * by `liveTokens`; drop its static estimate so a resumed/partial response * isn't double-counted on the estimate path. */ const estimatedTokens = Math.max( 0, - branchTotals.estTokens - (liveTokens > 0 ? branchTotals.tailEstTokens : 0), + branchTotals.estTokens - (liveOnTail ? branchTotals.tailEstTokens : 0), ); - const rawUsed = - branchTotals.input + - branchTotals.output + - estimatedTokens + - branchTotals.summaryBaseline + - liveTokens; - /** The send path prunes an over-window branch before calling the model, so the - * live gauge never actually exceeds the window; clamp the display to the - * window rather than show impossible values (e.g. 50k / 8k). */ - const usedTokens = maxTokens != null && maxTokens > 0 ? Math.min(rawUsed, maxTokens) : rawUsed; + let messageTokens = branchTotals.input + branchTotals.output + estimatedTokens; + /** The send path prunes an over-window branch oldest-first before calling the + * model, so the next call can sit well under the window even when the full + * branch exceeds it. Mirror that: when the raw sum overflows the message + * window (max minus the always-sent summary baseline), report the newest + * messages that actually fit instead of clamping the whole branch to 100%. */ + if (maxTokens != null && maxTokens > 0) { + const messageBudget = Math.max(0, maxTokens - branchTotals.summaryBaseline); + if (messageTokens > messageBudget) { + messageTokens = prunedBranchTokens( + conversationKey, + branchTotals.tailId, + messageBudget, + liveOnTail, + ); + } + } + const usedTokens = messageTokens + branchTotals.summaryBaseline + liveTokens; return { usedTokens, maxTokens, @@ -297,5 +307,6 @@ export default function useTokenUsage({ liveTokens, limits, branchSnapshot, + conversationKey, ]); } diff --git a/client/src/utils/tokens.spec.ts b/client/src/utils/tokens.spec.ts index eddc7a2c89..ed4c865a4a 100644 --- a/client/src/utils/tokens.spec.ts +++ b/client/src/utils/tokens.spec.ts @@ -10,6 +10,7 @@ import { mergeUsage, setEntryUsage, sumTotalUsage, + prunedBranchTokens, findBranchSnapshotAnchor, estimateTokens, normalizeUsageUnits, @@ -218,6 +219,34 @@ describe('token index', () => { expect(totals.tailEstTokens).toBe(5); }); + describe('prunedBranchTokens (over-window mirror of getMessagesWithinTokenLimit)', () => { + /** u1 ← a1(huge, old) ← u2 ← a2(tail). */ + const buildChain = () => + buildIndex(CONVO, [ + msg('u1', Constants.NO_PARENT, true, 2), + msg('a1', 'u1', false, 10), + msg('u2', 'a1', true, 2), + msg('a2', 'u2', false, 2), + ]); + + it('keeps the newest messages that fit and stops at the first overflow', () => { + buildChain(); + /** Budget 8: a2(2)+u2(2)=4 fit; a1(10) would overflow → pruned. */ + expect(prunedBranchTokens(CONVO, 'a2', 8, false)).toBe(4); + }); + + it('returns the full branch sum when it fits the budget', () => { + buildChain(); + expect(prunedBranchTokens(CONVO, 'a2', 100, false)).toBe(16); + }); + + it('skips the in-flight tail when excludeTail is set', () => { + buildChain(); + /** Skip a2; a1(10)+u2(2)+u1(2)=14 all fit under 100. */ + expect(prunedBranchTokens(CONVO, 'a2', 100, true)).toBe(14); + }); + }); + it('caps the branch at a summary marker instead of re-summing compacted history', () => { const summarized = { messageId: 'a2', diff --git a/client/src/utils/tokens.ts b/client/src/utils/tokens.ts index 1372aad111..8ee65cc3dc 100644 --- a/client/src/utils/tokens.ts +++ b/client/src/utils/tokens.ts @@ -387,6 +387,59 @@ export function sumBranch( return { ...totals, tailEstTokens, tailId, usage, summaryBaseline }; } +/** + * Message tokens that would actually be sent for an over-window branch. The send + * path prunes oldest-first to fit (`getMessagesWithinTokenLimit`), so walk the + * branch newest→oldest and stop once the next message would exceed `budget`, + * mirroring its "newest-that-fits" behavior for the gauge. Approximation: it omits + * the instruction/tool-schema overhead and tool-call pairing the real pruner also + * accounts for, which the client can't know for a snapshot-less branch — close + * enough for an estimate, and superseded by an exact snapshot once generated. + * `budget` is the message window (max minus the always-sent summary baseline); + * when `excludeTail`, the in-flight tail response is skipped (it rides on + * `liveTokens`). Per-message contribution matches `sumBranch`: stored `tokenCount` + * when counted, else the char-based `estTokens`. + */ +export function prunedBranchTokens( + conversationId: string, + tailId: string | null | undefined, + budget: number, + excludeTail: boolean, +): number { + const index = registry.get(conversationId); + if (!index || !tailId || budget <= 0) { + return 0; + } + + let total = 0; + let currentId: string | null = tailId; + let guard = index.size; + let isTail = true; + + while (currentId && currentId !== Constants.NO_PARENT && guard-- > 0) { + const entry: TokenEntry | undefined = index.get(currentId); + if (!entry) { + break; + } + const skip = isTail && excludeTail; + isTail = false; + if (!skip) { + const contribution = entry.tokenCount > 0 ? entry.tokenCount : entry.estTokens; + if (total + contribution > budget) { + break; + } + total += contribution; + } + /** Pre-summary turns are subsumed by the baseline the caller already reserved, + * so stop after counting the summarizing turn — mirrors `sumBranch`. */ + if (entry.summaryUsedTokens != null && entry.summaryUsedTokens > 0) { + break; + } + currentId = entry.parentMessageId; + } + return total; +} + /** * Sums provider usage/cost across EVERY message in the conversation (all * branches, including regenerated/abandoned responses) — the conversation