feat: Mirror send-path pruning in the over-window context estimate

For a snapshot-less branch whose tokens exceed the window, the send path
prunes oldest-first (getMessagesWithinTokenLimit), so the next call can sit
well under the window. The gauge previously clamped the full sum to 100%,
hiding that headroom. Add prunedBranchTokens — a newest->oldest walk that
keeps messages until the next would overflow the message budget (max minus the
summary baseline), mirroring the pruner — and use it on the estimate path in
place of the clamp. Approximation: omits the instruction/tool overhead and
tool-call pairing the real pruner accounts for (unknowable for a snapshot-less
branch); superseded by an exact snapshot once the branch is generated.
This commit is contained in:
Danny Avila 2026-06-25 15:48:41 -04:00
parent 376370d610
commit 5706e414fd
3 changed files with 104 additions and 11 deletions

View file

@ -21,6 +21,7 @@ import {
clearIndex,
mergeUsage,
sumTotalUsage,
prunedBranchTokens,
findBranchSnapshotAnchor,
} from '~/utils';
import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage';
@ -252,23 +253,32 @@ export default function useTokenUsage({
* from re-summing the discarded pre-summary history (which otherwise pins the
* gauge at 100% after a compaction). */
const maxTokens = limits.maxContextTokens;
const liveOnTail = liveTokens > 0;
/** When a stream is live the tail is the in-flight response, already counted
* by `liveTokens`; drop its static estimate so a resumed/partial response
* isn't double-counted on the estimate path. */
const estimatedTokens = Math.max(
0,
branchTotals.estTokens - (liveTokens > 0 ? branchTotals.tailEstTokens : 0),
branchTotals.estTokens - (liveOnTail ? branchTotals.tailEstTokens : 0),
);
const rawUsed =
branchTotals.input +
branchTotals.output +
estimatedTokens +
branchTotals.summaryBaseline +
liveTokens;
/** The send path prunes an over-window branch before calling the model, so the
* live gauge never actually exceeds the window; clamp the display to the
* window rather than show impossible values (e.g. 50k / 8k). */
const usedTokens = maxTokens != null && maxTokens > 0 ? Math.min(rawUsed, maxTokens) : rawUsed;
let messageTokens = branchTotals.input + branchTotals.output + estimatedTokens;
/** The send path prunes an over-window branch oldest-first before calling the
* model, so the next call can sit well under the window even when the full
* branch exceeds it. Mirror that: when the raw sum overflows the message
* window (max minus the always-sent summary baseline), report the newest
* messages that actually fit instead of clamping the whole branch to 100%. */
if (maxTokens != null && maxTokens > 0) {
const messageBudget = Math.max(0, maxTokens - branchTotals.summaryBaseline);
if (messageTokens > messageBudget) {
messageTokens = prunedBranchTokens(
conversationKey,
branchTotals.tailId,
messageBudget,
liveOnTail,
);
}
}
const usedTokens = messageTokens + branchTotals.summaryBaseline + liveTokens;
return {
usedTokens,
maxTokens,
@ -297,5 +307,6 @@ export default function useTokenUsage({
liveTokens,
limits,
branchSnapshot,
conversationKey,
]);
}

View file

@ -10,6 +10,7 @@ import {
mergeUsage,
setEntryUsage,
sumTotalUsage,
prunedBranchTokens,
findBranchSnapshotAnchor,
estimateTokens,
normalizeUsageUnits,
@ -218,6 +219,34 @@ describe('token index', () => {
expect(totals.tailEstTokens).toBe(5);
});
describe('prunedBranchTokens (over-window mirror of getMessagesWithinTokenLimit)', () => {
/** u1 ← a1(huge, old) ← u2 ← a2(tail). */
const buildChain = () =>
buildIndex(CONVO, [
msg('u1', Constants.NO_PARENT, true, 2),
msg('a1', 'u1', false, 10),
msg('u2', 'a1', true, 2),
msg('a2', 'u2', false, 2),
]);
it('keeps the newest messages that fit and stops at the first overflow', () => {
buildChain();
/** Budget 8: a2(2)+u2(2)=4 fit; a1(10) would overflow → pruned. */
expect(prunedBranchTokens(CONVO, 'a2', 8, false)).toBe(4);
});
it('returns the full branch sum when it fits the budget', () => {
buildChain();
expect(prunedBranchTokens(CONVO, 'a2', 100, false)).toBe(16);
});
it('skips the in-flight tail when excludeTail is set', () => {
buildChain();
/** Skip a2; a1(10)+u2(2)+u1(2)=14 all fit under 100. */
expect(prunedBranchTokens(CONVO, 'a2', 100, true)).toBe(14);
});
});
it('caps the branch at a summary marker instead of re-summing compacted history', () => {
const summarized = {
messageId: 'a2',

View file

@ -387,6 +387,59 @@ export function sumBranch(
return { ...totals, tailEstTokens, tailId, usage, summaryBaseline };
}
/**
* Message tokens that would actually be sent for an over-window branch. The send
* path prunes oldest-first to fit (`getMessagesWithinTokenLimit`), so walk the
* branch newestoldest and stop once the next message would exceed `budget`,
* mirroring its "newest-that-fits" behavior for the gauge. Approximation: it omits
* the instruction/tool-schema overhead and tool-call pairing the real pruner also
* accounts for, which the client can't know for a snapshot-less branch close
* enough for an estimate, and superseded by an exact snapshot once generated.
* `budget` is the message window (max minus the always-sent summary baseline);
* when `excludeTail`, the in-flight tail response is skipped (it rides on
* `liveTokens`). Per-message contribution matches `sumBranch`: stored `tokenCount`
* when counted, else the char-based `estTokens`.
*/
export function prunedBranchTokens(
conversationId: string,
tailId: string | null | undefined,
budget: number,
excludeTail: boolean,
): number {
const index = registry.get(conversationId);
if (!index || !tailId || budget <= 0) {
return 0;
}
let total = 0;
let currentId: string | null = tailId;
let guard = index.size;
let isTail = true;
while (currentId && currentId !== Constants.NO_PARENT && guard-- > 0) {
const entry: TokenEntry | undefined = index.get(currentId);
if (!entry) {
break;
}
const skip = isTail && excludeTail;
isTail = false;
if (!skip) {
const contribution = entry.tokenCount > 0 ? entry.tokenCount : entry.estTokens;
if (total + contribution > budget) {
break;
}
total += contribution;
}
/** Pre-summary turns are subsumed by the baseline the caller already reserved,
* so stop after counting the summarizing turn mirrors `sumBranch`. */
if (entry.summaryUsedTokens != null && entry.summaryUsedTokens > 0) {
break;
}
currentId = entry.parentMessageId;
}
return total;
}
/**
* Sums provider usage/cost across EVERY message in the conversation (all
* branches, including regenerated/abandoned responses) the conversation