mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-07-01 11:53:55 +00:00
✨ feat: Mirror send-path pruning in the over-window context estimate
For a snapshot-less branch whose tokens exceed the window, the send path prunes oldest-first (getMessagesWithinTokenLimit), so the next call can sit well under the window. The gauge previously clamped the full sum to 100%, hiding that headroom. Add prunedBranchTokens — a newest->oldest walk that keeps messages until the next would overflow the message budget (max minus the summary baseline), mirroring the pruner — and use it on the estimate path in place of the clamp. Approximation: omits the instruction/tool overhead and tool-call pairing the real pruner accounts for (unknowable for a snapshot-less branch); superseded by an exact snapshot once the branch is generated.
This commit is contained in:
parent
376370d610
commit
5706e414fd
3 changed files with 104 additions and 11 deletions
|
|
@ -21,6 +21,7 @@ import {
|
|||
clearIndex,
|
||||
mergeUsage,
|
||||
sumTotalUsage,
|
||||
prunedBranchTokens,
|
||||
findBranchSnapshotAnchor,
|
||||
} from '~/utils';
|
||||
import { useLatestMessageId } from '~/hooks/Messages/useLatestMessage';
|
||||
|
|
@ -252,23 +253,32 @@ export default function useTokenUsage({
|
|||
* from re-summing the discarded pre-summary history (which otherwise pins the
|
||||
* gauge at 100% after a compaction). */
|
||||
const maxTokens = limits.maxContextTokens;
|
||||
const liveOnTail = liveTokens > 0;
|
||||
/** When a stream is live the tail is the in-flight response, already counted
|
||||
* by `liveTokens`; drop its static estimate so a resumed/partial response
|
||||
* isn't double-counted on the estimate path. */
|
||||
const estimatedTokens = Math.max(
|
||||
0,
|
||||
branchTotals.estTokens - (liveTokens > 0 ? branchTotals.tailEstTokens : 0),
|
||||
branchTotals.estTokens - (liveOnTail ? branchTotals.tailEstTokens : 0),
|
||||
);
|
||||
const rawUsed =
|
||||
branchTotals.input +
|
||||
branchTotals.output +
|
||||
estimatedTokens +
|
||||
branchTotals.summaryBaseline +
|
||||
liveTokens;
|
||||
/** The send path prunes an over-window branch before calling the model, so the
|
||||
* live gauge never actually exceeds the window; clamp the display to the
|
||||
* window rather than show impossible values (e.g. 50k / 8k). */
|
||||
const usedTokens = maxTokens != null && maxTokens > 0 ? Math.min(rawUsed, maxTokens) : rawUsed;
|
||||
let messageTokens = branchTotals.input + branchTotals.output + estimatedTokens;
|
||||
/** The send path prunes an over-window branch oldest-first before calling the
|
||||
* model, so the next call can sit well under the window even when the full
|
||||
* branch exceeds it. Mirror that: when the raw sum overflows the message
|
||||
* window (max minus the always-sent summary baseline), report the newest
|
||||
* messages that actually fit instead of clamping the whole branch to 100%. */
|
||||
if (maxTokens != null && maxTokens > 0) {
|
||||
const messageBudget = Math.max(0, maxTokens - branchTotals.summaryBaseline);
|
||||
if (messageTokens > messageBudget) {
|
||||
messageTokens = prunedBranchTokens(
|
||||
conversationKey,
|
||||
branchTotals.tailId,
|
||||
messageBudget,
|
||||
liveOnTail,
|
||||
);
|
||||
}
|
||||
}
|
||||
const usedTokens = messageTokens + branchTotals.summaryBaseline + liveTokens;
|
||||
return {
|
||||
usedTokens,
|
||||
maxTokens,
|
||||
|
|
@ -297,5 +307,6 @@ export default function useTokenUsage({
|
|||
liveTokens,
|
||||
limits,
|
||||
branchSnapshot,
|
||||
conversationKey,
|
||||
]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import {
|
|||
mergeUsage,
|
||||
setEntryUsage,
|
||||
sumTotalUsage,
|
||||
prunedBranchTokens,
|
||||
findBranchSnapshotAnchor,
|
||||
estimateTokens,
|
||||
normalizeUsageUnits,
|
||||
|
|
@ -218,6 +219,34 @@ describe('token index', () => {
|
|||
expect(totals.tailEstTokens).toBe(5);
|
||||
});
|
||||
|
||||
describe('prunedBranchTokens (over-window mirror of getMessagesWithinTokenLimit)', () => {
|
||||
/** u1 ← a1(huge, old) ← u2 ← a2(tail). */
|
||||
const buildChain = () =>
|
||||
buildIndex(CONVO, [
|
||||
msg('u1', Constants.NO_PARENT, true, 2),
|
||||
msg('a1', 'u1', false, 10),
|
||||
msg('u2', 'a1', true, 2),
|
||||
msg('a2', 'u2', false, 2),
|
||||
]);
|
||||
|
||||
it('keeps the newest messages that fit and stops at the first overflow', () => {
|
||||
buildChain();
|
||||
/** Budget 8: a2(2)+u2(2)=4 fit; a1(10) would overflow → pruned. */
|
||||
expect(prunedBranchTokens(CONVO, 'a2', 8, false)).toBe(4);
|
||||
});
|
||||
|
||||
it('returns the full branch sum when it fits the budget', () => {
|
||||
buildChain();
|
||||
expect(prunedBranchTokens(CONVO, 'a2', 100, false)).toBe(16);
|
||||
});
|
||||
|
||||
it('skips the in-flight tail when excludeTail is set', () => {
|
||||
buildChain();
|
||||
/** Skip a2; a1(10)+u2(2)+u1(2)=14 all fit under 100. */
|
||||
expect(prunedBranchTokens(CONVO, 'a2', 100, true)).toBe(14);
|
||||
});
|
||||
});
|
||||
|
||||
it('caps the branch at a summary marker instead of re-summing compacted history', () => {
|
||||
const summarized = {
|
||||
messageId: 'a2',
|
||||
|
|
|
|||
|
|
@ -387,6 +387,59 @@ export function sumBranch(
|
|||
return { ...totals, tailEstTokens, tailId, usage, summaryBaseline };
|
||||
}
|
||||
|
||||
/**
|
||||
* Message tokens that would actually be sent for an over-window branch. The send
|
||||
* path prunes oldest-first to fit (`getMessagesWithinTokenLimit`), so walk the
|
||||
* branch newest→oldest and stop once the next message would exceed `budget`,
|
||||
* mirroring its "newest-that-fits" behavior for the gauge. Approximation: it omits
|
||||
* the instruction/tool-schema overhead and tool-call pairing the real pruner also
|
||||
* accounts for, which the client can't know for a snapshot-less branch — close
|
||||
* enough for an estimate, and superseded by an exact snapshot once generated.
|
||||
* `budget` is the message window (max minus the always-sent summary baseline);
|
||||
* when `excludeTail`, the in-flight tail response is skipped (it rides on
|
||||
* `liveTokens`). Per-message contribution matches `sumBranch`: stored `tokenCount`
|
||||
* when counted, else the char-based `estTokens`.
|
||||
*/
|
||||
export function prunedBranchTokens(
|
||||
conversationId: string,
|
||||
tailId: string | null | undefined,
|
||||
budget: number,
|
||||
excludeTail: boolean,
|
||||
): number {
|
||||
const index = registry.get(conversationId);
|
||||
if (!index || !tailId || budget <= 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let total = 0;
|
||||
let currentId: string | null = tailId;
|
||||
let guard = index.size;
|
||||
let isTail = true;
|
||||
|
||||
while (currentId && currentId !== Constants.NO_PARENT && guard-- > 0) {
|
||||
const entry: TokenEntry | undefined = index.get(currentId);
|
||||
if (!entry) {
|
||||
break;
|
||||
}
|
||||
const skip = isTail && excludeTail;
|
||||
isTail = false;
|
||||
if (!skip) {
|
||||
const contribution = entry.tokenCount > 0 ? entry.tokenCount : entry.estTokens;
|
||||
if (total + contribution > budget) {
|
||||
break;
|
||||
}
|
||||
total += contribution;
|
||||
}
|
||||
/** Pre-summary turns are subsumed by the baseline the caller already reserved,
|
||||
* so stop after counting the summarizing turn — mirrors `sumBranch`. */
|
||||
if (entry.summaryUsedTokens != null && entry.summaryUsedTokens > 0) {
|
||||
break;
|
||||
}
|
||||
currentId = entry.parentMessageId;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sums provider usage/cost across EVERY message in the conversation (all
|
||||
* branches, including regenerated/abandoned responses) — the conversation
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue