mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-06-25 17:06:24 +00:00
* 🧠 fix: charge Gemini reasoning tokens in agent usage accounting Resolves #13006. `usage.ts` previously billed `usage.output_tokens` directly. For Vertex AI Gemini thinking models, `@langchain/google-common`'s streaming path emits `output_tokens = candidatesTokenCount` only, dropping `thoughtsTokenCount`. Reasoning was billed at zero and the `total_tokens === input_tokens + output_tokens` invariant was broken. The fix lives in agents (danny-avila/agents#157) — but this is also a defense-in-depth backstop in case agents misses a path or another provider exhibits the same shape. `resolveCompletionTokens(usage)` adds `output_token_details.reasoning` back when (and only when) the gap is present (`total - input > output`), so providers that already include reasoning in `output_tokens` (OpenAI o-series, Anthropic, the Google-API wrapper) are no-ops — no double-counting. - `SplitUsage` gains a `completion` field; all four billing call sites in `processUsageGroup` use it instead of `usage.output_tokens`. - `total_output_tokens` in the result also reflects the corrected count. - `UsageMetadata` interface in `IJobStore.ts` adds the `output_token_details` field for type safety. - 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI no-double-count, structured spend path with cache + reasoning, no-op when no details present. * 🩹 fix: simplify reasoning correction to invariant-based gap check Initial fix gated the correction on `output_token_details.reasoning > 0`, which doesn't help in the live failure case: when google-common's stream emits the buggy fallback usage_metadata, output_token_details is empty ({}) and the gate exits early. Live debugging showed the reliable signal is the documented invariant itself: `total_tokens === input_tokens + output_tokens`. When buggy streams undercount output, total exceeds input + output by exactly the unbilled reasoning. Use `total - input` as the corrected output. This is provider-agnostic and stays a no-op for compliant providers (OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the gap is zero. Live verified end-to-end against gemini-3-flash-preview: - With agents fix in place: output_tokens=437 → billed 437 (no-op) - Backstop only (no agents fix, buggy input): raw 135, billed 297 (= total 309 - input 12, matches actual API charge) Updated tests to cover both scenarios.
287 lines
8.5 KiB
TypeScript
287 lines
8.5 KiB
TypeScript
import { logger } from '@librechat/data-schemas';
|
|
import { Providers } from 'librechat-data-provider';
|
|
import type { TCustomConfig, TTransactionsConfig } from 'librechat-data-provider';
|
|
import type {
|
|
StructuredTokenUsage,
|
|
BulkWriteDeps,
|
|
PreparedEntry,
|
|
TxMetadata,
|
|
TokenUsage,
|
|
PricingFns,
|
|
} from './transactions';
|
|
import type { UsageMetadata } from '~/stream/interfaces/IJobStore';
|
|
import type { EndpointTokenConfig } from '~/types/tokens';
|
|
import {
|
|
prepareStructuredTokenSpend,
|
|
bulkWriteTransactions,
|
|
prepareTokenSpend,
|
|
} from './transactions';
|
|
|
|
type SpendTokensFn = (txData: TxMetadata, tokenUsage: TokenUsage) => Promise<unknown>;
|
|
type SpendStructuredTokensFn = (
|
|
txData: TxMetadata,
|
|
tokenUsage: StructuredTokenUsage,
|
|
) => Promise<unknown>;
|
|
|
|
/**
|
|
* Providers whose `usage_metadata.input_tokens` ALREADY INCLUDES cached tokens
|
|
* (i.e. `input_token_details.cache_*` is a subset, not an additional charge):
|
|
*
|
|
* - Google / Vertex AI: `input_tokens` = `promptTokenCount` (includes `cachedContentTokenCount`)
|
|
* - OpenAI / Azure OpenAI: `input_tokens` = `prompt_tokens` (includes `prompt_tokens_details.cached_tokens`)
|
|
* - xAI, DeepSeek, OpenRouter, Moonshot: extend `ChatOpenAI`, same semantics
|
|
*
|
|
* Anthropic and Bedrock keep cache values separate from `input_tokens`, so they
|
|
* must be added back to compute the total prompt size — that's the historical
|
|
* additive default. Providers not listed here fall through to additive.
|
|
*/
|
|
const SUBSET_PROVIDERS: ReadonlySet<string> = new Set([
|
|
Providers.OPENAI,
|
|
Providers.AZURE,
|
|
Providers.GOOGLE,
|
|
Providers.VERTEXAI,
|
|
Providers.XAI,
|
|
Providers.DEEPSEEK,
|
|
Providers.OPENROUTER,
|
|
Providers.MOONSHOT,
|
|
]);
|
|
|
|
function inputTokensIncludesCache(provider?: string): boolean {
|
|
return provider != null && SUBSET_PROVIDERS.has(provider);
|
|
}
|
|
|
|
/**
|
|
* Resolves `completionTokens` for billing, repairing providers whose
|
|
* `usage_metadata.output_tokens` undercounts.
|
|
*
|
|
* The documented `UsageMetadata` contract (`@langchain/core`) is
|
|
* `total_tokens === input_tokens + output_tokens`. Compliant providers
|
|
* (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
|
|
* include any reasoning/thinking tokens inside `output_tokens` already,
|
|
* so the invariant holds.
|
|
*
|
|
* Vertex AI Gemini through `@langchain/google-common`'s streaming path
|
|
* emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
|
|
* leaving `total - input > output`. When that gap shows up we use the
|
|
* invariant to recover the correct billable output (`total - input`).
|
|
* Compliant providers have a zero gap, so this is a no-op for them.
|
|
*
|
|
* Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
|
|
*/
|
|
function resolveCompletionTokens(usage: UsageMetadata): number {
|
|
const output = Number(usage.output_tokens) || 0;
|
|
const total = Number(usage.total_tokens) || 0;
|
|
const input = Number(usage.input_tokens) || 0;
|
|
if (total > input + output) {
|
|
return total - input;
|
|
}
|
|
return output;
|
|
}
|
|
|
|
interface SplitUsage {
|
|
/** Non-cached input portion — what gets billed at the standard input rate */
|
|
inputOnly: number;
|
|
cacheCreation: number;
|
|
cacheRead: number;
|
|
/** Total prompt tokens including cached portion */
|
|
totalInput: number;
|
|
/** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */
|
|
completion: number;
|
|
}
|
|
|
|
function splitUsage(usage: UsageMetadata): SplitUsage {
|
|
const cacheCreation =
|
|
Number(usage.input_token_details?.cache_creation) ||
|
|
Number(usage.cache_creation_input_tokens) ||
|
|
0;
|
|
const cacheRead =
|
|
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
|
const rawInput = Number(usage.input_tokens) || 0;
|
|
const completion = resolveCompletionTokens(usage);
|
|
if (inputTokensIncludesCache(usage.provider)) {
|
|
return {
|
|
inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
|
|
cacheCreation,
|
|
cacheRead,
|
|
totalInput: rawInput,
|
|
completion,
|
|
};
|
|
}
|
|
return {
|
|
inputOnly: rawInput,
|
|
cacheCreation,
|
|
cacheRead,
|
|
totalInput: rawInput + cacheCreation + cacheRead,
|
|
completion,
|
|
};
|
|
}
|
|
|
|
export interface RecordUsageDeps {
|
|
spendTokens: SpendTokensFn;
|
|
spendStructuredTokens: SpendStructuredTokensFn;
|
|
pricing?: PricingFns;
|
|
bulkWriteOps?: BulkWriteDeps;
|
|
}
|
|
|
|
export interface RecordUsageParams {
|
|
user: string;
|
|
conversationId: string;
|
|
collectedUsage: UsageMetadata[];
|
|
model?: string;
|
|
context?: string;
|
|
messageId?: string;
|
|
balance?: Partial<TCustomConfig['balance']> | null;
|
|
transactions?: Partial<TTransactionsConfig>;
|
|
endpointTokenConfig?: EndpointTokenConfig;
|
|
}
|
|
|
|
export interface RecordUsageResult {
|
|
input_tokens: number;
|
|
output_tokens: number;
|
|
}
|
|
|
|
/**
|
|
* Records token usage for collected LLM calls and spends tokens against balance.
|
|
* This handles both sequential execution (tool calls) and parallel execution (multiple agents).
|
|
*
|
|
* When `pricing` and `bulkWriteOps` deps are provided, prepares all transaction documents
|
|
* in-memory first, then writes them in a single `insertMany` + one `updateBalance` call.
|
|
*/
|
|
export async function recordCollectedUsage(
|
|
deps: RecordUsageDeps,
|
|
params: RecordUsageParams,
|
|
): Promise<RecordUsageResult | undefined> {
|
|
const {
|
|
user,
|
|
model,
|
|
balance,
|
|
messageId,
|
|
transactions,
|
|
conversationId,
|
|
collectedUsage,
|
|
endpointTokenConfig,
|
|
context = 'message',
|
|
} = params;
|
|
|
|
if (!collectedUsage || !collectedUsage.length) {
|
|
return;
|
|
}
|
|
|
|
const messageUsages: UsageMetadata[] = [];
|
|
const summarizationUsages: UsageMetadata[] = [];
|
|
for (const usage of collectedUsage) {
|
|
if (usage == null) {
|
|
continue;
|
|
}
|
|
(usage.usage_type === 'summarization' ? summarizationUsages : messageUsages).push(usage);
|
|
}
|
|
|
|
const firstUsage = messageUsages[0];
|
|
const input_tokens = firstUsage == null ? 0 : splitUsage(firstUsage).totalInput;
|
|
|
|
let total_output_tokens = 0;
|
|
|
|
const { pricing, bulkWriteOps } = deps;
|
|
const useBulk = pricing && bulkWriteOps;
|
|
|
|
const processUsageGroup = (
|
|
usages: UsageMetadata[],
|
|
usageContext: string,
|
|
docs: PreparedEntry[],
|
|
): void => {
|
|
for (const usage of usages) {
|
|
if (!usage) {
|
|
continue;
|
|
}
|
|
|
|
const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage);
|
|
|
|
total_output_tokens += completion;
|
|
|
|
const txMetadata: TxMetadata = {
|
|
user,
|
|
balance,
|
|
messageId,
|
|
transactions,
|
|
conversationId,
|
|
endpointTokenConfig,
|
|
context: usageContext,
|
|
model: usage.model ?? model,
|
|
};
|
|
|
|
if (useBulk) {
|
|
const entries =
|
|
cacheCreation > 0 || cacheRead > 0
|
|
? prepareStructuredTokenSpend(
|
|
txMetadata,
|
|
{
|
|
promptTokens: {
|
|
input: inputOnly,
|
|
write: cacheCreation,
|
|
read: cacheRead,
|
|
},
|
|
completionTokens: completion,
|
|
},
|
|
pricing,
|
|
)
|
|
: prepareTokenSpend(
|
|
txMetadata,
|
|
{
|
|
promptTokens: inputOnly,
|
|
completionTokens: completion,
|
|
},
|
|
pricing,
|
|
);
|
|
docs.push(...entries);
|
|
continue;
|
|
}
|
|
|
|
if (cacheCreation > 0 || cacheRead > 0) {
|
|
deps
|
|
.spendStructuredTokens(txMetadata, {
|
|
promptTokens: {
|
|
input: inputOnly,
|
|
write: cacheCreation,
|
|
read: cacheRead,
|
|
},
|
|
completionTokens: completion,
|
|
})
|
|
.catch((err) => {
|
|
logger.error(
|
|
`[packages/api #recordCollectedUsage] Error spending structured ${usageContext} tokens`,
|
|
err,
|
|
);
|
|
});
|
|
continue;
|
|
}
|
|
|
|
deps
|
|
.spendTokens(txMetadata, {
|
|
promptTokens: inputOnly,
|
|
completionTokens: completion,
|
|
})
|
|
.catch((err) => {
|
|
logger.error(
|
|
`[packages/api #recordCollectedUsage] Error spending ${usageContext} tokens`,
|
|
err,
|
|
);
|
|
});
|
|
}
|
|
};
|
|
|
|
const allDocs: PreparedEntry[] = [];
|
|
processUsageGroup(messageUsages, context, allDocs);
|
|
processUsageGroup(summarizationUsages, 'summarization', allDocs);
|
|
if (useBulk && allDocs.length > 0) {
|
|
try {
|
|
await bulkWriteTransactions({ user, docs: allDocs }, bulkWriteOps);
|
|
} catch (err) {
|
|
logger.error('[packages/api #recordCollectedUsage] Error in bulk write', err);
|
|
}
|
|
}
|
|
|
|
return {
|
|
input_tokens,
|
|
output_tokens: total_output_tokens,
|
|
};
|
|
}
|