LibreChat/packages/api/src/agents/usage.ts
Danny Avila 0fe203aaca
🧠 fix: charge Gemini reasoning tokens in agent usage accounting (#13014)
* 🧠 fix: charge Gemini reasoning tokens in agent usage accounting

Resolves #13006.

`usage.ts` previously billed `usage.output_tokens` directly. For Vertex
AI Gemini thinking models, `@langchain/google-common`'s streaming path
emits `output_tokens = candidatesTokenCount` only, dropping
`thoughtsTokenCount`. Reasoning was billed at zero and the
`total_tokens === input_tokens + output_tokens` invariant was broken.

The fix lives in agents (danny-avila/agents#157) — but this is also a
defense-in-depth backstop in case agents misses a path or another
provider exhibits the same shape. `resolveCompletionTokens(usage)` adds
`output_token_details.reasoning` back when (and only when) the gap is
present (`total - input > output`), so providers that already include
reasoning in `output_tokens` (OpenAI o-series, Anthropic, the
Google-API wrapper) are no-ops — no double-counting.

- `SplitUsage` gains a `completion` field; all four billing call sites
  in `processUsageGroup` use it instead of `usage.output_tokens`.
- `total_output_tokens` in the result also reflects the corrected
  count.
- `UsageMetadata` interface in `IJobStore.ts` adds the
  `output_token_details` field for type safety.
- 4 new tests in `usage.spec.ts` cover: Vertex undercount fix, OpenAI
  no-double-count, structured spend path with cache + reasoning, no-op
  when no details present.

* 🩹 fix: simplify reasoning correction to invariant-based gap check

Initial fix gated the correction on `output_token_details.reasoning > 0`,
which doesn't help in the live failure case: when google-common's stream
emits the buggy fallback usage_metadata, output_token_details is empty
({}) and the gate exits early.

Live debugging showed the reliable signal is the documented invariant
itself: `total_tokens === input_tokens + output_tokens`. When buggy
streams undercount output, total exceeds input + output by exactly the
unbilled reasoning. Use `total - input` as the corrected output.

This is provider-agnostic and stays a no-op for compliant providers
(OpenAI/Anthropic/Google-via-CustomChatGoogleGenerativeAI), where the
gap is zero.

Live verified end-to-end against gemini-3-flash-preview:
- With agents fix in place:  output_tokens=437 → billed 437  (no-op)
- Backstop only (no agents fix, buggy input): raw 135, billed 297
  (= total 309 - input 12, matches actual API charge)

Updated tests to cover both scenarios.
2026-05-08 12:29:43 -04:00

287 lines
8.5 KiB
TypeScript

import { logger } from '@librechat/data-schemas';
import { Providers } from 'librechat-data-provider';
import type { TCustomConfig, TTransactionsConfig } from 'librechat-data-provider';
import type {
StructuredTokenUsage,
BulkWriteDeps,
PreparedEntry,
TxMetadata,
TokenUsage,
PricingFns,
} from './transactions';
import type { UsageMetadata } from '~/stream/interfaces/IJobStore';
import type { EndpointTokenConfig } from '~/types/tokens';
import {
prepareStructuredTokenSpend,
bulkWriteTransactions,
prepareTokenSpend,
} from './transactions';
type SpendTokensFn = (txData: TxMetadata, tokenUsage: TokenUsage) => Promise<unknown>;
type SpendStructuredTokensFn = (
txData: TxMetadata,
tokenUsage: StructuredTokenUsage,
) => Promise<unknown>;
/**
* Providers whose `usage_metadata.input_tokens` ALREADY INCLUDES cached tokens
* (i.e. `input_token_details.cache_*` is a subset, not an additional charge):
*
* - Google / Vertex AI: `input_tokens` = `promptTokenCount` (includes `cachedContentTokenCount`)
* - OpenAI / Azure OpenAI: `input_tokens` = `prompt_tokens` (includes `prompt_tokens_details.cached_tokens`)
* - xAI, DeepSeek, OpenRouter, Moonshot: extend `ChatOpenAI`, same semantics
*
* Anthropic and Bedrock keep cache values separate from `input_tokens`, so they
* must be added back to compute the total prompt size — that's the historical
* additive default. Providers not listed here fall through to additive.
*/
const SUBSET_PROVIDERS: ReadonlySet<string> = new Set([
Providers.OPENAI,
Providers.AZURE,
Providers.GOOGLE,
Providers.VERTEXAI,
Providers.XAI,
Providers.DEEPSEEK,
Providers.OPENROUTER,
Providers.MOONSHOT,
]);
function inputTokensIncludesCache(provider?: string): boolean {
return provider != null && SUBSET_PROVIDERS.has(provider);
}
/**
* Resolves `completionTokens` for billing, repairing providers whose
* `usage_metadata.output_tokens` undercounts.
*
* The documented `UsageMetadata` contract (`@langchain/core`) is
* `total_tokens === input_tokens + output_tokens`. Compliant providers
* (OpenAI, Anthropic, Google API via agents' `CustomChatGoogleGenerativeAI`)
* include any reasoning/thinking tokens inside `output_tokens` already,
* so the invariant holds.
*
* Vertex AI Gemini through `@langchain/google-common`'s streaming path
* emits `output_tokens = candidatesTokenCount` and drops `thoughtsTokenCount`,
* leaving `total - input > output`. When that gap shows up we use the
* invariant to recover the correct billable output (`total - input`).
* Compliant providers have a zero gap, so this is a no-op for them.
*
* Tracked in: https://github.com/danny-avila/LibreChat/issues/13006
*/
function resolveCompletionTokens(usage: UsageMetadata): number {
const output = Number(usage.output_tokens) || 0;
const total = Number(usage.total_tokens) || 0;
const input = Number(usage.input_tokens) || 0;
if (total > input + output) {
return total - input;
}
return output;
}
interface SplitUsage {
/** Non-cached input portion — what gets billed at the standard input rate */
inputOnly: number;
cacheCreation: number;
cacheRead: number;
/** Total prompt tokens including cached portion */
totalInput: number;
/** Output tokens for billing (includes reasoning when omitted from `output_tokens`) */
completion: number;
}
function splitUsage(usage: UsageMetadata): SplitUsage {
const cacheCreation =
Number(usage.input_token_details?.cache_creation) ||
Number(usage.cache_creation_input_tokens) ||
0;
const cacheRead =
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
const rawInput = Number(usage.input_tokens) || 0;
const completion = resolveCompletionTokens(usage);
if (inputTokensIncludesCache(usage.provider)) {
return {
inputOnly: Math.max(0, rawInput - cacheCreation - cacheRead),
cacheCreation,
cacheRead,
totalInput: rawInput,
completion,
};
}
return {
inputOnly: rawInput,
cacheCreation,
cacheRead,
totalInput: rawInput + cacheCreation + cacheRead,
completion,
};
}
export interface RecordUsageDeps {
spendTokens: SpendTokensFn;
spendStructuredTokens: SpendStructuredTokensFn;
pricing?: PricingFns;
bulkWriteOps?: BulkWriteDeps;
}
export interface RecordUsageParams {
user: string;
conversationId: string;
collectedUsage: UsageMetadata[];
model?: string;
context?: string;
messageId?: string;
balance?: Partial<TCustomConfig['balance']> | null;
transactions?: Partial<TTransactionsConfig>;
endpointTokenConfig?: EndpointTokenConfig;
}
export interface RecordUsageResult {
input_tokens: number;
output_tokens: number;
}
/**
* Records token usage for collected LLM calls and spends tokens against balance.
* This handles both sequential execution (tool calls) and parallel execution (multiple agents).
*
* When `pricing` and `bulkWriteOps` deps are provided, prepares all transaction documents
* in-memory first, then writes them in a single `insertMany` + one `updateBalance` call.
*/
export async function recordCollectedUsage(
deps: RecordUsageDeps,
params: RecordUsageParams,
): Promise<RecordUsageResult | undefined> {
const {
user,
model,
balance,
messageId,
transactions,
conversationId,
collectedUsage,
endpointTokenConfig,
context = 'message',
} = params;
if (!collectedUsage || !collectedUsage.length) {
return;
}
const messageUsages: UsageMetadata[] = [];
const summarizationUsages: UsageMetadata[] = [];
for (const usage of collectedUsage) {
if (usage == null) {
continue;
}
(usage.usage_type === 'summarization' ? summarizationUsages : messageUsages).push(usage);
}
const firstUsage = messageUsages[0];
const input_tokens = firstUsage == null ? 0 : splitUsage(firstUsage).totalInput;
let total_output_tokens = 0;
const { pricing, bulkWriteOps } = deps;
const useBulk = pricing && bulkWriteOps;
const processUsageGroup = (
usages: UsageMetadata[],
usageContext: string,
docs: PreparedEntry[],
): void => {
for (const usage of usages) {
if (!usage) {
continue;
}
const { inputOnly, cacheCreation, cacheRead, completion } = splitUsage(usage);
total_output_tokens += completion;
const txMetadata: TxMetadata = {
user,
balance,
messageId,
transactions,
conversationId,
endpointTokenConfig,
context: usageContext,
model: usage.model ?? model,
};
if (useBulk) {
const entries =
cacheCreation > 0 || cacheRead > 0
? prepareStructuredTokenSpend(
txMetadata,
{
promptTokens: {
input: inputOnly,
write: cacheCreation,
read: cacheRead,
},
completionTokens: completion,
},
pricing,
)
: prepareTokenSpend(
txMetadata,
{
promptTokens: inputOnly,
completionTokens: completion,
},
pricing,
);
docs.push(...entries);
continue;
}
if (cacheCreation > 0 || cacheRead > 0) {
deps
.spendStructuredTokens(txMetadata, {
promptTokens: {
input: inputOnly,
write: cacheCreation,
read: cacheRead,
},
completionTokens: completion,
})
.catch((err) => {
logger.error(
`[packages/api #recordCollectedUsage] Error spending structured ${usageContext} tokens`,
err,
);
});
continue;
}
deps
.spendTokens(txMetadata, {
promptTokens: inputOnly,
completionTokens: completion,
})
.catch((err) => {
logger.error(
`[packages/api #recordCollectedUsage] Error spending ${usageContext} tokens`,
err,
);
});
}
};
const allDocs: PreparedEntry[] = [];
processUsageGroup(messageUsages, context, allDocs);
processUsageGroup(summarizationUsages, 'summarization', allDocs);
if (useBulk && allDocs.length > 0) {
try {
await bulkWriteTransactions({ user, docs: allDocs }, bulkWriteOps);
} catch (err) {
logger.error('[packages/api #recordCollectedUsage] Error in bulk write', err);
}
}
return {
input_tokens,
output_tokens: total_output_tokens,
};
}