LibreChat/api/server/controllers/agents/client.js
Danny Avila db7011d567
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
GitNexus Index / index (push) Waiting to run
GitNexus Index / post-index (push) Blocked by required conditions
📊 feat: Real-Time Context Window & Token Usage Tracking (#13670)
* 📊 feat: Real-Time Context Window & Token Usage Tracking

* 🧪 fix: Align Pricing Spec Dep Signatures with TxDeps

* 🩹 fix: Resolve Codex Findings for Context Usage Tracking

* 📊 feat: Granular Tool Token Breakdown with Deferred Splits

* 🧪 test: Cover Session Cost in Mock E2E and Scope Usage Selectors

* 🧪 test: Live Host-Pipeline Usage Verification (Env-Gated)

* 🧪 test: Local Real-Provider Multi-Turn E2E Harness

* 🪙 fix: Keep Tagged Usage Buckets Out of the Live Context Estimate

* 🩹 fix: Scoped Token-Config Fallback and Sequential Visibility for Usage Events

* 🩹 fix: Address Usage Review Findings — Cost Timing, Scoped Caches, Finalized Output

- carry the post-snapshot output estimate into the context snapshot at
  finalize so the gauge keeps the last response after live resets
- accumulate per-rate billable units and price the session cost at
  render, so usage events arriving before the token-config load still
  count once it resolves
- pass user-scoped token-config cache keys through loadConfigModels
  fetches and drop the controller's unscoped fallback to prevent serving
  another user's resolved config
- tag emitted usage events with a per-run seq so resume dedupe never
  drops a distinct call with an identical payload
- admit the static tokenConfig override in the custom endpoint schema so
  it survives zod parsing into req.config

* 🩹 fix: Align Client Usage Accounting with Backend Cost Semantics

- classify cache tokens by provider (shared inputTokensIncludesCache from
  data-provider, consumed by both the backend billing path and the client)
  instead of a magnitude heuristic, so Anthropic/Bedrock turns where cache
  is smaller than uncached input no longer under-bill input
- mirror resolveCompletionTokens on the client so Vertex-style hidden
  thinking tokens are reflected in the Output row and session cost
- prefer endpoint pricing over adapter-provider pricing so a custom
  endpoint can price a known model name without built-in rates shadowing it
- carry static cacheRead/cacheWrite overrides through the tokenConfig
  schema and buildTokenConfigMap

* 🩹 fix: Honor Static Token Config in Billing; Tighten Usage Freshness

- initializeCustom now uses a static endpoint tokenConfig as the agent's
  endpointTokenConfig (billing + balance checks), not just the advertised
  UI config — previously the gauge showed admin rates while the agent
  billed against built-in tables
- invalidate the token-config query alongside models on user-key add/
  revoke so context windows and pricing refresh without a reload
- include maxContextTokens in ChatForm's stabilized conversation memo so
  the gauge reflects a changed context-window setting immediately
- feed the live output estimate from the legacy content path (direct and
  assistants streams), setting from cumulative part text rather than
  accumulating deltas

* 🩹 fix: Resume Usage Dedup, Agent Pricing, and Partial Override Billing

- fold usage events idempotently by (runId, seq) so resume backfill no
  longer resets the conversation totals — a mid-stream reconnect keeps the
  usage of prompts already completed earlier in the session
- tap replayed pending message/reasoning/content events so output streamed
  past the resume snapshot reaches the live estimate, not just the message
- resolve cost against the agent's backing endpoint (Agents conversations
  report endpoint `agents` / provider `openAI`, neither of which keys a
  custom endpoint's tokenConfig)
- getMultiplier/getCacheMultiplier fall back to the standard tables for
  models absent from a partial endpointTokenConfig, so a partial static
  override no longer bills non-listed models at defaultRate while the UI
  shows the correct pattern rate

* 🩹 fix: Repaired Output in Gauge, Cache-Rate Keys, Config Gate, Usage Cleanup

- live/completed gauge counts the repaired completion (normalized output),
  so under-reporting providers don't drop the response from used context
- translate static tokenConfig cacheWrite/cacheRead onto the write/read
  keys getCacheMultiplier reads, so cache tokens bill at the configured
  rate instead of the prompt-rate fallback
- clear the token index and usage atoms when leaving a conversation, so
  visited histories don't accumulate in memory for the tab's lifetime
- wait for startupConfig before mounting the gauge, so a deployment with
  contextUsage disabled never briefly mounts it or fires the token-config
  query on first load

* 🩹 fix: Move Token-Config Resolution to TS; Key Live Usage by Created Convo

- extract the token-config resolution (override gathering + cache lookup +
  buildTokenConfigMap) into resolveTokenConfigMap in packages/api, leaving
  the /api controller a thin request-scoped wrapper (CLAUDE.md TS rule)
- getConvoKey prefers the user message's real conversationId once the
  `created` event stamps it, so a new chat's first-response live gauge and
  totals land under the id TokenUsage subscribes to instead of NEW_CONVO

* 🩹 fix: Clear Stale Redis Job Usage; Live-Tap Legacy Streams; Share Fetched Config

- DEL the Redis job hash before re-creating it so a reused streamId can't
  inherit a prior run's contextUsage/tokenUsage and backfill stale usage
- tap the legacy {message,text} stream branch (non-agent OpenAI/Anthropic
  streams) into the live estimate, not just the content path
- copy a deduped fetch's token config to every sibling endpoint sharing the
  baseURL/key/headers, so /token-config resolves each by its own name

*  revert: Don't DEL Redis job hash in createJob (breaks cross-replica resume)

createJob is an idempotent join — a second replica calls it for the same
streamId to share an in-flight stream's state. DELeting the hash wiped the
prior replica's persisted created/usage state, so a joining replica missed
the created event (GenerationJobManager cross-replica integration test).
Reverts the F1 change from 2bfce0c34b; the stale-usage concern doesn't
arise in practice (streamId is unique per generation).

* 🩹 fix: Best-Effort Usage Emit; Tag Hidden Sequential-Agent Usage

- wrap the ModelEndHandler usage emit in try/catch so a failed telemetry
  delivery (closed SSE / Redis publish error) can't abort the handler
  before thought-signature capture, which would break resumed tool calls
- tag hidden sequential-agent usage as 'sequential' (non-primary) so the
  client folds it into session cost/totals but not the live context gauge,
  instead of letting an undefined usage_type inflate the visible gauge

* 🩹 fix: Refetch Stale Token Config on Mount; Normalize Vertex for Lookup

- useTokenConfigQuery refetches on mount when stale, so a user-key change
  that invalidates tokenConfig while the gauge is unmounted takes effect on
  return instead of serving the prior key's resolved config
- normalize a Vertex-backed agent's provider (vertexai) to the google
  token-config key, so Gemini context windows and rates resolve instead of
  showing unknown context / $0 cost

*  feat: Server-Side Per-Event Cost (Authoritative Pricing for the Gauge)

Move usage-cost pricing to the single source of truth. The backend prices
each model call with the same billing functions (premium tiers via
getMultiplier(inputTokenCount), cache rates) and emits the USD cost on
on_token_usage when interface.contextCost is enabled; the client sums
emitted costs instead of re-deriving from base token-config rates.

- computeUsageCostUSD reuses prepareTokenSpend/prepareStructuredTokenSpend
  so the emitted cost matches what is billed (incl. premium thresholds)
- getDefaultHandlers gains a usageCost pricing context; initialize.js wires
  db.getMultiplier/getCacheMultiplier gated on contextCost (agents path)
- client UsageTotals carries a summed costUSD; retire the client-side rate
  lookups (costFromUnits/calcUsageCost) that drifted from backend pricing
  and produced the provider-keying / cache-key / Vertex / premium findings
- keep normalizeUsageUnits for the displayed token counts; token-config is
  still used for the context-window meter

Fixes the premium-tier session-cost under-report (gpt-5.x / gemini-3.1
above their input thresholds).

* 🩹 fix: Branch-Accurate Usage Snapshot + Clearer Gauge Track Contrast

- re-anchor the context snapshot from the user message to the response
  message at finalize. Regenerating a response branches off a shared user
  message, so anchoring on it made the snapshot read as "active" on both
  branches — switching to the sibling branch showed the wrong (other
  branch's) context. The response message is branch-unique, so sibling
  branches now correctly fall back to their own per-branch totals.
- raise the gauge ring's track/fill contrast (muted track, prominent fill)
  so the used portion reads clearly as a fill-level indicator

* 🩹 fix: Tag Sequential Usage in Billing; Emit Subagent Cost; Reset Live on Resume Errors

- tag hidden sequential-agent usage `usage_type: 'sequential'` on the
  COLLECTED usage (not just the emit), and treat it as non-primary in
  recordCollectedUsage (billed, excluded from the reported output total) so
  hidden intermediate output stops inflating the parent's tokenCount/pruning
- emit on_token_usage from the subagent usage sink (tagged `subagent`, with
  authoritative cost when contextCost is on) so the gauge's session
  cost/totals include billed subagent usage; it stays out of the live meter
- call resetLive on the resumable 404 and max-retry terminal branches so the
  gauge doesn't keep counting stale in-flight tokens after the stream ends

* 🎨 fix: Contrast the Popup Context Bar; Revert Ring Restyle

- raise the popup breakdown's context progressbar contrast (muted
  surface-tertiary track, prominent text-primary fill) — that's the bar the
  contrast feedback was about
- revert the gauge ring restyle (kept its original border-heavy track /
  text-secondary fill); the ring wasn't the element in question

* 🩹 fix: Stop Snapshot Granularity Leaking Across Branches; Revert Tree Memo

- a null-anchor context snapshot was treated as active on every branch,
  leaking one generation's granular breakdown onto sibling branches. Require
  a non-null (response-message) anchor on the viewed branch instead, so
  siblings without a matching snapshot fall back to their own totals.
- revert the buildTree WeakMap memo in messages.ts. buildTree is pure (builds
  from shallow copies) so the memo was behaviorally identical, but it was the
  feature's only change to core branch-navigation selectors — removing it
  matches upstream and rules it out of branch-navigation debugging.

* 🪙 fix: Thread Endpoint Token Config to Agent Billing, Cost, and Context Limits

Custom-endpoint agents resolve an endpointTokenConfig during agent init but
it never reached the AgentClient, so spending, emitted cost, and runtime
max-token resolution all fell back to default rates for those agents.

- Surface options.endpointTokenConfig on the returned InitializedAgent.
- Pass it to the AgentClient (this.options.endpointTokenConfig) so the
  spending path bills at configured rates.
- Thread it through usageCost to computeUsageCostUSD so emitted per-event
  cost matches billing.
- getModelMaxTokens/getModelMaxOutputTokens fall back to the built-in map
  for models absent from a partial override (matches buildTokenConfigMap);
  consolidates the duplicated fallback in pricing.ts.

* 🪙 fix: Preserve Granular Breakdown Across Branch Switches

The granular context breakdown lives only in the live on_context_usage
snapshot — a single per-conversation slot, anchored to the latest response
and overwritten by each generation. Switching to a branch generated earlier
this session lost its tool/skill/system rows and fell back to coarse totals.

Retain each generation's finalized snapshot in a per-conversation map keyed
by its branch-unique response id (snapshotsByAnchorFamily). When the live
snapshot is off the viewed branch, walk the branch tail for its deepest
stored anchor and render that breakdown. Bounded by generation count and
cleared on conversation switch; the live/just-generated path is unchanged.

* 🪙 fix: Harden Resume Seeding and Subagent Usage Emission

- useResumableSSE: skip the trailing-output live seed when the resume
  carries a context snapshot; the snapshot's messageTokens already counts
  produced output, so seeding it again inflated usage until the next reset.
- AgentClient subagent emitter: await GenerationJobManager.emitChunk like
  every other caller (it persists before publishing), so a floating promise
  can't race job cleanup and a Redis/publish failure is caught by the
  emitter's try/catch instead of surfacing as an unhandled rejection.

* 🧪 test: Playwright Coverage for Context Breakdown Granularity

Add a test-only data-testid distinguishing the granular snapshot breakdown
(context-breakdown) from the coarse message-history estimate
(context-estimate), then assert granularity in the mock e2e harness:

- renders the granular breakdown from the live on_context_usage snapshot
  (guards that the snapshot event actually reaches the popover, not just the
  usage totals).
- preserves the granular breakdown after switching branches — regenerate to
  overwrite the single live snapshot, switch back, and confirm the rows
  survive via the per-anchor snapshot history map.

Branch regenerate/sibling selectors mirror the existing chat.spec branch test.
All three usage specs pass against the mock pipeline.

* 🪙 fix: Correct Resume Live-Seed, Fallback Re-index, and Subagent Emit Flush

Codex round on the prior commit:

- countTrailingOutputChars now counts only output at the very END of the
  aggregated content (0 when the model paused at a tool call), and the resume
  path always seeds it. The earlier skip-trailing-tool-parts behavior plus the
  skip-seed-when-snapshot gate together over- or under-counted in-flight
  output on resume; one rule fixes both — pre-invoke snapshot budget is never
  double-counted, and genuine in-flight output is no longer dropped.
- useTokenUsage re-indexes from the messages cache on tail change while
  submitting. The cache subscriber is muted during streaming, so without a
  context snapshot (non-agent streams) sumBranch missed the created tail and
  dropped history + prompt until finalize. Bounded — tailId only shifts on
  created/finalize/branch-switch.
- AgentClient tracks subagent usage emit promises and flushes them in
  chatCompletion's finally. The sink fires the emitter without awaiting, and
  resume reads the usage emitChunk persists (HSET), so cleanup must not race
  it or resumed clients miss billed subagent usage.
2026-06-13 19:38:28 -04:00

1715 lines
60 KiB
JavaScript

require('events').EventEmitter.defaultMaxListeners = 100;
const { logger } = require('@librechat/data-schemas');
const { getBufferString, HumanMessage } = require('@librechat/agents/langchain/messages');
const {
createRun,
isEnabled,
checkAccess,
buildToolSet,
logToolError,
sanitizeTitle,
payloadParser,
resolveHeaders,
createSafeUser,
initializeAgent,
countTokens,
getBalanceConfig,
omitTitleOptions,
getProviderConfig,
memoryInstructions,
createTokenCounter,
applyContextToAgent,
isMemoryAgentEnabled,
recordCollectedUsage,
sendEvent,
computeUsageCostUSD,
createSubagentUsageSink,
isDeepSeekReasoningProvider,
GenerationJobManager,
getTransactionsConfig,
resolveRecursionLimit,
createMemoryProcessor,
loadAgent: loadAgentFn,
createMultiAgentMapper,
filterMalformedContentParts,
countFormattedMessageTokens,
prependFileContext,
hydrateMissingIndexTokenCounts,
injectSkillPrimes,
collectFreshSkillPrimeNames,
isSkillPrimeMessage,
collectFileIds,
processTextWithTokenLimit,
buildAgentScopedContext,
buildSkillPrimeContentParts,
buildInitialToolSessions,
} = require('@librechat/api');
const {
Callback,
Providers,
TitleMethod,
formatMessage,
formatAgentMessages,
createMetadataAggregator,
} = require('@librechat/agents');
const {
Constants,
UsageEvents,
Permissions,
VisionModes,
ContentTypes,
EModelEndpoint,
PermissionTypes,
AgentCapabilities,
isAgentsEndpoint,
isEphemeralAgentId,
removeNullishValues,
DEFAULT_MEMORY_MAX_INPUT_TOKENS,
} = require('librechat-data-provider');
const { filterFilesByAgentAccess } = require('~/server/services/Files/permissions');
const { encodeAndFormat } = require('~/server/services/Files/images/encode');
const { createContextHandlers } = require('~/app/clients/prompts');
const { resolveConfigServers } = require('~/server/services/MCP');
const { getMCPServerTools } = require('~/server/services/Config');
const BaseClient = require('~/app/clients/BaseClient');
const { getMCPManager } = require('~/config');
const db = require('~/models');
const loadAgent = (params) => loadAgentFn(params, { getAgent: db.getAgent, getMCPServerTools });
const MEMORY_INPUT_CHARS_PER_TOKEN = 8;
class AgentClient extends BaseClient {
constructor(options = {}) {
super(null, options);
/** The current client class
* @type {string} */
this.clientName = EModelEndpoint.agents;
/** @deprecated @type {true} - Is a Chat Completion Request */
this.isChatCompletion = true;
/** @type {AgentRun} */
this.run;
/** Resolves with the agent run once `chatCompletion` initializes it (or
* `null` if initialization fails), letting immediate-mode title generation
* await the run instead of throwing when fired before the run exists.
* @type {Promise<AgentRun | null> | null} */
this._runReady = null;
/** @type {((run: AgentRun | null) => void) | null} */
this._resolveRun = null;
const {
agentConfigs,
contentParts,
collectedUsage,
collectedThoughtSignatures,
artifactPromises,
maxContextTokens,
subagentAggregatorsByToolCallId,
...clientOptions
} = options;
this.agentConfigs = agentConfigs;
this.maxContextTokens = maxContextTokens;
/** @type {MessageContentComplex[]} */
this.contentParts = contentParts;
/** @type {Array<UsageMetadata>} */
this.collectedUsage = collectedUsage;
/** Vertex Gemini 3 thought signatures captured during the run, keyed by
* `tool_call_id`. Persisted on `responseMessage.metadata.thoughtSignatures`
* and restored as `additional_kwargs.signatures` on subsequent turns to
* keep tool round-trips valid across DB reconstruction.
* @type {Record<string, string> | undefined} */
this.collectedThoughtSignatures = collectedThoughtSignatures;
/** @type {ArtifactPromises} */
this.artifactPromises = artifactPromises;
/** Per-request map of `createContentAggregator` instances keyed by
* the parent's `tool_call_id`. `ON_SUBAGENT_UPDATE` events stream
* into each aggregator as they arrive; `finalizeSubagentContent`
* harvests `contentParts` onto the matching `subagent` tool_call
* so the child's full activity survives a page refresh. */
this.subagentAggregatorsByToolCallId = subagentAggregatorsByToolCallId ?? new Map();
/** In-flight `on_token_usage` emits from subagent child runs. The sink
* fires the emitter without awaiting, so chatCompletion's finally flushes
* these before returning — otherwise job cleanup can race the persist.
* @type {Promise<void>[]} */
this.pendingSubagentEmits = [];
/** @type {AgentClientOptions} */
this.options = Object.assign({ endpoint: options.endpoint }, clientOptions);
/** @type {string} */
this.model = this.options.agent.model_parameters.model;
/** The key for the usage object's input tokens
* @type {string} */
this.inputTokensKey = 'input_tokens';
/** The key for the usage object's output tokens
* @type {string} */
this.outputTokensKey = 'output_tokens';
/** @type {UsageMetadata} */
this.usage;
/** @type {Record<string, number>} */
this.indexTokenCountMap = {};
/** @type {Array<Record<string, unknown>> | null} */
this.memoryPayload = null;
/** @type {(messages: BaseMessage[]) => Promise<void>} */
this.processMemory;
}
/**
* Returns the aggregated content parts for the current run.
* @returns {MessageContentComplex[]} */
getContentParts() {
return this.contentParts;
}
/**
* Harvest the `contentParts` from each per-subagent `createContentAggregator`
* instance and attach them onto the matching parent `subagent` tool_call
* as `subagent_content`. Runs once per message save (from
* `sendCompletion`'s `finally`) so the child's full reasoning / tool
* calls / final text survive a page refresh — the client-side Recoil
* atom is session-only. Aggregators keyed by a tool_call_id that never
* appeared in `contentParts` are discarded (no home to attach to).
*/
finalizeSubagentContent() {
const buffer = this.subagentAggregatorsByToolCallId;
if (!buffer || buffer.size === 0 || !Array.isArray(this.contentParts)) {
return;
}
for (const part of this.contentParts) {
if (part?.type !== ContentTypes.TOOL_CALL) continue;
const toolCall = part[ContentTypes.TOOL_CALL];
if (!toolCall || toolCall.name !== Constants.SUBAGENT || !toolCall.id) continue;
const aggregator = buffer.get(toolCall.id);
if (!aggregator) continue;
try {
/** `createContentAggregator` returns a sparse array (undefined
* slots for indices that never received content). Strip those
* so the persisted shape is a clean `TMessageContentParts[]`. */
const parts = Array.isArray(aggregator.contentParts)
? aggregator.contentParts.filter((p) => p != null)
: [];
if (parts.length > 0) {
toolCall.subagent_content = parts;
}
} catch (err) {
logger.warn(
`[AgentClient] Failed to attach subagent content for tool_call ${toolCall.id}: ${err?.message ?? err}`,
);
}
}
buffer.clear();
}
setOptions(_options) {}
/**
* `AgentClient` is not opinionated about vision requests, so we don't do anything here
* @param {MongoFile[]} attachments
*/
checkVisionRequest() {}
getSaveOptions() {
let runOptions = {};
try {
runOptions = payloadParser(this.options) ?? {};
} catch (error) {
logger.error(
'[api/server/controllers/agents/client.js #getSaveOptions] Error parsing options',
error,
);
}
return removeNullishValues(
Object.assign(
{
spec: this.options.spec,
iconURL: this.options.iconURL,
chatProjectId: this.options.chatProjectId,
endpoint: this.options.endpoint,
agent_id: this.options.agent.id,
modelLabel: this.options.modelLabel,
resendFiles: this.options.resendFiles,
imageDetail: this.options.imageDetail,
maxContextTokens: this.maxContextTokens,
},
// TODO: PARSE OPTIONS BY PROVIDER, MAY CONTAIN SENSITIVE DATA
runOptions,
),
);
}
/**
* Returns build message options. For AgentClient, agent-specific instructions
* are retrieved directly from agent objects in buildMessages, so this returns empty.
* @returns {Object} Empty options object
*/
getBuildMessagesOptions() {
return {};
}
/**
*
* @param {TMessage} message
* @param {Array<MongoFile>} attachments
* @returns {Promise<Array<Partial<MongoFile>>>}
*/
async addImageURLs(message, attachments) {
const { files, image_urls } = await encodeAndFormat(
this.options.req,
attachments,
{
provider: this.options.agent.provider,
endpoint: this.options.endpoint,
},
VisionModes.agents,
);
message.image_urls = image_urls.length ? image_urls : undefined;
return files;
}
async buildMessages(messages, parentMessageId, _buildOptions, opts) {
/** Always pass mapMethod; getMessagesForConversation applies it only to messages with addedConvo flag */
const orderedMessages = this.constructor.getMessagesForConversation({
messages,
parentMessageId,
summary: this.shouldSummarize,
mapMethod: createMultiAgentMapper(this.options.agent, this.agentConfigs),
mapCondition: (message) => message.addedConvo === true,
});
let payload;
/** @type {number | undefined} */
let promptTokens;
/** Normalize instruction fields before applying per-run context. */
const normalizeInstructions = (agent) => {
agent.instructions = agent.instructions?.trim() || undefined;
agent.additional_instructions = agent.additional_instructions?.trim() || undefined;
return agent;
};
/** Collect all agents for unified processing while preserving stable/dynamic instruction fields. */
const allAgents = [
{ agent: normalizeInstructions(this.options.agent), agentId: this.options.agent.id },
...(this.agentConfigs?.size > 0
? Array.from(this.agentConfigs.entries()).map(([agentId, agent]) => ({
agent: normalizeInstructions(agent),
agentId,
}))
: []),
];
const sharedRunAttachmentIds = new Set();
if (this.options.attachments) {
const attachments = await this.options.attachments;
const latestMessage = orderedMessages[orderedMessages.length - 1];
for (const fileId of collectFileIds(attachments)) {
sharedRunAttachmentIds.add(fileId);
}
if (this.message_file_map) {
this.message_file_map[latestMessage.messageId] = attachments;
} else {
this.message_file_map = {
[latestMessage.messageId]: attachments,
};
}
await this.addFileContextToMessage(latestMessage, attachments);
const files = await this.processAttachments(latestMessage, attachments);
this.options.attachments = files;
}
/** Note: Bedrock uses legacy RAG API handling */
if (this.message_file_map && !isAgentsEndpoint(this.options.endpoint)) {
this.contextHandlers = createContextHandlers(
this.options.req,
orderedMessages[orderedMessages.length - 1].text,
);
}
/** @type {Record<number, number>} */
const indexTokenCountMap = {};
/** @type {Record<string, number>} */
const tokenCountMap = {};
const memoryPayload = [];
let hasFileContext = false;
let promptTokenTotal = 0;
const encoding = this.getEncoding();
const formattedMessages = orderedMessages.map((message, i) => {
const formattedMessage = formatMessage({
message,
userName: this.options?.name,
assistantName: this.options?.modelLabel,
});
const memoryFormattedMessage = formatMessage({
message,
userName: this.options?.name,
assistantName: this.options?.modelLabel,
});
/**
* Bind file context to the message it belongs to. Historical attachments
* are resent inline, so the current turn's text attachment must be inline
* too instead of living only in the dynamic system tail.
*/
if (message.fileContext) {
hasFileContext = true;
prependFileContext(formattedMessage, message.fileContext);
}
memoryPayload.push(memoryFormattedMessage);
const dbTokenCount = Number(orderedMessages[i].tokenCount);
const hasDbTokenCount = Number.isFinite(dbTokenCount) && dbTokenCount > 0;
const needsCanonicalTokenCount =
!hasDbTokenCount || (this.isVisionModel && (message.image_urls || message.files));
let canonicalTokenCount = hasDbTokenCount ? dbTokenCount : 0;
if (needsCanonicalTokenCount) {
canonicalTokenCount = countFormattedMessageTokens(memoryFormattedMessage, encoding);
}
const promptMessageTokenCount = message.fileContext
? countFormattedMessageTokens(formattedMessage, encoding)
: canonicalTokenCount;
/* If message has files, calculate image token cost */
if (this.message_file_map && this.message_file_map[message.messageId]) {
const attachments = this.message_file_map[message.messageId];
for (const file of attachments) {
if (file.embedded) {
this.contextHandlers?.processFile(file);
continue;
}
if (file.metadata?.codeEnvRef) {
continue;
}
}
}
const normalizedCanonicalTokenCount =
Number.isFinite(canonicalTokenCount) && canonicalTokenCount > 0 ? canonicalTokenCount : 0;
const normalizedPromptTokenCount =
Number.isFinite(promptMessageTokenCount) && promptMessageTokenCount > 0
? promptMessageTokenCount
: 0;
orderedMessages[i].tokenCount = normalizedCanonicalTokenCount;
indexTokenCountMap[i] = normalizedPromptTokenCount;
promptTokenTotal += normalizedPromptTokenCount;
if (message.messageId) {
tokenCountMap[message.messageId] = normalizedCanonicalTokenCount;
}
if (isEnabled(process.env.AGENT_DEBUG_LOGGING)) {
const role = message.isCreatedByUser ? 'user' : 'assistant';
const hasSummary =
Array.isArray(message.content) && message.content.some((p) => p && p.type === 'summary');
const suffix = hasSummary ? '[S]' : '';
const id = (message.messageId ?? message.id ?? '').slice(-8);
const recalced = needsCanonicalTokenCount ? normalizedCanonicalTokenCount : null;
const promptRecalced = message.fileContext ? normalizedPromptTokenCount : null;
logger.debug(
`[AgentClient] msg[${i}] ${role}${suffix} id=…${id} db=${dbTokenCount} needsRecount=${needsCanonicalTokenCount} recalced=${recalced} promptRecalced=${promptRecalced} tokens=${normalizedPromptTokenCount}`,
);
}
return formattedMessage;
});
payload = formattedMessages;
this.memoryPayload = hasFileContext ? memoryPayload : null;
messages = orderedMessages;
promptTokens = promptTokenTotal;
/**
* Build shared run context - applies to ALL agents in the run.
* Request attachment file context is already bound inline to the latest
* user message above; only side-channel context belongs here.
* Memory context is handled separately and applied per-agent based on config.
*/
const sharedRunContextParts = [];
/** Augmented prompt from RAG/context handlers */
if (this.contextHandlers) {
this.augmentedPrompt = await this.contextHandlers.createContext();
if (this.augmentedPrompt) {
sharedRunContextParts.push(this.augmentedPrompt);
}
}
/** Memory context (user preferences/memories) */
const withoutKeys = await this.useMemory();
const memoryContext = withoutKeys
? `${memoryInstructions}\n\n# Existing memory about the user:\n${withoutKeys}`
: undefined;
const sharedRunContext = sharedRunContextParts.join('\n\n');
const memoryAgentEnabled = isMemoryAgentEnabled(this.options.req.config?.memory);
const agentScopedContext = await buildAgentScopedContext({
agentIds: allAgents.map(({ agentId }) => agentId),
attachmentsByAgentId: this.options.agentContextAttachmentsByAgentId,
sharedRunAttachmentIds,
req: this.options.req,
tokenCountFn: (text) => countTokens(text),
});
/** Preserve prompt token counts for graph formatting and pruning. */
this.indexTokenCountMap = indexTokenCountMap;
/** Extract contextMeta from the parent response (second-to-last in ordered chain;
* last is the current user message). Seeds the pruner's calibration EMA for this run. */
const parentResponse =
orderedMessages.length >= 2 ? orderedMessages[orderedMessages.length - 2] : undefined;
if (parentResponse?.contextMeta && !parentResponse.isCreatedByUser) {
this.contextMeta = parentResponse.contextMeta;
}
const result = {
prompt: payload,
tokenCountMap,
promptTokens,
messages,
};
if (promptTokens >= 0 && typeof opts?.getReqData === 'function') {
opts.getReqData({ promptTokens });
}
/**
* Apply context to all agents.
* Stable agent/MCP instructions stay on `instructions`; shared runtime context
* is appended to `additional_instructions` as the dynamic system tail.
*
* NOTE: This intentionally mutates agent objects in place. The agentConfigs Map
* holds references to config objects that will be passed to the graph runtime.
*/
const ephemeralAgent = this.options.req.body.ephemeralAgent;
const mcpManager = getMCPManager();
const configServers = await resolveConfigServers(this.options.req);
await Promise.all(
allAgents.map(({ agent, agentId }) => {
const agentRunContextParts = [sharedRunContext];
if (memoryContext && (agentId === this.options.agent.id || memoryAgentEnabled)) {
agentRunContextParts.push(memoryContext);
}
const scopedContext = agentScopedContext.get(agentId);
if (scopedContext) {
agentRunContextParts.push(scopedContext);
}
return applyContextToAgent({
agent,
agentId,
logger,
mcpManager,
configServers,
sharedRunContext: agentRunContextParts.filter(Boolean).join('\n\n'),
ephemeralAgent: agentId === this.options.agent.id ? ephemeralAgent : undefined,
});
}),
);
return result;
}
/**
* Creates a promise that resolves with the memory promise result or undefined after a timeout
* @param {Promise<(TAttachment | null)[] | undefined>} memoryPromise - The memory promise to await
* @param {number} timeoutMs - Timeout in milliseconds (default: 3000)
* @returns {Promise<(TAttachment | null)[] | undefined>}
*/
async awaitMemoryWithTimeout(memoryPromise, timeoutMs = 3000) {
if (!memoryPromise) {
return;
}
try {
const timeoutPromise = new Promise((_, reject) =>
setTimeout(() => reject(new Error('Memory processing timeout')), timeoutMs),
);
const attachments = await Promise.race([memoryPromise, timeoutPromise]);
return attachments;
} catch (error) {
if (error.message === 'Memory processing timeout') {
logger.warn('[AgentClient] Memory processing timed out after 3 seconds');
} else {
logger.error('[AgentClient] Error processing memory:', error);
}
return;
}
}
/**
* @returns {Promise<string | undefined>}
*/
async useMemory() {
const user = this.options.req.user;
if (user.personalization?.memories === false) {
return;
}
const hasAccess = await checkAccess({
user,
permissionType: PermissionTypes.MEMORIES,
permissions: [Permissions.USE],
getRoleByName: db.getRoleByName,
});
if (!hasAccess) {
logger.debug(
`[api/server/controllers/agents/client.js #useMemory] User ${user.id} does not have USE permission for memories`,
);
return;
}
const appConfig = this.options.req.config;
const memoryConfig = appConfig.memory;
if (!memoryConfig || memoryConfig.disabled === true) {
return;
}
const userId = this.options.req.user.id + '';
this.processMemory = undefined;
if (!isMemoryAgentEnabled(memoryConfig)) {
try {
const { withoutKeys } = await db.getFormattedMemories({ userId });
return withoutKeys;
} catch (error) {
logger.error(
'[api/server/controllers/agents/client.js #useMemory] Error loading memories',
error,
);
return;
}
}
/** @type {Agent} */
let prelimAgent;
const allowedProviders = new Set(
appConfig?.endpoints?.[EModelEndpoint.agents]?.allowedProviders,
);
try {
if (memoryConfig.agent?.id != null && memoryConfig.agent.id !== this.options.agent.id) {
prelimAgent = await loadAgent({
req: this.options.req,
agent_id: memoryConfig.agent.id,
endpoint: EModelEndpoint.agents,
});
} else if (memoryConfig.agent?.id != null) {
prelimAgent = this.options.agent;
} else if (
memoryConfig.agent?.id == null &&
memoryConfig.agent?.model != null &&
memoryConfig.agent?.provider != null
) {
prelimAgent = { id: Constants.EPHEMERAL_AGENT_ID, ...memoryConfig.agent };
}
} catch (error) {
logger.error(
'[api/server/controllers/agents/client.js #useMemory] Error loading agent for memory',
error,
);
}
if (!prelimAgent) {
return;
}
/** Forward the same `execute_code` capability gate the chat flow uses —
* memory agents are unlikely to list `execute_code`, but if one does,
* Phase 8 relies on this flag to expand the string into
* `bash_tool` + `read_file` (pre-Phase 8 the legacy `execute_code`
* tool registered unconditionally; without this passthrough the
* memory path would silently lose code-execution tooling). */
const memoryCapabilities = new Set(appConfig?.endpoints?.[EModelEndpoint.agents]?.capabilities);
const agent = await initializeAgent(
{
req: this.options.req,
res: this.options.res,
agent: prelimAgent,
allowedProviders,
endpointOption: {
endpoint: !isEphemeralAgentId(prelimAgent.id)
? EModelEndpoint.agents
: memoryConfig.agent?.provider,
},
codeEnvAvailable: memoryCapabilities.has(AgentCapabilities.execute_code),
},
{
getFiles: db.getFiles,
getUserKey: db.getUserKey,
getConvoFiles: db.getConvoFiles,
updateFilesUsage: db.updateFilesUsage,
getUserKeyValues: db.getUserKeyValues,
getToolFilesByIds: db.getToolFilesByIds,
getCodeGeneratedFiles: db.getCodeGeneratedFiles,
filterFilesByAgentAccess,
},
);
if (!agent) {
logger.warn(
'[api/server/controllers/agents/client.js #useMemory] No agent found for memory',
memoryConfig,
);
return;
}
const llmConfig = Object.assign(
{
provider: agent.provider,
model: agent.model,
},
agent.model_parameters,
);
/** @type {import('@librechat/api').MemoryConfig} */
const config = {
validKeys: memoryConfig.validKeys,
instructions: agent.instructions,
llmConfig,
tokenLimit: memoryConfig.tokenLimit,
};
const messageId = this.responseMessageId + '';
const conversationId = this.conversationId + '';
const streamId = this.options.req?._resumableStreamId || null;
const [withoutKeys, processMemory] = await createMemoryProcessor({
userId,
config,
messageId,
streamId,
conversationId,
memoryMethods: {
setMemory: db.setMemory,
deleteMemory: db.deleteMemory,
getFormattedMemories: db.getFormattedMemories,
},
res: this.options.res,
user: createSafeUser(this.options.req.user),
});
this.processMemory = processMemory;
return withoutKeys;
}
/**
* Filters out image URLs from message content
* @param {BaseMessage} message - The message to filter
* @returns {BaseMessage} - A new message with image URLs removed
*/
filterImageUrls(message) {
if (!message.content || typeof message.content === 'string') {
return message;
}
if (Array.isArray(message.content)) {
const filteredContent = message.content.filter(
(part) => part.type !== ContentTypes.IMAGE_URL,
);
if (filteredContent.length === 1 && filteredContent[0].type === ContentTypes.TEXT) {
const MessageClass = message.constructor;
return new MessageClass({
content: filteredContent[0].text,
additional_kwargs: message.additional_kwargs,
});
}
const MessageClass = message.constructor;
return new MessageClass({
content: filteredContent,
additional_kwargs: message.additional_kwargs,
});
}
return message;
}
/**
* @param {BaseMessage[]} messages
* @returns {Promise<void | (TAttachment | null)[]>}
*/
async runMemory(messages) {
try {
if (this.processMemory == null) {
return;
}
const appConfig = this.options.req.config;
const memoryConfig = appConfig.memory;
const messageWindowSize = memoryConfig?.messageWindowSize ?? 5;
/**
* Strip skill-primed meta messages before memory extraction. The primes
* sit next to the latest user message and carry large SKILL.md bodies,
* so letting them into the window would crowd out real chat turns and
* pollute extracted memories with synthetic instruction content the
* user never typed.
*/
const chatMessages = messages.filter((m) => !isSkillPrimeMessage(m));
let messagesToProcess = [...chatMessages];
if (chatMessages.length > messageWindowSize) {
for (let i = chatMessages.length - messageWindowSize; i >= 0; i--) {
const potentialWindow = chatMessages.slice(i, i + messageWindowSize);
if (potentialWindow[0]?.role === 'user') {
messagesToProcess = [...potentialWindow];
break;
}
}
if (messagesToProcess.length === chatMessages.length) {
messagesToProcess = [...chatMessages.slice(-messageWindowSize)];
}
}
const filteredMessages = messagesToProcess.map((msg) => this.filterImageUrls(msg));
const bufferString = getBufferString(filteredMessages);
const configuredMaxInputTokens = Number.isFinite(memoryConfig?.maxInputTokens)
? Math.floor(memoryConfig.maxInputTokens)
: undefined;
const maxInputTokens =
configuredMaxInputTokens != null && configuredMaxInputTokens > 0
? configuredMaxInputTokens
: DEFAULT_MEMORY_MAX_INPUT_TOKENS;
const maxInputChars = maxInputTokens * MEMORY_INPUT_CHARS_PER_TOKEN;
const isCharTruncated = bufferString.length > maxInputChars;
const memoryInput = `# Current Chat:\n\n${
isCharTruncated
? `[Earlier chat content omitted due to memory input limit]\n\n${bufferString.slice(
-maxInputChars,
)}`
: bufferString
}`;
const {
text: limitedMemoryInput,
tokenCount,
wasTruncated,
} = await processTextWithTokenLimit({
text: memoryInput,
tokenLimit: maxInputTokens,
tokenCountFn: (text) => countTokens(text),
preserve: 'end',
});
if (isCharTruncated || wasTruncated) {
logger.warn('[MemoryAgent] Memory input truncated before processing', {
tokenCount,
messageId: this.responseMessageId,
conversationId: this.conversationId,
maxInputTokens,
wasTruncated,
maxInputChars,
originalLength: bufferString.length,
});
}
const bufferMessage = new HumanMessage(limitedMemoryInput);
return await this.processMemory([bufferMessage]);
} catch (error) {
logger.error('Memory Agent failed to process memory', error);
}
}
/** @type {sendCompletion} */
async sendCompletion(payload, opts = {}) {
await this.chatCompletion({
payload,
onProgress: opts.onProgress,
userMCPAuthMap: opts.userMCPAuthMap,
abortController: opts.abortController,
});
const completion = filterMalformedContentParts(this.contentParts);
const signatures = this.collectedThoughtSignatures;
if (!signatures || Object.keys(signatures).length === 0) {
return { completion };
}
return { completion, metadata: { thoughtSignatures: signatures } };
}
/**
* @param {Object} params
* @param {string} [params.model]
* @param {string} [params.context='message']
* @param {AppConfig['balance']} [params.balance]
* @param {AppConfig['transactions']} [params.transactions]
* @param {UsageMetadata[]} [params.collectedUsage=this.collectedUsage]
*/
async recordCollectedUsage({
model,
balance,
transactions,
context = 'message',
collectedUsage = this.collectedUsage,
}) {
const result = await recordCollectedUsage(
{
spendTokens: db.spendTokens,
spendStructuredTokens: db.spendStructuredTokens,
pricing: { getMultiplier: db.getMultiplier, getCacheMultiplier: db.getCacheMultiplier },
bulkWriteOps: { insertMany: db.bulkInsertTransactions, updateBalance: db.updateBalance },
},
{
user: this.user ?? this.options.req.user?.id,
conversationId: this.conversationId,
collectedUsage,
model: model ?? this.model ?? this.options.agent.model_parameters.model,
context,
messageId: this.responseMessageId,
balance,
transactions,
endpointTokenConfig: this.options.endpointTokenConfig,
},
);
if (result) {
this.usage = result;
}
}
/**
* Get stream usage as returned by this client's API response.
* @returns {UsageMetadata} The stream usage object.
*/
getStreamUsage() {
return this.usage;
}
/**
* Builds the subagent usage emitter for {@link createSubagentUsageSink}.
* Streams each billed child-run usage to the client as an `on_token_usage`
* event tagged `subagent` (folds into session cost/totals, not the live
* gauge), with the authoritative cost when `interface.contextCost` is on.
* Returns undefined when there's no stream to write to.
* @param {AppConfig} [appConfig]
* @returns {((usage: UsageMetadata) => void) | undefined}
*/
buildSubagentUsageEmitter(appConfig) {
const res = this.options.res;
const streamId = this.options.req?._resumableStreamId || null;
if (!res && !streamId) {
return undefined;
}
const includeCost = appConfig?.interfaceConfig?.contextCost === true;
const endpointTokenConfig = this.options.endpointTokenConfig;
return (usage) => {
const data = {
input_tokens: usage.input_tokens,
output_tokens: usage.output_tokens,
total_tokens: usage.total_tokens,
input_token_details: this.subagentCacheDetails(usage),
model: usage.model,
provider: usage.provider,
usage_type: 'subagent',
runId: this.responseMessageId,
/** Unique per collected entry (post-push length) for resume dedupe */
seq: this.collectedUsage.length,
cost: includeCost
? computeUsageCostUSD(
usage,
{ getMultiplier: db.getMultiplier, getCacheMultiplier: db.getCacheMultiplier },
endpointTokenConfig,
)
: undefined,
};
/** The sink fires this without awaiting, so retain the promise and flush
* it in chatCompletion's finally — emitChunk persists (HSET) before
* publishing, and job cleanup must not race that persist or resumed
* clients miss billed subagent usage. */
const emit = (async () => {
try {
if (streamId) {
await GenerationJobManager.emitChunk(streamId, {
event: UsageEvents.ON_TOKEN_USAGE,
data,
});
} else {
sendEvent(res, { event: UsageEvents.ON_TOKEN_USAGE, data });
}
} catch (err) {
logger.warn('[AgentClient] Failed to emit subagent usage', err);
}
})();
this.pendingSubagentEmits.push(emit);
return emit;
};
}
/** Normalizes a subagent usage event's cache token details for emission. */
subagentCacheDetails(usage) {
const cache_creation =
usage.input_token_details?.cache_creation ?? usage.cache_creation_input_tokens;
const cache_read = usage.input_token_details?.cache_read ?? usage.cache_read_input_tokens;
if (cache_creation == null && cache_read == null) {
return undefined;
}
return { cache_creation, cache_read };
}
/**
* @param {TMessage} responseMessage
* @returns {number}
*/
getTokenCountForResponse({ content }) {
return countFormattedMessageTokens({ role: 'assistant', content }, this.getEncoding());
}
/**
* @param {object} params
* @param {string | ChatCompletionMessageParam[]} params.payload
* @param {Record<string, Record<string, string>>} [params.userMCPAuthMap]
* @param {AbortController} [params.abortController]
*/
async chatCompletion({ payload, userMCPAuthMap, abortController = null }) {
/** @type {Partial<GraphRunnableConfig>} */
let config;
/** @type {ReturnType<createRun>} */
let run;
/** @type {Promise<(TAttachment | null)[] | undefined>} */
let memoryPromise;
const appConfig = this.options.req.config;
const balanceConfig = getBalanceConfig(appConfig);
const transactionsConfig = getTransactionsConfig(appConfig);
try {
if (!abortController) {
abortController = new AbortController();
}
/** @type {AppConfig['endpoints']['agents']} */
const agentsEConfig = appConfig.endpoints?.[EModelEndpoint.agents];
config = {
runName: 'AgentRun',
configurable: {
thread_id: this.conversationId,
last_agent_index: this.agentConfigs?.size ?? 0,
user_id: this.user ?? this.options.req.user?.id,
hide_sequential_outputs: this.options.agent.hide_sequential_outputs,
requestBody: {
messageId: this.responseMessageId,
conversationId: this.conversationId,
parentMessageId: this.parentMessageId,
},
user: createSafeUser(this.options.req.user),
},
recursionLimit: resolveRecursionLimit(agentsEConfig, this.options.agent),
signal: abortController.signal,
streamMode: 'values',
version: 'v2',
};
const toolSet = buildToolSet(this.options.agent);
const tokenCounter = createTokenCounter(this.getEncoding());
/** Pre-resolve invoked skill bodies + re-prime files before formatting messages */
const skillPrimeResult = this.options.primeInvokedSkills
? await this.options.primeInvokedSkills(payload)
: undefined;
/**
* Seed `Graph.sessions` with code-env files primed across every
* reachable agent (primary, handoff/addedConvo, and nested
* subagents) plus skill-priming output. The merge logic and its
* run-wide semantics live in `buildInitialToolSessions`; see that
* helper's doc for why this is intentionally NOT per-agent.
*/
const initialSessions = buildInitialToolSessions({
skillSessions: skillPrimeResult?.initialSessions,
agents: [this.options.agent, ...(this.agentConfigs ? this.agentConfigs.values() : [])],
});
/** Spoof `Providers.DEEPSEEK` so the SDK preserves `reasoning_content` on tool turns (#13366). */
const hasDeepSeekAgent = (agent) =>
agent != null &&
isDeepSeekReasoningProvider(agent.provider, agent.model_parameters?.model ?? agent.model);
const needsDeepSeekFormat =
hasDeepSeekAgent(this.options.agent) ||
(this.agentConfigs != null &&
Array.from(this.agentConfigs.values()).some(hasDeepSeekAgent));
/**
* Skills primed fresh this turn — manual ($ popover) and always-apply
* (frontmatter). `injectSkillPrimes` (below) splices their SKILL.md
* bodies in, so `formatAgentMessages` must NOT also reconstruct the
* same names from a historical `skill` tool_call — otherwise the body
* lands twice and a prompt-cache marker can pin to the duplicated
* synthetic prefix. Names NOT primed this turn still reconstruct from
* history, preserving sticky manual re-priming across turns.
*/
const manualSkillPrimes = this.options.agent?.manualSkillPrimes;
const alwaysApplySkillPrimes = this.options.agent?.alwaysApplySkillPrimes;
const freshSkillPrimeNames = collectFreshSkillPrimeNames({
manualSkillPrimes,
alwaysApplySkillPrimes,
});
const formatOptions =
needsDeepSeekFormat || freshSkillPrimeNames.size > 0
? {
...(needsDeepSeekFormat ? { provider: Providers.DEEPSEEK } : {}),
...(freshSkillPrimeNames.size > 0
? { skipSkillBodyNames: freshSkillPrimeNames }
: {}),
}
: undefined;
let {
messages: initialMessages,
indexTokenCountMap,
summary: initialSummary,
boundaryTokenAdjustment,
} = formatAgentMessages(
payload,
this.indexTokenCountMap,
toolSet,
skillPrimeResult?.skills,
formatOptions,
);
if (boundaryTokenAdjustment) {
logger.debug(
`[AgentClient] Boundary token adjustment: ${boundaryTokenAdjustment.original}${boundaryTokenAdjustment.adjusted} (${boundaryTokenAdjustment.remainingChars}/${boundaryTokenAdjustment.totalChars} chars)`,
);
}
/**
* Skill priming — both manual ($ popover) and always-apply (frontmatter).
*
* Splice + index-shift logic lives in `injectSkillPrimes`
* (packages/api/src/agents/skills.ts) so the delicate position math
* can be unit-tested in TS without standing up AgentClient. The
* resolver enforces a combined ceiling (manual-first, always-apply
* truncated first when over cap) before reaching here; the splice
* re-applies the cap as defense-in-depth. Runs for both single-
* agent and multi-agent runs; how primes interact with handoff /
* added-convo agents' per-agent state is an agents-SDK concern,
* not this layer's to gate.
*
* `manualSkillPrimes` / `alwaysApplySkillPrimes` are resolved above
* (used to build `freshSkillPrimeNames` for dedupe against historical
* skill reconstruction).
*/
if (
(manualSkillPrimes && manualSkillPrimes.length > 0) ||
(alwaysApplySkillPrimes && alwaysApplySkillPrimes.length > 0)
) {
const primeResult = injectSkillPrimes({
initialMessages,
indexTokenCountMap,
manualSkillPrimes,
alwaysApplySkillPrimes,
});
indexTokenCountMap = primeResult.indexTokenCountMap;
if (primeResult.inserted > 0) {
const manualNames = (manualSkillPrimes ?? []).map((p) => p.name);
const alwaysApplyNames = (alwaysApplySkillPrimes ?? []).map((p) => p.name);
logger.debug(
`[AgentClient] Primed ${primeResult.inserted} skill(s) at message index ${primeResult.insertIdx} — manual: [${manualNames.join(', ')}], always-apply: [${alwaysApplyNames.join(', ')}]`,
);
}
if (primeResult.alwaysApplyDropped > 0) {
logger.warn(
`[AgentClient] Dropped ${primeResult.alwaysApplyDropped} always-apply prime(s) to stay within MAX_PRIMED_SKILLS_PER_TURN.`,
);
}
}
if (indexTokenCountMap && isEnabled(process.env.AGENT_DEBUG_LOGGING)) {
const entries = Object.entries(indexTokenCountMap);
const perMsg = entries.map(([idx, count]) => {
const msg = initialMessages[Number(idx)];
const type = msg ? msg._getType() : '?';
return `${idx}:${type}=${count}`;
});
logger.debug(
`[AgentClient] Token map after format: [${perMsg.join(', ')}] (payload=${payload.length}, formatted=${initialMessages.length})`,
);
}
indexTokenCountMap = hydrateMissingIndexTokenCounts({
messages: initialMessages,
indexTokenCountMap,
tokenCounter,
});
const memoryMessages =
this.processMemory && this.memoryPayload
? formatAgentMessages(
this.memoryPayload,
undefined,
toolSet,
skillPrimeResult?.skills,
formatOptions,
).messages
: initialMessages;
/**
* @param {BaseMessage[]} messages
*/
const runAgents = async (messages) => {
const agents = [this.options.agent];
// Include additional agents when:
// - agentConfigs has agents (from addedConvo parallel execution or agent handoffs)
// - Agents without incoming edges become start nodes and run in parallel automatically
if (this.agentConfigs && this.agentConfigs.size > 0) {
agents.push(...this.agentConfigs.values());
}
// TODO: needs to be added as part of AgentContext initialization
// const noSystemModelRegex = [/\b(o1-preview|o1-mini|amazon\.titan-text)\b/gi];
// const noSystemMessages = noSystemModelRegex.some((regex) =>
// agent.model_parameters.model.match(regex),
// );
// if (noSystemMessages === true && systemContent?.length) {
// const latestMessageContent = _messages.pop().content;
// if (typeof latestMessageContent !== 'string') {
// latestMessageContent[0].text = [systemContent, latestMessageContent[0].text].join('\n');
// _messages.push(new HumanMessage({ content: latestMessageContent }));
// } else {
// const text = [systemContent, latestMessageContent].join('\n');
// _messages.push(new HumanMessage(text));
// }
// }
// let messages = _messages;
// if (agent.useLegacyContent === true) {
// messages = formatContentStrings(messages);
// }
// if (
// agent.model_parameters?.clientOptions?.defaultHeaders?.['anthropic-beta']?.includes(
// 'prompt-caching',
// )
// ) {
// messages = addCacheControl(messages);
// }
if (this.processMemory) {
memoryPromise = this.runMemory(memoryMessages);
}
/** Seed calibration state from previous run if encoding matches */
const currentEncoding = this.getEncoding();
const prevMeta = this.contextMeta;
const encodingMatch = prevMeta?.encoding === currentEncoding;
const calibrationRatio =
encodingMatch && prevMeta?.calibrationRatio > 0 ? prevMeta.calibrationRatio : undefined;
if (prevMeta) {
logger.debug(
`[AgentClient] contextMeta from parent: ratio=${prevMeta.calibrationRatio}, encoding=${prevMeta.encoding}, current=${currentEncoding}, seeded=${calibrationRatio ?? 'none'}`,
);
}
run = await createRun({
agents,
messages,
indexTokenCountMap,
initialSummary,
initialSessions,
calibrationRatio,
runId: this.responseMessageId,
signal: abortController.signal,
customHandlers: this.options.eventHandlers,
requestBody: config.configurable.requestBody,
user: createSafeUser(this.options.req?.user),
summarizationConfig: appConfig?.summarization,
appConfig,
tokenCounter,
/** Bills subagent child-run model calls — child graphs execute
* outside the streamEvents loop, so ModelEndHandler never sees
* them. Entries land in collectedUsage tagged
* `usage_type: 'subagent'` and are spent by recordCollectedUsage.
* The sink also streams each as an `on_token_usage` event so the
* gauge's session cost/totals include billed subagent usage (the
* `subagent` tag keeps it out of the live context meter). */
subagentUsageSink: createSubagentUsageSink(
this.collectedUsage,
this.buildSubagentUsageEmitter(appConfig),
),
});
if (!run) {
throw new Error('Failed to create run');
}
this.run = run;
if (this._resolveRun) {
this._resolveRun(run);
this._resolveRun = null;
}
const streamId = this.options.req?._resumableStreamId;
if (streamId && run.Graph) {
GenerationJobManager.setGraph(streamId, run.Graph);
}
if (userMCPAuthMap != null) {
config.configurable.userMCPAuthMap = userMCPAuthMap;
}
/** @deprecated Agent Chain */
config.configurable.last_agent_id = agents[agents.length - 1].id;
await run.processStream({ messages }, config, {
callbacks: {
[Callback.TOOL_ERROR]: logToolError,
},
});
config.signal = null;
};
const hideSequentialOutputs = config.configurable.hide_sequential_outputs;
await runAgents(initialMessages);
/**
* Surface a completed `skill` tool_call content part per *manually*-
* primed skill so the existing `SkillCall` frontend renderer shows
* a "Skill X loaded" card on the assistant response. Applied after
* the graph finishes to avoid clashing with the aggregator's own
* per-step content indexing. Prepended (not appended) so cards sit
* above the model's output — priming ran before the turn, the
* reply follows.
*
* Always-apply primes intentionally do NOT emit assistant-side
* cards. `extractInvokedSkillsFromPayload` scans history for
* `skill` tool_calls and feeds `primeInvokedSkills`, which is
* Phase 3's sticky-re-prime path — that's the right behavior for
* manual (user picked `$skill` once; re-prime on every subsequent
* turn from history). For always-apply, `resolveAlwaysApplySkills`
* already re-primes every turn from fresh DB state, so persisting
* the card would cause the skill body to get primed twice per
* turn starting on turn 2. The user-facing acknowledgement for
* always-apply lives on the user bubble as the pinned
* `SkillPills` row (`message.alwaysAppliedSkills`), which
* is the durable signal the user wants: "this skill auto-primes".
*
* Live streaming display of manual user-bubble pills is handled
* by `SkillPills` reading `message.manualSkills`. No
* separate SSE emit is needed here; trying to stream a mid-run
* tool_call at index 0 collided with the LLM's first text
* content, while emitting at a sparse offset pushed the card
* below the reply on finalize. Post-run unshift keeps the final
* responseMessage.content in the right order.
*/
const manualPrimed = this.options.agent?.manualSkillPrimes ?? [];
if (manualPrimed.length > 0) {
const runId = this.responseMessageId ?? 'skill-prime';
const manualParts = buildSkillPrimeContentParts(manualPrimed, { runId });
this.contentParts.unshift(...manualParts);
}
/** @deprecated Agent Chain */
if (hideSequentialOutputs) {
this.contentParts = this.contentParts.filter((part, index) => {
// Include parts that are either:
// 1. At or after the finalContentStart index
// 2. Of type tool_call
// 3. Have tool_call_ids property
return (
index >= this.contentParts.length - 1 ||
part.type === ContentTypes.TOOL_CALL ||
part.tool_call_ids
);
});
}
} catch (err) {
logger.error(
'[api/server/controllers/agents/client.js #sendCompletion] Operation aborted',
err,
);
if (!abortController.signal.aborted) {
logger.error(
'[api/server/controllers/agents/client.js #sendCompletion] Unhandled error type',
err,
);
this.contentParts.push({
type: ContentTypes.ERROR,
[ContentTypes.ERROR]: `An error occurred while processing the request${err?.message ? `: ${err.message}` : ''}`,
});
}
} finally {
/** Capture calibration state from the run for persistence on the response message.
* Runs in finally so values are captured even on abort. */
const ratio = this.run?.getCalibrationRatio() ?? 0;
if (ratio > 0 && ratio !== 1) {
this.contextMeta = {
calibrationRatio: Math.round(ratio * 1000) / 1000,
encoding: this.getEncoding(),
};
} else {
this.contextMeta = undefined;
}
this.finalizeSubagentContent();
/** Flush subagent usage emits the sink fired without awaiting, so their
* persist/publish completes before we return and the job is cleaned up
* (resumed clients read this persisted usage). */
if (this.pendingSubagentEmits.length > 0) {
await Promise.allSettled(this.pendingSubagentEmits);
this.pendingSubagentEmits = [];
}
try {
const attachments = await this.awaitMemoryWithTimeout(memoryPromise);
if (attachments && attachments.length > 0) {
this.artifactPromises.push(...attachments);
}
/** Skip token spending if aborted - the abort handler (abortMiddleware.js) handles it
This prevents double-spending when user aborts via `/api/agents/chat/abort` */
const wasAborted = abortController?.signal?.aborted;
if (!wasAborted) {
await this.recordCollectedUsage({
context: 'message',
balance: balanceConfig,
transactions: transactionsConfig,
});
} else {
logger.debug(
'[api/server/controllers/agents/client.js #chatCompletion] Skipping token spending - handled by abort middleware',
);
}
} catch (err) {
logger.error(
'[api/server/controllers/agents/client.js #chatCompletion] Error in cleanup phase',
err,
);
}
if (this._resolveRun) {
this._resolveRun(this.run ?? null);
this._resolveRun = null;
}
run = null;
config = null;
memoryPromise = null;
}
}
/**
* Resolves with the agent run once it is initialized, or `null` if
* initialization fails. Lets immediate-mode title generation await the run
* instead of throwing when fired before `chatCompletion` assigns `this.run`.
* Rejects promptly if the provided signal aborts before the run is ready.
* @param {AbortSignal} [signal]
* @returns {Promise<AgentRun | null>}
*/
_waitForRun(signal) {
if (this.run) {
return Promise.resolve(this.run);
}
if (!this._runReady) {
this._runReady = new Promise((resolve) => {
this._resolveRun = resolve;
});
}
if (!signal) {
return this._runReady;
}
if (signal.aborted) {
return Promise.reject(new Error('Aborted before run initialization'));
}
return new Promise((resolve, reject) => {
const onAbort = () => reject(new Error('Aborted before run initialization'));
signal.addEventListener('abort', onAbort, { once: true });
this._runReady.then((run) => {
signal.removeEventListener('abort', onAbort);
resolve(run);
});
});
}
/**
* @param {Object} params
* @param {string} params.text
* @param {AbortController} params.abortController
* @param {boolean} [params.immediate] When true, the title is generated as soon
* as the request is made — the run is awaited (instead of throwing) and the
* title derives from the user's input only (`contentParts` is empty).
*/
async titleConvo({ text, abortController, immediate = false }) {
if (!this.run) {
if (!immediate) {
throw new Error('Run not initialized');
}
await this._waitForRun(abortController?.signal);
if (!this.run) {
logger.debug(
'[api/server/controllers/agents/client.js #titleConvo] Run unavailable for immediate title generation',
);
return;
}
}
const { handleLLMEnd, collected: collectedMetadata } = createMetadataAggregator();
const { req, agent } = this.options;
if (req?.body?.isTemporary) {
logger.debug(
`[api/server/controllers/agents/client.js #titleConvo] Skipping title generation for temporary conversation`,
);
return;
}
const appConfig = req.config;
let endpoint = agent.endpoint;
/** @type {import('@librechat/agents').ClientOptions} */
let clientOptions = {
model: agent.model || agent.model_parameters.model,
};
let titleProviderConfig = getProviderConfig({ provider: endpoint, appConfig });
/** @type {TEndpoint | undefined} */
const endpointConfig =
appConfig.endpoints?.all ??
appConfig.endpoints?.[endpoint] ??
titleProviderConfig.customEndpointConfig;
if (!endpointConfig) {
logger.debug(
`[api/server/controllers/agents/client.js #titleConvo] No endpoint config for "${endpoint}"`,
);
}
if (endpointConfig?.titleConvo === false) {
logger.debug(
`[api/server/controllers/agents/client.js #titleConvo] Title generation disabled for endpoint "${endpoint}"`,
);
return;
}
if (endpointConfig?.titleEndpoint && endpointConfig.titleEndpoint !== endpoint) {
try {
titleProviderConfig = getProviderConfig({
provider: endpointConfig.titleEndpoint,
appConfig,
});
endpoint = endpointConfig.titleEndpoint;
} catch (error) {
logger.warn(
`[api/server/controllers/agents/client.js #titleConvo] Error getting title endpoint config for "${endpointConfig.titleEndpoint}", falling back to default`,
error,
);
// Fall back to original provider config
endpoint = agent.endpoint;
titleProviderConfig = getProviderConfig({ provider: endpoint, appConfig });
}
}
if (
endpointConfig &&
endpointConfig.titleModel &&
endpointConfig.titleModel !== Constants.CURRENT_MODEL
) {
clientOptions.model = endpointConfig.titleModel;
}
const options = await titleProviderConfig.getOptions({
req,
endpoint,
model_parameters: clientOptions,
db: {
getUserKey: db.getUserKey,
getUserKeyValues: db.getUserKeyValues,
},
});
let provider = options.provider ?? titleProviderConfig.overrideProvider ?? agent.provider;
if (
endpoint === EModelEndpoint.azureOpenAI &&
options.llmConfig?.azureOpenAIApiInstanceName == null
) {
provider = Providers.OPENAI;
} else if (
endpoint === EModelEndpoint.azureOpenAI &&
options.llmConfig?.azureOpenAIApiInstanceName != null &&
provider !== Providers.AZURE
) {
provider = Providers.AZURE;
}
/** @type {import('@librechat/agents').ClientOptions} */
clientOptions = { ...options.llmConfig };
if (options.configOptions) {
clientOptions.configuration = options.configOptions;
}
if (clientOptions.maxTokens != null) {
delete clientOptions.maxTokens;
}
if (clientOptions?.modelKwargs?.max_completion_tokens != null) {
delete clientOptions.modelKwargs.max_completion_tokens;
}
if (clientOptions?.modelKwargs?.max_output_tokens != null) {
delete clientOptions.modelKwargs.max_output_tokens;
}
clientOptions = Object.assign(
Object.fromEntries(
Object.entries(clientOptions).filter(([key]) => !omitTitleOptions.has(key)),
),
);
if (
provider === Providers.GOOGLE &&
(endpointConfig?.titleMethod === TitleMethod.FUNCTIONS ||
endpointConfig?.titleMethod === TitleMethod.STRUCTURED)
) {
clientOptions.json = true;
}
/** Resolve request-based headers for Custom Endpoints. Note: if this is added to
* non-custom endpoints, needs consideration of varying provider header configs.
*/
if (clientOptions?.configuration?.defaultHeaders != null) {
clientOptions.configuration.defaultHeaders = resolveHeaders({
headers: clientOptions.configuration.defaultHeaders,
user: createSafeUser(this.options.req?.user),
body: {
messageId: this.responseMessageId,
conversationId: this.conversationId,
parentMessageId: this.parentMessageId,
},
});
}
try {
const titleResult = await this.run.generateTitle({
provider,
clientOptions,
inputText: text,
contentParts: immediate ? [] : this.contentParts,
titleMethod: endpointConfig?.titleMethod,
titlePrompt: endpointConfig?.titlePrompt,
titlePromptTemplate: endpointConfig?.titlePromptTemplate,
chainOptions: {
runName: 'TitleRun',
signal: abortController.signal,
callbacks: [
{
handleLLMEnd,
},
],
configurable: {
thread_id: this.conversationId,
user_id: this.user ?? this.options.req.user?.id,
},
},
});
const collectedUsage = collectedMetadata.map((item) => {
let input_tokens, output_tokens;
if (item.usage) {
input_tokens =
item.usage.prompt_tokens || item.usage.input_tokens || item.usage.inputTokens;
output_tokens =
item.usage.completion_tokens || item.usage.output_tokens || item.usage.outputTokens;
} else if (item.tokenUsage) {
input_tokens = item.tokenUsage.promptTokens;
output_tokens = item.tokenUsage.completionTokens;
} else if (item.usage_metadata) {
input_tokens = item.usage_metadata.input_tokens;
output_tokens = item.usage_metadata.output_tokens;
}
return {
input_tokens: input_tokens,
output_tokens: output_tokens,
};
});
const balanceConfig = getBalanceConfig(appConfig);
const transactionsConfig = getTransactionsConfig(appConfig);
await this.recordCollectedUsage({
collectedUsage,
context: 'title',
model: clientOptions.model,
balance: balanceConfig,
transactions: transactionsConfig,
messageId: this.responseMessageId,
}).catch((err) => {
logger.error(
'[api/server/controllers/agents/client.js #titleConvo] Error recording collected usage',
err,
);
});
return sanitizeTitle(titleResult.title);
} catch (err) {
logger.error('[api/server/controllers/agents/client.js #titleConvo] Error', err);
return;
}
}
/**
* @param {object} params
* @param {number} params.promptTokens
* @param {number} params.completionTokens
* @param {string} [params.model]
* @param {OpenAIUsageMetadata} [params.usage]
* @param {AppConfig['balance']} [params.balance]
* @param {string} [params.context='message']
* @returns {Promise<void>}
*/
async recordTokenUsage({
model,
usage,
balance,
promptTokens,
completionTokens,
context = 'message',
}) {
try {
await db.spendTokens(
{
model,
context,
balance,
messageId: this.responseMessageId,
conversationId: this.conversationId,
user: this.user ?? this.options.req.user?.id,
endpointTokenConfig: this.options.endpointTokenConfig,
},
{ promptTokens, completionTokens },
);
if (
usage &&
typeof usage === 'object' &&
'reasoning_tokens' in usage &&
typeof usage.reasoning_tokens === 'number'
) {
await db.spendTokens(
{
model,
balance,
context: 'reasoning',
messageId: this.responseMessageId,
conversationId: this.conversationId,
user: this.user ?? this.options.req.user?.id,
endpointTokenConfig: this.options.endpointTokenConfig,
},
{ completionTokens: usage.reasoning_tokens },
);
}
} catch (error) {
logger.error(
'[api/server/controllers/agents/client.js #recordTokenUsage] Error recording token usage',
error,
);
}
}
/** Anthropic Claude models use a distinct BPE tokenizer; all others default to o200k_base. */
getEncoding() {
if (this.model && this.model.toLowerCase().includes('claude')) {
return 'claude';
}
return 'o200k_base';
}
}
module.exports = AgentClient;