mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-05-13 16:07:30 +00:00
1041 lines
39 KiB
JavaScript
1041 lines
39 KiB
JavaScript
const path = require('path');
|
|
const { v4 } = require('uuid');
|
|
const { logger } = require('@librechat/data-schemas');
|
|
const { getCodeBaseURL } = require('@librechat/agents');
|
|
const {
|
|
withTimeout,
|
|
getBasePath,
|
|
logAxiosError,
|
|
hasOfficeHtmlPath,
|
|
sanitizeArtifactPath,
|
|
flattenArtifactPath,
|
|
createAxiosInstance,
|
|
classifyCodeArtifact,
|
|
codeServerHttpAgent,
|
|
codeServerHttpsAgent,
|
|
extractCodeArtifactText,
|
|
getExtractedTextFormat,
|
|
getStorageMetadata,
|
|
buildCodeEnvDownloadQuery,
|
|
} = require('@librechat/api');
|
|
const {
|
|
Tools,
|
|
megabyte,
|
|
fileConfig,
|
|
FileContext,
|
|
FileSources,
|
|
imageExtRegex,
|
|
inferMimeType,
|
|
EToolResources,
|
|
EModelEndpoint,
|
|
mergeFileConfig,
|
|
getEndpointFileConfig,
|
|
} = require('librechat-data-provider');
|
|
const { filterFilesByAgentAccess } = require('~/server/services/Files/permissions');
|
|
const { createFile, getFiles, updateFile, claimCodeFile } = require('~/models');
|
|
const { getStrategyFunctions } = require('~/server/services/Files/strategies');
|
|
const { convertImage } = require('~/server/services/Files/images/convert');
|
|
const { determineFileType } = require('~/server/utils');
|
|
|
|
const axios = createAxiosInstance();
|
|
|
|
/**
|
|
* Creates a fallback download URL response when file cannot be processed locally.
|
|
* Used when: file exceeds size limit, storage strategy unavailable, or download error occurs.
|
|
* @param {Object} params - The parameters.
|
|
* @param {string} params.name - The filename.
|
|
* @param {string} params.session_id - The code execution session ID.
|
|
* @param {string} params.id - The file ID from the code environment.
|
|
* @param {string} params.conversationId - The current conversation ID.
|
|
* @param {string} params.toolCallId - The tool call ID that generated the file.
|
|
* @param {string} params.messageId - The current message ID.
|
|
* @param {number} params.expiresAt - Expiration timestamp (24 hours from creation).
|
|
* @returns {Object} Fallback response with download URL.
|
|
*/
|
|
const createDownloadFallback = ({
|
|
id,
|
|
name,
|
|
messageId,
|
|
expiresAt,
|
|
session_id,
|
|
toolCallId,
|
|
conversationId,
|
|
}) => {
|
|
const basePath = getBasePath();
|
|
return {
|
|
filename: name,
|
|
filepath: `${basePath}/api/files/code/download/${session_id}/${id}`,
|
|
expiresAt,
|
|
conversationId,
|
|
toolCallId,
|
|
messageId,
|
|
};
|
|
};
|
|
|
|
/**
|
|
* Hard ceiling on the deferred preview rendering (HTML extraction + DB
|
|
* update). The inner office-render path already has its own 12s timeout
|
|
* and a concurrency-limited queue; this is the outer guard that catches
|
|
* pathological cases where queue wait + render + DB write would
|
|
* otherwise hang the file in `status: 'pending'` indefinitely.
|
|
*
|
|
* If the timeout fires the record is updated to `status: 'failed'`
|
|
* with `previewError: 'timeout'` and the UI shows download-only.
|
|
*/
|
|
const PREVIEW_FINALIZE_TIMEOUT_MS = 60_000;
|
|
|
|
/**
|
|
* Render the inline HTML preview for a code-execution file (or plain
|
|
* text for non-office buckets that still benefit from caching), then
|
|
* atomically transition the DB record to `status: 'ready'` (with
|
|
* `text`/`textFormat`) or `status: 'failed'` (with `previewError`).
|
|
*
|
|
* Decoupled from `processCodeOutput` so the agent's final response is
|
|
* not blocked on potentially slow office rendering. The caller fires
|
|
* this without awaiting; promises continue running after the HTTP
|
|
* response closes (Node doesn't kill them) and the frontend learns of
|
|
* completion via the `attachment` update SSE event (if the stream is
|
|
* still open) or via React Query polling otherwise. Process restart
|
|
* is the only thing that can lose progress — covered by the boot-time
|
|
* orphan sweep.
|
|
*
|
|
* @param {object} params
|
|
* @param {Buffer} params.buffer - The full downloaded file contents,
|
|
* bounded by the server's `fileSizeLimit` config (defaults far above
|
|
* the 1MB extractor cap). The buffer is captured by the closure
|
|
* returned in `{ finalize }`, so when many office files queue behind
|
|
* the inner concurrency limiter (cap 2), all queued buffers stay
|
|
* resident until each one's slot frees. For a tool result emitting
|
|
* N office files, peak heap usage from this path is up to
|
|
* `N * fileSizeLimit`. Acceptable for typical agent runs (a handful
|
|
* of files at a few hundred KB each); pathological cases are bounded
|
|
* by the inner per-file 12s timeout and the outer 60s render cap.
|
|
* @param {string} params.leafName - Basename for classification.
|
|
* @param {string} params.mimeType - Detected/inferred MIME.
|
|
* @param {string} params.category - Classifier output.
|
|
* @param {string} params.file_id - The DB record key for the update.
|
|
* @param {string} [params.previewRevision] - Generation marker stamped
|
|
* by the immediate persist step. The DB commit is conditional on
|
|
* this — if a newer emit (cross-turn filename reuse) has rotated
|
|
* the revision before this render finishes, `updateFile` returns
|
|
* null and the stale render is silently discarded rather than
|
|
* overwriting the newer record.
|
|
* @returns {Promise<MongoFile | null>} The post-update record on
|
|
* success; `null` if the DB update itself failed (extraction failure
|
|
* is reflected as `status: 'failed'`, not a thrown error) or if the
|
|
* `previewRevision` guard rejected the write.
|
|
*/
|
|
const finalizePreview = async ({
|
|
buffer,
|
|
leafName,
|
|
mimeType,
|
|
category,
|
|
file_id,
|
|
previewRevision,
|
|
}) => {
|
|
let text = null;
|
|
let previewError;
|
|
try {
|
|
text = await withTimeout(
|
|
extractCodeArtifactText(buffer, leafName, mimeType, category),
|
|
PREVIEW_FINALIZE_TIMEOUT_MS,
|
|
`Preview extraction exceeded ${PREVIEW_FINALIZE_TIMEOUT_MS}ms`,
|
|
);
|
|
} catch (_error) {
|
|
/* `extractCodeArtifactText` swallows its own errors and returns null,
|
|
* so the only way to reach here is a `withTimeout` rejection — i.e.
|
|
* the queue + render combined exceeded the outer 60s ceiling. */
|
|
previewError = 'timeout';
|
|
logger.warn(
|
|
`[finalizePreview] ${file_id}: extraction timed out after ${PREVIEW_FINALIZE_TIMEOUT_MS}ms`,
|
|
);
|
|
}
|
|
/* HTML-or-null contract (PR #12934): null result on an office file
|
|
* must NOT fall back to plain text — surface as failed. Caller gates
|
|
* on `hasOfficeHtmlPath`, so reaching here always means office. */
|
|
const textFormat = getExtractedTextFormat(leafName, mimeType, text);
|
|
const failed = text == null;
|
|
const status = failed ? 'failed' : 'ready';
|
|
if (failed && !previewError) {
|
|
previewError = 'parser-error';
|
|
}
|
|
try {
|
|
/* Conditional update: commit only if `previewRevision` still
|
|
* matches what the immediate persist step stamped. If a newer
|
|
* emit has rotated the revision (cross-turn filename reuse),
|
|
* `updateFile` returns null and the stale render is silently
|
|
* discarded. (Codex P1 review on PR #12957.) */
|
|
const updated = await updateFile(
|
|
{
|
|
file_id,
|
|
text,
|
|
textFormat,
|
|
status,
|
|
previewError: failed ? previewError : null,
|
|
},
|
|
previewRevision ? { previewRevision } : undefined,
|
|
);
|
|
if (!updated && previewRevision) {
|
|
logger.debug(
|
|
`[finalizePreview] ${file_id}: stale render skipped — newer emit has superseded revision ${previewRevision}`,
|
|
);
|
|
}
|
|
return updated;
|
|
} catch (error) {
|
|
logger.error(
|
|
`[finalizePreview] ${file_id}: failed to persist preview result: ${error?.message ?? error}`,
|
|
);
|
|
return null;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Run the background `finalize` thunk returned by `processCodeOutput`
|
|
* and route the resolved record to the caller's emit logic. Shared
|
|
* between `callbacks.js` (chat-completions + Open Responses) and
|
|
* `tools.js` (direct tool endpoint) so the fire-and-forget pattern
|
|
* doesn't drift across callsites.
|
|
*
|
|
* `onResolved` receives the post-update DB record and is the only piece
|
|
* that varies — chat-completions writes the legacy `attachment` SSE
|
|
* event, Open Responses writes the spec-shaped `librechat:attachment`
|
|
* event with a sequence number, and the direct tool endpoint has no
|
|
* stream to write to (caller passes a no-op).
|
|
*
|
|
* The catch path is the safety net for unexpected programming errors
|
|
* inside `finalizePreview` ONLY. The function is designed to never
|
|
* throw (extraction and DB failures are translated to `status: 'failed'`
|
|
* inside it), but a ref error or future regression would otherwise
|
|
* leave the DB record stuck at `'pending'` until the boot-time orphan
|
|
* sweep — potentially hours away on a stable server. We attempt a
|
|
* best-effort `updateFile` to mark the record `'failed'` with
|
|
* `previewError: 'unexpected'` so the UI stops polling and the
|
|
* next-turn LLM context surfaces the failure.
|
|
*
|
|
* `onResolved` errors are deliberately isolated in their own try/catch.
|
|
* Without that isolation, a transient transport-side failure (SSE write
|
|
* race after the stream closed, an emitter listener throwing) would
|
|
* propagate into the finalize catch and downgrade an *already-resolved*
|
|
* record to `failed` with `previewError: 'unexpected'` — surfacing
|
|
* "preview unavailable" in the UI even though extraction succeeded
|
|
* and the file is on disk. The emit failure is logged but the DB
|
|
* record stays at whatever `finalizePreview` wrote (typically
|
|
* `'ready'`), so the polling layer / next page load still sees the
|
|
* resolved preview.
|
|
*
|
|
* @param {object} params
|
|
* @param {(() => Promise<object | null>) | undefined} params.finalize - The
|
|
* thunk returned by `processCodeOutput`. No-op when undefined.
|
|
* @param {string | undefined} params.fileId - DB key for the failure
|
|
* marker; if absent the catch only logs.
|
|
* @param {string | undefined} [params.previewRevision] - Generation
|
|
* marker stamped by the immediate persist step. The defensive
|
|
* `updateFile` in the catch is conditional on this — if a newer
|
|
* emit has rotated the revision, the stale failure marker is
|
|
* silently discarded so a programming error from an older render
|
|
* doesn't override a newer turn's record.
|
|
* @param {(updated: object) => void} [params.onResolved] - Called once
|
|
* on success with the post-update record.
|
|
*/
|
|
const runPreviewFinalize = ({ finalize, fileId, previewRevision, onResolved }) => {
|
|
if (typeof finalize !== 'function') {
|
|
return;
|
|
}
|
|
finalize()
|
|
.then((updated) => {
|
|
if (!updated || !onResolved) {
|
|
return;
|
|
}
|
|
/* Isolated try/catch — a throw inside `onResolved` (transport-side
|
|
* SSE write race, emitter listener error) MUST NOT propagate to
|
|
* the outer `.catch`, which would downgrade an already-resolved
|
|
* record to `failed` with `previewError: 'unexpected'`.
|
|
* Extraction succeeded at this point and `finalizePreview` has
|
|
* already persisted the terminal status; the polling layer / next
|
|
* page load will surface the resolved preview even if this turn's
|
|
* SSE emit didn't land. */
|
|
try {
|
|
onResolved(updated);
|
|
} catch (emitError) {
|
|
logger.error(
|
|
`[runPreviewFinalize] onResolved threw for ${fileId}; record stays at the finalized status:`,
|
|
emitError,
|
|
);
|
|
}
|
|
})
|
|
.catch((error) => {
|
|
logger.error('Error rendering deferred preview:', error);
|
|
if (!fileId) {
|
|
return;
|
|
}
|
|
updateFile(
|
|
{
|
|
file_id: fileId,
|
|
status: 'failed',
|
|
previewError: 'unexpected',
|
|
},
|
|
previewRevision ? { previewRevision } : undefined,
|
|
).catch((updateErr) => {
|
|
logger.error(
|
|
`[runPreviewFinalize] also failed to mark ${fileId} as failed after error:`,
|
|
updateErr,
|
|
);
|
|
});
|
|
});
|
|
};
|
|
|
|
/**
|
|
* Process code execution output files — downloads and saves both images
|
|
* and non-image files. All files are saved to local storage with
|
|
* `codeEnvRef` metadata for code env re-upload.
|
|
*
|
|
* Returns a two-part shape so callers can ship the attachment to the
|
|
* client immediately and run preview extraction in the background:
|
|
* - `file`: persisted metadata (file is on disk, downloadable, and
|
|
* has `status: 'pending'` if a preview is still being rendered).
|
|
* - `finalize` (optional): a thunk returning the deferred preview
|
|
* result promise. Present only when an inline HTML preview is
|
|
* expected (office buckets — DOCX/XLSX/XLS/ODS/CSV/PPTX). Caller
|
|
* decides whether to await or fire-and-forget.
|
|
*
|
|
* Existing fallback paths (size limit, missing storage strategy, error
|
|
* catch) return `{ file }` with no `finalize` — there's nothing to
|
|
* extract.
|
|
*
|
|
* @param {ServerRequest} params.req - The Express request object.
|
|
* @param {string} params.id - The file ID from the code environment.
|
|
* @param {string} params.name - The filename.
|
|
* @param {string} params.toolCallId - The tool call ID that generated the file.
|
|
* @param {string} params.session_id - The code execution session ID.
|
|
* @param {string} params.conversationId - The current conversation ID.
|
|
* @param {string} params.messageId - The current message ID.
|
|
* @returns {Promise<{ file: MongoFile & { messageId: string, toolCallId: string }, finalize?: () => Promise<MongoFile | null> }>}
|
|
*/
|
|
const processCodeOutput = async ({
|
|
req,
|
|
id,
|
|
name,
|
|
toolCallId,
|
|
conversationId,
|
|
messageId,
|
|
session_id,
|
|
}) => {
|
|
const appConfig = req.config;
|
|
const currentDate = new Date();
|
|
const baseURL = getCodeBaseURL();
|
|
const fileExt = path.extname(name).toLowerCase();
|
|
const isImage = fileExt && imageExtRegex.test(name);
|
|
|
|
const mergedFileConfig = mergeFileConfig(appConfig.fileConfig);
|
|
const endpointFileConfig = getEndpointFileConfig({
|
|
fileConfig: mergedFileConfig,
|
|
endpoint: EModelEndpoint.agents,
|
|
});
|
|
const fileSizeLimit = endpointFileConfig.fileSizeLimit ?? mergedFileConfig.serverFileSizeLimit;
|
|
|
|
try {
|
|
const formattedDate = currentDate.toISOString();
|
|
/* Code-output files are always user-private — no skill execution
|
|
* produces a skill-scoped output bucket. The download URL must
|
|
* carry `?kind=user&id=<userId>` so codeapi's `sessionAuth`
|
|
* resolves the matching `<tenant>:user:<userId>` sessionKey. See
|
|
* codeapi #1455 / Phase C. */
|
|
const downloadQuery = buildCodeEnvDownloadQuery({ kind: 'user', id: req.user.id });
|
|
const response = await axios({
|
|
method: 'get',
|
|
url: `${baseURL}/download/${session_id}/${id}${downloadQuery}`,
|
|
responseType: 'arraybuffer',
|
|
headers: {
|
|
'User-Agent': 'LibreChat/1.0',
|
|
},
|
|
httpAgent: codeServerHttpAgent,
|
|
httpsAgent: codeServerHttpsAgent,
|
|
timeout: 15000,
|
|
});
|
|
|
|
const buffer = Buffer.from(response.data, 'binary');
|
|
|
|
// Enforce file size limit
|
|
if (buffer.length > fileSizeLimit) {
|
|
logger.warn(
|
|
`[processCodeOutput] File "${name}" (${(buffer.length / megabyte).toFixed(2)} MB) exceeds size limit of ${(fileSizeLimit / megabyte).toFixed(2)} MB, falling back to download URL`,
|
|
);
|
|
return {
|
|
file: createDownloadFallback({
|
|
id,
|
|
name,
|
|
messageId,
|
|
toolCallId,
|
|
session_id,
|
|
conversationId,
|
|
expiresAt: currentDate.getTime() + 86400000,
|
|
}),
|
|
};
|
|
}
|
|
|
|
/* Code-output files belong to the user who ran the execution.
|
|
* SessionKey on codeapi will be `<tenant>:user:<userId>` for these,
|
|
* so cache and access stay user-private. */
|
|
const codeEnvRef = {
|
|
kind: 'user',
|
|
id: req.user.id,
|
|
storage_session_id: session_id,
|
|
file_id: id,
|
|
};
|
|
|
|
/* `safeName` keeps the directory structure (`a/b/file.txt` -> `a/b/file.txt`)
|
|
* so the next prime() can place the file at the same nested path in the
|
|
* sandbox; flattening would re-create the bug where every nested artifact
|
|
* collapsed into the root and read_file calls 404'd. The flat-form
|
|
* storage key is composed below once `file_id` is known so we can cap
|
|
* the total length at filesystem NAME_MAX. */
|
|
const safeName = sanitizeArtifactPath(name);
|
|
if (safeName !== name) {
|
|
logger.warn(
|
|
`[processCodeOutput] Filename sanitized: "${name}" -> "${safeName}" | conv=${conversationId}`,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Atomically claim a file_id for this (filename, conversationId, context) tuple.
|
|
* Uses $setOnInsert so concurrent calls for the same filename converge on
|
|
* a single record instead of creating duplicates (TOCTOU race fix).
|
|
*
|
|
* Claim by `safeName` (not raw `name`) so the claim and the eventual
|
|
* `createFile` agree on the filename column — otherwise weird inputs
|
|
* (e.g. `"proj name/file@v1.txt"`) would claim under the raw name and
|
|
* then write under the sanitized one, leaving the claim row orphaned.
|
|
*/
|
|
const newFileId = v4();
|
|
const claimed = await claimCodeFile({
|
|
filename: safeName,
|
|
conversationId,
|
|
file_id: newFileId,
|
|
user: req.user.id,
|
|
tenantId: req.user.tenantId,
|
|
});
|
|
const file_id = claimed.file_id;
|
|
const isUpdate = file_id !== newFileId;
|
|
|
|
if (isUpdate) {
|
|
logger.debug(
|
|
`[processCodeOutput] Updating existing file "${safeName}" (${file_id}) instead of creating duplicate`,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Preserve the original `messageId` on update. Each `processCodeOutput`
|
|
* call would otherwise overwrite it with the current run's run id, which
|
|
* decouples the file from the assistant message that originally created
|
|
* it. `getCodeGeneratedFiles` filters by `messageId IN <thread>`, so a
|
|
* stale id (e.g. from a later regeneration / failed re-read attempt)
|
|
* silently excludes the file from priming on subsequent turns.
|
|
*/
|
|
const persistedMessageId = isUpdate ? (claimed.messageId ?? messageId) : messageId;
|
|
|
|
if (isImage) {
|
|
const usage = isUpdate ? (claimed.usage ?? 0) + 1 : 1;
|
|
const _file = await convertImage(req, buffer, 'high', `${file_id}${fileExt}`);
|
|
const filepath = usage > 1 ? `${_file.filepath}?v=${Date.now()}` : _file.filepath;
|
|
const storageMetadata = getStorageMetadata({
|
|
filepath: _file.filepath,
|
|
source: appConfig.fileStrategy,
|
|
storageKey: _file.storageKey,
|
|
storageRegion: _file.storageRegion,
|
|
});
|
|
const file = {
|
|
..._file,
|
|
filepath,
|
|
...storageMetadata,
|
|
file_id,
|
|
messageId: persistedMessageId,
|
|
usage,
|
|
filename: safeName,
|
|
conversationId,
|
|
user: req.user.id,
|
|
tenantId: req.user.tenantId,
|
|
type: `image/${appConfig.imageOutputType}`,
|
|
createdAt: isUpdate ? claimed.createdAt : formattedDate,
|
|
updatedAt: formattedDate,
|
|
source: appConfig.fileStrategy,
|
|
context: FileContext.execute_code,
|
|
metadata: { codeEnvRef },
|
|
};
|
|
await createFile(file, true);
|
|
return { file: Object.assign(file, { messageId, toolCallId }) };
|
|
}
|
|
|
|
const { saveBuffer } = getStrategyFunctions(appConfig.fileStrategy);
|
|
if (!saveBuffer) {
|
|
logger.warn(
|
|
`[processCodeOutput] saveBuffer not available for strategy ${appConfig.fileStrategy}, falling back to download URL`,
|
|
);
|
|
return {
|
|
file: createDownloadFallback({
|
|
id,
|
|
name,
|
|
messageId,
|
|
toolCallId,
|
|
session_id,
|
|
conversationId,
|
|
expiresAt: currentDate.getTime() + 86400000,
|
|
}),
|
|
};
|
|
}
|
|
|
|
const detectedType = await determineFileType(buffer, true);
|
|
const mimeType = detectedType?.mime || inferMimeType(name, '') || 'application/octet-stream';
|
|
|
|
/** Check MIME type support - for code-generated files, we're lenient but log unsupported types */
|
|
const isSupportedMimeType = fileConfig.checkType(
|
|
mimeType,
|
|
endpointFileConfig.supportedMimeTypes,
|
|
);
|
|
if (!isSupportedMimeType) {
|
|
logger.warn(
|
|
`[processCodeOutput] File "${name}" has unsupported MIME type "${mimeType}", proceeding with storage but may not be usable as tool resource`,
|
|
);
|
|
}
|
|
|
|
/* Compose the storage key here, after `file_id` is known, so the
|
|
* `flattenArtifactPath` cap budget can be calculated against the
|
|
* actual prefix length. The full key has to fit in one filesystem
|
|
* path component (NAME_MAX = 255 on most filesystems); without this
|
|
* cap, deeply-nested artifact paths whose individual segments were
|
|
* within bounds can still produce a flat form that overflows once
|
|
* `${file_id}__` is prepended, causing `ENAMETOOLONG` inside
|
|
* saveBuffer and falling back to a download URL. The 255 figure is
|
|
* the conservative cross-platform NAME_MAX (Linux ext4, NTFS, APFS).
|
|
*/
|
|
const NAME_MAX = 255;
|
|
const flatName = flattenArtifactPath(safeName, NAME_MAX - file_id.length - 2);
|
|
const fileName = `${file_id}__${flatName}`;
|
|
const filepath = await saveBuffer({
|
|
userId: req.user.id,
|
|
buffer,
|
|
fileName,
|
|
basePath: 'uploads',
|
|
tenantId: req.user.tenantId,
|
|
});
|
|
const storageMetadata = getStorageMetadata({
|
|
filepath,
|
|
source: appConfig.fileStrategy,
|
|
});
|
|
|
|
/* `classifyCodeArtifact` and `extractCodeArtifactText` make
|
|
* extension/bare-name decisions on the input string. With the
|
|
* path-preserving sanitizer they can now receive a nested path like
|
|
* `reports.v1/Makefile`, which the classifier's `extensionOf` reads
|
|
* as `v1/Makefile` (the slice after the dot in the directory name)
|
|
* and the bare-name branch rejects because it sees a `.` anywhere in
|
|
* the string. Result: extensionless artifacts under dotted folders
|
|
* (Makefile, Dockerfile, etc.) get misclassified as `other` and
|
|
* skip text extraction. Pass the basename so classification matches
|
|
* what it would have gotten with the old flat-name flow. */
|
|
const leafName = path.basename(safeName);
|
|
const category = classifyCodeArtifact(leafName, mimeType);
|
|
|
|
/* Office-bucket files (DOCX/XLSX/XLS/ODS/CSV/PPTX) route through
|
|
* `bufferToOfficeHtml` which is CPU-heavy. Persist the record now
|
|
* with `status: 'pending'` and `text: null` so the agent's response
|
|
* isn't blocked, then return a `finalize` thunk the caller can run
|
|
* in the background. Non-office files have cheap or no extraction
|
|
* — run it inline so the caller gets a fully-resolved record
|
|
* without juggling a finalize step. */
|
|
const expectsPreview = hasOfficeHtmlPath(leafName, mimeType);
|
|
|
|
const baseFile = {
|
|
file_id,
|
|
filepath,
|
|
...storageMetadata,
|
|
messageId: persistedMessageId,
|
|
object: 'file',
|
|
filename: safeName,
|
|
type: mimeType,
|
|
conversationId,
|
|
user: req.user.id,
|
|
tenantId: req.user.tenantId,
|
|
bytes: buffer.length,
|
|
updatedAt: formattedDate,
|
|
metadata: { codeEnvRef },
|
|
source: appConfig.fileStrategy,
|
|
context: FileContext.execute_code,
|
|
usage: isUpdate ? (claimed.usage ?? 0) + 1 : 1,
|
|
createdAt: isUpdate ? claimed.createdAt : formattedDate,
|
|
};
|
|
|
|
if (expectsPreview) {
|
|
/* Persist with `status: 'pending'` and explicit
|
|
* `text: null` / `textFormat: null` so an update that previously
|
|
* had cached text gets cleared. The deferred finalize transitions
|
|
* to 'ready' (with text/textFormat) or 'failed' (with
|
|
* previewError).
|
|
*
|
|
* `previewRevision` is a fresh UUID stamped on every emit. The
|
|
* deferred finalize's `updateFile` is conditional on this — if
|
|
* a newer turn (cross-turn filename reuse) has rotated the
|
|
* revision before this render finishes, the stale render is
|
|
* silently discarded rather than overwriting the newer record.
|
|
* (Codex P1 review on PR #12957.) */
|
|
const previewRevision = v4();
|
|
const file = {
|
|
...baseFile,
|
|
text: null,
|
|
textFormat: null,
|
|
status: 'pending',
|
|
previewError: null,
|
|
previewRevision,
|
|
};
|
|
await createFile(file, true);
|
|
return {
|
|
file: Object.assign(file, { messageId, toolCallId }),
|
|
finalize: () =>
|
|
finalizePreview({ buffer, leafName, mimeType, category, file_id, previewRevision }),
|
|
previewRevision,
|
|
};
|
|
}
|
|
|
|
/* Non-office path: extraction is cheap (utf8 decode, parseDocument
|
|
* for PDF/ODT, or null for binaries). Run inline and return a
|
|
* fully-resolved record — no `finalize` needed. */
|
|
const text = await extractCodeArtifactText(buffer, leafName, mimeType, category);
|
|
/* `textFormat` accompanies `text` so the client can gate
|
|
* office-HTML-bucket routing on a trusted signal — clients MUST
|
|
* NOT inject `text` into the iframe as HTML unless `textFormat ===
|
|
* 'html'`. RAG-uploaded `.docx` etc. arrive with plain text from
|
|
* mammoth.extractRawText and would otherwise be hijacked by the
|
|
* extension-based office routing into the HTML-injection path
|
|
* (Codex P1 review on PR #12934). null on extract failure — the
|
|
* client treats absence as 'text' for safety. */
|
|
const textFormat = getExtractedTextFormat(leafName, mimeType, text);
|
|
const file = {
|
|
...baseFile,
|
|
// Always set explicitly so an update which produces a binary or
|
|
// oversized artifact clears any previously cached text — createFile
|
|
// uses findOneAndUpdate with $set semantics.
|
|
text: text ?? null,
|
|
textFormat: textFormat ?? null,
|
|
// Clear deferred-preview lifecycle fields in case the prior emit
|
|
// at this (filename, conversationId) was an office file —
|
|
// otherwise stale `pending`/`failed` would persist and the client
|
|
// would render the wrong state for the now non-office artifact.
|
|
status: null,
|
|
previewError: null,
|
|
previewRevision: null,
|
|
};
|
|
|
|
await createFile(file, true);
|
|
return { file: Object.assign(file, { messageId, toolCallId }) };
|
|
} catch (error) {
|
|
if (error?.message === 'Path traversal detected in filename') {
|
|
logger.warn(
|
|
`[processCodeOutput] Path traversal blocked for file "${name}" | conv=${conversationId}`,
|
|
);
|
|
}
|
|
logAxiosError({
|
|
message: 'Error downloading/processing code environment file',
|
|
error,
|
|
});
|
|
logger.warn(
|
|
`[processCodeOutput] Falling back to Code API download URL for strategy ${appConfig.fileStrategy}`,
|
|
);
|
|
|
|
// Fallback for download errors - return download URL so user can still manually download
|
|
return {
|
|
file: createDownloadFallback({
|
|
id,
|
|
name,
|
|
messageId,
|
|
toolCallId,
|
|
session_id,
|
|
conversationId,
|
|
expiresAt: currentDate.getTime() + 86400000,
|
|
}),
|
|
};
|
|
}
|
|
};
|
|
|
|
function checkIfActive(dateString) {
|
|
const givenDate = new Date(dateString);
|
|
const currentDate = new Date();
|
|
const timeDifference = currentDate - givenDate;
|
|
const hoursPassed = timeDifference / (1000 * 60 * 60);
|
|
return hoursPassed < 23;
|
|
}
|
|
|
|
/**
|
|
* Retrieves the `lastModified` time string for a specified file from Code Execution Server.
|
|
*
|
|
* @param {import('librechat-data-provider').CodeEnvRef} ref - Typed pointer
|
|
* into codeapi storage. Carries kind/id/storage_session_id/file_id;
|
|
* codeapi resolves the sessionKey from the request's auth context.
|
|
*
|
|
* @returns {Promise<string|null>}
|
|
* A promise that resolves to the `lastModified` time string of the file if successful, or null if there is an
|
|
* error in initialization or fetching the info.
|
|
*/
|
|
async function getSessionInfo(ref) {
|
|
try {
|
|
const baseURL = getCodeBaseURL();
|
|
/* `/sessions/.../objects/...` is gated by codeapi's `sessionAuth`
|
|
* middleware (post-Phase C). The middleware reconstructs the
|
|
* sessionKey from the URL query (`kind`/`id`/`version?`) plus the
|
|
* requester's auth context, then matches it against the cached
|
|
* sessionKey on the storage bucket. We have the full `codeEnvRef`
|
|
* here, so pass kind+id (+version when skill) directly. */
|
|
const query = buildCodeEnvDownloadQuery({
|
|
kind: ref.kind,
|
|
id: ref.id,
|
|
...(ref.kind === 'skill' ? { version: ref.version } : {}),
|
|
});
|
|
const response = await axios({
|
|
method: 'get',
|
|
url: `${baseURL}/sessions/${ref.storage_session_id}/objects/${ref.file_id}${query}`,
|
|
headers: {
|
|
'User-Agent': 'LibreChat/1.0',
|
|
},
|
|
httpAgent: codeServerHttpAgent,
|
|
httpsAgent: codeServerHttpsAgent,
|
|
timeout: 5000,
|
|
});
|
|
|
|
return response.data?.lastModified;
|
|
} catch (error) {
|
|
logAxiosError({
|
|
message: `Error fetching session info: ${error.message}`,
|
|
error,
|
|
});
|
|
return null;
|
|
}
|
|
}
|
|
|
|
const getPreviewContextSuffix = (file) => {
|
|
if (file.status === 'pending') {
|
|
return ' (preview not yet generated)';
|
|
}
|
|
|
|
if (file.status !== 'failed') {
|
|
return '';
|
|
}
|
|
|
|
return file.previewError
|
|
? ` (preview unavailable: ${file.previewError})`
|
|
: ' (preview unavailable)';
|
|
};
|
|
|
|
const getVisibleCodeFileContextLine = (file, agentResourceIds) => {
|
|
if (file.context === FileContext.execute_code) {
|
|
return '';
|
|
}
|
|
|
|
const fileSuffix = agentResourceIds.has(file.file_id) ? '' : ' (attached by user)';
|
|
return `\n\t- /mnt/data/${file.filename}${fileSuffix}${getPreviewContextSuffix(file)}`;
|
|
};
|
|
|
|
const appendVisibleCodeFileContext = (toolContext, contextLine) => {
|
|
if (!contextLine) {
|
|
return toolContext;
|
|
}
|
|
|
|
if (toolContext) {
|
|
return `${toolContext}${contextLine}`;
|
|
}
|
|
|
|
return `- Note: The following files are available in the "${Tools.execute_code}" tool environment:${contextLine}`;
|
|
};
|
|
|
|
/**
|
|
*
|
|
* @param {Object} options
|
|
* @param {ServerRequest} options.req
|
|
* @param {Agent['tool_resources']} options.tool_resources
|
|
* @param {string} [options.agentId] - The agent ID for file access control
|
|
* @returns {Promise<{
|
|
* files: Array<{ id: string; session_id: string; name: string }>,
|
|
* toolContext: string,
|
|
* }>}
|
|
*/
|
|
const primeFiles = async (options) => {
|
|
const { tool_resources, req, agentId } = options;
|
|
const file_ids = tool_resources?.[EToolResources.execute_code]?.file_ids ?? [];
|
|
const agentResourceIds = new Set(file_ids);
|
|
const resourceFiles = tool_resources?.[EToolResources.execute_code]?.files ?? [];
|
|
|
|
/* Step 1 of the priming trace: input volume. Pair with the
|
|
* per-file `[primeCodeFiles] file=...` lines and the final
|
|
* `[primeCodeFiles] returned=...` line below to locate which
|
|
* layer drops a file the sandbox doesn't end up seeing. */
|
|
logger.debug(
|
|
`[primeCodeFiles] in: file_ids=${file_ids.length} resourceFiles=${resourceFiles.length}`,
|
|
{ agentId, file_ids, resourceFileIds: resourceFiles.map((f) => f?.file_id) },
|
|
);
|
|
|
|
// Get all files first
|
|
const allFiles = (await getFiles({ file_id: { $in: file_ids } }, null, { text: 0 })) ?? [];
|
|
|
|
// Filter by access if user and agent are provided
|
|
let dbFiles;
|
|
if (req?.user?.id && agentId) {
|
|
dbFiles = await filterFilesByAgentAccess({
|
|
files: allFiles,
|
|
userId: req.user.id,
|
|
role: req.user.role,
|
|
agentId,
|
|
});
|
|
} else {
|
|
dbFiles = allFiles;
|
|
}
|
|
|
|
dbFiles = dbFiles.concat(resourceFiles);
|
|
|
|
const files = [];
|
|
const sessions = new Map();
|
|
let toolContext = '';
|
|
|
|
/* Per-file path counters — emitted at the bottom so a single
|
|
* grep on `[primeCodeFiles]` shows the input volume, the per-file
|
|
* paths taken, and the final dispatch summary in one trace. */
|
|
let skippedNoRef = 0;
|
|
let reuploadFailures = 0;
|
|
|
|
for (let i = 0; i < dbFiles.length; i++) {
|
|
const file = dbFiles[i];
|
|
if (!file) {
|
|
continue;
|
|
}
|
|
|
|
const ref = file.metadata?.codeEnvRef;
|
|
if (!ref) {
|
|
skippedNoRef += 1;
|
|
logger.debug(
|
|
`[primeCodeFiles] file=${file.file_id} path=skip reason=no-codeenvref filename=${file.filename}`,
|
|
);
|
|
continue;
|
|
}
|
|
const session_id = ref.storage_session_id;
|
|
const id = ref.file_id;
|
|
|
|
/**
|
|
* `pushFile` accepts optional overrides so the reupload path can
|
|
* push the FRESH `(storage_session_id, file_id)` from the new
|
|
* `codeEnvRef`. Without these overrides, the closure would
|
|
* capture the stale pre-reupload refs from the outer loop and
|
|
* the in-memory `files` array (now consumed by
|
|
* `buildInitialToolSessions` to seed `Graph.sessions`) would
|
|
* point at a sandbox object that no longer exists. The DB record
|
|
* gets the new ref via `updateFile`, but the seed would still
|
|
* inject the old one — bash_tool / read_file would 404 trying to
|
|
* mount the file until the next turn re-reads metadata.
|
|
*
|
|
* `kind`, `id`, `version` are preserved on the in-memory ref so
|
|
* codeapi can resolve sessionKey per-file (kind switch +
|
|
* tenant prefix from auth context).
|
|
*/
|
|
const pushFile = (overrideSessionId, overrideId) => {
|
|
toolContext = appendVisibleCodeFileContext(
|
|
toolContext,
|
|
getVisibleCodeFileContextLine(file, agentResourceIds),
|
|
);
|
|
/* `id` is the storage file_id (drives codeapi's upload-key
|
|
* existence check), `resource_id` is the entity that owns
|
|
* the storage session (drives sessionKey re-derivation). For
|
|
* code-output files this is `kind: 'user'` and `resource_id`
|
|
* is informational (codeapi ignores it for user kind), but
|
|
* we still send it for shape uniformity with shared kinds. */
|
|
files.push({
|
|
id: overrideId ?? id,
|
|
resource_id: ref.id,
|
|
storage_session_id: overrideSessionId ?? session_id,
|
|
name: file.filename,
|
|
kind: ref.kind,
|
|
...(ref.kind === 'skill' ? { version: ref.version } : {}),
|
|
});
|
|
};
|
|
|
|
if (sessions.has(session_id)) {
|
|
logger.debug(
|
|
`[primeCodeFiles] file=${file.file_id} path=cache-hit-by-session storage_session_id=${session_id}`,
|
|
);
|
|
pushFile();
|
|
continue;
|
|
}
|
|
|
|
const reuploadFile = async () => {
|
|
try {
|
|
const { getDownloadStream } = getStrategyFunctions(file.source);
|
|
const { handleFileUpload: uploadCodeEnvFile } = getStrategyFunctions(
|
|
FileSources.execute_code,
|
|
);
|
|
const stream = await getDownloadStream(options.req, file.filepath);
|
|
/* Reupload preserves the resource identity from the existing
|
|
* ref so codeapi re-buckets under the same sessionKey shape
|
|
* (skill stays skill, user stays user). Without this, a
|
|
* skill-cache-miss reupload would land in the user bucket
|
|
* and never re-shareable cross-user. */
|
|
const uploaded = await uploadCodeEnvFile({
|
|
req: options.req,
|
|
stream,
|
|
filename: file.filename,
|
|
kind: ref.kind,
|
|
id: ref.id,
|
|
...(ref.kind === 'skill' ? { version: ref.version } : {}),
|
|
});
|
|
|
|
/**
|
|
* Use the FRESH `(storage_session_id, file_id)` from the
|
|
* reupload response and route it through the dedupe Map, the
|
|
* persisted record, and the in-memory `files` list. The
|
|
* original ref captured at the top of this iteration refers
|
|
* to the old, expired/missing sandbox object — using it here
|
|
* would silently re-introduce the bug `Graph.sessions`
|
|
* seeding is supposed to fix.
|
|
*
|
|
* `kind`, `id`, `version` survive the round-trip: the
|
|
* upload preserves the resource identity, only the storage
|
|
* pointer changes.
|
|
*/
|
|
const newRef = {
|
|
kind: ref.kind,
|
|
id: ref.id,
|
|
storage_session_id: uploaded.storage_session_id,
|
|
file_id: uploaded.file_id,
|
|
...(ref.kind === 'skill' ? { version: ref.version } : {}),
|
|
};
|
|
|
|
const updatedMetadata = {
|
|
...file.metadata,
|
|
codeEnvRef: newRef,
|
|
};
|
|
|
|
await updateFile({
|
|
file_id: file.file_id,
|
|
metadata: updatedMetadata,
|
|
});
|
|
sessions.set(newRef.storage_session_id, true);
|
|
pushFile(newRef.storage_session_id, newRef.file_id);
|
|
logger.debug(
|
|
`[primeCodeFiles] file=${file.file_id} path=reupload-success ` +
|
|
`oldSession=${session_id} newSession=${newRef.storage_session_id} newFileId=${newRef.file_id}`,
|
|
);
|
|
} catch (error) {
|
|
reuploadFailures += 1;
|
|
logger.error(
|
|
`[primeCodeFiles] file=${file.file_id} path=reupload-failed session=${session_id}: ${error.message}`,
|
|
error,
|
|
);
|
|
}
|
|
};
|
|
const uploadTime = await getSessionInfo(ref);
|
|
if (!uploadTime) {
|
|
logger.debug(
|
|
`[primeCodeFiles] file=${file.file_id} path=reupload reason=no-uploadtime ` +
|
|
`storage_session_id=${session_id}`,
|
|
);
|
|
await reuploadFile();
|
|
continue;
|
|
}
|
|
if (!checkIfActive(uploadTime)) {
|
|
logger.debug(
|
|
`[primeCodeFiles] file=${file.file_id} path=reupload reason=stale ` +
|
|
`uploadTime=${uploadTime} storage_session_id=${session_id}`,
|
|
);
|
|
await reuploadFile();
|
|
continue;
|
|
}
|
|
sessions.set(session_id, true);
|
|
logger.debug(
|
|
`[primeCodeFiles] file=${file.file_id} path=fresh-active storage_session_id=${session_id}`,
|
|
);
|
|
pushFile();
|
|
}
|
|
|
|
/* Dispatch summary — emitted unconditionally so a single grep on
|
|
* `[primeCodeFiles] out` always shows the final state, not only
|
|
* the per-path trail leading up to it. */
|
|
logger.debug(
|
|
`[primeCodeFiles] out: returned=${files.length} ` +
|
|
`skippedNoRef=${skippedNoRef} reuploadFailures=${reuploadFailures}`,
|
|
);
|
|
|
|
return { files, toolContext };
|
|
};
|
|
|
|
/**
|
|
* Reads a single file from the code-execution sandbox by shelling `cat`
|
|
* through the sandbox `/exec` endpoint. Used by the `read_file` host
|
|
* handler when the requested path is a code-env path (`/mnt/data/...`)
|
|
* or otherwise not resolvable as a skill file. Resolves to
|
|
* `{ content }` from stdout on success, or `null` when the codeapi base
|
|
* URL isn't configured / the read returns no content (caller turns that
|
|
* into a model-visible error). Throws axios-style errors on transport
|
|
* failure so the caller can surface a meaningful error message.
|
|
*
|
|
* `session_id` and `files` come from the seeded `tc.codeSessionContext`
|
|
* (emitted by the agents-side `ToolNode` for `read_file` calls in
|
|
* v3.1.72+) so the read lands in the same sandbox session that holds
|
|
* the agent's prior-turn artifacts.
|
|
*
|
|
* @param {Object} params
|
|
* @param {string} params.file_path - Absolute path inside the sandbox (e.g. `/mnt/data/foo.txt`).
|
|
* @param {string} [params.session_id] - Sandbox session id from the seeded context.
|
|
* @param {Array<{id: string, name: string, session_id?: string}>} [params.files] - File refs to mount.
|
|
* @returns {Promise<{content: string} | null>}
|
|
*/
|
|
async function readSandboxFile({ file_path, session_id, files }) {
|
|
const baseURL = getCodeBaseURL();
|
|
if (!baseURL) {
|
|
return null;
|
|
}
|
|
|
|
/** Single-quote `file_path` with embedded-quote escaping so a malicious
|
|
* filename can't break out of the `cat` command. The handler upstream
|
|
* has already established this is a code-env path the model
|
|
* legitimately asked to read; this just keeps the shell quoting safe. */
|
|
const safePath = `'${file_path.replace(/'/g, `'\\''`)}'`;
|
|
/** @type {Record<string, unknown>} */
|
|
const postData = { lang: 'bash', code: `cat ${safePath}` };
|
|
if (session_id) {
|
|
postData.session_id = session_id;
|
|
}
|
|
if (files && files.length > 0) {
|
|
postData.files = files;
|
|
}
|
|
|
|
try {
|
|
const response = await axios({
|
|
method: 'post',
|
|
url: `${baseURL}/exec`,
|
|
data: postData,
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'User-Agent': 'LibreChat/1.0',
|
|
},
|
|
httpAgent: codeServerHttpAgent,
|
|
httpsAgent: codeServerHttpsAgent,
|
|
timeout: 15000,
|
|
});
|
|
const result = response?.data ?? {};
|
|
if (result.stderr && (result.stdout == null || result.stdout === '')) {
|
|
throw new Error(String(result.stderr).trim());
|
|
}
|
|
if (result.stdout == null) {
|
|
return null;
|
|
}
|
|
return { content: String(result.stdout) };
|
|
} catch (error) {
|
|
logAxiosError({
|
|
message: `Error reading sandbox file "${file_path}"`,
|
|
error,
|
|
});
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
primeFiles,
|
|
checkIfActive,
|
|
getSessionInfo,
|
|
processCodeOutput,
|
|
readSandboxFile,
|
|
runPreviewFinalize,
|
|
};
|