diff --git a/api/server/services/Files/Code/__tests__/process-traversal.spec.js b/api/server/services/Files/Code/__tests__/process-traversal.spec.js index cf6b416261..016f3a71c6 100644 --- a/api/server/services/Files/Code/__tests__/process-traversal.spec.js +++ b/api/server/services/Files/Code/__tests__/process-traversal.spec.js @@ -8,7 +8,8 @@ jest.mock('@librechat/agents', () => ({ getCodeBaseURL: jest.fn(() => 'http://localhost:8000'), })); -const mockSanitizeFilename = jest.fn(); +const mockSanitizeArtifactPath = jest.fn(); +const mockFlattenArtifactPath = jest.fn((name) => name.replace(/\//g, '__')); const mockAxios = jest.fn().mockResolvedValue({ data: Buffer.from('file-content'), @@ -21,7 +22,8 @@ jest.mock('@librechat/api', () => { return { logAxiosError: jest.fn(), getBasePath: jest.fn(() => ''), - sanitizeFilename: mockSanitizeFilename, + sanitizeArtifactPath: mockSanitizeArtifactPath, + flattenArtifactPath: mockFlattenArtifactPath, createAxiosInstance: jest.fn(() => mockAxios), classifyCodeArtifact: jest.fn(() => 'other'), extractCodeArtifactText: jest.fn(async () => null), @@ -92,23 +94,25 @@ describe('processCodeOutput path traversal protection', () => { jest.clearAllMocks(); }); - test('sanitizeFilename is called with the raw artifact name', async () => { - mockSanitizeFilename.mockReturnValueOnce('output.csv'); + test('sanitizeArtifactPath is called with the raw artifact name', async () => { + mockSanitizeArtifactPath.mockReturnValueOnce('output.csv'); await processCodeOutput({ ...baseParams, name: 'output.csv' }); - expect(mockSanitizeFilename).toHaveBeenCalledWith('output.csv'); + expect(mockSanitizeArtifactPath).toHaveBeenCalledWith('output.csv'); }); - test('sanitized name is used in saveBuffer fileName', async () => { - mockSanitizeFilename.mockReturnValueOnce('sanitized-name.txt'); + test('sanitized name is used in saveBuffer fileName (and flattened to a single component)', async () => { + mockSanitizeArtifactPath.mockReturnValueOnce('sanitized-name.txt'); await processCodeOutput({ ...baseParams, name: '../../../tmp/poc.txt' }); - expect(mockSanitizeFilename).toHaveBeenCalledWith('../../../tmp/poc.txt'); + expect(mockSanitizeArtifactPath).toHaveBeenCalledWith('../../../tmp/poc.txt'); const call = mockSaveBuffer.mock.calls[0][0]; + /* `flattenArtifactPath` is identity for already-flat names; the assert + * is against the storage-key composition (`__`). */ expect(call.fileName).toBe('mock-uuid__sanitized-name.txt'); }); test('sanitized name is stored as filename in the file record', async () => { - mockSanitizeFilename.mockReturnValueOnce('safe-output.csv'); + mockSanitizeArtifactPath.mockReturnValueOnce('safe-output.csv'); await processCodeOutput({ ...baseParams, name: 'unsafe/../../output.csv' }); const fileArg = createFile.mock.calls[0][0]; @@ -122,10 +126,10 @@ describe('processCodeOutput path traversal protection', () => { bytes: 100, }); - mockSanitizeFilename.mockReturnValueOnce('safe-chart.png'); + mockSanitizeArtifactPath.mockReturnValueOnce('safe-chart.png'); await processCodeOutput({ ...baseParams, name: '../../../chart.png' }); - expect(mockSanitizeFilename).toHaveBeenCalledWith('../../../chart.png'); + expect(mockSanitizeArtifactPath).toHaveBeenCalledWith('../../../chart.png'); const fileArg = createFile.mock.calls[0][0]; expect(fileArg.filename).toBe('safe-chart.png'); }); diff --git a/api/server/services/Files/Code/process.js b/api/server/services/Files/Code/process.js index 72460e661e..5ae78a7e9a 100644 --- a/api/server/services/Files/Code/process.js +++ b/api/server/services/Files/Code/process.js @@ -5,7 +5,8 @@ const { getCodeBaseURL } = require('@librechat/agents'); const { getBasePath, logAxiosError, - sanitizeFilename, + sanitizeArtifactPath, + flattenArtifactPath, createAxiosInstance, classifyCodeArtifact, codeServerHttpAgent, @@ -134,14 +135,32 @@ const processCodeOutput = async ({ const fileIdentifier = `${session_id}/${id}`; + /* `safeName` keeps the directory structure (`a/b/file.txt` -> `a/b/file.txt`) + * so the next prime() can place the file at the same nested path in the + * sandbox; flattening would re-create the bug where every nested artifact + * collapsed into the root and read_file calls 404'd. The flat-form + * storage key is composed below once `file_id` is known so we can cap + * the total length at filesystem NAME_MAX. */ + const safeName = sanitizeArtifactPath(name); + if (safeName !== name) { + logger.warn( + `[processCodeOutput] Filename sanitized: "${name}" -> "${safeName}" | conv=${conversationId}`, + ); + } + /** * Atomically claim a file_id for this (filename, conversationId, context) tuple. * Uses $setOnInsert so concurrent calls for the same filename converge on * a single record instead of creating duplicates (TOCTOU race fix). + * + * Claim by `safeName` (not raw `name`) so the claim and the eventual + * `createFile` agree on the filename column — otherwise weird inputs + * (e.g. `"proj name/file@v1.txt"`) would claim under the raw name and + * then write under the sanitized one, leaving the claim row orphaned. */ const newFileId = v4(); const claimed = await claimCodeFile({ - filename: name, + filename: safeName, conversationId, file_id: newFileId, user: req.user.id, @@ -151,14 +170,7 @@ const processCodeOutput = async ({ if (isUpdate) { logger.debug( - `[processCodeOutput] Updating existing file "${name}" (${file_id}) instead of creating duplicate`, - ); - } - - const safeName = sanitizeFilename(name); - if (safeName !== name) { - logger.warn( - `[processCodeOutput] Filename sanitized: "${name}" -> "${safeName}" | conv=${conversationId}`, + `[processCodeOutput] Updating existing file "${safeName}" (${file_id}) instead of creating duplicate`, ); } @@ -226,7 +238,19 @@ const processCodeOutput = async ({ ); } - const fileName = `${file_id}__${safeName}`; + /* Compose the storage key here, after `file_id` is known, so the + * `flattenArtifactPath` cap budget can be calculated against the + * actual prefix length. The full key has to fit in one filesystem + * path component (NAME_MAX = 255 on most filesystems); without this + * cap, deeply-nested artifact paths whose individual segments were + * within bounds can still produce a flat form that overflows once + * `${file_id}__` is prepended, causing `ENAMETOOLONG` inside + * saveBuffer and falling back to a download URL. The 255 figure is + * the conservative cross-platform NAME_MAX (Linux ext4, NTFS, APFS). + */ + const NAME_MAX = 255; + const flatName = flattenArtifactPath(safeName, NAME_MAX - file_id.length - 2); + const fileName = `${file_id}__${flatName}`; const filepath = await saveBuffer({ userId: req.user.id, buffer, @@ -234,8 +258,19 @@ const processCodeOutput = async ({ basePath: 'uploads', }); - const category = classifyCodeArtifact(safeName, mimeType); - const text = await extractCodeArtifactText(buffer, safeName, mimeType, category); + /* `classifyCodeArtifact` and `extractCodeArtifactText` make + * extension/bare-name decisions on the input string. With the + * path-preserving sanitizer they can now receive a nested path like + * `reports.v1/Makefile`, which the classifier's `extensionOf` reads + * as `v1/Makefile` (the slice after the dot in the directory name) + * and the bare-name branch rejects because it sees a `.` anywhere in + * the string. Result: extensionless artifacts under dotted folders + * (Makefile, Dockerfile, etc.) get misclassified as `other` and + * skip text extraction. Pass the basename so classification matches + * what it would have gotten with the old flat-name flow. */ + const leafName = path.basename(safeName); + const category = classifyCodeArtifact(leafName, mimeType); + const text = await extractCodeArtifactText(buffer, leafName, mimeType, category); const file = { file_id, diff --git a/api/server/services/Files/Code/process.spec.js b/api/server/services/Files/Code/process.spec.js index cdeee12600..e0d7b08fd3 100644 --- a/api/server/services/Files/Code/process.spec.js +++ b/api/server/services/Files/Code/process.spec.js @@ -49,7 +49,8 @@ jest.mock('@librechat/api', () => { return { logAxiosError: jest.fn(), getBasePath: jest.fn(() => ''), - sanitizeFilename: jest.fn((name) => name), + sanitizeArtifactPath: jest.fn((name) => name), + flattenArtifactPath: jest.fn((name) => name.replace(/\//g, '__')), createAxiosInstance: jest.fn(() => mockAxios), /** * Arrow-function indirection (vs. a direct `jest.fn()` reference) so @@ -274,6 +275,94 @@ describe('Code Process', () => { expect(result.bytes).toBe(100); }); + it('preserves nested directory paths in the DB record while flattening the storage key', async () => { + /* Regression test for the silent-data-loss path: when codeapi reports a + * file with a nested name like "test_folder/test_file.txt", LibreChat + * used to feed it through `sanitizeFilename` (basename-only), which + * persisted "test_file.txt" to the DB and made the file un-locatable on + * the next prime() (cat /mnt/data/test_folder/test_file.txt would + * 404). The fix: keep the path on the DB record (so primeFiles can + * place it back at the same nested location), but flatten it for the + * storage key (saveBuffer strategies key by single component). */ + const smallBuffer = Buffer.alloc(100); + mockAxios.mockResolvedValue({ data: smallBuffer }); + const mockSaveBuffer = jest.fn().mockResolvedValue('/uploads/saved.txt'); + getStrategyFunctions.mockReturnValue({ saveBuffer: mockSaveBuffer }); + + const result = await processCodeOutput({ + ...baseParams, + name: 'test_folder/test_file.txt', + }); + + // Storage key flattens `/` to `__` so on-disk strategies don't + // accidentally create real subdirectories under uploads/. + expect(mockSaveBuffer).toHaveBeenCalledWith( + expect.objectContaining({ + fileName: 'mock-uuid-1234__test_folder__test_file.txt', + }), + ); + // DB row keeps the nested path verbatim — that's what primeFiles + // ships back to the sandbox on the next turn. + expect(result.filename).toBe('test_folder/test_file.txt'); + // Claim is also keyed by the path-preserving name so the + // (filename, conversationId) compound key stays consistent. + expect(mockClaimCodeFile).toHaveBeenCalledWith( + expect.objectContaining({ filename: 'test_folder/test_file.txt' }), + ); + }); + + it('passes a NAME_MAX-aware budget to flattenArtifactPath when composing the storage key', async () => { + /* Codex review P1: per-segment caps on the path-preserving form + * aren't enough — once the segments are joined with `__` for the + * storage key, deeply-nested or moderately long paths can still + * exceed filesystem NAME_MAX (255) and cause `ENAMETOOLONG` in + * saveBuffer. processCodeOutput must pass a file_id-aware budget + * to flattenArtifactPath so the cap holds end-to-end. The unit + * tests in `packages/api/src/utils/files.spec.ts` cover the + * truncation logic itself; this test covers the integration. */ + const smallBuffer = Buffer.alloc(100); + mockAxios.mockResolvedValue({ data: smallBuffer }); + const mockSaveBuffer = jest.fn().mockResolvedValue('/uploads/saved.bin'); + getStrategyFunctions.mockReturnValue({ saveBuffer: mockSaveBuffer }); + + const flattenSpy = require('@librechat/api').flattenArtifactPath; + flattenSpy.mockClear(); + + await processCodeOutput({ ...baseParams, name: 'a/b/c.csv' }); + + // The handler should call flattenArtifactPath with both the + // safeName AND a budget = NAME_MAX (255) minus the prefix + // (`${file_id}__`). file_id mock is `mock-uuid-1234` (14 chars), + // so the budget should be 255 - 14 - 2 = 239. + expect(flattenSpy).toHaveBeenCalledWith(expect.any(String), 239); + }); + + it('passes the basename (not the full nested path) to classifyCodeArtifact and extractCodeArtifactText', async () => { + /* Codex review P2: with the path-preserving sanitizer, `safeName` + * can be a nested string like `reports.v1/Makefile`. The + * classifier reads `extensionOf` against the full string, which + * sees `.v1/Makefile` after the dotted-dir's first dot and + * misclassifies the file as `other` (so text extraction is + * skipped). Pass `path.basename(safeName)` instead. */ + const smallBuffer = Buffer.alloc(100); + mockAxios.mockResolvedValue({ data: smallBuffer }); + const mockSaveBuffer = jest.fn().mockResolvedValue('/uploads/saved.txt'); + getStrategyFunctions.mockReturnValue({ saveBuffer: mockSaveBuffer }); + + await processCodeOutput({ + ...baseParams, + name: 'reports.v1/Makefile', + }); + + expect(mockClassifyCodeArtifact).toHaveBeenCalledWith('Makefile', expect.any(String)); + expect(mockExtractCodeArtifactText).toHaveBeenCalledWith( + expect.any(Buffer), + 'Makefile', + expect.any(String), + expect.any(String), + ); + }); + it('should detect MIME type from buffer', async () => { const smallBuffer = Buffer.alloc(100); mockAxios.mockResolvedValue({ data: smallBuffer }); diff --git a/packages/api/src/utils/files.spec.ts b/packages/api/src/utils/files.spec.ts index 2f1b1346aa..fc41b00a11 100644 --- a/packages/api/src/utils/files.spec.ts +++ b/packages/api/src/utils/files.spec.ts @@ -1,4 +1,9 @@ -import { sanitizeFilename, resolveUploadErrorMessage } from './files'; +import { + sanitizeFilename, + sanitizeArtifactPath, + flattenArtifactPath, + resolveUploadErrorMessage, +} from './files'; jest.mock('node:crypto', () => { const actualModule = jest.requireActual('node:crypto'); @@ -8,6 +13,15 @@ jest.mock('node:crypto', () => { }; }); +/* `sanitizeArtifactPath` and `flattenArtifactPath` use `crypto.createHash` + * (real, not mocked) to produce a deterministic 6-hex disambiguator from + * the input. Tests compute the expected hash inline so we can compare + * exact strings without re-mocking. */ +function expectedHexSuffix(input: string): string { + const { createHash } = jest.requireActual('node:crypto'); + return createHash('sha256').update(input).digest('hex').slice(0, 6); +} + describe('sanitizeFilename', () => { test('removes directory components (1/2)', () => { expect(sanitizeFilename('/path/to/file.txt')).toBe('file.txt'); @@ -52,6 +66,374 @@ describe('sanitizeFilename', () => { }); }); +describe('sanitizeArtifactPath', () => { + test('preserves a single nested directory component', () => { + expect(sanitizeArtifactPath('test_folder/test_file.txt')).toBe('test_folder/test_file.txt'); + }); + + test('preserves multiple nested directory components', () => { + expect(sanitizeArtifactPath('a/b/c/file.txt')).toBe('a/b/c/file.txt'); + }); + + test('replaces non-alphanumeric characters per segment + adds raw-input disambiguator', () => { + /* Different raw inputs that sanitize to the same form (`out 1.csv` + * vs `out_1.csv`, `out@1.csv` vs `out#1.csv`) would otherwise share + * a `claimCodeFile` compound key and silently overwrite each other. + * When character-level sanitization changed something, we embed a + * deterministic SHA-256 prefix of the raw input in the leaf so + * different raw inputs produce different safe forms. Same raw + * input is still idempotent. */ + const raw = 'proj name/file@v1.txt'; + expect(sanitizeArtifactPath(raw)).toBe(`proj_name/file_v1-${expectedHexSuffix(raw)}.txt`); + }); + + test('falls back to basename for parent traversal', () => { + expect(sanitizeArtifactPath('../escape.txt')).toBe('escape.txt'); + }); + + test('resolves embedded parent traversal via path normalization (with disambiguator)', () => { + /* `path.posix.normalize('a/../escape.txt')` collapses to `escape.txt`, + * which then passes the traversal guard and goes through the normal + * segment-split path. Because normalization mutated the input + * (raw had `a/../`, safe form doesn't), the disambiguator fires — + * raw `a/../escape.txt` and raw `escape.txt` would otherwise both + * resolve to `escape.txt` and collide on `claimCodeFile`. */ + const raw = 'a/../escape.txt'; + expect(sanitizeArtifactPath(raw)).toBe(`escape-${expectedHexSuffix(raw)}.txt`); + }); + + test('falls back to basename for absolute paths', () => { + expect(sanitizeArtifactPath('/abs/path.txt')).toBe('path.txt'); + }); + + test('collapses redundant separators (with disambiguator)', () => { + /* `dir//file.txt` and `dir/file.txt` would both resolve to + * `dir/file.txt` without disambiguation — collision on + * `claimCodeFile`. The empty-segment collapse counts as + * mutation, so the disambiguator fires for the doubled-slash + * variant and the clean one passes through unchanged. */ + const raw = 'dir//file.txt'; + expect(sanitizeArtifactPath(raw)).toBe(`dir/file-${expectedHexSuffix(raw)}.txt`); + }); + + test('strips current-directory segments (with disambiguator)', () => { + const raw = './dir/./file.txt'; + expect(sanitizeArtifactPath(raw)).toBe(`dir/file-${expectedHexSuffix(raw)}.txt`); + }); + + test('prepends underscore on dotfile leaf and disambiguates against literal _.x', () => { + /* `.hidden` and `_.hidden` both resolve to `_.hidden` without + * disambiguation. The dotfile-prefix step counts as mutation, so + * `.hidden` → `_.hidden-` and the literal `_.hidden` passes + * through. The disambiguator appends at the end (rather than + * splitting `_.hidden` into stem + extension) because `dot <= 1`. */ + const raw = 'dir/.hidden'; + expect(sanitizeArtifactPath(raw)).toBe(`dir/_.hidden-${expectedHexSuffix(raw)}`); + }); + + test('returns underscore for empty input', () => { + expect(sanitizeArtifactPath('')).toBe('_'); + }); + + test('caps the leaf segment at 255 chars with extension-preserving truncation', () => { + /* Regression for the unbounded-leaf path: without the per-segment + * cap, a 300-char artifact name would flow through to saveBuffer's + * storage key (`${file_id}__${flatName}`) and trip ENAMETOOLONG on + * filesystems that enforce NAME_MAX. */ + const longName = 'a'.repeat(300) + '.txt'; + const result = sanitizeArtifactPath(longName); + expect(result.length).toBe(255); + expect(result).toMatch(new RegExp(`^a+-${expectedHexSuffix(longName)}\\.txt$`)); + }); + + test('caps the leaf when nested under a directory, preserving the directory verbatim', () => { + const longLeaf = 'b'.repeat(300) + '.csv'; + const result = sanitizeArtifactPath(`reports/${longLeaf}`); + const [dir, leaf] = result.split('/'); + expect(dir).toBe('reports'); + expect(leaf.length).toBe(255); + expect(leaf).toMatch(new RegExp(`^b+-${expectedHexSuffix(longLeaf)}\\.csv$`)); + }); + + test('caps non-leaf directory segments at 255 chars', () => { + /* Directory components hit `NAME_MAX` independently of the leaf — + * each `mkdir` along the path has to satisfy the per-component limit + * regardless of the basename truncation. */ + const longDir = 'd'.repeat(300); + const result = sanitizeArtifactPath(`${longDir}/notes.txt`); + const [dir, leaf] = result.split('/'); + expect(dir.length).toBe(255); + expect(dir).toMatch(new RegExp(`^d+-${expectedHexSuffix(longDir)}$`)); + expect(leaf).toBe('notes.txt'); + }); + + test('produces deterministic output across calls (no orphaned uploads on re-truncation)', () => { + /* Codex review P2: `sanitizeFilename`'s `crypto.randomBytes(3)` made + * the truncated form non-deterministic — re-uploading the same long + * name would compute a different storage key, orphaning the previous + * on-disk file under the reused `file_id`. The new helpers hash the + * input so the same input always produces the same output. */ + const longName = 'a'.repeat(300) + '.txt'; + const a = sanitizeArtifactPath(longName); + const b = sanitizeArtifactPath(longName); + expect(a).toBe(b); + /* Two *different* long names that share a truncation prefix must + * still produce different outputs (collision avoidance). */ + const otherName = 'a'.repeat(299) + 'X.txt'; + expect(sanitizeArtifactPath(otherName)).not.toBe(a); + }); + + test('caps every segment in a nested path with mixed lengths', () => { + /* Every segment respects the cap — the truncate is per-component, + * not on the join. Use 2 segments here (rather than 3+) so the + * joined form stays under `ARTIFACT_PATH_TOTAL_MAX` (512); the + * total-cap fallback gets its own test below. */ + const segA = 'x'.repeat(260); + const leaf = 'z'.repeat(260) + '.json'; + const result = sanitizeArtifactPath(`${segA}/${leaf}`); + const parts = result.split('/'); + expect(parts).toHaveLength(2); + expect(parts[0].length).toBe(255); + expect(parts[1].length).toBe(255); + expect(parts[1]).toMatch(/\.json$/); + }); + + test('does not truncate filenames that are exactly at the 255-char limit', () => { + /* Off-by-one guard: 255 itself is allowed (filesystem boundary), 256 + * is not. */ + const exact = 'e'.repeat(251) + '.txt'; // 255 chars total + expect(sanitizeArtifactPath(exact)).toBe(exact); + expect(sanitizeArtifactPath(`dir/${exact}`)).toBe(`dir/${exact}`); + }); + + test('falls back to leaf-only when total path length exceeds the DB-index cap (Codex review P2)', () => { + /* MongoDB 4.0 rejects indexed values past 1024 bytes, and even on + * 4.2+ where the limit is configurable, runaway nested paths bloat + * the unique compound index on (file_id, filename, conversationId, + * context). At 3+ at-cap (255-char) segments + separators the joined + * form blows past the safety budget; the helper falls back to leaf- + * only (already segment-capped to ≤ 255). */ + const segA = 'x'.repeat(255); + const segB = 'y'.repeat(255); + const segC = 'z'.repeat(255); + const result = sanitizeArtifactPath(`${segA}/${segB}/${segC}/file.txt`); + /* Joined would be ~768 chars + slashes; well past the 512 cap. */ + expect(result).toBe('file.txt'); + }); + + test('keeps the nested path when total length is within the DB-index cap', () => { + /* The cap doesn't fire for realistic outputs — typical artifact + * depth is ≤ 3 segments × short names. */ + const result = sanitizeArtifactPath('reports/2026/q1.csv'); + expect(result).toBe('reports/2026/q1.csv'); + }); + + describe('collision avoidance (Codex review P2)', () => { + /* `sanitizeArtifactPath` is not injective — multiple raw inputs can + * collapse to the same regex-and-normalize output. `claimCodeFile` + * is keyed on the schema's compound unique + * `(filename, conversationId, context, tenantId)` index, so a + * collision would silently overwrite an earlier artifact's bytes + * via a reused `file_id`. The disambiguator branch embeds a + * SHA-256 prefix of the raw input in the leaf to keep different + * raw inputs distinct. */ + test('different raw inputs that sanitize to the same form get distinct safe forms', () => { + const a = sanitizeArtifactPath('out 1.csv'); + const b = sanitizeArtifactPath('out_1.csv'); + const c = sanitizeArtifactPath('out@1.csv'); + const d = sanitizeArtifactPath('out#1.csv'); + /* Pre-fix all four collapsed to `out_1.csv`. Post-fix only the + * already-clean `out_1.csv` keeps that form; the others get + * disambiguators based on their raw inputs. */ + expect(b).toBe('out_1.csv'); + expect(a).not.toBe(b); + expect(c).not.toBe(b); + expect(d).not.toBe(b); + expect(a).not.toBe(c); + expect(c).not.toBe(d); + }); + + test('whitespace-vs-underscore collision in directory segment', () => { + /* Codex's specific example: `reports 2026/out.csv` and + * `reports_2026/out.csv` would both have safeName + * `reports_2026/out.csv` without disambiguation. */ + const a = sanitizeArtifactPath('reports 2026/out.csv'); + const b = sanitizeArtifactPath('reports_2026/out.csv'); + expect(b).toBe('reports_2026/out.csv'); + expect(a).not.toBe(b); + /* The disambiguator is on the LEAF (not the mutated dir + * segment) so the layout matches normal path-preserving + * outputs. */ + expect(a.startsWith('reports_2026/out-')).toBe(true); + expect(a.endsWith('.csv')).toBe(true); + }); + + test('dotfile prefix collision: `.x` vs `_.x`', () => { + const a = sanitizeArtifactPath('.hidden'); + const b = sanitizeArtifactPath('_.hidden'); + expect(b).toBe('_.hidden'); + expect(a).not.toBe(b); + expect(a).toBe(`_.hidden-${expectedHexSuffix('.hidden')}`); + }); + + test('idempotent: same raw input always produces the same safe form', () => { + /* Disambiguator is deterministic (SHA-256 prefix of raw input), + * so re-uploading the same long-or-mutated name lands at the + * same storage key on every call. */ + const raw = 'proj name/data file.csv'; + expect(sanitizeArtifactPath(raw)).toBe(sanitizeArtifactPath(raw)); + }); + + test('clean inputs (no mutation) skip the disambiguator', () => { + /* Cosmetic: don't clutter human-readable filenames with a hash + * when no collision is possible. The check is on the + * post-regex form vs the raw input — when they match exactly, + * the disambiguator branch doesn't fire. */ + expect(sanitizeArtifactPath('reports/2026/q1.csv')).toBe('reports/2026/q1.csv'); + expect(sanitizeArtifactPath('plain.txt')).toBe('plain.txt'); + }); + + test('disambiguator survives leaf truncation (long mutated leaf)', () => { + /* Long input + mutation: truncateLeafSegment caps at 255 first, + * then embedDisambiguatorInLeaf re-trims to insert the input + * hash. The seg-hash from the first truncation is replaced by + * the input-hash from the second pass — that's intentional: + * input-hash is the load-bearing collision-avoidance suffix. */ + const raw = 'data file ' + 'a'.repeat(280) + '.csv'; + const result = sanitizeArtifactPath(raw); + expect(result.length).toBe(255); + expect(result.endsWith(`-${expectedHexSuffix(raw)}.csv`)).toBe(true); + }); + + test('disambiguator survives total-cap fallback', () => { + /* When joined > ARTIFACT_PATH_TOTAL_MAX, we fall back to the + * leaf-only form. The leaf has already had the disambiguator + * embedded, so the fallback preserves collision avoidance for + * the pathological-depth case too. */ + const raw = + 'a b'.repeat(100) + '/' + 'c d'.repeat(100) + '/' + 'e f'.repeat(100) + '/file.csv'; + const result = sanitizeArtifactPath(raw); + /* Result is leaf-only (no slashes). */ + expect(result.includes('/')).toBe(false); + /* And carries the disambiguator. */ + expect(result.endsWith(`-${expectedHexSuffix(raw)}.csv`)).toBe(true); + }); + }); + + test('preserves the dotfile underscore prefix when the leaf also needs truncation', () => { + /* A long hidden-file leaf (`._very_long_name`) goes through the + * underscore-prefix branch first; truncation must run AFTER that + * rewrite or the leaf would still leak past the cap. */ + const longHidden = '.' + 'a'.repeat(300); + const result = sanitizeArtifactPath(`dir/${longHidden}`); + const [dir, leaf] = result.split('/'); + expect(dir).toBe('dir'); + expect(leaf.length).toBe(255); + expect(leaf.startsWith('_.')).toBe(true); + }); +}); + +describe('flattenArtifactPath', () => { + test('joins path components with __ for storage keys', () => { + expect(flattenArtifactPath('a/b/c.txt')).toBe('a__b__c.txt'); + }); + + test('passes through paths with no separator', () => { + expect(flattenArtifactPath('file.txt')).toBe('file.txt'); + }); + + test('handles single-level nesting', () => { + expect(flattenArtifactPath('test_folder/test_file.txt')).toBe('test_folder__test_file.txt'); + }); + + test('passes through unchanged when no maxLength is supplied', () => { + /* The cap is opt-in — callers that aren't building filesystem keys + * (e.g. tests, log messages) shouldn't get truncation. */ + const longPath = 'a'.repeat(200) + '/' + 'b'.repeat(200) + '.txt'; + expect(flattenArtifactPath(longPath)).toBe(longPath.replace(/\//g, '__')); + }); + + test('passes through unchanged when flat form fits within maxLength', () => { + expect(flattenArtifactPath('short/file.txt', 100)).toBe('short__file.txt'); + expect(flattenArtifactPath('short/file.txt', 15)).toBe('short__file.txt'); + }); + + test('truncates flat form when it exceeds maxLength, preserving leaf extension', () => { + /* Regression for the deep-nesting flat-key overflow path: three + * 100-char segments → 308-char flat form that would blow past + * NAME_MAX=255 once `${file_id}__` is prepended. The truncation + * keeps the .ext on the leaf so download MIME inference still works. */ + const a = 'a'.repeat(100); + const b = 'b'.repeat(100); + const safePath = `${a}/${b}/c.txt`; + const result = flattenArtifactPath(safePath, 200); + expect(result.length).toBe(200); + expect(result.endsWith('.txt')).toBe(true); + expect(result).toMatch(new RegExp(`-${expectedHexSuffix(safePath)}\\.txt$`)); + }); + + test('preserves the extension even when only the leaf overflows', () => { + const longLeaf = 'L'.repeat(300); + const result = flattenArtifactPath(`${longLeaf}.json`, 200); + expect(result.length).toBe(200); + expect(result.endsWith('.json')).toBe(true); + }); + + test('falls back to whole-key truncation (no extension preservation) when ext is pathologically long', () => { + /* `path.extname`-style logic: a single dot followed by 100 chars is + * not a real extension. Keep the cap honored even if a contrived + * input would yield a "stem budget" of zero or negative. */ + const result = flattenArtifactPath('stem.' + 'x'.repeat(100), 50); + expect(result.length).toBe(50); + }); + + test('handles paths with no extension by truncating the stem', () => { + const longName = 'n'.repeat(300); + const result = flattenArtifactPath(longName, 50); + expect(result.length).toBe(50); + expect(result).toMatch(new RegExp(`-${expectedHexSuffix(longName)}$`)); + }); + + test('matches the boundary length exactly when input is right at the cap', () => { + /* Off-by-one guard: maxLength of N should allow flat forms of N. */ + const flat = 'a'.repeat(50); + expect(flattenArtifactPath(flat, 50)).toBe(flat); + expect(flattenArtifactPath(flat, 49).length).toBe(49); + }); + + test('produces deterministic output across calls (no orphaned uploads on re-flatten)', () => { + /* Codex review P2: re-flattening the same input must produce the + * same key so re-uploads land at the same storage location. The + * hash-based suffix replaces the previous random one. */ + const longPath = 'a'.repeat(100) + '/b.csv'; + const a = flattenArtifactPath(longPath, 50); + const b = flattenArtifactPath(longPath, 50); + expect(a).toBe(b); + /* And different inputs that share a truncation prefix still produce + * different outputs (collision avoidance). */ + const otherPath = 'a'.repeat(100) + '/c.csv'; + expect(flattenArtifactPath(otherPath, 50)).not.toBe(a); + }); + + test('clamps the result to maxLength even when ext.length > maxLength - 7 (Codex review P2)', () => { + /* Pathological maxLength: 5, ext: ".txt" (4 chars). Stem budget is + * Math.max(0, 5 - 4 - 7) = 0, so the formula yields + * `'' + '-' + 6-char-hash + '.txt'` = 11 chars — past maxLength. + * The final clamp guarantees the result is ≤ maxLength regardless. */ + const result = flattenArtifactPath('foo/bar.txt', 5); + expect(result.length).toBeLessThanOrEqual(5); + }); + + test('returns empty string when maxLength <= 0', () => { + /* Edge case: a negative or zero budget can't fit any output. Don't + * attempt to construct a key; let the caller handle the empty case + * (in practice this never fires — `process.js` passes + * `255 - file_id.length - 2`, and file_id is bounded). */ + expect(flattenArtifactPath('a/b.txt', 0)).toBe(''); + expect(flattenArtifactPath('a/b.txt', -5)).toBe(''); + }); +}); + describe('resolveUploadErrorMessage', () => { test('returns default message for null error', () => { expect(resolveUploadErrorMessage(null)).toBe('Error processing file'); diff --git a/packages/api/src/utils/files.ts b/packages/api/src/utils/files.ts index 9e78d9289c..37c993a3f3 100644 --- a/packages/api/src/utils/files.ts +++ b/packages/api/src/utils/files.ts @@ -67,6 +67,275 @@ export function sanitizeFilename(inputName: string): string { return name; } +/** Per-path-component length cap. Mirrors `sanitizeFilename`'s 255-char + * basename cap and matches filesystem `NAME_MAX` (255 bytes on Linux/ext4, + * 255 chars on Windows/NTFS) — without it, `saveBuffer` writes + * `${file_id}__${flatName}` and a long artifact name surfaces as + * `ENAMETOOLONG` and falls back to a download URL instead of persisting. */ +const ARTIFACT_PATH_SEGMENT_MAX = 255; + +/** Whole-path length cap for the path-preserving form. The DB stores this + * value in `filename`, which participates in a compound unique index on + * the File schema — `{ filename, conversationId, context, tenantId }` + * with a partial filter for `context: FileContext.execute_code` + * (`packages/data-schemas/src/schema/file.ts`). MongoDB 4.0 and earlier + * reject indexed values past 1024 bytes; even on 4.2+ where the limit + * is configurable, runaway nested paths bloat the index for no real + * benefit. 512 chars is plenty for realistic code-execution outputs + * (typical depth ≤ 3 segments × short names) and gives headroom for + * BSON / index-overhead encoding. + * + * Above the cap we fall back to leaf-only — the same shape as the + * absolute-path / `..`-traversal branches above. This loses directory + * structure for the pathological case (the user's `cat /mnt/data// + * /file.txt` won't survive across turns), but DB-write failure + * with a missing artifact is strictly worse than a flat-name fallback. + * Pre-PR every artifact got this treatment regardless of depth, so the + * cap is monotonically better than the prior behavior. */ +const ARTIFACT_PATH_TOTAL_MAX = 512; + +/** + * Deterministic disambiguator suffix for truncated names. The original + * `sanitizeFilename` used `crypto.randomBytes(3)`, which made the + * truncated form non-deterministic — a re-upload of the same long + * filename would compute a *different* storage key, orphaning the + * previous on-disk file under `claimCodeFile`'s reused `file_id`. + * + * Hashing the input gives stability across calls (same input → same + * output) while still disambiguating distinct inputs that share a + * truncation prefix. SHA-256 truncated to 6 hex chars is collision-safe + * for our scale (24 bits ≈ 16M values; we'd need ~5K simultaneous + * truncated names in one (filename, conversationId) bucket before a + * collision becomes likely, vs. the realistic ceiling of single-digit + * artifacts per turn). + */ +function deterministicHexSuffix(input: string): string { + return crypto.createHash('sha256').update(input).digest('hex').slice(0, 6); +} + +/** + * Truncates a path-leaf segment while preserving its extension, matching the + * shape `sanitizeFilename` uses for the basename cap. The 6-hex-char + * suffix is a SHA-256 prefix of the original input, so re-truncating + * the same name produces the same key (no orphaned uploads). + * + * Falls back to whole-segment truncation (no extension preservation) when + * the extension itself is so long there's no room for a meaningful stem — + * `path.extname` happily returns 100+ char "extensions" for pathological + * inputs like `_.`, and trying to keep that extension would + * still blow past the cap. + */ +function truncateLeafSegment(leaf: string): string { + if (leaf.length <= ARTIFACT_PATH_SEGMENT_MAX) return leaf; + const ext = path.extname(leaf); + const stem = path.basename(leaf, ext); + // 8 = 1 (`-`) + 6 (hex disambiguator) + 1 (minimum 1-char stem) + if (ext.length > ARTIFACT_PATH_SEGMENT_MAX - 8) { + return truncateDirSegment(leaf); + } + const stemBudget = ARTIFACT_PATH_SEGMENT_MAX - ext.length - 7; + return stem.slice(0, stemBudget) + '-' + deterministicHexSuffix(leaf) + ext; +} + +/** Truncates a non-leaf (directory) segment. Directory segments don't + * carry semantic extensions, so we just slice and append the same 6-hex + * disambiguation suffix. */ +function truncateDirSegment(seg: string): string { + if (seg.length <= ARTIFACT_PATH_SEGMENT_MAX) return seg; + return seg.slice(0, ARTIFACT_PATH_SEGMENT_MAX - 7) + '-' + deterministicHexSuffix(seg); +} + +/** + * Embed a deterministic 6-hex hash of `hashSource` in `segment` (before its + * extension if any). Used to keep two different raw artifact names that + * sanitize to the same `segment` distinguishable — without this, + * `claimCodeFile`'s `(filename, conversationId, context, tenantId)` + * compound unique index would silently match the second upload to the + * first record and overwrite the first artifact's bytes. + * + * The disambiguator survives length capping: if appending pushes the + * segment past `ARTIFACT_PATH_SEGMENT_MAX`, the stem is trimmed to make + * room (the hash + extension are load-bearing for collision avoidance + * and MIME inference, the stem is just the human-readable prefix). + */ +function embedDisambiguatorInLeaf(segment: string, hashSource: string): string { + const suffix = '-' + deterministicHexSuffix(hashSource); + const dot = segment.lastIndexOf('.'); + /* `dot <= 1` treats dotfile-prefixed leaves (`_.hidden`, `_.x`) and + * extensionless names (`Makefile`, `out`) the same way: append the + * disambiguator at the end rather than splitting `_.hidden` into stem + * `_` + "extension" `.hidden`, which would produce the awkward + * `_-.hidden`. The `_.x.y` case (dotfile with a real extension) + * still has `dot > 1` and goes through the extension-preserving + * branch correctly. */ + let result: string; + if (dot <= 1) { + const proposed = segment + suffix; + result = + proposed.length <= ARTIFACT_PATH_SEGMENT_MAX + ? proposed + : segment.slice(0, ARTIFACT_PATH_SEGMENT_MAX - suffix.length) + suffix; + } else { + const stem = segment.slice(0, dot); + const ext = segment.slice(dot); + const proposed = stem + suffix + ext; + if (proposed.length <= ARTIFACT_PATH_SEGMENT_MAX) { + result = proposed; + } else { + // Trim stem to make room while preserving disambiguator + extension. + const stemBudget = Math.max(0, ARTIFACT_PATH_SEGMENT_MAX - suffix.length - ext.length); + result = stem.slice(0, stemBudget) + suffix + ext; + } + } + /* Defensive final clamp. The branches above already keep the result + * within `ARTIFACT_PATH_SEGMENT_MAX` for any input where the + * extension fits the segment, but a pathological extension (e.g. the + * `.aaaaa…` shape `path.extname` returns for contrived inputs) could + * still produce a longer result. Hard-clamp so the segment cap holds + * unconditionally. */ + return result.length <= ARTIFACT_PATH_SEGMENT_MAX + ? result + : result.slice(0, ARTIFACT_PATH_SEGMENT_MAX); +} + +/** + * Sanitize a code-execution artifact path while preserving directory components + * so subsequent prime() runs can place the file at the same nested location in + * the next sandbox session. Each segment is sanitized independently with the + * same rules as `sanitizeFilename`. Falls back to the basename for absolute + * paths or names containing `..` traversal. + * + * Each path component is capped at `ARTIFACT_PATH_SEGMENT_MAX` (255) chars + * — the leaf with extension preservation (matching `sanitizeFilename`), + * non-leaf segments with a plain truncate-and-disambiguate. Without the + * cap, long artifact names flow into `saveBuffer`'s storage key + * unbounded and trip `ENAMETOOLONG` instead of persisting. + */ +export function sanitizeArtifactPath(inputName: string): string { + if (!inputName) return '_'; + if (path.isAbsolute(inputName)) return sanitizeFilename(inputName); + const normalized = path.posix.normalize(inputName); + if ( + normalized === '..' || + normalized.startsWith('../') || + normalized.includes('/../') || + normalized.endsWith('/..') || + normalized.endsWith('/') + ) { + return sanitizeFilename(inputName); + } + const segments = normalized + .split('/') + .map((seg) => seg.replace(/[^a-zA-Z0-9.-]/g, '_')) + .filter((seg) => seg.length > 0 && seg !== '.'); + if (segments.length === 0) return '_'; + const leafIdx = segments.length - 1; + if (segments[leafIdx].startsWith('.')) segments[leafIdx] = '_' + segments[leafIdx]; + /* Snapshot the post-regex-and-dotfile form BEFORE per-segment + * truncation so the collision-avoidance check below only fires for + * character-level mutations (regex replacement, normalization, + * dotfile prefix, empty-segment collapse). Truncation by itself is + * already disambiguated by `truncateLeafSegment`'s seg-hash; firing + * the input-hash branch on truncation alone would just stack a + * second hash for no collision-avoidance benefit. */ + const preCapJoined = segments.join('/'); + const capped = segments.map((seg, i) => + i === leafIdx ? truncateLeafSegment(seg) : truncateDirSegment(seg), + ); + let joined = capped.join('/'); + /* Collision avoidance. `sanitizeArtifactPath` is not injective: + * `out@1.csv` and `out#1.csv` (and `out 1.csv` and `out_1.csv`, and + * `.x` and `_.x`) all collapse onto the same safe form. Without + * disambiguation, `claimCodeFile` — keyed on the schema's compound + * unique `(filename, conversationId, context, tenantId)` index — + * matches the later upload to the earlier record and silently + * overwrites the first artifact's bytes via the reused `file_id`. + * + * When character-level sanitization changed something, embed a + * SHA-256 prefix of the raw input in the leaf segment. Same raw + * input → same safe form (idempotent for re-uploads); different raw + * inputs that would have collided → different safe forms. + * + * Clean inputs (where the regex/normalize pass was a no-op) skip + * the disambiguator — no collision is possible because they're + * already distinct strings, and we don't want to clutter human- + * readable filenames with a hash when nothing was at risk. */ + if (preCapJoined !== inputName) { + capped[leafIdx] = embedDisambiguatorInLeaf(capped[leafIdx], inputName); + joined = capped.join('/'); + } + /* Per-segment caps are necessary but not sufficient — a deeply nested + * path with many at-cap segments can still exceed the DB's indexed-key + * limit (Mongo 4.0 ≤ 1024 bytes; later versions configurable). Fall + * back to leaf-only when the joined form blows past the total cap, so + * the (filename, conversationId, context, tenantId) compound unique + * index never sees an oversized key. The leaf is already capped by + * `truncateLeafSegment` + `embedDisambiguatorInLeaf`, so this + * guarantees ≤ ARTIFACT_PATH_SEGMENT_MAX (255) chars. Same shape as + * the absolute-path / `..`-traversal fallback above. */ + if (joined.length > ARTIFACT_PATH_TOTAL_MAX) { + return capped[leafIdx]; + } + return joined; +} + +/** Limit on `path.extname`'s "extension" length we'll honor when + * truncating a flat key. Real file extensions cap out around 8 chars + * (`.parquet`, `.tsv`, `.html`); a 16-char ceiling keeps us tolerant of + * legitimate edge cases (`.openshift`) while ignoring pathological + * outputs from `path.extname` on contrived inputs. Above that we treat + * the trailing `.foo` as part of the stem and just hard-truncate. */ +const FLAT_KEY_MAX_EXT_LENGTH = 16; + +/** + * Map a (sanitized) artifact path to a flat storage-safe key. The local + * storage strategies key by single component; this avoids creating + * unintended subdirectories on disk while the DB record retains the + * nested path for the next prime(). + * + * Optionally caps the result at `maxLength` characters. Per-segment caps + * applied by `sanitizeArtifactPath` aren't enough on their own — + * `${file_id}__${flatName}` has to fit in one filesystem path component + * (NAME_MAX = 255 on most filesystems), so a deeply-nested path whose + * segments are individually within bounds can still produce a flat form + * that overflows once `${file_id}__` is prepended. The caller passes + * `255 - prefix.length` and the truncation preserves the leaf extension + * with the same disambiguating hex suffix that `sanitizeFilename` uses. + * + * Without this cap, oversized flat keys hit `ENAMETOOLONG` inside + * `saveBuffer`, the artifact falls back to a download URL, and the next + * prime() never sees it. + */ +export function flattenArtifactPath(safePath: string, maxLength?: number): string { + const flat = safePath.replace(/\//g, '__'); + if (maxLength == null || flat.length <= maxLength) return flat; + if (maxLength <= 0) return ''; + + /* Find the leaf's extension (last `.`) — segment separators are `__`, + * never `.`, so the last dot is always inside the leaf. Ignore + * "extensions" longer than `FLAT_KEY_MAX_EXT_LENGTH` so a pathological + * input doesn't leave us with no stem budget. */ + const lastDot = flat.lastIndexOf('.'); + const candidateExt = lastDot >= 0 ? flat.slice(lastDot) : ''; + const ext = + candidateExt.length > 0 && candidateExt.length <= FLAT_KEY_MAX_EXT_LENGTH ? candidateExt : ''; + const stem = ext ? flat.slice(0, lastDot) : flat; + // 7 = '-' + 6 hex disambiguator. Stem budget can collapse to 0 when + // `ext.length > maxLength - 7` — that's fine; the stem just disappears + // and we fall back to `-` (still bounded). The hash is a + // SHA-256 prefix of `safePath` so re-flattening the same input + // produces the same key (same storage location across re-uploads). + const stemBudget = Math.max(0, maxLength - ext.length - 7); + const truncated = stem.slice(0, stemBudget) + '-' + deterministicHexSuffix(safePath) + ext; + /* Final clamp. The stemBudget formula above keeps `truncated.length` + * exactly at `maxLength` for any maxLength ≥ ext.length + 7, and at + * `7 + ext.length` otherwise. The latter can still exceed maxLength + * for absurdly small budgets (maxLength < ext.length + 7) — clamp + * defensively so callers always get a key ≤ maxLength regardless of + * what they passed in. */ + return truncated.length <= maxLength ? truncated : truncated.slice(0, maxLength); +} + /** * Options for reading files */