mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-05-13 16:07:30 +00:00
📝 fix: Preserve Raw Markdown Formatting on Upload as Text (#12734)
* 🐛 fix: Preserve Raw Markdown on `Upload as Text` When `RAG_API_URL` is configured, `.md` uploads were sent to the RAG API `/text` endpoint, which routes Markdown through `UnstructuredMarkdownLoader` and strips formatting (`#`, `**`, lists, blockquotes). Users expect `Upload as Text` to preserve raw content - identical bytes in a `.txt` file round-trip verbatim, while the `.md` came back stripped. Short-circuit the RAG API call for Markdown files (by MIME type or `.md` / `.markdown` extension) and read the file verbatim via `parseTextNative`. Non-Markdown paths are unaffected, and the embedding path (`/embed`) keeps its existing loader so vector search quality is unchanged. * 🐛 fix: normalize markdown MIME and accept `text/md` Addressing review feedback on the `Upload as Text` short-circuit: - Accept `text/md` in the markdown MIME set (LibreChat treats it as a valid markdown type elsewhere, e.g. the artifact-rendering prompt). - Normalize the incoming MIME type (lowercase + strip parameters) before the set lookup so parameterized values like `text/markdown; charset=utf-8` and uppercase `TEXT/MARKDOWN` still short-circuit. Extensionless uploads relying only on the `Content-Type` header would otherwise fall through to the RAG `/text` endpoint and lose their markdown formatting. Extend `text.spec.ts` parametrized cases with `text/md`, parameterized MIME, uppercase, and whitespace-padded variants. * 🧹 chore: Address Code Review Follow-ups on `Upload as Text` fix Addressing comprehensive review feedback: - Debug log now includes filename and MIME type so operators can identify which upload triggered the short-circuit without having to correlate other logs. - Expand markdown extension detection beyond `.md` / `.markdown` to cover `.mdown`, `.mkdn`, `.mkd`, `.mdwn` (case-insensitive regex). - Tighten `normalizeMimeType` parameter type from `string | undefined` to `string` to match the actual Express.Multer.File type. The falsy-check still protects against empty strings at runtime. - Extend parametrized tests with the most common real-world shapes: `text/plain` + `.md` (the MIME most browsers/servers assign), the new rare extensions, and empty MIME + `.md` (pure extension fallback path). - Add a positive assertion that `readFileAsString` was called with the expected arguments on every short-circuit case, so tests fail loudly if the native-parse path ever regresses. * 🧪 test: Cover `.mdwn` regex branch in Markdown short-circuit Every other alternation in `MARKDOWN_EXTENSIONS_RE` has at least one test case (`md`, `markdown`, `mdown`, `mkdn`, `mkd`) but `mdwn` did not, leaving a typo in that branch undetectable.
This commit is contained in:
parent
3bd2681272
commit
2358d07b64
2 changed files with 101 additions and 0 deletions
|
|
@ -300,5 +300,73 @@ describe('text', () => {
|
|||
source: FileSources.text,
|
||||
});
|
||||
});
|
||||
|
||||
it.each([
|
||||
{ mimetype: 'text/markdown', originalname: 'notes.md' },
|
||||
{ mimetype: 'text/x-markdown', originalname: 'notes.md' },
|
||||
{ mimetype: 'text/md', originalname: 'notes' },
|
||||
{ mimetype: 'application/markdown', originalname: 'notes.md' },
|
||||
{ mimetype: 'application/x-markdown', originalname: 'notes.md' },
|
||||
{ mimetype: 'text/plain', originalname: 'notes.md' },
|
||||
{ mimetype: 'application/octet-stream', originalname: 'README.md' },
|
||||
{ mimetype: 'application/octet-stream', originalname: 'GUIDE.MARKDOWN' },
|
||||
{ mimetype: 'application/octet-stream', originalname: 'post.mdown' },
|
||||
{ mimetype: 'application/octet-stream', originalname: 'post.mkdn' },
|
||||
{ mimetype: 'application/octet-stream', originalname: 'post.mkd' },
|
||||
{ mimetype: 'application/octet-stream', originalname: 'docs.mdwn' },
|
||||
{ mimetype: 'text/markdown; charset=utf-8', originalname: 'notes' },
|
||||
{ mimetype: 'TEXT/MARKDOWN', originalname: 'notes' },
|
||||
{ mimetype: ' text/markdown ; charset=UTF-8 ', originalname: 'notes' },
|
||||
{ mimetype: '', originalname: 'notes.md' },
|
||||
])(
|
||||
'should short-circuit to native parsing for markdown file (%o)',
|
||||
async ({ mimetype, originalname }) => {
|
||||
process.env.RAG_API_URL = 'http://rag-api.test';
|
||||
const mockText = '# Heading\n\n**bold** text';
|
||||
const mockBytes = Buffer.byteLength(mockText, 'utf8');
|
||||
|
||||
mockedReadFileAsString.mockResolvedValue({
|
||||
content: mockText,
|
||||
bytes: mockBytes,
|
||||
});
|
||||
|
||||
const result = await parseText({
|
||||
req: mockReq,
|
||||
file: { ...mockFile, mimetype, originalname },
|
||||
file_id: mockFileId,
|
||||
});
|
||||
|
||||
expect(mockedAxios.get).not.toHaveBeenCalled();
|
||||
expect(mockedAxios.post).not.toHaveBeenCalled();
|
||||
expect(mockedReadFileAsString).toHaveBeenCalledWith('/tmp/test.txt', {
|
||||
fileSize: 100,
|
||||
});
|
||||
expect(result).toEqual({
|
||||
text: mockText,
|
||||
bytes: mockBytes,
|
||||
source: FileSources.text,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
it('should still call the RAG API for non-markdown text files', async () => {
|
||||
process.env.RAG_API_URL = 'http://rag-api.test';
|
||||
const mockText = 'plain text content';
|
||||
|
||||
mockedAxios.get.mockResolvedValue({ status: 200, statusText: 'OK' });
|
||||
mockedAxios.post.mockResolvedValue({ data: { text: mockText } });
|
||||
|
||||
await parseText({
|
||||
req: mockReq,
|
||||
file: mockFile,
|
||||
file_id: mockFileId,
|
||||
});
|
||||
|
||||
expect(mockedAxios.post).toHaveBeenCalledWith(
|
||||
'http://rag-api.test/text',
|
||||
expect.any(Object),
|
||||
expect.objectContaining({ timeout: 300000 }),
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -7,6 +7,32 @@ import type { ServerRequest } from '~/types';
|
|||
import { logAxiosError, readFileAsString } from '~/utils';
|
||||
import { generateShortLivedToken } from '~/crypto/jwt';
|
||||
|
||||
const MARKDOWN_MIME_TYPES = new Set([
|
||||
'text/markdown',
|
||||
'text/x-markdown',
|
||||
'text/md',
|
||||
'application/markdown',
|
||||
'application/x-markdown',
|
||||
]);
|
||||
|
||||
const MARKDOWN_EXTENSIONS_RE = /\.(md|markdown|mdown|mkdn|mkd|mdwn)$/i;
|
||||
|
||||
function normalizeMimeType(mimetype: string): string {
|
||||
if (!mimetype) {
|
||||
return '';
|
||||
}
|
||||
const semi = mimetype.indexOf(';');
|
||||
const base = semi === -1 ? mimetype : mimetype.slice(0, semi);
|
||||
return base.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function isMarkdownFile(file: Express.Multer.File): boolean {
|
||||
if (MARKDOWN_MIME_TYPES.has(normalizeMimeType(file.mimetype))) {
|
||||
return true;
|
||||
}
|
||||
return MARKDOWN_EXTENSIONS_RE.test(file.originalname ?? '');
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to parse text using RAG API, falls back to native text parsing
|
||||
* @param params - The parameters object
|
||||
|
|
@ -29,6 +55,13 @@ export async function parseText({
|
|||
return parseTextNative(file);
|
||||
}
|
||||
|
||||
if (isMarkdownFile(file)) {
|
||||
logger.debug(
|
||||
`[parseText] Markdown file detected (${file.originalname}, ${file.mimetype}), using native parsing to preserve raw formatting`,
|
||||
);
|
||||
return parseTextNative(file);
|
||||
}
|
||||
|
||||
const userId = req.user?.id;
|
||||
if (!userId) {
|
||||
logger.debug('[parseText] No user ID provided, falling back to native text parsing');
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue