From ded3cd8876501c1daa67d54bbb4b02d2b85efc3d Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Mon, 10 Mar 2025 17:23:46 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=8D=20feat:=20Mistral=20OCR=20API=20/?= =?UTF-8?q?=20Upload=20Files=20as=20Text=20(#6274)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: move `loadAuthValues` to `~/services/Tools/credentials` * feat: add createAxiosInstance function to configure axios with proxy support * WIP: First pass mistral ocr * refactor: replace getConvoFiles with getToolFiles for improved file retrieval logic * refactor: improve document formatting in encodeAndFormat function * refactor: remove unused resendFiles parameter from buildOptions function (this option comes from the agent config) * fix: update getFiles call to include files with `text` property as well * refactor: move file handling to `initializeAgentOptions` * refactor: enhance addImageURLs method to handle OCR text and improve message formatting * refactor: update message formatting to handle OCR text in various content types * refactor: remove unused resendFiles property from compactAgentsSchema * fix: add error handling for Mistral OCR document upload and logging * refactor: integrate OCR capability into file upload options and configuration * refactor: skip processing for text source files in delete request, as they are directly tied to database * feat: add metadata field to ExtendedFile type and update PanelColumns and PanelTable components for localization and metadata handling * fix: source icon styling * wip: first pass, frontend file context agent resources * refactor: add hover card with contextual information for File Context (OCR) in FileContext component * feat: enhance file processing by integrating file retrieval for OCR resources in agent initialization * feat: implement OCR config; fix: agent resource deletion for ocr files * feat: enhance agent initialization by adding OCR capability check in resource priming * ci: fix `~/config` module mock * ci: add OCR property expectation in AppService tests * refactor: simplify OCR config loading by removing environment variable extraction, to be done when OCR is actually performed * ci: add unit test to ensure environment variable references are not parsed in OCR config * refactor: disable base64 image inclusion in OCR request * refactor: enhance OCR configuration handling by validating environment variables and providing defaults * refactor: use file stream from disk for mistral ocr api --- api/app/clients/BaseClient.js | 10 +- api/app/clients/tools/util/handleTools.js | 41 +- api/app/clients/tools/util/index.js | 3 +- api/config/index.js | 16 + api/models/Conversation.js | 52 +- api/models/File.js | 6 +- api/server/controllers/agents/callbacks.js | 2 +- api/server/controllers/agents/client.js | 27 +- api/server/controllers/tools.js | 3 +- api/server/middleware/buildEndpointOption.js | 11 +- api/server/routes/files/files.js | 2 +- api/server/services/AppService.js | 9 +- api/server/services/AppService.spec.js | 30 + api/server/services/Endpoints/agents/build.js | 12 +- .../services/Endpoints/agents/initialize.js | 68 +- api/server/services/Files/MistralOCR/crud.js | 207 +++++ .../services/Files/MistralOCR/crud.spec.js | 737 ++++++++++++++++++ api/server/services/Files/MistralOCR/index.js | 5 + api/server/services/Files/images/encode.js | 8 + api/server/services/Files/process.js | 54 +- api/server/services/Files/strategies.js | 23 + api/server/services/Tools/credentials.js | 56 ++ api/server/utils/handleText.js | 1 + api/test/__mocks__/logger.js | 3 + api/typedefs.js | 48 ++ client/src/common/agents-types.ts | 1 + client/src/common/types.ts | 1 + .../Chat/Input/Files/AttachFileMenu.tsx | 13 +- .../Chat/Input/Files/DragDropModal.tsx | 8 +- .../Chat/Input/Files/FilePreview.tsx | 7 +- .../Chat/Input/Files/SourceIcon.tsx | 41 +- .../SidePanel/Agents/AgentConfig.tsx | 29 +- .../SidePanel/Agents/FileContext.tsx | 128 +++ .../SidePanel/Files/PanelColumns.tsx | 11 +- .../SidePanel/Files/PanelFileCell.tsx | 1 - .../components/SidePanel/Files/PanelTable.tsx | 1 + client/src/data-provider/Files/mutations.ts | 5 +- client/src/locales/en/translation.json | 7 +- client/src/utils/forms.tsx | 16 +- package-lock.json | 2 +- packages/data-provider/package.json | 2 +- packages/data-provider/src/config.ts | 17 +- packages/data-provider/src/index.ts | 1 + packages/data-provider/src/ocr.ts | 14 + packages/data-provider/src/schemas.ts | 1 - .../data-provider/src/types/assistants.ts | 6 +- packages/data-provider/src/types/files.ts | 2 + packages/data-schemas/src/schema/file.ts | 4 + 48 files changed, 1621 insertions(+), 131 deletions(-) create mode 100644 api/server/services/Files/MistralOCR/crud.js create mode 100644 api/server/services/Files/MistralOCR/crud.spec.js create mode 100644 api/server/services/Files/MistralOCR/index.js create mode 100644 api/server/services/Tools/credentials.js create mode 100644 client/src/components/SidePanel/Agents/FileContext.tsx create mode 100644 packages/data-provider/src/ocr.ts diff --git a/api/app/clients/BaseClient.js b/api/app/clients/BaseClient.js index 61b39a8f6d..77e14c07d0 100644 --- a/api/app/clients/BaseClient.js +++ b/api/app/clients/BaseClient.js @@ -1121,9 +1121,13 @@ class BaseClient { return message; } - const files = await getFiles({ - file_id: { $in: fileIds }, - }); + const files = await getFiles( + { + file_id: { $in: fileIds }, + }, + {}, + {}, + ); await this.addImageURLs(message, files, this.visionMode); diff --git a/api/app/clients/tools/util/handleTools.js b/api/app/clients/tools/util/handleTools.js index ae19a158ee..063d6e0327 100644 --- a/api/app/clients/tools/util/handleTools.js +++ b/api/app/clients/tools/util/handleTools.js @@ -21,6 +21,7 @@ const { } = require('../'); const { primeFiles: primeCodeFiles } = require('~/server/services/Files/Code/process'); const { createFileSearchTool, primeFiles: primeSearchFiles } = require('./fileSearch'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); const { createMCPTool } = require('~/server/services/MCP'); const { loadSpecs } = require('./loadSpecs'); const { logger } = require('~/config'); @@ -90,45 +91,6 @@ const validateTools = async (user, tools = []) => { } }; -const loadAuthValues = async ({ userId, authFields, throwError = true }) => { - let authValues = {}; - - /** - * Finds the first non-empty value for the given authentication field, supporting alternate fields. - * @param {string[]} fields Array of strings representing the authentication fields. Supports alternate fields delimited by "||". - * @returns {Promise<{ authField: string, authValue: string} | null>} An object containing the authentication field and value, or null if not found. - */ - const findAuthValue = async (fields) => { - for (const field of fields) { - let value = process.env[field]; - if (value) { - return { authField: field, authValue: value }; - } - try { - value = await getUserPluginAuthValue(userId, field, throwError); - } catch (err) { - if (field === fields[fields.length - 1] && !value) { - throw err; - } - } - if (value) { - return { authField: field, authValue: value }; - } - } - return null; - }; - - for (let authField of authFields) { - const fields = authField.split('||'); - const result = await findAuthValue(fields); - if (result) { - authValues[result.authField] = result.authValue; - } - } - - return authValues; -}; - /** @typedef {typeof import('@langchain/core/tools').Tool} ToolConstructor */ /** @typedef {import('@langchain/core/tools').Tool} Tool */ @@ -348,7 +310,6 @@ const loadTools = async ({ module.exports = { loadToolWithAuth, - loadAuthValues, validateTools, loadTools, }; diff --git a/api/app/clients/tools/util/index.js b/api/app/clients/tools/util/index.js index 73d10270b6..ea67bb4ced 100644 --- a/api/app/clients/tools/util/index.js +++ b/api/app/clients/tools/util/index.js @@ -1,9 +1,8 @@ -const { validateTools, loadTools, loadAuthValues } = require('./handleTools'); +const { validateTools, loadTools } = require('./handleTools'); const handleOpenAIErrors = require('./handleOpenAIErrors'); module.exports = { handleOpenAIErrors, - loadAuthValues, validateTools, loadTools, }; diff --git a/api/config/index.js b/api/config/index.js index aaf8bb2764..dd765efb8e 100644 --- a/api/config/index.js +++ b/api/config/index.js @@ -1,3 +1,4 @@ +const axios = require('axios'); const { EventSource } = require('eventsource'); const { Time, CacheKeys } = require('librechat-data-provider'); const logger = require('./winston'); @@ -47,9 +48,24 @@ const sendEvent = (res, event) => { res.write(`event: message\ndata: ${JSON.stringify(event)}\n\n`); }; +function createAxiosInstance() { + const instance = axios.create(); + + if (process.env.proxy) { + const url = new URL(process.env.proxy); + instance.defaults.proxy = { + host: url.hostname, + protocol: url.protocol.replace(':', ''), + }; + } + + return instance; +} + module.exports = { logger, sendEvent, getMCPManager, + createAxiosInstance, getFlowStateManager, }; diff --git a/api/models/Conversation.js b/api/models/Conversation.js index 9e51926ebc..f609b96c5c 100644 --- a/api/models/Conversation.js +++ b/api/models/Conversation.js @@ -15,19 +15,6 @@ const searchConversation = async (conversationId) => { throw new Error('Error searching conversation'); } }; -/** - * Searches for a conversation by conversationId and returns associated file ids. - * @param {string} conversationId - The conversation's ID. - * @returns {Promise} - */ -const getConvoFiles = async (conversationId) => { - try { - return (await Conversation.findOne({ conversationId }, 'files').lean())?.files ?? []; - } catch (error) { - logger.error('[getConvoFiles] Error getting conversation files', error); - throw new Error('Error getting conversation files'); - } -}; /** * Retrieves a single conversation for a given user and conversation ID. @@ -73,9 +60,46 @@ const deleteNullOrEmptyConversations = async () => { } }; +/** + * Retrieves files from a conversation that have either embedded=true + * or a metadata.fileIdentifier. Simplified and efficient query. + * + * @param {string} conversationId - The conversation ID + * @returns {Promise} - Filtered array of matching file objects + */ +const getToolFiles = async (conversationId) => { + try { + const [result] = await Conversation.aggregate([ + { $match: { conversationId } }, + { + $project: { + files: { + $filter: { + input: '$files', + as: 'file', + cond: { + $or: [ + { $eq: ['$$file.embedded', true] }, + { $ifNull: ['$$file.metadata.fileIdentifier', false] }, + ], + }, + }, + }, + _id: 0, + }, + }, + ]).exec(); + + return result?.files || []; + } catch (error) { + logger.error('[getConvoEmbeddedFiles] Error fetching embedded files:', error); + throw new Error('Error fetching embedded files'); + } +}; + module.exports = { Conversation, - getConvoFiles, + getToolFiles, searchConversation, deleteNullOrEmptyConversations, /** diff --git a/api/models/File.js b/api/models/File.js index 870a18a7c8..3d62f27abb 100644 --- a/api/models/File.js +++ b/api/models/File.js @@ -17,11 +17,13 @@ const findFileById = async (file_id, options = {}) => { * Retrieves files matching a given filter, sorted by the most recently updated. * @param {Object} filter - The filter criteria to apply. * @param {Object} [_sortOptions] - Optional sort parameters. + * @param {Object|String} [selectFields={ text: 0 }] - Fields to include/exclude in the query results. + * Default excludes the 'text' field. * @returns {Promise>} A promise that resolves to an array of file documents. */ -const getFiles = async (filter, _sortOptions) => { +const getFiles = async (filter, _sortOptions, selectFields = { text: 0 }) => { const sortOptions = { updatedAt: -1, ..._sortOptions }; - return await File.find(filter).sort(sortOptions).lean(); + return await File.find(filter).select(selectFields).sort(sortOptions).lean(); }; /** diff --git a/api/server/controllers/agents/callbacks.js b/api/server/controllers/agents/callbacks.js index 45beefe7e6..6622ec3815 100644 --- a/api/server/controllers/agents/callbacks.js +++ b/api/server/controllers/agents/callbacks.js @@ -10,8 +10,8 @@ const { ChatModelStreamHandler, } = require('@librechat/agents'); const { processCodeOutput } = require('~/server/services/Files/Code/process'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); const { saveBase64Image } = require('~/server/services/Files/process'); -const { loadAuthValues } = require('~/app/clients/tools/util'); const { logger, sendEvent } = require('~/config'); /** @typedef {import('@librechat/agents').Graph} Graph */ diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js index 628b62e5ea..d591fe3247 100644 --- a/api/server/controllers/agents/client.js +++ b/api/server/controllers/agents/client.js @@ -223,14 +223,23 @@ class AgentClient extends BaseClient { }; } + /** + * + * @param {TMessage} message + * @param {Array} attachments + * @returns {Promise>>} + */ async addImageURLs(message, attachments) { - const { files, image_urls } = await encodeAndFormat( + const { files, text, image_urls } = await encodeAndFormat( this.options.req, attachments, this.options.agent.provider, VisionModes.agents, ); message.image_urls = image_urls.length ? image_urls : undefined; + if (text && text.length) { + message.ocr = text; + } return files; } @@ -308,7 +317,21 @@ class AgentClient extends BaseClient { assistantName: this.options?.modelLabel, }); - const needsTokenCount = this.contextStrategy && !orderedMessages[i].tokenCount; + if (message.ocr && i !== orderedMessages.length - 1) { + if (typeof formattedMessage.content === 'string') { + formattedMessage.content = message.ocr + '\n' + formattedMessage.content; + } else { + const textPart = formattedMessage.content.find((part) => part.type === 'text'); + textPart + ? (textPart.text = message.ocr + '\n' + textPart.text) + : formattedMessage.content.unshift({ type: 'text', text: message.ocr }); + } + } else if (message.ocr && i === orderedMessages.length - 1) { + systemContent = [systemContent, message.ocr].join('\n'); + } + + const needsTokenCount = + (this.contextStrategy && !orderedMessages[i].tokenCount) || message.ocr; /* If tokens were never counted, or, is a Vision request and the message has files, count again */ if (needsTokenCount || (this.isVisionModel && (message.image_urls || message.files))) { diff --git a/api/server/controllers/tools.js b/api/server/controllers/tools.js index 1c5330af35..b37b6fcb8c 100644 --- a/api/server/controllers/tools.js +++ b/api/server/controllers/tools.js @@ -10,7 +10,8 @@ const { const { processFileURL, uploadImageBuffer } = require('~/server/services/Files/process'); const { processCodeOutput } = require('~/server/services/Files/Code/process'); const { createToolCall, getToolCallsByConvo } = require('~/models/ToolCall'); -const { loadAuthValues, loadTools } = require('~/app/clients/tools/util'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); +const { loadTools } = require('~/app/clients/tools/util'); const { checkAccess } = require('~/server/middleware'); const { getMessage } = require('~/models/Message'); const { logger } = require('~/config'); diff --git a/api/server/middleware/buildEndpointOption.js b/api/server/middleware/buildEndpointOption.js index a0ce754a1c..041864b025 100644 --- a/api/server/middleware/buildEndpointOption.js +++ b/api/server/middleware/buildEndpointOption.js @@ -10,7 +10,6 @@ const openAI = require('~/server/services/Endpoints/openAI'); const agents = require('~/server/services/Endpoints/agents'); const custom = require('~/server/services/Endpoints/custom'); const google = require('~/server/services/Endpoints/google'); -const { getConvoFiles } = require('~/models/Conversation'); const { handleError } = require('~/server/utils'); const buildFunction = { @@ -87,16 +86,8 @@ async function buildEndpointOption(req, res, next) { // TODO: use `getModelsConfig` only when necessary const modelsConfig = await getModelsConfig(req); - const { resendFiles = true } = req.body.endpointOption; req.body.endpointOption.modelsConfig = modelsConfig; - if (isAgents && resendFiles && req.body.conversationId) { - const fileIds = await getConvoFiles(req.body.conversationId); - const requestFiles = req.body.files ?? []; - if (requestFiles.length || fileIds.length) { - req.body.endpointOption.attachments = processFiles(requestFiles, fileIds); - } - } else if (req.body.files) { - // hold the promise + if (req.body.files && !isAgents) { req.body.endpointOption.attachments = processFiles(req.body.files); } next(); diff --git a/api/server/routes/files/files.js b/api/server/routes/files/files.js index c320f7705b..c371b8e28e 100644 --- a/api/server/routes/files/files.js +++ b/api/server/routes/files/files.js @@ -16,7 +16,7 @@ const { } = require('~/server/services/Files/process'); const { getStrategyFunctions } = require('~/server/services/Files/strategies'); const { getOpenAIClient } = require('~/server/controllers/assistants/helpers'); -const { loadAuthValues } = require('~/app/clients/tools/util'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); const { getAgent } = require('~/models/Agent'); const { getFiles } = require('~/models/File'); const { logger } = require('~/config'); diff --git a/api/server/services/AppService.js b/api/server/services/AppService.js index d194d31a6b..1accd7eba6 100644 --- a/api/server/services/AppService.js +++ b/api/server/services/AppService.js @@ -1,4 +1,9 @@ -const { FileSources, EModelEndpoint, getConfigDefaults } = require('librechat-data-provider'); +const { + FileSources, + EModelEndpoint, + loadOCRConfig, + getConfigDefaults, +} = require('librechat-data-provider'); const { checkVariables, checkHealth, checkConfig, checkAzureVariables } = require('./start/checks'); const { azureAssistantsDefaults, assistantsConfigSetup } = require('./start/assistants'); const { initializeFirebase } = require('./Files/Firebase/initialize'); @@ -25,6 +30,7 @@ const AppService = async (app) => { const config = (await loadCustomConfig()) ?? {}; const configDefaults = getConfigDefaults(); + const ocr = loadOCRConfig(config.ocr); const filteredTools = config.filteredTools; const includedTools = config.includedTools; const fileStrategy = config.fileStrategy ?? configDefaults.fileStrategy; @@ -57,6 +63,7 @@ const AppService = async (app) => { const interfaceConfig = await loadDefaultInterface(config, configDefaults); const defaultLocals = { + ocr, paths, fileStrategy, socialLogins, diff --git a/api/server/services/AppService.spec.js b/api/server/services/AppService.spec.js index 61ac80fc6c..e47bfe7d5d 100644 --- a/api/server/services/AppService.spec.js +++ b/api/server/services/AppService.spec.js @@ -120,6 +120,7 @@ describe('AppService', () => { }, }, paths: expect.anything(), + ocr: expect.anything(), imageOutputType: expect.any(String), fileConfig: undefined, secureImageLinks: undefined, @@ -588,4 +589,33 @@ describe('AppService updating app.locals and issuing warnings', () => { ); }); }); + + it('should not parse environment variable references in OCR config', async () => { + // Mock custom configuration with env variable references in OCR config + const mockConfig = { + ocr: { + apiKey: '${OCR_API_KEY_CUSTOM_VAR_NAME}', + baseURL: '${OCR_BASEURL_CUSTOM_VAR_NAME}', + strategy: 'mistral_ocr', + mistralModel: 'mistral-medium', + }, + }; + + require('./Config/loadCustomConfig').mockImplementationOnce(() => Promise.resolve(mockConfig)); + + // Set actual environment variables with different values + process.env.OCR_API_KEY_CUSTOM_VAR_NAME = 'actual-api-key'; + process.env.OCR_BASEURL_CUSTOM_VAR_NAME = 'https://actual-ocr-url.com'; + + // Initialize app + const app = { locals: {} }; + await AppService(app); + + // Verify that the raw string references were preserved and not interpolated + expect(app.locals.ocr).toBeDefined(); + expect(app.locals.ocr.apiKey).toEqual('${OCR_API_KEY_CUSTOM_VAR_NAME}'); + expect(app.locals.ocr.baseURL).toEqual('${OCR_BASEURL_CUSTOM_VAR_NAME}'); + expect(app.locals.ocr.strategy).toEqual('mistral_ocr'); + expect(app.locals.ocr.mistralModel).toEqual('mistral-medium'); + }); }); diff --git a/api/server/services/Endpoints/agents/build.js b/api/server/services/Endpoints/agents/build.js index 027937e7fd..999cdc16be 100644 --- a/api/server/services/Endpoints/agents/build.js +++ b/api/server/services/Endpoints/agents/build.js @@ -2,15 +2,8 @@ const { loadAgent } = require('~/models/Agent'); const { logger } = require('~/config'); const buildOptions = (req, endpoint, parsedBody) => { - const { - spec, - iconURL, - agent_id, - instructions, - maxContextTokens, - resendFiles = true, - ...model_parameters - } = parsedBody; + const { spec, iconURL, agent_id, instructions, maxContextTokens, ...model_parameters } = + parsedBody; const agentPromise = loadAgent({ req, agent_id, @@ -24,7 +17,6 @@ const buildOptions = (req, endpoint, parsedBody) => { iconURL, endpoint, agent_id, - resendFiles, instructions, maxContextTokens, model_parameters, diff --git a/api/server/services/Endpoints/agents/initialize.js b/api/server/services/Endpoints/agents/initialize.js index 1cf8ad7a67..11c8dc6fc4 100644 --- a/api/server/services/Endpoints/agents/initialize.js +++ b/api/server/services/Endpoints/agents/initialize.js @@ -2,6 +2,7 @@ const { createContentAggregator, Providers } = require('@librechat/agents'); const { EModelEndpoint, getResponseSender, + AgentCapabilities, providerEndpointMap, } = require('librechat-data-provider'); const { @@ -15,10 +16,13 @@ const initCustom = require('~/server/services/Endpoints/custom/initialize'); const initGoogle = require('~/server/services/Endpoints/google/initialize'); const generateArtifactsPrompt = require('~/app/clients/prompts/artifacts'); const { getCustomEndpointConfig } = require('~/server/services/Config'); +const { processFiles } = require('~/server/services/Files/process'); const { loadAgentTools } = require('~/server/services/ToolService'); const AgentClient = require('~/server/controllers/agents/client'); +const { getToolFiles } = require('~/models/Conversation'); const { getModelMaxTokens } = require('~/utils'); const { getAgent } = require('~/models/Agent'); +const { getFiles } = require('~/models/File'); const { logger } = require('~/config'); const providerConfigMap = { @@ -34,20 +38,38 @@ const providerConfigMap = { }; /** - * + * @param {ServerRequest} req * @param {Promise> | undefined} _attachments * @param {AgentToolResources | undefined} _tool_resources * @returns {Promise<{ attachments: Array | undefined, tool_resources: AgentToolResources | undefined }>} */ -const primeResources = async (_attachments, _tool_resources) => { +const primeResources = async (req, _attachments, _tool_resources) => { try { + /** @type {Array | undefined} */ + let attachments; + const tool_resources = _tool_resources ?? {}; + const isOCREnabled = (req.app.locals?.[EModelEndpoint.agents]?.capabilities ?? []).includes( + AgentCapabilities.ocr, + ); + if (tool_resources.ocr?.file_ids && isOCREnabled) { + const context = await getFiles( + { + file_id: { $in: tool_resources.ocr.file_ids }, + }, + {}, + {}, + ); + attachments = (attachments ?? []).concat(context); + } if (!_attachments) { - return { attachments: undefined, tool_resources: _tool_resources }; + return { attachments, tool_resources }; } /** @type {Array | undefined} */ const files = await _attachments; - const attachments = []; - const tool_resources = _tool_resources ?? {}; + if (!attachments) { + /** @type {Array} */ + attachments = []; + } for (const file of files) { if (!file) { @@ -82,7 +104,6 @@ const primeResources = async (_attachments, _tool_resources) => { * @param {ServerResponse} params.res * @param {Agent} params.agent * @param {object} [params.endpointOption] - * @param {AgentToolResources} [params.tool_resources] * @param {boolean} [params.isInitialAgent] * @returns {Promise} */ @@ -91,9 +112,28 @@ const initializeAgentOptions = async ({ res, agent, endpointOption, - tool_resources, isInitialAgent = false, }) => { + let currentFiles; + const requestFiles = req.body.files ?? []; + if ( + isInitialAgent && + req.body.conversationId != null && + agent.model_parameters?.resendFiles === true + ) { + const fileIds = (await getToolFiles(req.body.conversationId)).map((f) => f.file_id); + if (requestFiles.length || fileIds.length) { + currentFiles = await processFiles(requestFiles, fileIds); + } + } else if (isInitialAgent && requestFiles.length) { + currentFiles = await processFiles(requestFiles); + } + + const { attachments, tool_resources } = await primeResources( + req, + currentFiles, + agent.tool_resources, + ); const { tools, toolContextMap } = await loadAgentTools({ req, res, @@ -160,6 +200,7 @@ const initializeAgentOptions = async ({ return { ...agent, tools, + attachments, toolContextMap, maxContextTokens: agent.max_context_tokens ?? @@ -197,11 +238,6 @@ const initializeClient = async ({ req, res, endpointOption }) => { throw new Error('Agent not found'); } - const { attachments, tool_resources } = await primeResources( - endpointOption.attachments, - primaryAgent.tool_resources, - ); - const agentConfigs = new Map(); // Handle primary agent @@ -210,7 +246,6 @@ const initializeClient = async ({ req, res, endpointOption }) => { res, agent: primaryAgent, endpointOption, - tool_resources, isInitialAgent: true, }); @@ -240,18 +275,19 @@ const initializeClient = async ({ req, res, endpointOption }) => { const client = new AgentClient({ req, - agent: primaryConfig, sender, - attachments, contentParts, + agentConfigs, eventHandlers, collectedUsage, artifactPromises, + agent: primaryConfig, spec: endpointOption.spec, iconURL: endpointOption.iconURL, - agentConfigs, endpoint: EModelEndpoint.agents, + attachments: primaryConfig.attachments, maxContextTokens: primaryConfig.maxContextTokens, + resendFiles: primaryConfig.model_parameters?.resendFiles ?? true, }); return { client }; diff --git a/api/server/services/Files/MistralOCR/crud.js b/api/server/services/Files/MistralOCR/crud.js new file mode 100644 index 0000000000..cef8297519 --- /dev/null +++ b/api/server/services/Files/MistralOCR/crud.js @@ -0,0 +1,207 @@ +// ~/server/services/Files/MistralOCR/crud.js +const fs = require('fs'); +const path = require('path'); +const FormData = require('form-data'); +const { FileSources, envVarRegex, extractEnvVariable } = require('librechat-data-provider'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); +const { logger, createAxiosInstance } = require('~/config'); +const { logAxiosError } = require('~/utils'); + +const axios = createAxiosInstance(); + +/** + * Uploads a document to Mistral API using file streaming to avoid loading the entire file into memory + * + * @param {Object} params Upload parameters + * @param {string} params.filePath The path to the file on disk + * @param {string} [params.fileName] Optional filename to use (defaults to the name from filePath) + * @param {string} params.apiKey Mistral API key + * @param {string} [params.baseURL=https://api.mistral.ai/v1] Mistral API base URL + * @returns {Promise} The response from Mistral API + */ +async function uploadDocumentToMistral({ + filePath, + fileName = '', + apiKey, + baseURL = 'https://api.mistral.ai/v1', +}) { + const form = new FormData(); + form.append('purpose', 'ocr'); + const actualFileName = fileName || path.basename(filePath); + const fileStream = fs.createReadStream(filePath); + form.append('file', fileStream, { filename: actualFileName }); + + return axios + .post(`${baseURL}/files`, form, { + headers: { + Authorization: `Bearer ${apiKey}`, + ...form.getHeaders(), + }, + maxBodyLength: Infinity, + maxContentLength: Infinity, + }) + .then((res) => res.data) + .catch((error) => { + logger.error('Error uploading document to Mistral:', error.message); + throw error; + }); +} + +async function getSignedUrl({ + apiKey, + fileId, + expiry = 24, + baseURL = 'https://api.mistral.ai/v1', +}) { + return axios + .get(`${baseURL}/files/${fileId}/url?expiry=${expiry}`, { + headers: { + Authorization: `Bearer ${apiKey}`, + }, + }) + .then((res) => res.data) + .catch((error) => { + logger.error('Error fetching signed URL:', error.message); + throw error; + }); +} + +/** + * @param {Object} params + * @param {string} params.apiKey + * @param {string} params.documentUrl + * @param {string} [params.baseURL] + * @returns {Promise} + */ +async function performOCR({ + apiKey, + documentUrl, + model = 'mistral-ocr-latest', + baseURL = 'https://api.mistral.ai/v1', +}) { + return axios + .post( + `${baseURL}/ocr`, + { + model, + include_image_base64: false, + document: { + type: 'document_url', + document_url: documentUrl, + }, + }, + { + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + }, + }, + ) + .then((res) => res.data) + .catch((error) => { + logger.error('Error performing OCR:', error.message); + throw error; + }); +} + +function extractVariableName(str) { + const match = str.match(envVarRegex); + return match ? match[1] : null; +} + +const uploadMistralOCR = async ({ req, file, file_id, entity_id }) => { + try { + /** @type {TCustomConfig['ocr']} */ + const ocrConfig = req.app.locals?.ocr; + + const apiKeyConfig = ocrConfig.apiKey || ''; + const baseURLConfig = ocrConfig.baseURL || ''; + + const isApiKeyEnvVar = envVarRegex.test(apiKeyConfig); + const isBaseURLEnvVar = envVarRegex.test(baseURLConfig); + + const isApiKeyEmpty = !apiKeyConfig.trim(); + const isBaseURLEmpty = !baseURLConfig.trim(); + + let apiKey, baseURL; + + if (isApiKeyEnvVar || isBaseURLEnvVar || isApiKeyEmpty || isBaseURLEmpty) { + const apiKeyVarName = isApiKeyEnvVar ? extractVariableName(apiKeyConfig) : 'OCR_API_KEY'; + const baseURLVarName = isBaseURLEnvVar ? extractVariableName(baseURLConfig) : 'OCR_BASEURL'; + + const authValues = await loadAuthValues({ + userId: req.user.id, + authFields: [baseURLVarName, apiKeyVarName], + optional: new Set([baseURLVarName]), + }); + + apiKey = authValues[apiKeyVarName]; + baseURL = authValues[baseURLVarName]; + } else { + apiKey = apiKeyConfig; + baseURL = baseURLConfig; + } + + const mistralFile = await uploadDocumentToMistral({ + filePath: file.path, + fileName: file.originalname, + apiKey, + baseURL, + }); + + const modelConfig = ocrConfig.mistralModel || ''; + const model = envVarRegex.test(modelConfig) + ? extractEnvVariable(modelConfig) + : modelConfig.trim() || 'mistral-ocr-latest'; + + const signedUrlResponse = await getSignedUrl({ + apiKey, + baseURL, + fileId: mistralFile.id, + }); + + const ocrResult = await performOCR({ + apiKey, + baseURL, + model, + documentUrl: signedUrlResponse.url, + }); + + let aggregatedText = ''; + const images = []; + ocrResult.pages.forEach((page, index) => { + if (ocrResult.pages.length > 1) { + aggregatedText += `# PAGE ${index + 1}\n`; + } + + aggregatedText += page.markdown + '\n\n'; + + if (page.images && page.images.length > 0) { + page.images.forEach((image) => { + if (image.image_base64) { + images.push(image.image_base64); + } + }); + } + }); + + return { + filename: file.originalname, + bytes: aggregatedText.length * 4, + filepath: FileSources.mistral_ocr, + text: aggregatedText, + images, + }; + } catch (error) { + const message = 'Error uploading document to Mistral OCR API'; + logAxiosError({ error, message }); + throw new Error(message); + } +}; + +module.exports = { + uploadDocumentToMistral, + uploadMistralOCR, + getSignedUrl, + performOCR, +}; diff --git a/api/server/services/Files/MistralOCR/crud.spec.js b/api/server/services/Files/MistralOCR/crud.spec.js new file mode 100644 index 0000000000..80ac6f73a4 --- /dev/null +++ b/api/server/services/Files/MistralOCR/crud.spec.js @@ -0,0 +1,737 @@ +const fs = require('fs'); + +const mockAxios = { + interceptors: { + request: { use: jest.fn(), eject: jest.fn() }, + response: { use: jest.fn(), eject: jest.fn() }, + }, + create: jest.fn().mockReturnValue({ + defaults: { + proxy: null, + }, + get: jest.fn().mockResolvedValue({ data: {} }), + post: jest.fn().mockResolvedValue({ data: {} }), + put: jest.fn().mockResolvedValue({ data: {} }), + delete: jest.fn().mockResolvedValue({ data: {} }), + }), + get: jest.fn().mockResolvedValue({ data: {} }), + post: jest.fn().mockResolvedValue({ data: {} }), + put: jest.fn().mockResolvedValue({ data: {} }), + delete: jest.fn().mockResolvedValue({ data: {} }), + reset: jest.fn().mockImplementation(function () { + this.get.mockClear(); + this.post.mockClear(); + this.put.mockClear(); + this.delete.mockClear(); + this.create.mockClear(); + }), +}; + +jest.mock('axios', () => mockAxios); +jest.mock('fs'); +jest.mock('~/utils', () => ({ + logAxiosError: jest.fn(), +})); +jest.mock('~/config', () => ({ + logger: { + error: jest.fn(), + }, + createAxiosInstance: () => mockAxios, +})); +jest.mock('~/server/services/Tools/credentials', () => ({ + loadAuthValues: jest.fn(), +})); + +const { uploadDocumentToMistral, uploadMistralOCR, getSignedUrl, performOCR } = require('./crud'); + +describe('MistralOCR Service', () => { + afterEach(() => { + mockAxios.reset(); + jest.clearAllMocks(); + }); + + describe('uploadDocumentToMistral', () => { + beforeEach(() => { + // Create a more complete mock for file streams that FormData can work with + const mockReadStream = { + on: jest.fn().mockImplementation(function (event, handler) { + // Simulate immediate 'end' event to make FormData complete processing + if (event === 'end') { + handler(); + } + return this; + }), + pipe: jest.fn().mockImplementation(function () { + return this; + }), + pause: jest.fn(), + resume: jest.fn(), + emit: jest.fn(), + once: jest.fn(), + destroy: jest.fn(), + }; + + fs.createReadStream = jest.fn().mockReturnValue(mockReadStream); + + // Mock FormData's append to avoid actual stream processing + jest.mock('form-data', () => { + const mockFormData = function () { + return { + append: jest.fn(), + getHeaders: jest + .fn() + .mockReturnValue({ 'content-type': 'multipart/form-data; boundary=---boundary' }), + getBuffer: jest.fn().mockReturnValue(Buffer.from('mock-form-data')), + getLength: jest.fn().mockReturnValue(100), + }; + }; + return mockFormData; + }); + }); + + it('should upload a document to Mistral API using file streaming', async () => { + const mockResponse = { data: { id: 'file-123', purpose: 'ocr' } }; + mockAxios.post.mockResolvedValueOnce(mockResponse); + + const result = await uploadDocumentToMistral({ + filePath: '/path/to/test.pdf', + fileName: 'test.pdf', + apiKey: 'test-api-key', + }); + + // Check that createReadStream was called with the correct file path + expect(fs.createReadStream).toHaveBeenCalledWith('/path/to/test.pdf'); + + // Since we're mocking FormData, we'll just check that axios was called correctly + expect(mockAxios.post).toHaveBeenCalledWith( + 'https://api.mistral.ai/v1/files', + expect.anything(), + expect.objectContaining({ + headers: expect.objectContaining({ + Authorization: 'Bearer test-api-key', + }), + maxBodyLength: Infinity, + maxContentLength: Infinity, + }), + ); + expect(result).toEqual(mockResponse.data); + }); + + it('should handle errors during document upload', async () => { + const errorMessage = 'API error'; + mockAxios.post.mockRejectedValueOnce(new Error(errorMessage)); + + await expect( + uploadDocumentToMistral({ + filePath: '/path/to/test.pdf', + fileName: 'test.pdf', + apiKey: 'test-api-key', + }), + ).rejects.toThrow(); + + const { logger } = require('~/config'); + expect(logger.error).toHaveBeenCalledWith( + expect.stringContaining('Error uploading document to Mistral:'), + expect.any(String), + ); + }); + }); + + describe('getSignedUrl', () => { + it('should fetch signed URL from Mistral API', async () => { + const mockResponse = { data: { url: 'https://document-url.com' } }; + mockAxios.get.mockResolvedValueOnce(mockResponse); + + const result = await getSignedUrl({ + fileId: 'file-123', + apiKey: 'test-api-key', + }); + + expect(mockAxios.get).toHaveBeenCalledWith( + 'https://api.mistral.ai/v1/files/file-123/url?expiry=24', + { + headers: { + Authorization: 'Bearer test-api-key', + }, + }, + ); + expect(result).toEqual(mockResponse.data); + }); + + it('should handle errors when fetching signed URL', async () => { + const errorMessage = 'API error'; + mockAxios.get.mockRejectedValueOnce(new Error(errorMessage)); + + await expect( + getSignedUrl({ + fileId: 'file-123', + apiKey: 'test-api-key', + }), + ).rejects.toThrow(); + + const { logger } = require('~/config'); + expect(logger.error).toHaveBeenCalledWith('Error fetching signed URL:', errorMessage); + }); + }); + + describe('performOCR', () => { + it('should perform OCR using Mistral API', async () => { + const mockResponse = { + data: { + pages: [{ markdown: 'Page 1 content' }, { markdown: 'Page 2 content' }], + }, + }; + mockAxios.post.mockResolvedValueOnce(mockResponse); + + const result = await performOCR({ + apiKey: 'test-api-key', + documentUrl: 'https://document-url.com', + model: 'mistral-ocr-latest', + }); + + expect(mockAxios.post).toHaveBeenCalledWith( + 'https://api.mistral.ai/v1/ocr', + { + model: 'mistral-ocr-latest', + include_image_base64: false, + document: { + type: 'document_url', + document_url: 'https://document-url.com', + }, + }, + { + headers: { + 'Content-Type': 'application/json', + Authorization: 'Bearer test-api-key', + }, + }, + ); + expect(result).toEqual(mockResponse.data); + }); + + it('should handle errors during OCR processing', async () => { + const errorMessage = 'OCR processing error'; + mockAxios.post.mockRejectedValueOnce(new Error(errorMessage)); + + await expect( + performOCR({ + apiKey: 'test-api-key', + documentUrl: 'https://document-url.com', + }), + ).rejects.toThrow(); + + const { logger } = require('~/config'); + expect(logger.error).toHaveBeenCalledWith('Error performing OCR:', errorMessage); + }); + }); + + describe('uploadMistralOCR', () => { + beforeEach(() => { + const mockReadStream = { + on: jest.fn().mockImplementation(function (event, handler) { + if (event === 'end') { + handler(); + } + return this; + }), + pipe: jest.fn().mockImplementation(function () { + return this; + }), + pause: jest.fn(), + resume: jest.fn(), + emit: jest.fn(), + once: jest.fn(), + destroy: jest.fn(), + }; + + fs.createReadStream = jest.fn().mockReturnValue(mockReadStream); + }); + + it('should process OCR for a file with standard configuration', async () => { + // Setup mocks + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + loadAuthValues.mockResolvedValue({ + OCR_API_KEY: 'test-api-key', + OCR_BASEURL: 'https://api.mistral.ai/v1', + }); + + // Mock file upload response + mockAxios.post.mockResolvedValueOnce({ + data: { id: 'file-123', purpose: 'ocr' }, + }); + + // Mock signed URL response + mockAxios.get.mockResolvedValueOnce({ + data: { url: 'https://signed-url.com' }, + }); + + // Mock OCR response with text and images + mockAxios.post.mockResolvedValueOnce({ + data: { + pages: [ + { + markdown: 'Page 1 content', + images: [{ image_base64: 'base64image1' }], + }, + { + markdown: 'Page 2 content', + images: [{ image_base64: 'base64image2' }], + }, + ], + }, + }); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + // Use environment variable syntax to ensure loadAuthValues is called + apiKey: '${OCR_API_KEY}', + baseURL: '${OCR_BASEURL}', + mistralModel: 'mistral-medium', + }, + }, + }, + }; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'document.pdf', + }; + + const result = await uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + expect(loadAuthValues).toHaveBeenCalledWith({ + userId: 'user123', + authFields: ['OCR_BASEURL', 'OCR_API_KEY'], + optional: expect.any(Set), + }); + + // Verify OCR result + expect(result).toEqual({ + filename: 'document.pdf', + bytes: expect.any(Number), + filepath: 'mistral_ocr', + text: expect.stringContaining('# PAGE 1'), + images: ['base64image1', 'base64image2'], + }); + }); + + it('should process variable references in configuration', async () => { + // Setup mocks with environment variables + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + loadAuthValues.mockResolvedValue({ + CUSTOM_API_KEY: 'custom-api-key', + CUSTOM_BASEURL: 'https://custom-api.mistral.ai/v1', + }); + + // Mock API responses + mockAxios.post.mockResolvedValueOnce({ + data: { id: 'file-123', purpose: 'ocr' }, + }); + mockAxios.get.mockResolvedValueOnce({ + data: { url: 'https://signed-url.com' }, + }); + mockAxios.post.mockResolvedValueOnce({ + data: { + pages: [{ markdown: 'Content from custom API' }], + }, + }); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + apiKey: '${CUSTOM_API_KEY}', + baseURL: '${CUSTOM_BASEURL}', + mistralModel: '${CUSTOM_MODEL}', + }, + }, + }, + }; + + // Set environment variable for model + process.env.CUSTOM_MODEL = 'mistral-large'; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'document.pdf', + }; + + const result = await uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + // Verify that custom environment variables were extracted and used + expect(loadAuthValues).toHaveBeenCalledWith({ + userId: 'user123', + authFields: ['CUSTOM_BASEURL', 'CUSTOM_API_KEY'], + optional: expect.any(Set), + }); + + // Check that mistral-large was used in the OCR API call + expect(mockAxios.post).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + model: 'mistral-large', + }), + expect.anything(), + ); + + expect(result.text).toEqual('Content from custom API\n\n'); + }); + + it('should fall back to default values when variables are not properly formatted', async () => { + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + loadAuthValues.mockResolvedValue({ + OCR_API_KEY: 'default-api-key', + OCR_BASEURL: undefined, // Testing optional parameter + }); + + mockAxios.post.mockResolvedValueOnce({ + data: { id: 'file-123', purpose: 'ocr' }, + }); + mockAxios.get.mockResolvedValueOnce({ + data: { url: 'https://signed-url.com' }, + }); + mockAxios.post.mockResolvedValueOnce({ + data: { + pages: [{ markdown: 'Default API result' }], + }, + }); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + // Use environment variable syntax to ensure loadAuthValues is called + apiKey: '${INVALID_FORMAT}', // Using valid env var format but with an invalid name + baseURL: '${OCR_BASEURL}', // Using valid env var format + mistralModel: 'mistral-ocr-latest', // Plain string value + }, + }, + }, + }; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'document.pdf', + }; + + await uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + // Should use the default values + expect(loadAuthValues).toHaveBeenCalledWith({ + userId: 'user123', + authFields: ['OCR_BASEURL', 'INVALID_FORMAT'], + optional: expect.any(Set), + }); + + // Should use the default model when not using environment variable format + expect(mockAxios.post).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + model: 'mistral-ocr-latest', + }), + expect.anything(), + ); + }); + + it('should handle API errors during OCR process', async () => { + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + loadAuthValues.mockResolvedValue({ + OCR_API_KEY: 'test-api-key', + }); + + // Mock file upload to fail + mockAxios.post.mockRejectedValueOnce(new Error('Upload failed')); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + apiKey: 'OCR_API_KEY', + baseURL: 'OCR_BASEURL', + }, + }, + }, + }; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'document.pdf', + }; + + await expect( + uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }), + ).rejects.toThrow('Error uploading document to Mistral OCR API'); + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + const { logAxiosError } = require('~/utils'); + expect(logAxiosError).toHaveBeenCalled(); + }); + + it('should handle single page documents without page numbering', async () => { + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + loadAuthValues.mockResolvedValue({ + OCR_API_KEY: 'test-api-key', + OCR_BASEURL: 'https://api.mistral.ai/v1', // Make sure this is included + }); + + // Clear all previous mocks + mockAxios.post.mockClear(); + mockAxios.get.mockClear(); + + // 1. First mock: File upload response + mockAxios.post.mockImplementationOnce(() => + Promise.resolve({ data: { id: 'file-123', purpose: 'ocr' } }), + ); + + // 2. Second mock: Signed URL response + mockAxios.get.mockImplementationOnce(() => + Promise.resolve({ data: { url: 'https://signed-url.com' } }), + ); + + // 3. Third mock: OCR response + mockAxios.post.mockImplementationOnce(() => + Promise.resolve({ + data: { + pages: [{ markdown: 'Single page content' }], + }, + }), + ); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + apiKey: 'OCR_API_KEY', + baseURL: 'OCR_BASEURL', + mistralModel: 'mistral-ocr-latest', + }, + }, + }, + }; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'single-page.pdf', + }; + + const result = await uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + // Verify that single page documents don't include page numbering + expect(result.text).not.toContain('# PAGE'); + expect(result.text).toEqual('Single page content\n\n'); + }); + + it('should use literal values in configuration when provided directly', async () => { + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + // We'll still mock this but it should not be used for literal values + loadAuthValues.mockResolvedValue({}); + + // Clear all previous mocks + mockAxios.post.mockClear(); + mockAxios.get.mockClear(); + + // 1. First mock: File upload response + mockAxios.post.mockImplementationOnce(() => + Promise.resolve({ data: { id: 'file-123', purpose: 'ocr' } }), + ); + + // 2. Second mock: Signed URL response + mockAxios.get.mockImplementationOnce(() => + Promise.resolve({ data: { url: 'https://signed-url.com' } }), + ); + + // 3. Third mock: OCR response + mockAxios.post.mockImplementationOnce(() => + Promise.resolve({ + data: { + pages: [{ markdown: 'Processed with literal config values' }], + }, + }), + ); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + // Direct values that should be used as-is, without variable substitution + apiKey: 'actual-api-key-value', + baseURL: 'https://direct-api-url.mistral.ai/v1', + mistralModel: 'mistral-direct-model', + }, + }, + }, + }; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'direct-values.pdf', + }; + + const result = await uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + // Verify the correct URL was used with the direct baseURL value + expect(mockAxios.post).toHaveBeenCalledWith( + 'https://direct-api-url.mistral.ai/v1/files', + expect.any(Object), + expect.objectContaining({ + headers: expect.objectContaining({ + Authorization: 'Bearer actual-api-key-value', + }), + }), + ); + + // Check the OCR call was made with the direct model value + expect(mockAxios.post).toHaveBeenCalledWith( + 'https://direct-api-url.mistral.ai/v1/ocr', + expect.objectContaining({ + model: 'mistral-direct-model', + }), + expect.any(Object), + ); + + // Verify the result + expect(result.text).toEqual('Processed with literal config values\n\n'); + + // Verify loadAuthValues was never called since we used direct values + expect(loadAuthValues).not.toHaveBeenCalled(); + }); + + it('should handle empty configuration values and use defaults', async () => { + const { loadAuthValues } = require('~/server/services/Tools/credentials'); + // Set up the mock values to be returned by loadAuthValues + loadAuthValues.mockResolvedValue({ + OCR_API_KEY: 'default-from-env-key', + OCR_BASEURL: 'https://default-from-env.mistral.ai/v1', + }); + + // Clear all previous mocks + mockAxios.post.mockClear(); + mockAxios.get.mockClear(); + + // 1. First mock: File upload response + mockAxios.post.mockImplementationOnce(() => + Promise.resolve({ data: { id: 'file-123', purpose: 'ocr' } }), + ); + + // 2. Second mock: Signed URL response + mockAxios.get.mockImplementationOnce(() => + Promise.resolve({ data: { url: 'https://signed-url.com' } }), + ); + + // 3. Third mock: OCR response + mockAxios.post.mockImplementationOnce(() => + Promise.resolve({ + data: { + pages: [{ markdown: 'Content from default configuration' }], + }, + }), + ); + + const req = { + user: { id: 'user123' }, + app: { + locals: { + ocr: { + // Empty string values - should fall back to defaults + apiKey: '', + baseURL: '', + mistralModel: '', + }, + }, + }, + }; + + const file = { + path: '/tmp/upload/file.pdf', + originalname: 'empty-config.pdf', + }; + + const result = await uploadMistralOCR({ + req, + file, + file_id: 'file123', + entity_id: 'entity123', + }); + + expect(fs.createReadStream).toHaveBeenCalledWith('/tmp/upload/file.pdf'); + + // Verify loadAuthValues was called with the default variable names + expect(loadAuthValues).toHaveBeenCalledWith({ + userId: 'user123', + authFields: ['OCR_BASEURL', 'OCR_API_KEY'], + optional: expect.any(Set), + }); + + // Verify the API calls used the default values from loadAuthValues + expect(mockAxios.post).toHaveBeenCalledWith( + 'https://default-from-env.mistral.ai/v1/files', + expect.any(Object), + expect.objectContaining({ + headers: expect.objectContaining({ + Authorization: 'Bearer default-from-env-key', + }), + }), + ); + + // Verify the OCR model defaulted to mistral-ocr-latest + expect(mockAxios.post).toHaveBeenCalledWith( + 'https://default-from-env.mistral.ai/v1/ocr', + expect.objectContaining({ + model: 'mistral-ocr-latest', + }), + expect.any(Object), + ); + + // Check result + expect(result.text).toEqual('Content from default configuration\n\n'); + }); + }); +}); diff --git a/api/server/services/Files/MistralOCR/index.js b/api/server/services/Files/MistralOCR/index.js new file mode 100644 index 0000000000..a6223d1ee5 --- /dev/null +++ b/api/server/services/Files/MistralOCR/index.js @@ -0,0 +1,5 @@ +const crud = require('./crud'); + +module.exports = { + ...crud, +}; diff --git a/api/server/services/Files/images/encode.js b/api/server/services/Files/images/encode.js index 94153ffc64..707632fb6a 100644 --- a/api/server/services/Files/images/encode.js +++ b/api/server/services/Files/images/encode.js @@ -49,6 +49,7 @@ async function encodeAndFormat(req, files, endpoint, mode) { const promises = []; const encodingMethods = {}; const result = { + text: '', files: [], image_urls: [], }; @@ -59,6 +60,9 @@ async function encodeAndFormat(req, files, endpoint, mode) { for (let file of files) { const source = file.source ?? FileSources.local; + if (source === FileSources.text && file.text) { + result.text += `${!result.text ? 'Attached document(s):\n```md' : '\n\n---\n\n'}# "${file.filename}"\n${file.text}\n`; + } if (!file.height) { promises.push([file, null]); @@ -85,6 +89,10 @@ async function encodeAndFormat(req, files, endpoint, mode) { promises.push(preparePayload(req, file)); } + if (result.text) { + result.text += '\n```'; + } + const detail = req.body.imageDetail ?? ImageDetail.auto; /** @type {Array<[MongoFile, string]>} */ diff --git a/api/server/services/Files/process.js b/api/server/services/Files/process.js index 8744eb409b..1bfadc4b23 100644 --- a/api/server/services/Files/process.js +++ b/api/server/services/Files/process.js @@ -28,8 +28,8 @@ const { addResourceFileId, deleteResourceFileId } = require('~/server/controller const { addAgentResourceFile, removeAgentResourceFiles } = require('~/models/Agent'); const { getOpenAIClient } = require('~/server/controllers/assistants/helpers'); const { createFile, updateFileUsage, deleteFiles } = require('~/models/File'); +const { loadAuthValues } = require('~/server/services/Tools/credentials'); const { getEndpointsConfig } = require('~/server/services/Config'); -const { loadAuthValues } = require('~/app/clients/tools/util'); const { LB_QueueAsyncCall } = require('~/server/utils/queue'); const { getStrategyFunctions } = require('./strategies'); const { determineFileType } = require('~/server/utils'); @@ -162,7 +162,6 @@ const processDeleteRequest = async ({ req, files }) => { for (const file of files) { const source = file.source ?? FileSources.local; - if (req.body.agent_id && req.body.tool_resource) { agentFiles.push({ tool_resource: req.body.tool_resource, @@ -170,6 +169,11 @@ const processDeleteRequest = async ({ req, files }) => { }); } + if (source === FileSources.text) { + resolvedFileIds.push(file.file_id); + continue; + } + if (checkOpenAIStorage(source) && !client[source]) { await initializeClients(); } @@ -521,6 +525,52 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { if (!isFileSearchEnabled) { throw new Error('File search is not enabled for Agents'); } + } else if (tool_resource === EToolResources.ocr) { + const isOCREnabled = await checkCapability(req, AgentCapabilities.ocr); + if (!isOCREnabled) { + throw new Error('OCR capability is not enabled for Agents'); + } + + const { handleFileUpload } = getStrategyFunctions( + req.app.locals?.ocr?.strategy ?? FileSources.mistral_ocr, + ); + const { file_id, temp_file_id } = metadata; + + const { + text, + bytes, + // TODO: OCR images support? + images, + filename, + filepath: ocrFileURL, + } = await handleFileUpload({ req, file, file_id, entity_id: agent_id }); + + const fileInfo = removeNullishValues({ + text, + bytes, + file_id, + temp_file_id, + user: req.user.id, + type: file.mimetype, + filepath: ocrFileURL, + source: FileSources.text, + filename: filename ?? file.originalname, + model: messageAttachment ? undefined : req.body.model, + context: messageAttachment ? FileContext.message_attachment : FileContext.agents, + }); + + if (!messageAttachment && tool_resource) { + await addAgentResourceFile({ + req, + file_id, + agent_id, + tool_resource, + }); + } + const result = await createFile(fileInfo, true); + return res + .status(200) + .json({ message: 'Agent file uploaded and processed successfully', ...result }); } const source = diff --git a/api/server/services/Files/strategies.js b/api/server/services/Files/strategies.js index ddfdd57469..128715bf3e 100644 --- a/api/server/services/Files/strategies.js +++ b/api/server/services/Files/strategies.js @@ -24,6 +24,7 @@ const { const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI'); const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code'); const { uploadVectors, deleteVectors } = require('./VectorDB'); +const { uploadMistralOCR } = require('./MistralOCR'); /** * Firebase Storage Strategy Functions @@ -127,6 +128,26 @@ const codeOutputStrategy = () => ({ getDownloadStream: getCodeOutputDownloadStream, }); +const mistralOCRStrategy = () => ({ + /** @type {typeof saveFileFromURL | null} */ + saveURL: null, + /** @type {typeof getLocalFileURL | null} */ + getFileURL: null, + /** @type {typeof saveLocalBuffer | null} */ + saveBuffer: null, + /** @type {typeof processLocalAvatar | null} */ + processAvatar: null, + /** @type {typeof uploadLocalImage | null} */ + handleImageUpload: null, + /** @type {typeof prepareImagesLocal | null} */ + prepareImagePayload: null, + /** @type {typeof deleteLocalFile | null} */ + deleteFile: null, + /** @type {typeof getLocalFileStream | null} */ + getDownloadStream: null, + handleFileUpload: uploadMistralOCR, +}); + // Strategy Selector const getStrategyFunctions = (fileSource) => { if (fileSource === FileSources.firebase) { @@ -141,6 +162,8 @@ const getStrategyFunctions = (fileSource) => { return vectorStrategy(); } else if (fileSource === FileSources.execute_code) { return codeOutputStrategy(); + } else if (fileSource === FileSources.mistral_ocr) { + return mistralOCRStrategy(); } else { throw new Error('Invalid file source'); } diff --git a/api/server/services/Tools/credentials.js b/api/server/services/Tools/credentials.js new file mode 100644 index 0000000000..b50a2460d4 --- /dev/null +++ b/api/server/services/Tools/credentials.js @@ -0,0 +1,56 @@ +const { getUserPluginAuthValue } = require('~/server/services/PluginService'); + +/** + * + * @param {Object} params + * @param {string} params.userId + * @param {string[]} params.authFields + * @param {Set} [params.optional] + * @param {boolean} [params.throwError] + * @returns + */ +const loadAuthValues = async ({ userId, authFields, optional, throwError = true }) => { + let authValues = {}; + + /** + * Finds the first non-empty value for the given authentication field, supporting alternate fields. + * @param {string[]} fields Array of strings representing the authentication fields. Supports alternate fields delimited by "||". + * @returns {Promise<{ authField: string, authValue: string} | null>} An object containing the authentication field and value, or null if not found. + */ + const findAuthValue = async (fields) => { + for (const field of fields) { + let value = process.env[field]; + if (value) { + return { authField: field, authValue: value }; + } + try { + value = await getUserPluginAuthValue(userId, field, throwError); + } catch (err) { + if (optional && optional.has(field)) { + return { authField: field, authValue: undefined }; + } + if (field === fields[fields.length - 1] && !value) { + throw err; + } + } + if (value) { + return { authField: field, authValue: value }; + } + } + return null; + }; + + for (let authField of authFields) { + const fields = authField.split('||'); + const result = await findAuthValue(fields); + if (result) { + authValues[result.authField] = result.authValue; + } + } + + return authValues; +}; + +module.exports = { + loadAuthValues, +}; diff --git a/api/server/utils/handleText.js b/api/server/utils/handleText.js index 8c681d8f4e..1b9cda28d7 100644 --- a/api/server/utils/handleText.js +++ b/api/server/utils/handleText.js @@ -203,6 +203,7 @@ function generateConfig(key, baseURL, endpoint) { AgentCapabilities.artifacts, AgentCapabilities.actions, AgentCapabilities.tools, + AgentCapabilities.ocr, ]; } diff --git a/api/test/__mocks__/logger.js b/api/test/__mocks__/logger.js index caeb004e39..549c57d5a4 100644 --- a/api/test/__mocks__/logger.js +++ b/api/test/__mocks__/logger.js @@ -39,7 +39,10 @@ jest.mock('winston-daily-rotate-file', () => { }); jest.mock('~/config', () => { + const actualModule = jest.requireActual('~/config'); return { + sendEvent: actualModule.sendEvent, + createAxiosInstance: actualModule.createAxiosInstance, logger: { info: jest.fn(), warn: jest.fn(), diff --git a/api/typedefs.js b/api/typedefs.js index 3045d9543b..21c4f1fecc 100644 --- a/api/typedefs.js +++ b/api/typedefs.js @@ -1787,3 +1787,51 @@ * @typedef {Promise<{ message: TMessage, conversation: TConversation }> | undefined} ClientDatabaseSavePromise * @memberof typedefs */ + +/** + * @exports OCRImage + * @typedef {Object} OCRImage + * @property {string} id - The identifier of the image. + * @property {number} top_left_x - X-coordinate of the top left corner of the image. + * @property {number} top_left_y - Y-coordinate of the top left corner of the image. + * @property {number} bottom_right_x - X-coordinate of the bottom right corner of the image. + * @property {number} bottom_right_y - Y-coordinate of the bottom right corner of the image. + * @property {string} image_base64 - Base64-encoded image data. + * @memberof typedefs + */ + +/** + * @exports PageDimensions + * @typedef {Object} PageDimensions + * @property {number} dpi - The dots per inch resolution of the page. + * @property {number} height - The height of the page in pixels. + * @property {number} width - The width of the page in pixels. + * @memberof typedefs + */ + +/** + * @exports OCRPage + * @typedef {Object} OCRPage + * @property {number} index - The index of the page in the document. + * @property {string} markdown - The extracted text content of the page in markdown format. + * @property {OCRImage[]} images - Array of images found on the page. + * @property {PageDimensions} dimensions - The dimensions of the page. + * @memberof typedefs + */ + +/** + * @exports OCRUsageInfo + * @typedef {Object} OCRUsageInfo + * @property {number} pages_processed - Number of pages processed in the document. + * @property {number} doc_size_bytes - Size of the document in bytes. + * @memberof typedefs + */ + +/** + * @exports OCRResult + * @typedef {Object} OCRResult + * @property {OCRPage[]} pages - Array of pages extracted from the document. + * @property {string} model - The model used for OCR processing. + * @property {OCRUsageInfo} usage_info - Usage information for the OCR operation. + * @memberof typedefs + */ diff --git a/client/src/common/agents-types.ts b/client/src/common/agents-types.ts index a9c24106bc..f4b8aac9fe 100644 --- a/client/src/common/agents-types.ts +++ b/client/src/common/agents-types.ts @@ -5,6 +5,7 @@ import type { OptionWithIcon, ExtendedFile } from './types'; export type TAgentOption = OptionWithIcon & Agent & { knowledge_files?: Array<[string, ExtendedFile]>; + context_files?: Array<[string, ExtendedFile]>; code_files?: Array<[string, ExtendedFile]>; }; diff --git a/client/src/common/types.ts b/client/src/common/types.ts index 380ec573b8..f0b2b8a238 100644 --- a/client/src/common/types.ts +++ b/client/src/common/types.ts @@ -483,6 +483,7 @@ export interface ExtendedFile { attached?: boolean; embedded?: boolean; tool_resource?: string; + metadata?: t.TFile['metadata']; } export type ContextType = { navVisible: boolean; setNavVisible: (visible: boolean) => void }; diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index 54a8a595c4..8841a0ae51 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -1,7 +1,7 @@ import * as Ariakit from '@ariakit/react'; import React, { useRef, useState, useMemo } from 'react'; -import { FileSearch, ImageUpIcon, TerminalSquareIcon } from 'lucide-react'; import { EToolResources, EModelEndpoint } from 'librechat-data-provider'; +import { FileSearch, ImageUpIcon, TerminalSquareIcon, FileType2Icon } from 'lucide-react'; import { FileUpload, TooltipAnchor, DropdownPopup } from '~/components/ui'; import { useGetEndpointsQuery } from '~/data-provider'; import { AttachmentIcon } from '~/components/svg'; @@ -49,6 +49,17 @@ const AttachFile = ({ isRTL, disabled, handleFileChange }: AttachFileProps) => { }, ]; + if (capabilities.includes(EToolResources.ocr)) { + items.push({ + label: localize('com_ui_upload_ocr_text'), + onClick: () => { + setToolResource(EToolResources.ocr); + handleUploadClick(); + }, + icon: , + }); + } + if (capabilities.includes(EToolResources.file_search)) { items.push({ label: localize('com_ui_upload_file_search'), diff --git a/client/src/components/Chat/Input/Files/DragDropModal.tsx b/client/src/components/Chat/Input/Files/DragDropModal.tsx index b252ae1a93..2abc15a45b 100644 --- a/client/src/components/Chat/Input/Files/DragDropModal.tsx +++ b/client/src/components/Chat/Input/Files/DragDropModal.tsx @@ -1,6 +1,6 @@ import React, { useMemo } from 'react'; import { EModelEndpoint, EToolResources } from 'librechat-data-provider'; -import { FileSearch, ImageUpIcon, TerminalSquareIcon } from 'lucide-react'; +import { FileSearch, ImageUpIcon, FileType2Icon, TerminalSquareIcon } from 'lucide-react'; import OGDialogTemplate from '~/components/ui/OGDialogTemplate'; import { useGetEndpointsQuery } from '~/data-provider'; import useLocalize from '~/hooks/useLocalize'; @@ -50,6 +50,12 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD value: EToolResources.execute_code, icon: , }); + } else if (capability === EToolResources.ocr) { + _options.push({ + label: localize('com_ui_upload_ocr_text'), + value: EToolResources.ocr, + icon: , + }); } } diff --git a/client/src/components/Chat/Input/Files/FilePreview.tsx b/client/src/components/Chat/Input/Files/FilePreview.tsx index 80933b8503..02851119af 100644 --- a/client/src/components/Chat/Input/Files/FilePreview.tsx +++ b/client/src/components/Chat/Input/Files/FilePreview.tsx @@ -19,7 +19,7 @@ const FilePreview = ({ }; className?: string; }) => { - const radius = 55; // Radius of the SVG circle + const radius = 55; const circumference = 2 * Math.PI * radius; const progress = useProgress( file?.['progress'] ?? 1, @@ -27,16 +27,15 @@ const FilePreview = ({ (file as ExtendedFile | undefined)?.size ?? 1, ); - // Calculate the offset based on the loading progress const offset = circumference - progress * circumference; const circleCSSProperties = { transition: 'stroke-dashoffset 0.5s linear', }; return ( -
+
- + {progress < 1 && ( + + + +
+ ); + } + + if (source === FileSources.text) { + return ( +
+ + + +
+ ); + } + + if (source === FileSources.vectordb) { + return ( +
+ + + +
+ ); } const endpoint = sourceToEndpoint[source ?? '']; @@ -31,7 +64,7 @@ export default function SourceIcon({ return null; } return ( - +
); } diff --git a/client/src/components/SidePanel/Agents/AgentConfig.tsx b/client/src/components/SidePanel/Agents/AgentConfig.tsx index 9fc7674158..7ece82bed1 100644 --- a/client/src/components/SidePanel/Agents/AgentConfig.tsx +++ b/client/src/components/SidePanel/Agents/AgentConfig.tsx @@ -23,6 +23,7 @@ import { processAgentOption } from '~/utils'; import AdminSettings from './AdminSettings'; import DeleteButton from './DeleteButton'; import AgentAvatar from './AgentAvatar'; +import FileContext from './FileContext'; import { Spinner } from '~/components'; import FileSearch from './FileSearch'; import ShareAgent from './ShareAgent'; @@ -82,6 +83,10 @@ export default function AgentConfig({ () => agentsConfig?.capabilities.includes(AgentCapabilities.artifacts) ?? false, [agentsConfig], ); + const ocrEnabled = useMemo( + () => agentsConfig?.capabilities.includes(AgentCapabilities.ocr) ?? false, + [agentsConfig], + ); const fileSearchEnabled = useMemo( () => agentsConfig?.capabilities.includes(AgentCapabilities.file_search) ?? false, [agentsConfig], @@ -91,6 +96,26 @@ export default function AgentConfig({ [agentsConfig], ); + const context_files = useMemo(() => { + if (typeof agent === 'string') { + return []; + } + + if (agent?.id !== agent_id) { + return []; + } + + if (agent.context_files) { + return agent.context_files; + } + + const _agent = processAgentOption({ + agent, + fileMap, + }); + return _agent.context_files ?? []; + }, [agent, agent_id, fileMap]); + const knowledge_files = useMemo(() => { if (typeof agent === 'string') { return []; @@ -334,7 +359,7 @@ export default function AgentConfig({ - {(codeEnabled || fileSearchEnabled || artifactsEnabled) && ( + {(codeEnabled || fileSearchEnabled || artifactsEnabled || ocrEnabled) && (
)} {/* Agent Tools & Actions */} diff --git a/client/src/components/SidePanel/Agents/FileContext.tsx b/client/src/components/SidePanel/Agents/FileContext.tsx new file mode 100644 index 0000000000..eb0e9e9a13 --- /dev/null +++ b/client/src/components/SidePanel/Agents/FileContext.tsx @@ -0,0 +1,128 @@ +import { useState, useRef } from 'react'; +import { + EModelEndpoint, + EToolResources, + mergeFileConfig, + fileConfig as defaultFileConfig, +} from 'librechat-data-provider'; +import type { ExtendedFile } from '~/common'; +import { useFileHandling, useLocalize, useLazyEffect } from '~/hooks'; +import FileRow from '~/components/Chat/Input/Files/FileRow'; +import { useGetFileConfig } from '~/data-provider'; +import { HoverCard, HoverCardContent, HoverCardPortal, HoverCardTrigger } from '~/components/ui'; +import { AttachmentIcon, CircleHelpIcon } from '~/components/svg'; +import { useChatContext } from '~/Providers'; +import { ESide } from '~/common'; + +export default function FileContext({ + agent_id, + files: _files, +}: { + agent_id: string; + files?: [string, ExtendedFile][]; +}) { + const localize = useLocalize(); + const { setFilesLoading } = useChatContext(); + const fileInputRef = useRef(null); + const [files, setFiles] = useState>(new Map()); + + const { data: fileConfig = defaultFileConfig } = useGetFileConfig({ + select: (data) => mergeFileConfig(data), + }); + + const { handleFileChange } = useFileHandling({ + overrideEndpoint: EModelEndpoint.agents, + additionalMetadata: { agent_id, tool_resource: EToolResources.ocr }, + fileSetter: setFiles, + }); + + useLazyEffect( + () => { + if (_files) { + setFiles(new Map(_files)); + } + }, + [_files], + 750, + ); + + const endpointFileConfig = fileConfig.endpoints[EModelEndpoint.agents]; + const isUploadDisabled = endpointFileConfig.disabled ?? false; + + if (isUploadDisabled) { + return null; + } + + const handleButtonClick = () => { + // necessary to reset the input + if (fileInputRef.current) { + fileInputRef.current.value = ''; + } + fileInputRef.current?.click(); + }; + + return ( +
+ +
+ + + + + + + + +
+

+ {localize('com_agents_file_context_info')} +

+
+
+
+
+
+
+ {/* File Context (OCR) Files */} +
{children}
} + /> +
+ +
+ {/* Disabled Message */} + {agent_id ? null : ( +
+ {localize('com_agents_file_context_disabled')} +
+ )} +
+
+ ); +} diff --git a/client/src/components/SidePanel/Files/PanelColumns.tsx b/client/src/components/SidePanel/Files/PanelColumns.tsx index d8fc15f6c6..5d199dfc9a 100644 --- a/client/src/components/SidePanel/Files/PanelColumns.tsx +++ b/client/src/components/SidePanel/Files/PanelColumns.tsx @@ -1,21 +1,23 @@ import { ArrowUpDown } from 'lucide-react'; import type { ColumnDef } from '@tanstack/react-table'; import type { TFile } from 'librechat-data-provider'; +import useLocalize from '~/hooks/useLocalize'; import PanelFileCell from './PanelFileCell'; import { Button } from '~/components/ui'; import { formatDate } from '~/utils'; -export const columns: ColumnDef[] = [ +export const columns: ColumnDef[] = [ { accessorKey: 'filename', header: ({ column }) => { + const localize = useLocalize(); return ( ); @@ -31,20 +33,21 @@ export const columns: ColumnDef[] = [ size: '10%', }, header: ({ column }) => { + const localize = useLocalize(); return ( ); }, cell: ({ row }) => ( - {formatDate(row.original.updatedAt?.toString() ?? '')} + {formatDate(row.original?.updatedAt?.toString() ?? '')} ), }, diff --git a/client/src/components/SidePanel/Files/PanelFileCell.tsx b/client/src/components/SidePanel/Files/PanelFileCell.tsx index e039b8b257..2a38a57172 100644 --- a/client/src/components/SidePanel/Files/PanelFileCell.tsx +++ b/client/src/components/SidePanel/Files/PanelFileCell.tsx @@ -6,7 +6,6 @@ import { getFileType } from '~/utils'; export default function PanelFileCell({ row }: { row: Row }) { const file = row.original; - return (
{file?.type.startsWith('image') === true ? ( diff --git a/client/src/components/SidePanel/Files/PanelTable.tsx b/client/src/components/SidePanel/Files/PanelTable.tsx index 51d0c864c1..87a965a968 100644 --- a/client/src/components/SidePanel/Files/PanelTable.tsx +++ b/client/src/components/SidePanel/Files/PanelTable.tsx @@ -159,6 +159,7 @@ export default function DataTable({ columns, data }: DataTablePro filename: fileData.filename, source: fileData.source, size: fileData.bytes, + metadata: fileData.metadata, }); }, [addFile, fileMap, conversation, localize, showToast, fileConfig.endpoints], diff --git a/client/src/data-provider/Files/mutations.ts b/client/src/data-provider/Files/mutations.ts index 9f51a5e717..8ac2a5e49e 100644 --- a/client/src/data-provider/Files/mutations.ts +++ b/client/src/data-provider/Files/mutations.ts @@ -63,8 +63,9 @@ export const useUploadFileMutation = ( const update = {}; const prevResources = agent.tool_resources ?? {}; - const prevResource: t.ExecuteCodeResource | t.AgentFileSearchResource = agent - .tool_resources?.[tool_resource] ?? { + const prevResource: t.ExecuteCodeResource | t.AgentFileResource = agent.tool_resources?.[ + tool_resource + ] ?? { file_ids: [], }; if (!prevResource.file_ids) { diff --git a/client/src/locales/en/translation.json b/client/src/locales/en/translation.json index d471d6a00a..e56024e169 100644 --- a/client/src/locales/en/translation.json +++ b/client/src/locales/en/translation.json @@ -11,6 +11,9 @@ "com_agents_create_error": "There was an error creating your agent.", "com_agents_description_placeholder": "Optional: Describe your Agent here", "com_agents_enable_file_search": "Enable File Search", + "com_agents_file_context": "File Context (OCR)", + "com_agents_file_context_disabled": "Agent must be created before uploading files for File Context.", + "com_agents_file_context_info": "Files uploaded as \"Context\" are processed using OCR to extract text, which is then added to the Agent's instructions. Ideal for documents, images with text, or PDFs where you need the full text content of a file", "com_agents_file_search_disabled": "Agent must be created before uploading files for File Search.", "com_agents_file_search_info": "When enabled, the agent will be informed of the exact filenames listed below, allowing it to retrieve relevant context from these files.", "com_agents_instructions_placeholder": "The system instructions that the agent uses", @@ -811,10 +814,12 @@ "com_ui_upload_code_files": "Upload for Code Interpreter", "com_ui_upload_delay": "Uploading \"{{0}}\" is taking more time than anticipated. Please wait while the file finishes indexing for retrieval.", "com_ui_upload_error": "There was an error uploading your file", + "com_ui_upload_file_context": "Upload File Context", "com_ui_upload_file_search": "Upload for File Search", "com_ui_upload_files": "Upload files", "com_ui_upload_image": "Upload an image", "com_ui_upload_image_input": "Upload Image", + "com_ui_upload_ocr_text": "Upload as Text", "com_ui_upload_invalid": "Invalid file for upload. Must be an image not exceeding the limit", "com_ui_upload_invalid_var": "Invalid file for upload. Must be an image not exceeding {{0}} MB", "com_ui_upload_success": "Successfully uploaded file", @@ -835,4 +840,4 @@ "com_ui_zoom": "Zoom", "com_user_message": "You", "com_warning_resubmit_unsupported": "Resubmitting the AI message is not supported for this endpoint." -} \ No newline at end of file +} diff --git a/client/src/utils/forms.tsx b/client/src/utils/forms.tsx index 04e52c9893..f8ccc7cb42 100644 --- a/client/src/utils/forms.tsx +++ b/client/src/utils/forms.tsx @@ -58,6 +58,9 @@ export const processAgentOption = ({ label: _agent?.name ?? '', value: _agent?.id ?? '', icon: isGlobal ? : null, + context_files: _agent?.tool_resources?.ocr?.file_ids + ? ([] as Array<[string, ExtendedFile]>) + : undefined, knowledge_files: _agent?.tool_resources?.file_search?.file_ids ? ([] as Array<[string, ExtendedFile]>) : undefined, @@ -83,7 +86,7 @@ export const processAgentOption = ({ const source = tool_resource === EToolResources.file_search ? FileSources.vectordb - : file?.source ?? FileSources.local; + : (file?.source ?? FileSources.local); if (file) { list?.push([ @@ -97,6 +100,7 @@ export const processAgentOption = ({ height: file.height, size: file.bytes, preview: file.filepath, + metadata: file.metadata, progress: 1, source, }, @@ -117,6 +121,16 @@ export const processAgentOption = ({ } }; + if (agent.context_files && _agent?.tool_resources?.ocr?.file_ids) { + _agent.tool_resources.ocr.file_ids.forEach((file_id) => + handleFile({ + file_id, + list: agent.context_files, + tool_resource: EToolResources.ocr, + }), + ); + } + if (agent.knowledge_files && _agent?.tool_resources?.file_search?.file_ids) { _agent.tool_resources.file_search.file_ids.forEach((file_id) => handleFile({ diff --git a/package-lock.json b/package-lock.json index e9d9c1ef16..d7e4f93476 100644 --- a/package-lock.json +++ b/package-lock.json @@ -41014,7 +41014,7 @@ }, "packages/data-provider": { "name": "librechat-data-provider", - "version": "0.7.7", + "version": "0.7.71", "license": "ISC", "dependencies": { "axios": "^1.8.2", diff --git a/packages/data-provider/package.json b/packages/data-provider/package.json index 2e151fd072..5ebf9877c0 100644 --- a/packages/data-provider/package.json +++ b/packages/data-provider/package.json @@ -1,6 +1,6 @@ { "name": "librechat-data-provider", - "version": "0.7.7", + "version": "0.7.71", "description": "data services for librechat apps", "main": "dist/index.js", "module": "dist/index.es.js", diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index d4c400c827..1cc0d5a785 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -168,6 +168,7 @@ export enum AgentCapabilities { artifacts = 'artifacts', actions = 'actions', tools = 'tools', + ocr = 'ocr', } export const defaultAssistantsVersion = { @@ -242,6 +243,7 @@ export const agentsEndpointSChema = baseEndpointSchema.merge( AgentCapabilities.artifacts, AgentCapabilities.actions, AgentCapabilities.tools, + AgentCapabilities.ocr, ]), }), ); @@ -534,9 +536,22 @@ export type TStartupConfig = { bundlerURL?: string; }; +export enum OCRStrategy { + MISTRAL_OCR = 'mistral_ocr', + CUSTOM_OCR = 'custom_ocr', +} + +export const ocrSchema = z.object({ + mistralModel: z.string().optional(), + apiKey: z.string().optional().default('OCR_API_KEY'), + baseURL: z.string().optional().default('OCR_BASEURL'), + strategy: z.nativeEnum(OCRStrategy).default(OCRStrategy.MISTRAL_OCR), +}); + export const configSchema = z.object({ version: z.string(), cache: z.boolean().default(true), + ocr: ocrSchema.optional(), secureImageLinks: z.boolean().optional(), imageOutputType: z.nativeEnum(EImageOutputType).default(EImageOutputType.PNG), includedTools: z.array(z.string()).optional(), @@ -1175,7 +1190,7 @@ export enum Constants { /** Key for the app's version. */ VERSION = 'v0.7.7', /** Key for the Custom Config's version (librechat.yaml). */ - CONFIG_VERSION = '1.2.1', + CONFIG_VERSION = '1.2.2', /** Standard value for the first message's `parentMessageId` value, to indicate no parent exists. */ NO_PARENT = '00000000-0000-0000-0000-000000000000', /** Standard value for the initial conversationId before a request is sent */ diff --git a/packages/data-provider/src/index.ts b/packages/data-provider/src/index.ts index 90b396001b..028ed07f19 100644 --- a/packages/data-provider/src/index.ts +++ b/packages/data-provider/src/index.ts @@ -7,6 +7,7 @@ export * from './file-config'; export * from './artifacts'; /* schema helpers */ export * from './parsers'; +export * from './ocr'; export * from './zod'; /* custom/dynamic configurations */ export * from './generate'; diff --git a/packages/data-provider/src/ocr.ts b/packages/data-provider/src/ocr.ts new file mode 100644 index 0000000000..cfde43025b --- /dev/null +++ b/packages/data-provider/src/ocr.ts @@ -0,0 +1,14 @@ +import type { TCustomConfig } from '../src/config'; +import { OCRStrategy } from '../src/config'; + +export function loadOCRConfig(config: TCustomConfig['ocr']): TCustomConfig['ocr'] { + const baseURL = config?.baseURL ?? ''; + const apiKey = config?.apiKey ?? ''; + const mistralModel = config?.mistralModel ?? ''; + return { + apiKey, + baseURL, + mistralModel, + strategy: config?.strategy ?? OCRStrategy.MISTRAL_OCR, + }; +} diff --git a/packages/data-provider/src/schemas.ts b/packages/data-provider/src/schemas.ts index 533d6ffc37..5be5b44453 100644 --- a/packages/data-provider/src/schemas.ts +++ b/packages/data-provider/src/schemas.ts @@ -1152,7 +1152,6 @@ export const compactAgentsSchema = tConversationSchema iconURL: true, greeting: true, agent_id: true, - resendFiles: true, instructions: true, additional_instructions: true, }) diff --git a/packages/data-provider/src/types/assistants.ts b/packages/data-provider/src/types/assistants.ts index a351752298..7092f98ae6 100644 --- a/packages/data-provider/src/types/assistants.ts +++ b/packages/data-provider/src/types/assistants.ts @@ -27,6 +27,7 @@ export enum EToolResources { code_interpreter = 'code_interpreter', execute_code = 'execute_code', file_search = 'file_search', + ocr = 'ocr', } export type Tool = { @@ -163,7 +164,8 @@ export type AgentModelParameters = { export interface AgentToolResources { execute_code?: ExecuteCodeResource; - file_search?: AgentFileSearchResource; + file_search?: AgentFileResource; + ocr?: Omit; } export interface ExecuteCodeResource { /** @@ -177,7 +179,7 @@ export interface ExecuteCodeResource { files?: Array; } -export interface AgentFileSearchResource { +export interface AgentFileResource { /** * The ID of the vector store attached to this agent. There * can be a maximum of 1 vector store attached to the agent. diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts index 5985096f4c..78ef7781e8 100644 --- a/packages/data-provider/src/types/files.ts +++ b/packages/data-provider/src/types/files.ts @@ -8,6 +8,8 @@ export enum FileSources { s3 = 's3', vectordb = 'vectordb', execute_code = 'execute_code', + mistral_ocr = 'mistral_ocr', + text = 'text', } export const checkOpenAIStorage = (source: string) => diff --git a/packages/data-schemas/src/schema/file.ts b/packages/data-schemas/src/schema/file.ts index b9fddd82d5..6d3b22a5a5 100644 --- a/packages/data-schemas/src/schema/file.ts +++ b/packages/data-schemas/src/schema/file.ts @@ -8,6 +8,7 @@ export interface IMongoFile extends Document { file_id: string; temp_file_id?: string; bytes: number; + text?: string; filename: string; filepath: string; object: 'file'; @@ -72,6 +73,9 @@ const file: Schema = new Schema( type: String, required: true, }, + text: { + type: String, + }, context: { type: String, },