diff options
| author | Luccas Paroni <[email protected]> | 2025-08-05 16:19:47 -0300 |
|---|---|---|
| committer | GitHub <[email protected]> | 2025-08-05 19:19:47 +0000 |
| commit | 2778c7d851740631b4dbacf907b63db26a6e1816 (patch) | |
| tree | f3361e32a59277ba5677ec2d06b6773d2f8c57c3 /packages/core/src | |
| parent | b4651452293295020874dfb6ba6707a47c555175 (diff) | |
feat(core): Parse Multimodal MCP Tool responses (#5529)
Co-authored-by: Luccas Paroni <[email protected]>
Diffstat (limited to 'packages/core/src')
| -rw-r--r-- | packages/core/src/tools/mcp-tool.test.ts | 369 | ||||
| -rw-r--r-- | packages/core/src/tools/mcp-tool.ts | 207 |
2 files changed, 524 insertions, 52 deletions
diff --git a/packages/core/src/tools/mcp-tool.test.ts b/packages/core/src/tools/mcp-tool.test.ts index b5843b95..f8a9a8ba 100644 --- a/packages/core/src/tools/mcp-tool.test.ts +++ b/packages/core/src/tools/mcp-tool.test.ts @@ -131,8 +131,11 @@ describe('DiscoveredMCPTool', () => { success: true, details: 'executed', }; - const mockFunctionResponseContent: Part[] = [ - { text: JSON.stringify(mockToolSuccessResultObject) }, + const mockFunctionResponseContent = [ + { + type: 'text', + text: JSON.stringify(mockToolSuccessResultObject), + }, ]; const mockMcpToolResponseParts: Part[] = [ { @@ -149,11 +152,13 @@ describe('DiscoveredMCPTool', () => { expect(mockCallTool).toHaveBeenCalledWith([ { name: serverToolName, args: params }, ]); - expect(toolResult.llmContent).toEqual(mockMcpToolResponseParts); const stringifiedResponseContent = JSON.stringify( mockToolSuccessResultObject, ); + expect(toolResult.llmContent).toEqual([ + { text: stringifiedResponseContent }, + ]); expect(toolResult.returnDisplay).toBe(stringifiedResponseContent); }); @@ -170,6 +175,9 @@ describe('DiscoveredMCPTool', () => { mockCallTool.mockResolvedValue(mockMcpToolResponsePartsEmpty); const toolResult: ToolResult = await tool.execute(params); expect(toolResult.returnDisplay).toBe('```json\n[]\n```'); + expect(toolResult.llmContent).toEqual([ + { text: '[Error: Could not parse tool response]' }, + ]); }); it('should propagate rejection if mcpTool.callTool rejects', async () => { @@ -186,6 +194,361 @@ describe('DiscoveredMCPTool', () => { await expect(tool.execute(params)).rejects.toThrow(expectedError); }); + + it('should handle a simple text response correctly', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { query: 'test' }; + const successMessage = 'This is a success message.'; + + // Simulate the response from the GenAI SDK, which wraps the MCP + // response in a functionResponse Part. + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + // The `content` array contains MCP ContentBlocks. + content: [{ type: 'text', text: successMessage }], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + // 1. Assert that the llmContent sent to the scheduler is a clean Part array. + expect(toolResult.llmContent).toEqual([{ text: successMessage }]); + + // 2. Assert that the display output is the simple text message. + expect(toolResult.returnDisplay).toBe(successMessage); + + // 3. Verify that the underlying callTool was made correctly. + expect(mockCallTool).toHaveBeenCalledWith([ + { name: serverToolName, args: params }, + ]); + }); + + it('should handle an AudioBlock response', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { action: 'play' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { + type: 'audio', + data: 'BASE64_AUDIO_DATA', + mimeType: 'audio/mp3', + }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([ + { + text: `[Tool '${serverToolName}' provided the following audio data with mime-type: audio/mp3]`, + }, + { + inlineData: { + mimeType: 'audio/mp3', + data: 'BASE64_AUDIO_DATA', + }, + }, + ]); + expect(toolResult.returnDisplay).toBe('[Audio: audio/mp3]'); + }); + + it('should handle a ResourceLinkBlock response', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { resource: 'get' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { + type: 'resource_link', + uri: 'file:///path/to/thing', + name: 'resource-name', + title: 'My Resource', + }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([ + { + text: 'Resource Link: My Resource at file:///path/to/thing', + }, + ]); + expect(toolResult.returnDisplay).toBe( + '[Link to My Resource: file:///path/to/thing]', + ); + }); + + it('should handle an embedded text ResourceBlock response', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { resource: 'get' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { + type: 'resource', + resource: { + uri: 'file:///path/to/text.txt', + text: 'This is the text content.', + mimeType: 'text/plain', + }, + }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([ + { text: 'This is the text content.' }, + ]); + expect(toolResult.returnDisplay).toBe('This is the text content.'); + }); + + it('should handle an embedded binary ResourceBlock response', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { resource: 'get' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { + type: 'resource', + resource: { + uri: 'file:///path/to/data.bin', + blob: 'BASE64_BINARY_DATA', + mimeType: 'application/octet-stream', + }, + }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([ + { + text: `[Tool '${serverToolName}' provided the following embedded resource with mime-type: application/octet-stream]`, + }, + { + inlineData: { + mimeType: 'application/octet-stream', + data: 'BASE64_BINARY_DATA', + }, + }, + ]); + expect(toolResult.returnDisplay).toBe( + '[Embedded Resource: application/octet-stream]', + ); + }); + + it('should handle a mix of content block types', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { action: 'complex' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { type: 'text', text: 'First part.' }, + { + type: 'image', + data: 'BASE64_IMAGE_DATA', + mimeType: 'image/jpeg', + }, + { type: 'text', text: 'Second part.' }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([ + { text: 'First part.' }, + { + text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`, + }, + { + inlineData: { + mimeType: 'image/jpeg', + data: 'BASE64_IMAGE_DATA', + }, + }, + { text: 'Second part.' }, + ]); + expect(toolResult.returnDisplay).toBe( + 'First part.\n[Image: image/jpeg]\nSecond part.', + ); + }); + + it('should ignore unknown content block types', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { action: 'test' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { type: 'text', text: 'Valid part.' }, + { type: 'future_block', data: 'some-data' }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([{ text: 'Valid part.' }]); + expect(toolResult.returnDisplay).toBe( + 'Valid part.\n[Unknown content type: future_block]', + ); + }); + + it('should handle a complex mix of content block types', async () => { + const tool = new DiscoveredMCPTool( + mockCallableToolInstance, + serverName, + serverToolName, + baseDescription, + inputSchema, + ); + const params = { action: 'super-complex' }; + const sdkResponse: Part[] = [ + { + functionResponse: { + name: serverToolName, + response: { + content: [ + { type: 'text', text: 'Here is a resource.' }, + { + type: 'resource_link', + uri: 'file:///path/to/resource', + name: 'resource-name', + title: 'My Resource', + }, + { + type: 'resource', + resource: { + uri: 'file:///path/to/text.txt', + text: 'Embedded text content.', + mimeType: 'text/plain', + }, + }, + { + type: 'image', + data: 'BASE64_IMAGE_DATA', + mimeType: 'image/jpeg', + }, + ], + }, + }, + }, + ]; + mockCallTool.mockResolvedValue(sdkResponse); + + const toolResult = await tool.execute(params); + + expect(toolResult.llmContent).toEqual([ + { text: 'Here is a resource.' }, + { + text: 'Resource Link: My Resource at file:///path/to/resource', + }, + { text: 'Embedded text content.' }, + { + text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`, + }, + { + inlineData: { + mimeType: 'image/jpeg', + data: 'BASE64_IMAGE_DATA', + }, + }, + ]); + expect(toolResult.returnDisplay).toBe( + 'Here is a resource.\n[Link to My Resource: file:///path/to/resource]\nEmbedded text content.\n[Image: image/jpeg]', + ); + }); }); describe('shouldConfirmExecute', () => { diff --git a/packages/core/src/tools/mcp-tool.ts b/packages/core/src/tools/mcp-tool.ts index 9e814bba..3dd62e2b 100644 --- a/packages/core/src/tools/mcp-tool.ts +++ b/packages/core/src/tools/mcp-tool.ts @@ -22,6 +22,40 @@ import { type ToolParams = Record<string, unknown>; +// Discriminated union for MCP Content Blocks to ensure type safety. +type McpTextBlock = { + type: 'text'; + text: string; +}; + +type McpMediaBlock = { + type: 'image' | 'audio'; + mimeType: string; + data: string; +}; + +type McpResourceBlock = { + type: 'resource'; + resource: { + text?: string; + blob?: string; + mimeType?: string; + }; +}; + +type McpResourceLinkBlock = { + type: 'resource_link'; + uri: string; + title?: string; + name?: string; +}; + +type McpContentBlock = + | McpTextBlock + | McpMediaBlock + | McpResourceBlock + | McpResourceLinkBlock; + export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> { private static readonly allowlist: Set<string> = new Set(); @@ -114,70 +148,145 @@ export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> { }, ]; - const responseParts: Part[] = await this.mcpTool.callTool(functionCalls); + const rawResponseParts = await this.mcpTool.callTool(functionCalls); + const transformedParts = transformMcpContentToParts(rawResponseParts); return { - llmContent: responseParts, - returnDisplay: getStringifiedResultForDisplay(responseParts), + llmContent: transformedParts, + returnDisplay: getStringifiedResultForDisplay(rawResponseParts), }; } } +function transformTextBlock(block: McpTextBlock): Part { + return { text: block.text }; +} + +function transformImageAudioBlock( + block: McpMediaBlock, + toolName: string, +): Part[] { + return [ + { + text: `[Tool '${toolName}' provided the following ${ + block.type + } data with mime-type: ${block.mimeType}]`, + }, + { + inlineData: { + mimeType: block.mimeType, + data: block.data, + }, + }, + ]; +} + +function transformResourceBlock( + block: McpResourceBlock, + toolName: string, +): Part | Part[] | null { + const resource = block.resource; + if (resource?.text) { + return { text: resource.text }; + } + if (resource?.blob) { + const mimeType = resource.mimeType || 'application/octet-stream'; + return [ + { + text: `[Tool '${toolName}' provided the following embedded resource with mime-type: ${mimeType}]`, + }, + { + inlineData: { + mimeType, + data: resource.blob, + }, + }, + ]; + } + return null; +} + +function transformResourceLinkBlock(block: McpResourceLinkBlock): Part { + return { + text: `Resource Link: ${block.title || block.name} at ${block.uri}`, + }; +} + /** - * Processes an array of `Part` objects, primarily from a tool's execution result, - * to generate a user-friendly string representation, typically for display in a CLI. - * - * The `result` array can contain various types of `Part` objects: - * 1. `FunctionResponse` parts: - * - If the `response.content` of a `FunctionResponse` is an array consisting solely - * of `TextPart` objects, their text content is concatenated into a single string. - * This is to present simple textual outputs directly. - * - If `response.content` is an array but contains other types of `Part` objects (or a mix), - * the `content` array itself is preserved. This handles structured data like JSON objects or arrays - * returned by a tool. - * - If `response.content` is not an array or is missing, the entire `functionResponse` - * object is preserved. - * 2. Other `Part` types (e.g., `TextPart` directly in the `result` array): - * - These are preserved as is. - * - * All processed parts are then collected into an array, which is JSON.stringify-ed - * with indentation and wrapped in a markdown JSON code block. + * Transforms the raw MCP content blocks from the SDK response into a + * standard GenAI Part array. + * @param sdkResponse The raw Part[] array from `mcpTool.callTool()`. + * @returns A clean Part[] array ready for the scheduler. */ -function getStringifiedResultForDisplay(result: Part[]) { - if (!result || result.length === 0) { - return '```json\n[]\n```'; +function transformMcpContentToParts(sdkResponse: Part[]): Part[] { + const funcResponse = sdkResponse?.[0]?.functionResponse; + const mcpContent = funcResponse?.response?.content as McpContentBlock[]; + const toolName = funcResponse?.name || 'unknown tool'; + + if (!Array.isArray(mcpContent)) { + return [{ text: '[Error: Could not parse tool response]' }]; } - const processFunctionResponse = (part: Part) => { - if (part.functionResponse) { - const responseContent = part.functionResponse.response?.content; - if (responseContent && Array.isArray(responseContent)) { - // Check if all parts in responseContent are simple TextParts - const allTextParts = responseContent.every( - (p: Part) => p.text !== undefined, - ); - if (allTextParts) { - return responseContent.map((p: Part) => p.text).join(''); - } - // If not all simple text parts, return the array of these content parts for JSON stringification - return responseContent; + const transformed = mcpContent.flatMap( + (block: McpContentBlock): Part | Part[] | null => { + switch (block.type) { + case 'text': + return transformTextBlock(block); + case 'image': + case 'audio': + return transformImageAudioBlock(block, toolName); + case 'resource': + return transformResourceBlock(block, toolName); + case 'resource_link': + return transformResourceLinkBlock(block); + default: + return null; } + }, + ); - // If no content, or not an array, or not a functionResponse, stringify the whole functionResponse part for inspection - return part.functionResponse; - } - return part; // Fallback for unexpected structure or non-FunctionResponsePart - }; + return transformed.filter((part): part is Part => part !== null); +} - const processedResults = - result.length === 1 - ? processFunctionResponse(result[0]) - : result.map(processFunctionResponse); - if (typeof processedResults === 'string') { - return processedResults; +/** + * Processes the raw response from the MCP tool to generate a clean, + * human-readable string for display in the CLI. It summarizes non-text + * content and presents text directly. + * + * @param rawResponse The raw Part[] array from the GenAI SDK. + * @returns A formatted string representing the tool's output. + */ +function getStringifiedResultForDisplay(rawResponse: Part[]): string { + const mcpContent = rawResponse?.[0]?.functionResponse?.response + ?.content as McpContentBlock[]; + + if (!Array.isArray(mcpContent)) { + return '```json\n' + JSON.stringify(rawResponse, null, 2) + '\n```'; } - return '```json\n' + JSON.stringify(processedResults, null, 2) + '\n```'; + const displayParts = mcpContent.map((block: McpContentBlock): string => { + switch (block.type) { + case 'text': + return block.text; + case 'image': + return `[Image: ${block.mimeType}]`; + case 'audio': + return `[Audio: ${block.mimeType}]`; + case 'resource_link': + return `[Link to ${block.title || block.name}: ${block.uri}]`; + case 'resource': + if (block.resource?.text) { + return block.resource.text; + } + return `[Embedded Resource: ${ + block.resource?.mimeType || 'unknown type' + }]`; + default: + return `[Unknown content type: ${(block as { type: string }).type}]`; + } + }); + + return displayParts.join('\n'); } /** Visible for testing */ |
