diff options
Diffstat (limited to 'packages/core/src/utils/fileUtils.ts')
| -rw-r--r-- | packages/core/src/utils/fileUtils.ts | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts new file mode 100644 index 00000000..d726c053 --- /dev/null +++ b/packages/core/src/utils/fileUtils.ts @@ -0,0 +1,280 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import fs from 'fs'; +import path from 'path'; +import { PartUnion } from '@google/genai'; +import mime from 'mime-types'; + +// Constants for text file processing +const DEFAULT_MAX_LINES_TEXT_FILE = 2000; +const MAX_LINE_LENGTH_TEXT_FILE = 2000; + +// Default values for encoding and separator format +export const DEFAULT_ENCODING: BufferEncoding = 'utf-8'; + +/** + * Checks if a path is within a given root directory. + * @param pathToCheck The absolute path to check. + * @param rootDirectory The absolute root directory. + * @returns True if the path is within the root directory, false otherwise. + */ +export function isWithinRoot( + pathToCheck: string, + rootDirectory: string, +): boolean { + const normalizedPathToCheck = path.normalize(pathToCheck); + const normalizedRootDirectory = path.normalize(rootDirectory); + + // Ensure the rootDirectory path ends with a separator for correct startsWith comparison, + // unless it's the root path itself (e.g., '/' or 'C:\'). + const rootWithSeparator = + normalizedRootDirectory === path.sep || + normalizedRootDirectory.endsWith(path.sep) + ? normalizedRootDirectory + : normalizedRootDirectory + path.sep; + + return ( + normalizedPathToCheck === normalizedRootDirectory || + normalizedPathToCheck.startsWith(rootWithSeparator) + ); +} + +/** + * Determines if a file is likely binary based on content sampling. + * @param filePath Path to the file. + * @returns True if the file appears to be binary. + */ +export function isBinaryFile(filePath: string): boolean { + try { + const fd = fs.openSync(filePath, 'r'); + // Read up to 4KB or file size, whichever is smaller + const fileSize = fs.fstatSync(fd).size; + if (fileSize === 0) { + // Empty file is not considered binary for content checking + fs.closeSync(fd); + return false; + } + const bufferSize = Math.min(4096, fileSize); + const buffer = Buffer.alloc(bufferSize); + const bytesRead = fs.readSync(fd, buffer, 0, buffer.length, 0); + fs.closeSync(fd); + + if (bytesRead === 0) return false; + + let nonPrintableCount = 0; + for (let i = 0; i < bytesRead; i++) { + if (buffer[i] === 0) return true; // Null byte is a strong indicator + if (buffer[i] < 9 || (buffer[i] > 13 && buffer[i] < 32)) { + nonPrintableCount++; + } + } + // If >30% non-printable characters, consider it binary + return nonPrintableCount / bytesRead > 0.3; + } catch { + // If any error occurs (e.g. file not found, permissions), + // treat as not binary here; let higher-level functions handle existence/access errors. + return false; + } +} + +/** + * Detects the type of file based on extension and content. + * @param filePath Path to the file. + * @returns 'text', 'image', 'pdf', or 'binary'. + */ +export function detectFileType( + filePath: string, +): 'text' | 'image' | 'pdf' | 'binary' { + const ext = path.extname(filePath).toLowerCase(); + const lookedUpMimeType = mime.lookup(filePath); // Returns false if not found, or the mime type string + + if (lookedUpMimeType && lookedUpMimeType.startsWith('image/')) { + return 'image'; + } + if (lookedUpMimeType && lookedUpMimeType === 'application/pdf') { + return 'pdf'; + } + + // Stricter binary check for common non-text extensions before content check + // These are often not well-covered by mime-types or might be misidentified. + if ( + [ + '.zip', + '.tar', + '.gz', + '.exe', + '.dll', + '.so', + '.class', + '.jar', + '.war', + '.7z', + '.doc', + '.docx', + '.xls', + '.xlsx', + '.ppt', + '.pptx', + '.odt', + '.ods', + '.odp', + '.bin', + '.dat', + '.obj', + '.o', + '.a', + '.lib', + '.wasm', + '.pyc', + '.pyo', + ].includes(ext) + ) { + return 'binary'; + } + + // Fallback to content-based check if mime type wasn't conclusive for image/pdf + // and it's not a known binary extension. + if (isBinaryFile(filePath)) { + return 'binary'; + } + + return 'text'; +} + +export interface ProcessedFileReadResult { + llmContent: PartUnion; // string for text, Part for image/pdf/unreadable binary + returnDisplay: string; + error?: string; // Optional error message for the LLM if file processing failed + isTruncated?: boolean; // For text files, indicates if content was truncated + originalLineCount?: number; // For text files + linesShown?: [number, number]; // For text files [startLine, endLine] (1-based for display) +} + +/** + * Reads and processes a single file, handling text, images, and PDFs. + * @param filePath Absolute path to the file. + * @param rootDirectory Absolute path to the project root for relative path display. + * @param offset Optional offset for text files (0-based line number). + * @param limit Optional limit for text files (number of lines to read). + * @returns ProcessedFileReadResult object. + */ +export async function processSingleFileContent( + filePath: string, + rootDirectory: string, + offset?: number, + limit?: number, +): Promise<ProcessedFileReadResult> { + try { + if (!fs.existsSync(filePath)) { + // Sync check is acceptable before async read + return { + llmContent: '', + returnDisplay: 'File not found.', + error: `File not found: ${filePath}`, + }; + } + const stats = fs.statSync(filePath); // Sync check + if (stats.isDirectory()) { + return { + llmContent: '', + returnDisplay: 'Path is a directory.', + error: `Path is a directory, not a file: ${filePath}`, + }; + } + + const fileType = detectFileType(filePath); + const relativePathForDisplay = path + .relative(rootDirectory, filePath) + .replace(/\\/g, '/'); + + switch (fileType) { + case 'binary': { + return { + llmContent: `Cannot display content of binary file: ${relativePathForDisplay}`, + returnDisplay: `Skipped binary file: ${relativePathForDisplay}`, + }; + } + case 'text': { + const content = await fs.promises.readFile(filePath, 'utf8'); + const lines = content.split('\n'); + const originalLineCount = lines.length; + + const startLine = offset || 0; + const effectiveLimit = + limit === undefined ? DEFAULT_MAX_LINES_TEXT_FILE : limit; + // Ensure endLine does not exceed originalLineCount + const endLine = Math.min(startLine + effectiveLimit, originalLineCount); + // Ensure selectedLines doesn't try to slice beyond array bounds if startLine is too high + const actualStartLine = Math.min(startLine, originalLineCount); + const selectedLines = lines.slice(actualStartLine, endLine); + + let linesWereTruncatedInLength = false; + const formattedLines = selectedLines.map((line) => { + if (line.length > MAX_LINE_LENGTH_TEXT_FILE) { + linesWereTruncatedInLength = true; + return ( + line.substring(0, MAX_LINE_LENGTH_TEXT_FILE) + '... [truncated]' + ); + } + return line; + }); + + const contentRangeTruncated = endLine < originalLineCount; + const isTruncated = contentRangeTruncated || linesWereTruncatedInLength; + + let llmTextContent = ''; + if (contentRangeTruncated) { + llmTextContent += `[File content truncated: showing lines ${actualStartLine + 1}-${endLine} of ${originalLineCount} total lines. Use offset/limit parameters to view more.]\n`; + } else if (linesWereTruncatedInLength) { + llmTextContent += `[File content partially truncated: some lines exceeded maximum length of ${MAX_LINE_LENGTH_TEXT_FILE} characters.]\n`; + } + llmTextContent += formattedLines.join('\n'); + + return { + llmContent: llmTextContent, + returnDisplay: isTruncated ? '(truncated)' : '', + isTruncated, + originalLineCount, + linesShown: [actualStartLine + 1, endLine], + }; + } + case 'image': + case 'pdf': { + const contentBuffer = await fs.promises.readFile(filePath); + const base64Data = contentBuffer.toString('base64'); + return { + llmContent: { + inlineData: { + data: base64Data, + mimeType: mime.lookup(filePath) || 'application/octet-stream', + }, + }, + returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`, + }; + } + default: { + // Should not happen with current detectFileType logic + const exhaustiveCheck: never = fileType; + return { + llmContent: `Unhandled file type: ${exhaustiveCheck}`, + returnDisplay: `Skipped unhandled file type: ${relativePathForDisplay}`, + error: `Unhandled file type for ${filePath}`, + }; + } + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + const displayPath = path + .relative(rootDirectory, filePath) + .replace(/\\/g, '/'); + return { + llmContent: `Error reading file ${displayPath}: ${errorMessage}`, + returnDisplay: `Error reading file ${displayPath}: ${errorMessage}`, + error: `Error reading file ${filePath}: ${errorMessage}`, + }; + } +} |
