diff options
Diffstat (limited to 'packages/core/src')
| -rw-r--r-- | packages/core/src/index.ts | 1 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/crawlCache.test.ts | 112 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/crawlCache.ts | 65 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/fileSearch.test.ts | 642 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/fileSearch.ts | 269 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/ignore.test.ts | 65 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/ignore.ts | 93 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/result-cache.test.ts | 56 | ||||
| -rw-r--r-- | packages/core/src/utils/filesearch/result-cache.ts | 70 |
9 files changed, 1373 insertions, 0 deletions
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index d7dfd90f..e60bd048 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -40,6 +40,7 @@ export * from './utils/shell-utils.js'; export * from './utils/systemEncoding.js'; export * from './utils/textUtils.js'; export * from './utils/formatters.js'; +export * from './utils/filesearch/fileSearch.js'; // Export services export * from './services/fileDiscoveryService.js'; diff --git a/packages/core/src/utils/filesearch/crawlCache.test.ts b/packages/core/src/utils/filesearch/crawlCache.test.ts new file mode 100644 index 00000000..2feab61a --- /dev/null +++ b/packages/core/src/utils/filesearch/crawlCache.test.ts @@ -0,0 +1,112 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, afterEach, beforeEach } from 'vitest'; +import { getCacheKey, read, write, clear } from './crawlCache.js'; + +describe('CrawlCache', () => { + describe('getCacheKey', () => { + it('should generate a consistent hash', () => { + const key1 = getCacheKey('/foo', 'bar'); + const key2 = getCacheKey('/foo', 'bar'); + expect(key1).toBe(key2); + }); + + it('should generate a different hash for different directories', () => { + const key1 = getCacheKey('/foo', 'bar'); + const key2 = getCacheKey('/bar', 'bar'); + expect(key1).not.toBe(key2); + }); + + it('should generate a different hash for different ignore content', () => { + const key1 = getCacheKey('/foo', 'bar'); + const key2 = getCacheKey('/foo', 'baz'); + expect(key1).not.toBe(key2); + }); + }); + + describe('in-memory cache operations', () => { + beforeEach(() => { + // Ensure a clean slate before each test + clear(); + }); + + afterEach(() => { + // Restore real timers after each test that uses fake ones + vi.useRealTimers(); + }); + + it('should write and read data from the cache', () => { + const key = 'test-key'; + const data = ['foo', 'bar']; + write(key, data, 10000); // 10 second TTL + const cachedData = read(key); + expect(cachedData).toEqual(data); + }); + + it('should return undefined for a nonexistent key', () => { + const cachedData = read('nonexistent-key'); + expect(cachedData).toBeUndefined(); + }); + + it('should clear the cache', () => { + const key = 'test-key'; + const data = ['foo', 'bar']; + write(key, data, 10000); + clear(); + const cachedData = read(key); + expect(cachedData).toBeUndefined(); + }); + + it('should automatically evict a cache entry after its TTL expires', async () => { + vi.useFakeTimers(); + const key = 'ttl-key'; + const data = ['foo']; + const ttl = 5000; // 5 seconds + + write(key, data, ttl); + + // Should exist immediately after writing + expect(read(key)).toEqual(data); + + // Advance time just before expiration + await vi.advanceTimersByTimeAsync(ttl - 1); + expect(read(key)).toEqual(data); + + // Advance time past expiration + await vi.advanceTimersByTimeAsync(1); + expect(read(key)).toBeUndefined(); + }); + + it('should reset the timer when an entry is updated', async () => { + vi.useFakeTimers(); + const key = 'update-key'; + const initialData = ['initial']; + const updatedData = ['updated']; + const ttl = 5000; // 5 seconds + + // Write initial data + write(key, initialData, ttl); + + // Advance time, but not enough to expire + await vi.advanceTimersByTimeAsync(3000); + expect(read(key)).toEqual(initialData); + + // Update the data, which should reset the timer + write(key, updatedData, ttl); + expect(read(key)).toEqual(updatedData); + + // Advance time again. If the timer wasn't reset, the total elapsed + // time (3000 + 3000 = 6000) would cause an eviction. + await vi.advanceTimersByTimeAsync(3000); + expect(read(key)).toEqual(updatedData); + + // Advance past the new expiration time + await vi.advanceTimersByTimeAsync(2001); + expect(read(key)).toBeUndefined(); + }); + }); +}); diff --git a/packages/core/src/utils/filesearch/crawlCache.ts b/packages/core/src/utils/filesearch/crawlCache.ts new file mode 100644 index 00000000..3cc948c6 --- /dev/null +++ b/packages/core/src/utils/filesearch/crawlCache.ts @@ -0,0 +1,65 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import crypto from 'node:crypto'; + +const crawlCache = new Map<string, string[]>(); +const cacheTimers = new Map<string, NodeJS.Timeout>(); + +/** + * Generates a unique cache key based on the project directory and the content + * of ignore files. This ensures that the cache is invalidated if the project + * or ignore rules change. + */ +export const getCacheKey = ( + directory: string, + ignoreContent: string, +): string => { + const hash = crypto.createHash('sha256'); + hash.update(directory); + hash.update(ignoreContent); + return hash.digest('hex'); +}; + +/** + * Reads cached data from the in-memory cache. + * Returns undefined if the key is not found. + */ +export const read = (key: string): string[] | undefined => crawlCache.get(key); + +/** + * Writes data to the in-memory cache and sets a timer to evict it after the TTL. + */ +export const write = (key: string, results: string[], ttlMs: number): void => { + // Clear any existing timer for this key to prevent premature deletion + if (cacheTimers.has(key)) { + clearTimeout(cacheTimers.get(key)!); + } + + // Store the new data + crawlCache.set(key, results); + + // Set a timer to automatically delete the cache entry after the TTL + const timerId = setTimeout(() => { + crawlCache.delete(key); + cacheTimers.delete(key); + }, ttlMs); + + // Store the timer handle so we can clear it if the entry is updated + cacheTimers.set(key, timerId); +}; + +/** + * Clears the entire cache and all active timers. + * Primarily used for testing. + */ +export const clear = (): void => { + for (const timerId of cacheTimers.values()) { + clearTimeout(timerId); + } + crawlCache.clear(); + cacheTimers.clear(); +}; diff --git a/packages/core/src/utils/filesearch/fileSearch.test.ts b/packages/core/src/utils/filesearch/fileSearch.test.ts new file mode 100644 index 00000000..b804d623 --- /dev/null +++ b/packages/core/src/utils/filesearch/fileSearch.test.ts @@ -0,0 +1,642 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import * as cache from './crawlCache.js'; +import { FileSearch, AbortError, filter } from './fileSearch.js'; +import { createTmpDir, cleanupTmpDir } from '@google/gemini-cli-test-utils'; + +type FileSearchWithPrivateMethods = FileSearch & { + performCrawl: () => Promise<void>; +}; + +describe('FileSearch', () => { + let tmpDir: string; + afterEach(async () => { + if (tmpDir) { + await cleanupTmpDir(tmpDir); + } + vi.restoreAllMocks(); + }); + + it('should use .geminiignore rules', async () => { + tmpDir = await createTmpDir({ + '.geminiignore': 'dist/', + dist: ['ignored.js'], + src: ['not-ignored.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: true, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual(['src/', '.geminiignore', 'src/not-ignored.js']); + }); + + it('should combine .gitignore and .geminiignore rules', async () => { + tmpDir = await createTmpDir({ + '.gitignore': 'dist/', + '.geminiignore': 'build/', + dist: ['ignored-by-git.js'], + build: ['ignored-by-gemini.js'], + src: ['not-ignored.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: true, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual([ + 'src/', + '.geminiignore', + '.gitignore', + 'src/not-ignored.js', + ]); + }); + + it('should use ignoreDirs option', async () => { + tmpDir = await createTmpDir({ + logs: ['some.log'], + src: ['main.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: ['logs'], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual(['src/', 'src/main.js']); + }); + + it('should handle negated directories', async () => { + tmpDir = await createTmpDir({ + '.gitignore': ['build/**', '!build/public', '!build/public/**'].join( + '\n', + ), + build: { + 'private.js': '', + public: ['index.html'], + }, + src: ['main.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual([ + 'build/', + 'build/public/', + 'src/', + '.gitignore', + 'build/public/index.html', + 'src/main.js', + ]); + }); + + it('should filter results with a search pattern', async () => { + tmpDir = await createTmpDir({ + src: { + 'main.js': '', + 'util.ts': '', + 'style.css': '', + }, + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search('**/*.js'); + + expect(results).toEqual(['src/main.js']); + }); + + it('should handle root-level file negation', async () => { + tmpDir = await createTmpDir({ + '.gitignore': ['*.mk', '!Foo.mk'].join('\n'), + 'bar.mk': '', + 'Foo.mk': '', + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual(['.gitignore', 'Foo.mk']); + }); + + it('should handle directory negation with glob', async () => { + tmpDir = await createTmpDir({ + '.gitignore': [ + 'third_party/**', + '!third_party/foo', + '!third_party/foo/bar', + '!third_party/foo/bar/baz_buffer', + ].join('\n'), + third_party: { + foo: { + bar: { + baz_buffer: '', + }, + }, + ignore_this: '', + }, + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual([ + 'third_party/', + 'third_party/foo/', + 'third_party/foo/bar/', + '.gitignore', + 'third_party/foo/bar/baz_buffer', + ]); + }); + + it('should correctly handle negated patterns in .gitignore', async () => { + tmpDir = await createTmpDir({ + '.gitignore': ['dist/**', '!dist/keep.js'].join('\n'), + dist: ['ignore.js', 'keep.js'], + src: ['main.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual([ + 'dist/', + 'src/', + '.gitignore', + 'dist/keep.js', + 'src/main.js', + ]); + }); + + // New test cases start here + + it('should initialize correctly when ignore files are missing', async () => { + tmpDir = await createTmpDir({ + src: ['file1.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: true, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + // Expect no errors to be thrown during initialization + await expect(fileSearch.initialize()).resolves.toBeUndefined(); + const results = await fileSearch.search(''); + expect(results).toEqual(['src/', 'src/file1.js']); + }); + + it('should respect maxResults option in search', async () => { + tmpDir = await createTmpDir({ + src: { + 'file1.js': '', + 'file2.js': '', + 'file3.js': '', + 'file4.js': '', + }, + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search('**/*.js', { maxResults: 2 }); + + expect(results).toEqual(['src/file1.js', 'src/file2.js']); // Assuming alphabetical sort + }); + + it('should return empty array when no matches are found', async () => { + tmpDir = await createTmpDir({ + src: ['file1.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search('nonexistent-file.xyz'); + + expect(results).toEqual([]); + }); + + it('should throw AbortError when filter is aborted', async () => { + const controller = new AbortController(); + const dummyPaths = Array.from({ length: 5000 }, (_, i) => `file${i}.js`); // Large array to ensure yielding + + const filterPromise = filter(dummyPaths, '*.js', controller.signal); + + // Abort after a short delay to ensure filter has started + setTimeout(() => controller.abort(), 1); + + await expect(filterPromise).rejects.toThrow(AbortError); + }); + + describe('with in-memory cache', () => { + beforeEach(() => { + cache.clear(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it('should throw an error if search is called before initialization', async () => { + tmpDir = await createTmpDir({}); + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await expect(fileSearch.search('')).rejects.toThrow( + 'Engine not initialized. Call initialize() first.', + ); + }); + + it('should hit the cache for subsequent searches', async () => { + tmpDir = await createTmpDir({ 'file1.js': '' }); + const getOptions = () => ({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: true, + cacheTtl: 10, + }); + + const fs1 = new FileSearch(getOptions()); + const crawlSpy1 = vi.spyOn( + fs1 as FileSearchWithPrivateMethods, + 'performCrawl', + ); + await fs1.initialize(); + expect(crawlSpy1).toHaveBeenCalledTimes(1); + + // Second search should hit the cache because the options are identical + const fs2 = new FileSearch(getOptions()); + const crawlSpy2 = vi.spyOn( + fs2 as FileSearchWithPrivateMethods, + 'performCrawl', + ); + await fs2.initialize(); + expect(crawlSpy2).not.toHaveBeenCalled(); + }); + + it('should miss the cache when ignore rules change', async () => { + tmpDir = await createTmpDir({ + '.gitignore': 'a.txt', + 'a.txt': '', + 'b.txt': '', + }); + const options = { + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: false, + ignoreDirs: [], + cache: true, + cacheTtl: 10000, + }; + + // Initial search to populate the cache + const fs1 = new FileSearch(options); + const crawlSpy1 = vi.spyOn( + fs1 as FileSearchWithPrivateMethods, + 'performCrawl', + ); + await fs1.initialize(); + const results1 = await fs1.search(''); + expect(crawlSpy1).toHaveBeenCalledTimes(1); + expect(results1).toEqual(['.gitignore', 'b.txt']); + + // Modify the ignore file + await fs.writeFile(path.join(tmpDir, '.gitignore'), 'b.txt'); + + // Second search should miss the cache and trigger a recrawl + const fs2 = new FileSearch(options); + const crawlSpy2 = vi.spyOn( + fs2 as FileSearchWithPrivateMethods, + 'performCrawl', + ); + await fs2.initialize(); + const results2 = await fs2.search(''); + expect(crawlSpy2).toHaveBeenCalledTimes(1); + expect(results2).toEqual(['.gitignore', 'a.txt']); + }); + + it('should miss the cache after TTL expires', async () => { + vi.useFakeTimers(); + tmpDir = await createTmpDir({ 'file1.js': '' }); + const options = { + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: true, + cacheTtl: 10, // 10 seconds + }; + + // Initial search to populate the cache + const fs1 = new FileSearch(options); + await fs1.initialize(); + + // Advance time past the TTL + await vi.advanceTimersByTimeAsync(11000); + + // Second search should miss the cache and trigger a recrawl + const fs2 = new FileSearch(options); + const crawlSpy = vi.spyOn( + fs2 as FileSearchWithPrivateMethods, + 'performCrawl', + ); + await fs2.initialize(); + + expect(crawlSpy).toHaveBeenCalledTimes(1); + }); + }); + + it('should handle empty or commented-only ignore files', async () => { + tmpDir = await createTmpDir({ + '.gitignore': '# This is a comment\n\n \n', + src: ['main.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: true, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual(['src/', '.gitignore', 'src/main.js']); + }); + + it('should always ignore the .git directory', async () => { + tmpDir = await createTmpDir({ + '.git': ['config', 'HEAD'], + src: ['main.js'], + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, // Explicitly disable .gitignore to isolate this rule + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + const results = await fileSearch.search(''); + + expect(results).toEqual(['src/', 'src/main.js']); + }); + + it('should be cancellable via AbortSignal', async () => { + const largeDir: Record<string, string> = {}; + for (let i = 0; i < 100; i++) { + largeDir[`file${i}.js`] = ''; + } + tmpDir = await createTmpDir(largeDir); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + + const controller = new AbortController(); + const searchPromise = fileSearch.search('**/*.js', { + signal: controller.signal, + }); + + // Yield to allow the search to start before aborting. + await new Promise((resolve) => setImmediate(resolve)); + + controller.abort(); + + await expect(searchPromise).rejects.toThrow(AbortError); + }); + + it('should leverage ResultCache for bestBaseQuery optimization', async () => { + tmpDir = await createTmpDir({ + src: { + 'foo.js': '', + 'bar.ts': '', + nested: { + 'baz.js': '', + }, + }, + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: true, // Enable caching for this test + cacheTtl: 0, + }); + + await fileSearch.initialize(); + + // Perform a broad search to prime the cache + const broadResults = await fileSearch.search('src/**'); + expect(broadResults).toEqual([ + 'src/', + 'src/nested/', + 'src/bar.ts', + 'src/foo.js', + 'src/nested/baz.js', + ]); + + // Perform a more specific search that should leverage the broad search's cached results + const specificResults = await fileSearch.search('src/**/*.js'); + expect(specificResults).toEqual(['src/foo.js', 'src/nested/baz.js']); + + // Although we can't directly inspect ResultCache.hits/misses from here, + // the correctness of specificResults after a broad search implicitly + // verifies that the caching mechanism, including bestBaseQuery, is working. + }); + + it('should be case-insensitive by default', async () => { + tmpDir = await createTmpDir({ + 'File1.Js': '', + 'file2.js': '', + 'FILE3.JS': '', + 'other.txt': '', + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: false, + cacheTtl: 0, + }); + + await fileSearch.initialize(); + + // Search with a lowercase pattern + let results = await fileSearch.search('file*.js'); + expect(results).toHaveLength(3); + expect(results).toEqual( + expect.arrayContaining(['File1.Js', 'file2.js', 'FILE3.JS']), + ); + + // Search with an uppercase pattern + results = await fileSearch.search('FILE*.JS'); + expect(results).toHaveLength(3); + expect(results).toEqual( + expect.arrayContaining(['File1.Js', 'file2.js', 'FILE3.JS']), + ); + + // Search with a mixed-case pattern + results = await fileSearch.search('FiLe*.Js'); + expect(results).toHaveLength(3); + expect(results).toEqual( + expect.arrayContaining(['File1.Js', 'file2.js', 'FILE3.JS']), + ); + }); + + it('should respect maxResults even when the cache returns an exact match', async () => { + tmpDir = await createTmpDir({ + 'file1.js': '', + 'file2.js': '', + 'file3.js': '', + 'file4.js': '', + 'file5.js': '', + }); + + const fileSearch = new FileSearch({ + projectRoot: tmpDir, + useGitignore: false, + useGeminiignore: false, + ignoreDirs: [], + cache: true, // Ensure caching is enabled + cacheTtl: 10000, + }); + + await fileSearch.initialize(); + + // 1. Perform a broad search to populate the cache with an exact match. + const initialResults = await fileSearch.search('*.js'); + expect(initialResults).toEqual([ + 'file1.js', + 'file2.js', + 'file3.js', + 'file4.js', + 'file5.js', + ]); + + // 2. Perform the same search again, but this time with a maxResults limit. + const limitedResults = await fileSearch.search('*.js', { maxResults: 2 }); + + // 3. Assert that the maxResults limit was respected, even with a cache hit. + expect(limitedResults).toEqual(['file1.js', 'file2.js']); + }); +}); diff --git a/packages/core/src/utils/filesearch/fileSearch.ts b/packages/core/src/utils/filesearch/fileSearch.ts new file mode 100644 index 00000000..5915821a --- /dev/null +++ b/packages/core/src/utils/filesearch/fileSearch.ts @@ -0,0 +1,269 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import path from 'node:path'; +import fs from 'node:fs'; +import { fdir } from 'fdir'; +import picomatch from 'picomatch'; +import { Ignore } from './ignore.js'; +import { ResultCache } from './result-cache.js'; +import * as cache from './crawlCache.js'; + +export type FileSearchOptions = { + projectRoot: string; + ignoreDirs: string[]; + useGitignore: boolean; + useGeminiignore: boolean; + cache: boolean; + cacheTtl: number; +}; + +export class AbortError extends Error { + constructor(message = 'Search aborted') { + super(message); + this.name = 'AbortError'; + } +} + +/** + * Filters a list of paths based on a given pattern. + * @param allPaths The list of all paths to filter. + * @param pattern The picomatch pattern to filter by. + * @param signal An AbortSignal to cancel the operation. + * @returns A promise that resolves to the filtered and sorted list of paths. + */ +export async function filter( + allPaths: string[], + pattern: string, + signal: AbortSignal | undefined, +): Promise<string[]> { + const patternFilter = picomatch(pattern, { + dot: true, + contains: true, + nocase: true, + }); + + const results: string[] = []; + for (const [i, p] of allPaths.entries()) { + // Yield control to the event loop periodically to prevent blocking. + if (i % 1000 === 0) { + await new Promise((resolve) => setImmediate(resolve)); + if (signal?.aborted) { + throw new AbortError(); + } + } + + if (patternFilter(p)) { + results.push(p); + } + } + + results.sort((a, b) => { + const aIsDir = a.endsWith('/'); + const bIsDir = b.endsWith('/'); + + if (aIsDir && !bIsDir) return -1; + if (!aIsDir && bIsDir) return 1; + + // This is 40% faster than localeCompare and the only thing we would really + // gain from localeCompare is case-sensitive sort + return a < b ? -1 : a > b ? 1 : 0; + }); + + return results; +} + +export type SearchOptions = { + signal?: AbortSignal; + maxResults?: number; +}; + +/** + * Provides a fast and efficient way to search for files within a project, + * respecting .gitignore and .geminiignore rules, and utilizing caching + * for improved performance. + */ +export class FileSearch { + private readonly absoluteDir: string; + private readonly ignore: Ignore = new Ignore(); + private resultCache: ResultCache | undefined; + private allFiles: string[] = []; + + /** + * Constructs a new `FileSearch` instance. + * @param options Configuration options for the file search. + */ + constructor(private readonly options: FileSearchOptions) { + this.absoluteDir = path.resolve(options.projectRoot); + } + + /** + * Initializes the file search engine by loading ignore rules, crawling the + * file system, and building the in-memory cache. This method must be called + * before performing any searches. + */ + async initialize(): Promise<void> { + this.loadIgnoreRules(); + await this.crawlFiles(); + this.buildResultCache(); + } + + /** + * Searches for files matching a given pattern. + * @param pattern The picomatch pattern to search for (e.g., '*.js', 'src/**'). + * @param options Search options, including an AbortSignal and maxResults. + * @returns A promise that resolves to a list of matching file paths, relative + * to the project root. + */ + async search( + pattern: string, + options: SearchOptions = {}, + ): Promise<string[]> { + if (!this.resultCache) { + throw new Error('Engine not initialized. Call initialize() first.'); + } + + pattern = pattern || '*'; + + const { files: candidates, isExactMatch } = + await this.resultCache!.get(pattern); + + let filteredCandidates; + if (isExactMatch) { + filteredCandidates = candidates; + } else { + // Apply the user's picomatch pattern filter + filteredCandidates = await filter(candidates, pattern, options.signal); + this.resultCache!.set(pattern, filteredCandidates); + } + + // Trade-off: We apply a two-stage filtering process. + // 1. During the file system crawl (`performCrawl`), we only apply directory-level + // ignore rules (e.g., `node_modules/`, `dist/`). This is because applying + // a full ignore filter (which includes file-specific patterns like `*.log`) + // during the crawl can significantly slow down `fdir`. + // 2. Here, in the `search` method, we apply the full ignore filter + // (including file patterns) to the `filteredCandidates` (which have already + // been filtered by the user's search pattern and sorted). For autocomplete, + // the number of displayed results is small (MAX_SUGGESTIONS_TO_SHOW), + // so applying the full filter to this truncated list is much more efficient + // than applying it to every file during the initial crawl. + const fileFilter = this.ignore.getFileFilter(); + const results: string[] = []; + for (const [i, candidate] of filteredCandidates.entries()) { + // Yield to the event loop to avoid blocking on large result sets. + if (i % 1000 === 0) { + await new Promise((resolve) => setImmediate(resolve)); + if (options.signal?.aborted) { + throw new AbortError(); + } + } + + if (results.length >= (options.maxResults ?? Infinity)) { + break; + } + // The `ignore` library throws an error if the path is '.', so we skip it. + if (candidate === '.') { + continue; + } + if (!fileFilter(candidate)) { + results.push(candidate); + } + } + return results; + } + + /** + * Loads ignore rules from .gitignore and .geminiignore files, and applies + * any additional ignore directories specified in the options. + */ + private loadIgnoreRules(): void { + if (this.options.useGitignore) { + const gitignorePath = path.join(this.absoluteDir, '.gitignore'); + if (fs.existsSync(gitignorePath)) { + this.ignore.add(fs.readFileSync(gitignorePath, 'utf8')); + } + } + + if (this.options.useGeminiignore) { + const geminiignorePath = path.join(this.absoluteDir, '.geminiignore'); + if (fs.existsSync(geminiignorePath)) { + this.ignore.add(fs.readFileSync(geminiignorePath, 'utf8')); + } + } + + const ignoreDirs = ['.git', ...this.options.ignoreDirs]; + this.ignore.add( + ignoreDirs.map((dir) => { + if (dir.endsWith('/')) { + return dir; + } + return `${dir}/`; + }), + ); + } + + /** + * Crawls the file system to get a list of all files and directories, + * optionally using a cache for faster initialization. + */ + private async crawlFiles(): Promise<void> { + if (this.options.cache) { + const cacheKey = cache.getCacheKey( + this.absoluteDir, + this.ignore.getFingerprint(), + ); + const cachedResults = cache.read(cacheKey); + + if (cachedResults) { + this.allFiles = cachedResults; + return; + } + } + + this.allFiles = await this.performCrawl(); + + if (this.options.cache) { + const cacheKey = cache.getCacheKey( + this.absoluteDir, + this.ignore.getFingerprint(), + ); + cache.write(cacheKey, this.allFiles, this.options.cacheTtl * 1000); + } + } + + /** + * Performs the actual file system crawl using `fdir`, applying directory + * ignore rules. + * @returns A promise that resolves to a list of all files and directories. + */ + private async performCrawl(): Promise<string[]> { + const dirFilter = this.ignore.getDirectoryFilter(); + + // We use `fdir` for fast file system traversal. A key performance + // optimization for large workspaces is to exclude entire directories + // early in the traversal process. This is why we apply directory-specific + // ignore rules (e.g., `node_modules/`, `dist/`) directly to `fdir`'s + // exclude filter. + const api = new fdir() + .withRelativePaths() + .withDirs() + .withPathSeparator('/') // Always use unix style paths + .exclude((_, dirPath) => { + const relativePath = path.relative(this.absoluteDir, dirPath); + return dirFilter(`${relativePath}/`); + }); + + return api.crawl(this.absoluteDir).withPromise(); + } + + /** + * Builds the in-memory cache for fast pattern matching. + */ + private buildResultCache(): void { + this.resultCache = new ResultCache(this.allFiles, this.absoluteDir); + } +} diff --git a/packages/core/src/utils/filesearch/ignore.test.ts b/packages/core/src/utils/filesearch/ignore.test.ts new file mode 100644 index 00000000..ff375e3f --- /dev/null +++ b/packages/core/src/utils/filesearch/ignore.test.ts @@ -0,0 +1,65 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { Ignore } from './ignore.js'; + +describe('Ignore', () => { + describe('getDirectoryFilter', () => { + it('should ignore directories matching directory patterns', () => { + const ig = new Ignore().add(['foo/', 'bar/']); + const dirFilter = ig.getDirectoryFilter(); + expect(dirFilter('foo/')).toBe(true); + expect(dirFilter('bar/')).toBe(true); + expect(dirFilter('baz/')).toBe(false); + }); + + it('should not ignore directories with file patterns', () => { + const ig = new Ignore().add(['foo.js', '*.log']); + const dirFilter = ig.getDirectoryFilter(); + expect(dirFilter('foo.js')).toBe(false); + expect(dirFilter('foo.log')).toBe(false); + }); + }); + + describe('getFileFilter', () => { + it('should not ignore files with directory patterns', () => { + const ig = new Ignore().add(['foo/', 'bar/']); + const fileFilter = ig.getFileFilter(); + expect(fileFilter('foo')).toBe(false); + expect(fileFilter('foo/file.txt')).toBe(false); + }); + + it('should ignore files matching file patterns', () => { + const ig = new Ignore().add(['*.log', 'foo.js']); + const fileFilter = ig.getFileFilter(); + expect(fileFilter('foo.log')).toBe(true); + expect(fileFilter('foo.js')).toBe(true); + expect(fileFilter('bar.txt')).toBe(false); + }); + }); + + it('should accumulate patterns across multiple add() calls', () => { + const ig = new Ignore().add('foo.js'); + ig.add('bar.js'); + const fileFilter = ig.getFileFilter(); + expect(fileFilter('foo.js')).toBe(true); + expect(fileFilter('bar.js')).toBe(true); + expect(fileFilter('baz.js')).toBe(false); + }); + + it('should return a stable and consistent fingerprint', () => { + const ig1 = new Ignore().add(['foo', '!bar']); + const ig2 = new Ignore().add('foo\n!bar'); + + // Fingerprints should be identical for the same rules. + expect(ig1.getFingerprint()).toBe(ig2.getFingerprint()); + + // Adding a new rule should change the fingerprint. + ig2.add('baz'); + expect(ig1.getFingerprint()).not.toBe(ig2.getFingerprint()); + }); +}); diff --git a/packages/core/src/utils/filesearch/ignore.ts b/packages/core/src/utils/filesearch/ignore.ts new file mode 100644 index 00000000..9f756f93 --- /dev/null +++ b/packages/core/src/utils/filesearch/ignore.ts @@ -0,0 +1,93 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import ignore from 'ignore'; +import picomatch from 'picomatch'; + +const hasFileExtension = picomatch('**/*[*.]*'); + +export class Ignore { + private readonly allPatterns: string[] = []; + private dirIgnorer = ignore(); + private fileIgnorer = ignore(); + + /** + * Adds one or more ignore patterns. + * @param patterns A single pattern string or an array of pattern strings. + * Each pattern can be a glob-like string similar to .gitignore rules. + * @returns The `Ignore` instance for chaining. + */ + add(patterns: string | string[]): this { + if (typeof patterns === 'string') { + patterns = patterns.split(/\r?\n/); + } + + for (const p of patterns) { + const pattern = p.trim(); + + if (pattern === '' || pattern.startsWith('#')) { + continue; + } + + this.allPatterns.push(pattern); + + const isPositiveDirPattern = + pattern.endsWith('/') && !pattern.startsWith('!'); + + if (isPositiveDirPattern) { + this.dirIgnorer.add(pattern); + } else { + // An ambiguous pattern (e.g., "build") could match a file or a + // directory. To optimize the file system crawl, we use a heuristic: + // patterns without a dot in the last segment are included in the + // directory exclusion check. + // + // This heuristic can fail. For example, an ignore pattern of "my.assets" + // intended to exclude a directory will not be treated as a directory + // pattern because it contains a ".". This results in crawling a + // directory that should have been excluded, reducing efficiency. + // Correctness is still maintained. The incorrectly crawled directory + // will be filtered out by the final ignore check. + // + // For maximum crawl efficiency, users should explicitly mark directory + // patterns with a trailing slash (e.g., "my.assets/"). + this.fileIgnorer.add(pattern); + if (!hasFileExtension(pattern)) { + this.dirIgnorer.add(pattern); + } + } + } + + return this; + } + + /** + * Returns a predicate that matches explicit directory ignore patterns (patterns ending with '/'). + * @returns {(dirPath: string) => boolean} + */ + getDirectoryFilter(): (dirPath: string) => boolean { + return (dirPath: string) => this.dirIgnorer.ignores(dirPath); + } + + /** + * Returns a predicate that matches file ignore patterns (all patterns not ending with '/'). + * Note: This may also match directories if a file pattern matches a directory name, but all explicit directory patterns are handled by getDirectoryFilter. + * @returns {(filePath: string) => boolean} + */ + getFileFilter(): (filePath: string) => boolean { + return (filePath: string) => this.fileIgnorer.ignores(filePath); + } + + /** + * Returns a string representing the current set of ignore patterns. + * This can be used to generate a unique identifier for the ignore configuration, + * useful for caching purposes. + * @returns A string fingerprint of the ignore patterns. + */ + getFingerprint(): string { + return this.allPatterns.join('\n'); + } +} diff --git a/packages/core/src/utils/filesearch/result-cache.test.ts b/packages/core/src/utils/filesearch/result-cache.test.ts new file mode 100644 index 00000000..0b1b4e17 --- /dev/null +++ b/packages/core/src/utils/filesearch/result-cache.test.ts @@ -0,0 +1,56 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import path from 'node:path'; +import { test, expect } from 'vitest'; +import { ResultCache } from './result-cache.js'; + +test('ResultCache basic usage', async () => { + const files = [ + 'foo.txt', + 'bar.js', + 'baz.md', + 'subdir/file.txt', + 'subdir/other.js', + 'subdir/nested/file.md', + ]; + const cache = new ResultCache(files, path.resolve('.')); + const { files: resultFiles, isExactMatch } = await cache.get('*.js'); + expect(resultFiles).toEqual(files); + expect(isExactMatch).toBe(false); +}); + +test('ResultCache cache hit/miss', async () => { + const files = ['foo.txt', 'bar.js', 'baz.md']; + const cache = new ResultCache(files, path.resolve('.')); + // First call: miss + const { files: result1Files, isExactMatch: isExactMatch1 } = + await cache.get('*.js'); + expect(result1Files).toEqual(files); + expect(isExactMatch1).toBe(false); + + // Simulate FileSearch applying the filter and setting the result + cache.set('*.js', ['bar.js']); + + // Second call: hit + const { files: result2Files, isExactMatch: isExactMatch2 } = + await cache.get('*.js'); + expect(result2Files).toEqual(['bar.js']); + expect(isExactMatch2).toBe(true); +}); + +test('ResultCache best base query', async () => { + const files = ['foo.txt', 'foobar.js', 'baz.md']; + const cache = new ResultCache(files, path.resolve('.')); + + // Cache a broader query + cache.set('foo', ['foo.txt', 'foobar.js']); + + // Search for a more specific query that starts with the broader one + const { files: resultFiles, isExactMatch } = await cache.get('foobar'); + expect(resultFiles).toEqual(['foo.txt', 'foobar.js']); + expect(isExactMatch).toBe(false); +}); diff --git a/packages/core/src/utils/filesearch/result-cache.ts b/packages/core/src/utils/filesearch/result-cache.ts new file mode 100644 index 00000000..77b99aec --- /dev/null +++ b/packages/core/src/utils/filesearch/result-cache.ts @@ -0,0 +1,70 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Implements an in-memory cache for file search results. + * This cache optimizes subsequent searches by leveraging previously computed results. + */ +export class ResultCache { + private readonly cache: Map<string, string[]>; + private hits = 0; + private misses = 0; + + constructor( + private readonly allFiles: string[], + private readonly absoluteDir: string, + ) { + this.cache = new Map(); + } + + /** + * Retrieves cached search results for a given query, or provides a base set + * of files to search from. + * @param query The search query pattern. + * @returns An object containing the files to search and a boolean indicating + * if the result is an exact cache hit. + */ + async get( + query: string, + ): Promise<{ files: string[]; isExactMatch: boolean }> { + const isCacheHit = this.cache.has(query); + + if (isCacheHit) { + this.hits++; + return { files: this.cache.get(query)!, isExactMatch: true }; + } + + this.misses++; + + // This is the core optimization of the memory cache. + // If a user first searches for "foo", and then for "foobar", + // we don't need to search through all files again. We can start + // from the results of the "foo" search. + // This finds the most specific, already-cached query that is a prefix + // of the current query. + let bestBaseQuery = ''; + for (const key of this.cache?.keys?.() ?? []) { + if (query.startsWith(key) && key.length > bestBaseQuery.length) { + bestBaseQuery = key; + } + } + + const filesToSearch = bestBaseQuery + ? this.cache.get(bestBaseQuery)! + : this.allFiles; + + return { files: filesToSearch, isExactMatch: false }; + } + + /** + * Stores search results in the cache. + * @param query The search query pattern. + * @param results The matching file paths to cache. + */ + set(query: string, results: string[]): void { + this.cache.set(query, results); + } +} |
