feat(core, cli): Introduce high-performance FileSearch engine (#5136)

Co-authored-by: Jacob Richman <[email protected]>
author: Bryant Chandler <[email protected]> 2025-08-05 16:18:03 -0700
committer: GitHub <[email protected]> 2025-08-05 23:18:03 +0000
commit: 12a9bc3ed94fab3071529b5304d46bcc5b4fe756 (patch)
tree: 90967b6670668c6c476719ac04422e1744cbabd6 /packages/core/src
parent: 2141b39c3d713a19f2dd8012a76c2ff8b7c30a5e (diff)
9 files changed, 1373 insertions, 0 deletions
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index d7dfd90f..e60bd048 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -40,6 +40,7 @@ export * from './utils/shell-utils.js';
 export * from './utils/systemEncoding.js';
 export * from './utils/textUtils.js';
 export * from './utils/formatters.js';
+export * from './utils/filesearch/fileSearch.js';
 
 // Export services
 export * from './services/fileDiscoveryService.js';
diff --git a/packages/core/src/utils/filesearch/crawlCache.test.ts b/packages/core/src/utils/filesearch/crawlCache.test.ts
new file mode 100644
index 00000000..2feab61a
--- /dev/null
+++ b/packages/core/src/utils/filesearch/crawlCache.test.ts
@@ -0,0 +1,112 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, afterEach, beforeEach } from 'vitest';
+import { getCacheKey, read, write, clear } from './crawlCache.js';
+
+describe('CrawlCache', () => {
+  describe('getCacheKey', () => {
+    it('should generate a consistent hash', () => {
+      const key1 = getCacheKey('/foo', 'bar');
+      const key2 = getCacheKey('/foo', 'bar');
+      expect(key1).toBe(key2);
+    });
+
+    it('should generate a different hash for different directories', () => {
+      const key1 = getCacheKey('/foo', 'bar');
+      const key2 = getCacheKey('/bar', 'bar');
+      expect(key1).not.toBe(key2);
+    });
+
+    it('should generate a different hash for different ignore content', () => {
+      const key1 = getCacheKey('/foo', 'bar');
+      const key2 = getCacheKey('/foo', 'baz');
+      expect(key1).not.toBe(key2);
+    });
+  });
+
+  describe('in-memory cache operations', () => {
+    beforeEach(() => {
+      // Ensure a clean slate before each test
+      clear();
+    });
+
+    afterEach(() => {
+      // Restore real timers after each test that uses fake ones
+      vi.useRealTimers();
+    });
+
+    it('should write and read data from the cache', () => {
+      const key = 'test-key';
+      const data = ['foo', 'bar'];
+      write(key, data, 10000); // 10 second TTL
+      const cachedData = read(key);
+      expect(cachedData).toEqual(data);
+    });
+
+    it('should return undefined for a nonexistent key', () => {
+      const cachedData = read('nonexistent-key');
+      expect(cachedData).toBeUndefined();
+    });
+
+    it('should clear the cache', () => {
+      const key = 'test-key';
+      const data = ['foo', 'bar'];
+      write(key, data, 10000);
+      clear();
+      const cachedData = read(key);
+      expect(cachedData).toBeUndefined();
+    });
+
+    it('should automatically evict a cache entry after its TTL expires', async () => {
+      vi.useFakeTimers();
+      const key = 'ttl-key';
+      const data = ['foo'];
+      const ttl = 5000; // 5 seconds
+
+      write(key, data, ttl);
+
+      // Should exist immediately after writing
+      expect(read(key)).toEqual(data);
+
+      // Advance time just before expiration
+      await vi.advanceTimersByTimeAsync(ttl - 1);
+      expect(read(key)).toEqual(data);
+
+      // Advance time past expiration
+      await vi.advanceTimersByTimeAsync(1);
+      expect(read(key)).toBeUndefined();
+    });
+
+    it('should reset the timer when an entry is updated', async () => {
+      vi.useFakeTimers();
+      const key = 'update-key';
+      const initialData = ['initial'];
+      const updatedData = ['updated'];
+      const ttl = 5000; // 5 seconds
+
+      // Write initial data
+      write(key, initialData, ttl);
+
+      // Advance time, but not enough to expire
+      await vi.advanceTimersByTimeAsync(3000);
+      expect(read(key)).toEqual(initialData);
+
+      // Update the data, which should reset the timer
+      write(key, updatedData, ttl);
+      expect(read(key)).toEqual(updatedData);
+
+      // Advance time again. If the timer wasn't reset, the total elapsed
+      // time (3000 + 3000 = 6000) would cause an eviction.
+      await vi.advanceTimersByTimeAsync(3000);
+      expect(read(key)).toEqual(updatedData);
+
+      // Advance past the new expiration time
+      await vi.advanceTimersByTimeAsync(2001);
+      expect(read(key)).toBeUndefined();
+    });
+  });
+});
diff --git a/packages/core/src/utils/filesearch/crawlCache.ts b/packages/core/src/utils/filesearch/crawlCache.ts
new file mode 100644
index 00000000..3cc948c6
--- /dev/null
+++ b/packages/core/src/utils/filesearch/crawlCache.ts
@@ -0,0 +1,65 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import crypto from 'node:crypto';
+
+const crawlCache = new Map<string, string[]>();
+const cacheTimers = new Map<string, NodeJS.Timeout>();
+
+/**
+ * Generates a unique cache key based on the project directory and the content
+ * of ignore files. This ensures that the cache is invalidated if the project
+ * or ignore rules change.
+ */
+export const getCacheKey = (
+  directory: string,
+  ignoreContent: string,
+): string => {
+  const hash = crypto.createHash('sha256');
+  hash.update(directory);
+  hash.update(ignoreContent);
+  return hash.digest('hex');
+};
+
+/**
+ * Reads cached data from the in-memory cache.
+ * Returns undefined if the key is not found.
+ */
+export const read = (key: string): string[] | undefined => crawlCache.get(key);
+
+/**
+ * Writes data to the in-memory cache and sets a timer to evict it after the TTL.
+ */
+export const write = (key: string, results: string[], ttlMs: number): void => {
+  // Clear any existing timer for this key to prevent premature deletion
+  if (cacheTimers.has(key)) {
+    clearTimeout(cacheTimers.get(key)!);
+  }
+
+  // Store the new data
+  crawlCache.set(key, results);
+
+  // Set a timer to automatically delete the cache entry after the TTL
+  const timerId = setTimeout(() => {
+    crawlCache.delete(key);
+    cacheTimers.delete(key);
+  }, ttlMs);
+
+  // Store the timer handle so we can clear it if the entry is updated
+  cacheTimers.set(key, timerId);
+};
+
+/**
+ * Clears the entire cache and all active timers.
+ * Primarily used for testing.
+ */
+export const clear = (): void => {
+  for (const timerId of cacheTimers.values()) {
+    clearTimeout(timerId);
+  }
+  crawlCache.clear();
+  cacheTimers.clear();
+};
diff --git a/packages/core/src/utils/filesearch/fileSearch.test.ts b/packages/core/src/utils/filesearch/fileSearch.test.ts
new file mode 100644
index 00000000..b804d623
--- /dev/null
+++ b/packages/core/src/utils/filesearch/fileSearch.test.ts
@@ -0,0 +1,642 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+import * as cache from './crawlCache.js';
+import { FileSearch, AbortError, filter } from './fileSearch.js';
+import { createTmpDir, cleanupTmpDir } from '@google/gemini-cli-test-utils';
+
+type FileSearchWithPrivateMethods = FileSearch & {
+  performCrawl: () => Promise<void>;
+};
+
+describe('FileSearch', () => {
+  let tmpDir: string;
+  afterEach(async () => {
+    if (tmpDir) {
+      await cleanupTmpDir(tmpDir);
+    }
+    vi.restoreAllMocks();
+  });
+
+  it('should use .geminiignore rules', async () => {
+    tmpDir = await createTmpDir({
+      '.geminiignore': 'dist/',
+      dist: ['ignored.js'],
+      src: ['not-ignored.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: true,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual(['src/', '.geminiignore', 'src/not-ignored.js']);
+  });
+
+  it('should combine .gitignore and .geminiignore rules', async () => {
+    tmpDir = await createTmpDir({
+      '.gitignore': 'dist/',
+      '.geminiignore': 'build/',
+      dist: ['ignored-by-git.js'],
+      build: ['ignored-by-gemini.js'],
+      src: ['not-ignored.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: true,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual([
+      'src/',
+      '.geminiignore',
+      '.gitignore',
+      'src/not-ignored.js',
+    ]);
+  });
+
+  it('should use ignoreDirs option', async () => {
+    tmpDir = await createTmpDir({
+      logs: ['some.log'],
+      src: ['main.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: ['logs'],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual(['src/', 'src/main.js']);
+  });
+
+  it('should handle negated directories', async () => {
+    tmpDir = await createTmpDir({
+      '.gitignore': ['build/**', '!build/public', '!build/public/**'].join(
+        '\n',
+      ),
+      build: {
+        'private.js': '',
+        public: ['index.html'],
+      },
+      src: ['main.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual([
+      'build/',
+      'build/public/',
+      'src/',
+      '.gitignore',
+      'build/public/index.html',
+      'src/main.js',
+    ]);
+  });
+
+  it('should filter results with a search pattern', async () => {
+    tmpDir = await createTmpDir({
+      src: {
+        'main.js': '',
+        'util.ts': '',
+        'style.css': '',
+      },
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('**/*.js');
+
+    expect(results).toEqual(['src/main.js']);
+  });
+
+  it('should handle root-level file negation', async () => {
+    tmpDir = await createTmpDir({
+      '.gitignore': ['*.mk', '!Foo.mk'].join('\n'),
+      'bar.mk': '',
+      'Foo.mk': '',
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual(['.gitignore', 'Foo.mk']);
+  });
+
+  it('should handle directory negation with glob', async () => {
+    tmpDir = await createTmpDir({
+      '.gitignore': [
+        'third_party/**',
+        '!third_party/foo',
+        '!third_party/foo/bar',
+        '!third_party/foo/bar/baz_buffer',
+      ].join('\n'),
+      third_party: {
+        foo: {
+          bar: {
+            baz_buffer: '',
+          },
+        },
+        ignore_this: '',
+      },
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual([
+      'third_party/',
+      'third_party/foo/',
+      'third_party/foo/bar/',
+      '.gitignore',
+      'third_party/foo/bar/baz_buffer',
+    ]);
+  });
+
+  it('should correctly handle negated patterns in .gitignore', async () => {
+    tmpDir = await createTmpDir({
+      '.gitignore': ['dist/**', '!dist/keep.js'].join('\n'),
+      dist: ['ignore.js', 'keep.js'],
+      src: ['main.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual([
+      'dist/',
+      'src/',
+      '.gitignore',
+      'dist/keep.js',
+      'src/main.js',
+    ]);
+  });
+
+  // New test cases start here
+
+  it('should initialize correctly when ignore files are missing', async () => {
+    tmpDir = await createTmpDir({
+      src: ['file1.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: true,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    // Expect no errors to be thrown during initialization
+    await expect(fileSearch.initialize()).resolves.toBeUndefined();
+    const results = await fileSearch.search('');
+    expect(results).toEqual(['src/', 'src/file1.js']);
+  });
+
+  it('should respect maxResults option in search', async () => {
+    tmpDir = await createTmpDir({
+      src: {
+        'file1.js': '',
+        'file2.js': '',
+        'file3.js': '',
+        'file4.js': '',
+      },
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('**/*.js', { maxResults: 2 });
+
+    expect(results).toEqual(['src/file1.js', 'src/file2.js']); // Assuming alphabetical sort
+  });
+
+  it('should return empty array when no matches are found', async () => {
+    tmpDir = await createTmpDir({
+      src: ['file1.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('nonexistent-file.xyz');
+
+    expect(results).toEqual([]);
+  });
+
+  it('should throw AbortError when filter is aborted', async () => {
+    const controller = new AbortController();
+    const dummyPaths = Array.from({ length: 5000 }, (_, i) => `file${i}.js`); // Large array to ensure yielding
+
+    const filterPromise = filter(dummyPaths, '*.js', controller.signal);
+
+    // Abort after a short delay to ensure filter has started
+    setTimeout(() => controller.abort(), 1);
+
+    await expect(filterPromise).rejects.toThrow(AbortError);
+  });
+
+  describe('with in-memory cache', () => {
+    beforeEach(() => {
+      cache.clear();
+    });
+
+    afterEach(() => {
+      vi.useRealTimers();
+    });
+
+    it('should throw an error if search is called before initialization', async () => {
+      tmpDir = await createTmpDir({});
+      const fileSearch = new FileSearch({
+        projectRoot: tmpDir,
+        useGitignore: false,
+        useGeminiignore: false,
+        ignoreDirs: [],
+        cache: false,
+        cacheTtl: 0,
+      });
+
+      await expect(fileSearch.search('')).rejects.toThrow(
+        'Engine not initialized. Call initialize() first.',
+      );
+    });
+
+    it('should hit the cache for subsequent searches', async () => {
+      tmpDir = await createTmpDir({ 'file1.js': '' });
+      const getOptions = () => ({
+        projectRoot: tmpDir,
+        useGitignore: false,
+        useGeminiignore: false,
+        ignoreDirs: [],
+        cache: true,
+        cacheTtl: 10,
+      });
+
+      const fs1 = new FileSearch(getOptions());
+      const crawlSpy1 = vi.spyOn(
+        fs1 as FileSearchWithPrivateMethods,
+        'performCrawl',
+      );
+      await fs1.initialize();
+      expect(crawlSpy1).toHaveBeenCalledTimes(1);
+
+      // Second search should hit the cache because the options are identical
+      const fs2 = new FileSearch(getOptions());
+      const crawlSpy2 = vi.spyOn(
+        fs2 as FileSearchWithPrivateMethods,
+        'performCrawl',
+      );
+      await fs2.initialize();
+      expect(crawlSpy2).not.toHaveBeenCalled();
+    });
+
+    it('should miss the cache when ignore rules change', async () => {
+      tmpDir = await createTmpDir({
+        '.gitignore': 'a.txt',
+        'a.txt': '',
+        'b.txt': '',
+      });
+      const options = {
+        projectRoot: tmpDir,
+        useGitignore: true,
+        useGeminiignore: false,
+        ignoreDirs: [],
+        cache: true,
+        cacheTtl: 10000,
+      };
+
+      // Initial search to populate the cache
+      const fs1 = new FileSearch(options);
+      const crawlSpy1 = vi.spyOn(
+        fs1 as FileSearchWithPrivateMethods,
+        'performCrawl',
+      );
+      await fs1.initialize();
+      const results1 = await fs1.search('');
+      expect(crawlSpy1).toHaveBeenCalledTimes(1);
+      expect(results1).toEqual(['.gitignore', 'b.txt']);
+
+      // Modify the ignore file
+      await fs.writeFile(path.join(tmpDir, '.gitignore'), 'b.txt');
+
+      // Second search should miss the cache and trigger a recrawl
+      const fs2 = new FileSearch(options);
+      const crawlSpy2 = vi.spyOn(
+        fs2 as FileSearchWithPrivateMethods,
+        'performCrawl',
+      );
+      await fs2.initialize();
+      const results2 = await fs2.search('');
+      expect(crawlSpy2).toHaveBeenCalledTimes(1);
+      expect(results2).toEqual(['.gitignore', 'a.txt']);
+    });
+
+    it('should miss the cache after TTL expires', async () => {
+      vi.useFakeTimers();
+      tmpDir = await createTmpDir({ 'file1.js': '' });
+      const options = {
+        projectRoot: tmpDir,
+        useGitignore: false,
+        useGeminiignore: false,
+        ignoreDirs: [],
+        cache: true,
+        cacheTtl: 10, // 10 seconds
+      };
+
+      // Initial search to populate the cache
+      const fs1 = new FileSearch(options);
+      await fs1.initialize();
+
+      // Advance time past the TTL
+      await vi.advanceTimersByTimeAsync(11000);
+
+      // Second search should miss the cache and trigger a recrawl
+      const fs2 = new FileSearch(options);
+      const crawlSpy = vi.spyOn(
+        fs2 as FileSearchWithPrivateMethods,
+        'performCrawl',
+      );
+      await fs2.initialize();
+
+      expect(crawlSpy).toHaveBeenCalledTimes(1);
+    });
+  });
+
+  it('should handle empty or commented-only ignore files', async () => {
+    tmpDir = await createTmpDir({
+      '.gitignore': '# This is a comment\n\n   \n',
+      src: ['main.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: true,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual(['src/', '.gitignore', 'src/main.js']);
+  });
+
+  it('should always ignore the .git directory', async () => {
+    tmpDir = await createTmpDir({
+      '.git': ['config', 'HEAD'],
+      src: ['main.js'],
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false, // Explicitly disable .gitignore to isolate this rule
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+    const results = await fileSearch.search('');
+
+    expect(results).toEqual(['src/', 'src/main.js']);
+  });
+
+  it('should be cancellable via AbortSignal', async () => {
+    const largeDir: Record<string, string> = {};
+    for (let i = 0; i < 100; i++) {
+      largeDir[`file${i}.js`] = '';
+    }
+    tmpDir = await createTmpDir(largeDir);
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+
+    const controller = new AbortController();
+    const searchPromise = fileSearch.search('**/*.js', {
+      signal: controller.signal,
+    });
+
+    // Yield to allow the search to start before aborting.
+    await new Promise((resolve) => setImmediate(resolve));
+
+    controller.abort();
+
+    await expect(searchPromise).rejects.toThrow(AbortError);
+  });
+
+  it('should leverage ResultCache for bestBaseQuery optimization', async () => {
+    tmpDir = await createTmpDir({
+      src: {
+        'foo.js': '',
+        'bar.ts': '',
+        nested: {
+          'baz.js': '',
+        },
+      },
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: true, // Enable caching for this test
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+
+    // Perform a broad search to prime the cache
+    const broadResults = await fileSearch.search('src/**');
+    expect(broadResults).toEqual([
+      'src/',
+      'src/nested/',
+      'src/bar.ts',
+      'src/foo.js',
+      'src/nested/baz.js',
+    ]);
+
+    // Perform a more specific search that should leverage the broad search's cached results
+    const specificResults = await fileSearch.search('src/**/*.js');
+    expect(specificResults).toEqual(['src/foo.js', 'src/nested/baz.js']);
+
+    // Although we can't directly inspect ResultCache.hits/misses from here,
+    // the correctness of specificResults after a broad search implicitly
+    // verifies that the caching mechanism, including bestBaseQuery, is working.
+  });
+
+  it('should be case-insensitive by default', async () => {
+    tmpDir = await createTmpDir({
+      'File1.Js': '',
+      'file2.js': '',
+      'FILE3.JS': '',
+      'other.txt': '',
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: false,
+      cacheTtl: 0,
+    });
+
+    await fileSearch.initialize();
+
+    // Search with a lowercase pattern
+    let results = await fileSearch.search('file*.js');
+    expect(results).toHaveLength(3);
+    expect(results).toEqual(
+      expect.arrayContaining(['File1.Js', 'file2.js', 'FILE3.JS']),
+    );
+
+    // Search with an uppercase pattern
+    results = await fileSearch.search('FILE*.JS');
+    expect(results).toHaveLength(3);
+    expect(results).toEqual(
+      expect.arrayContaining(['File1.Js', 'file2.js', 'FILE3.JS']),
+    );
+
+    // Search with a mixed-case pattern
+    results = await fileSearch.search('FiLe*.Js');
+    expect(results).toHaveLength(3);
+    expect(results).toEqual(
+      expect.arrayContaining(['File1.Js', 'file2.js', 'FILE3.JS']),
+    );
+  });
+
+  it('should respect maxResults even when the cache returns an exact match', async () => {
+    tmpDir = await createTmpDir({
+      'file1.js': '',
+      'file2.js': '',
+      'file3.js': '',
+      'file4.js': '',
+      'file5.js': '',
+    });
+
+    const fileSearch = new FileSearch({
+      projectRoot: tmpDir,
+      useGitignore: false,
+      useGeminiignore: false,
+      ignoreDirs: [],
+      cache: true, // Ensure caching is enabled
+      cacheTtl: 10000,
+    });
+
+    await fileSearch.initialize();
+
+    // 1. Perform a broad search to populate the cache with an exact match.
+    const initialResults = await fileSearch.search('*.js');
+    expect(initialResults).toEqual([
+      'file1.js',
+      'file2.js',
+      'file3.js',
+      'file4.js',
+      'file5.js',
+    ]);
+
+    // 2. Perform the same search again, but this time with a maxResults limit.
+    const limitedResults = await fileSearch.search('*.js', { maxResults: 2 });
+
+    // 3. Assert that the maxResults limit was respected, even with a cache hit.
+    expect(limitedResults).toEqual(['file1.js', 'file2.js']);
+  });
+});
diff --git a/packages/core/src/utils/filesearch/fileSearch.ts b/packages/core/src/utils/filesearch/fileSearch.ts
new file mode 100644
index 00000000..5915821a
--- /dev/null
+++ b/packages/core/src/utils/filesearch/fileSearch.ts
@@ -0,0 +1,269 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import path from 'node:path';
+import fs from 'node:fs';
+import { fdir } from 'fdir';
+import picomatch from 'picomatch';
+import { Ignore } from './ignore.js';
+import { ResultCache } from './result-cache.js';
+import * as cache from './crawlCache.js';
+
+export type FileSearchOptions = {
+  projectRoot: string;
+  ignoreDirs: string[];
+  useGitignore: boolean;
+  useGeminiignore: boolean;
+  cache: boolean;
+  cacheTtl: number;
+};
+
+export class AbortError extends Error {
+  constructor(message = 'Search aborted') {
+    super(message);
+    this.name = 'AbortError';
+  }
+}
+
+/**
+ * Filters a list of paths based on a given pattern.
+ * @param allPaths The list of all paths to filter.
+ * @param pattern The picomatch pattern to filter by.
+ * @param signal An AbortSignal to cancel the operation.
+ * @returns A promise that resolves to the filtered and sorted list of paths.
+ */
+export async function filter(
+  allPaths: string[],
+  pattern: string,
+  signal: AbortSignal | undefined,
+): Promise<string[]> {
+  const patternFilter = picomatch(pattern, {
+    dot: true,
+    contains: true,
+    nocase: true,
+  });
+
+  const results: string[] = [];
+  for (const [i, p] of allPaths.entries()) {
+    // Yield control to the event loop periodically to prevent blocking.
+    if (i % 1000 === 0) {
+      await new Promise((resolve) => setImmediate(resolve));
+      if (signal?.aborted) {
+        throw new AbortError();
+      }
+    }
+
+    if (patternFilter(p)) {
+      results.push(p);
+    }
+  }
+
+  results.sort((a, b) => {
+    const aIsDir = a.endsWith('/');
+    const bIsDir = b.endsWith('/');
+
+    if (aIsDir && !bIsDir) return -1;
+    if (!aIsDir && bIsDir) return 1;
+
+    // This is 40% faster than localeCompare and the only thing we would really
+    // gain from localeCompare is case-sensitive sort
+    return a < b ? -1 : a > b ? 1 : 0;
+  });
+
+  return results;
+}
+
+export type SearchOptions = {
+  signal?: AbortSignal;
+  maxResults?: number;
+};
+
+/**
+ * Provides a fast and efficient way to search for files within a project,
+ * respecting .gitignore and .geminiignore rules, and utilizing caching
+ * for improved performance.
+ */
+export class FileSearch {
+  private readonly absoluteDir: string;
+  private readonly ignore: Ignore = new Ignore();
+  private resultCache: ResultCache | undefined;
+  private allFiles: string[] = [];
+
+  /**
+   * Constructs a new `FileSearch` instance.
+   * @param options Configuration options for the file search.
+   */
+  constructor(private readonly options: FileSearchOptions) {
+    this.absoluteDir = path.resolve(options.projectRoot);
+  }
+
+  /**
+   * Initializes the file search engine by loading ignore rules, crawling the
+   * file system, and building the in-memory cache. This method must be called
+   * before performing any searches.
+   */
+  async initialize(): Promise<void> {
+    this.loadIgnoreRules();
+    await this.crawlFiles();
+    this.buildResultCache();
+  }
+
+  /**
+   * Searches for files matching a given pattern.
+   * @param pattern The picomatch pattern to search for (e.g., '*.js', 'src/**').
+   * @param options Search options, including an AbortSignal and maxResults.
+   * @returns A promise that resolves to a list of matching file paths, relative
+   *          to the project root.
+   */
+  async search(
+    pattern: string,
+    options: SearchOptions = {},
+  ): Promise<string[]> {
+    if (!this.resultCache) {
+      throw new Error('Engine not initialized. Call initialize() first.');
+    }
+
+    pattern = pattern || '*';
+
+    const { files: candidates, isExactMatch } =
+      await this.resultCache!.get(pattern);
+
+    let filteredCandidates;
+    if (isExactMatch) {
+      filteredCandidates = candidates;
+    } else {
+      // Apply the user's picomatch pattern filter
+      filteredCandidates = await filter(candidates, pattern, options.signal);
+      this.resultCache!.set(pattern, filteredCandidates);
+    }
+
+    // Trade-off: We apply a two-stage filtering process.
+    // 1. During the file system crawl (`performCrawl`), we only apply directory-level
+    //    ignore rules (e.g., `node_modules/`, `dist/`). This is because applying
+    //    a full ignore filter (which includes file-specific patterns like `*.log`)
+    //    during the crawl can significantly slow down `fdir`.
+    // 2. Here, in the `search` method, we apply the full ignore filter
+    //    (including file patterns) to the `filteredCandidates` (which have already
+    //    been filtered by the user's search pattern and sorted). For autocomplete,
+    //    the number of displayed results is small (MAX_SUGGESTIONS_TO_SHOW),
+    //    so applying the full filter to this truncated list is much more efficient
+    //    than applying it to every file during the initial crawl.
+    const fileFilter = this.ignore.getFileFilter();
+    const results: string[] = [];
+    for (const [i, candidate] of filteredCandidates.entries()) {
+      // Yield to the event loop to avoid blocking on large result sets.
+      if (i % 1000 === 0) {
+        await new Promise((resolve) => setImmediate(resolve));
+        if (options.signal?.aborted) {
+          throw new AbortError();
+        }
+      }
+
+      if (results.length >= (options.maxResults ?? Infinity)) {
+        break;
+      }
+      // The `ignore` library throws an error if the path is '.', so we skip it.
+      if (candidate === '.') {
+        continue;
+      }
+      if (!fileFilter(candidate)) {
+        results.push(candidate);
+      }
+    }
+    return results;
+  }
+
+  /**
+   * Loads ignore rules from .gitignore and .geminiignore files, and applies
+   * any additional ignore directories specified in the options.
+   */
+  private loadIgnoreRules(): void {
+    if (this.options.useGitignore) {
+      const gitignorePath = path.join(this.absoluteDir, '.gitignore');
+      if (fs.existsSync(gitignorePath)) {
+        this.ignore.add(fs.readFileSync(gitignorePath, 'utf8'));
+      }
+    }
+
+    if (this.options.useGeminiignore) {
+      const geminiignorePath = path.join(this.absoluteDir, '.geminiignore');
+      if (fs.existsSync(geminiignorePath)) {
+        this.ignore.add(fs.readFileSync(geminiignorePath, 'utf8'));
+      }
+    }
+
+    const ignoreDirs = ['.git', ...this.options.ignoreDirs];
+    this.ignore.add(
+      ignoreDirs.map((dir) => {
+        if (dir.endsWith('/')) {
+          return dir;
+        }
+        return `${dir}/`;
+      }),
+    );
+  }
+
+  /**
+   * Crawls the file system to get a list of all files and directories,
+   * optionally using a cache for faster initialization.
+   */
+  private async crawlFiles(): Promise<void> {
+    if (this.options.cache) {
+      const cacheKey = cache.getCacheKey(
+        this.absoluteDir,
+        this.ignore.getFingerprint(),
+      );
+      const cachedResults = cache.read(cacheKey);
+
+      if (cachedResults) {
+        this.allFiles = cachedResults;
+        return;
+      }
+    }
+
+    this.allFiles = await this.performCrawl();
+
+    if (this.options.cache) {
+      const cacheKey = cache.getCacheKey(
+        this.absoluteDir,
+        this.ignore.getFingerprint(),
+      );
+      cache.write(cacheKey, this.allFiles, this.options.cacheTtl * 1000);
+    }
+  }
+
+  /**
+   * Performs the actual file system crawl using `fdir`, applying directory
+   * ignore rules.
+   * @returns A promise that resolves to a list of all files and directories.
+   */
+  private async performCrawl(): Promise<string[]> {
+    const dirFilter = this.ignore.getDirectoryFilter();
+
+    // We use `fdir` for fast file system traversal. A key performance
+    // optimization for large workspaces is to exclude entire directories
+    // early in the traversal process. This is why we apply directory-specific
+    // ignore rules (e.g., `node_modules/`, `dist/`) directly to `fdir`'s
+    // exclude filter.
+    const api = new fdir()
+      .withRelativePaths()
+      .withDirs()
+      .withPathSeparator('/') // Always use unix style paths
+      .exclude((_, dirPath) => {
+        const relativePath = path.relative(this.absoluteDir, dirPath);
+        return dirFilter(`${relativePath}/`);
+      });
+
+    return api.crawl(this.absoluteDir).withPromise();
+  }
+
+  /**
+   * Builds the in-memory cache for fast pattern matching.
+   */
+  private buildResultCache(): void {
+    this.resultCache = new ResultCache(this.allFiles, this.absoluteDir);
+  }
+}
diff --git a/packages/core/src/utils/filesearch/ignore.test.ts b/packages/core/src/utils/filesearch/ignore.test.ts
new file mode 100644
index 00000000..ff375e3f
--- /dev/null
+++ b/packages/core/src/utils/filesearch/ignore.test.ts
@@ -0,0 +1,65 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect } from 'vitest';
+import { Ignore } from './ignore.js';
+
+describe('Ignore', () => {
+  describe('getDirectoryFilter', () => {
+    it('should ignore directories matching directory patterns', () => {
+      const ig = new Ignore().add(['foo/', 'bar/']);
+      const dirFilter = ig.getDirectoryFilter();
+      expect(dirFilter('foo/')).toBe(true);
+      expect(dirFilter('bar/')).toBe(true);
+      expect(dirFilter('baz/')).toBe(false);
+    });
+
+    it('should not ignore directories with file patterns', () => {
+      const ig = new Ignore().add(['foo.js', '*.log']);
+      const dirFilter = ig.getDirectoryFilter();
+      expect(dirFilter('foo.js')).toBe(false);
+      expect(dirFilter('foo.log')).toBe(false);
+    });
+  });
+
+  describe('getFileFilter', () => {
+    it('should not ignore files with directory patterns', () => {
+      const ig = new Ignore().add(['foo/', 'bar/']);
+      const fileFilter = ig.getFileFilter();
+      expect(fileFilter('foo')).toBe(false);
+      expect(fileFilter('foo/file.txt')).toBe(false);
+    });
+
+    it('should ignore files matching file patterns', () => {
+      const ig = new Ignore().add(['*.log', 'foo.js']);
+      const fileFilter = ig.getFileFilter();
+      expect(fileFilter('foo.log')).toBe(true);
+      expect(fileFilter('foo.js')).toBe(true);
+      expect(fileFilter('bar.txt')).toBe(false);
+    });
+  });
+
+  it('should accumulate patterns across multiple add() calls', () => {
+    const ig = new Ignore().add('foo.js');
+    ig.add('bar.js');
+    const fileFilter = ig.getFileFilter();
+    expect(fileFilter('foo.js')).toBe(true);
+    expect(fileFilter('bar.js')).toBe(true);
+    expect(fileFilter('baz.js')).toBe(false);
+  });
+
+  it('should return a stable and consistent fingerprint', () => {
+    const ig1 = new Ignore().add(['foo', '!bar']);
+    const ig2 = new Ignore().add('foo\n!bar');
+
+    // Fingerprints should be identical for the same rules.
+    expect(ig1.getFingerprint()).toBe(ig2.getFingerprint());
+
+    // Adding a new rule should change the fingerprint.
+    ig2.add('baz');
+    expect(ig1.getFingerprint()).not.toBe(ig2.getFingerprint());
+  });
+});
diff --git a/packages/core/src/utils/filesearch/ignore.ts b/packages/core/src/utils/filesearch/ignore.ts
new file mode 100644
index 00000000..9f756f93
--- /dev/null
+++ b/packages/core/src/utils/filesearch/ignore.ts
@@ -0,0 +1,93 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import ignore from 'ignore';
+import picomatch from 'picomatch';
+
+const hasFileExtension = picomatch('**/*[*.]*');
+
+export class Ignore {
+  private readonly allPatterns: string[] = [];
+  private dirIgnorer = ignore();
+  private fileIgnorer = ignore();
+
+  /**
+   * Adds one or more ignore patterns.
+   * @param patterns A single pattern string or an array of pattern strings.
+   *                 Each pattern can be a glob-like string similar to .gitignore rules.
+   * @returns The `Ignore` instance for chaining.
+   */
+  add(patterns: string | string[]): this {
+    if (typeof patterns === 'string') {
+      patterns = patterns.split(/\r?\n/);
+    }
+
+    for (const p of patterns) {
+      const pattern = p.trim();
+
+      if (pattern === '' || pattern.startsWith('#')) {
+        continue;
+      }
+
+      this.allPatterns.push(pattern);
+
+      const isPositiveDirPattern =
+        pattern.endsWith('/') && !pattern.startsWith('!');
+
+      if (isPositiveDirPattern) {
+        this.dirIgnorer.add(pattern);
+      } else {
+        // An ambiguous pattern (e.g., "build") could match a file or a
+        // directory. To optimize the file system crawl, we use a heuristic:
+        // patterns without a dot in the last segment are included in the
+        // directory exclusion check.
+        //
+        // This heuristic can fail. For example, an ignore pattern of "my.assets"
+        // intended to exclude a directory will not be treated as a directory
+        // pattern because it contains a ".". This results in crawling a
+        // directory that should have been excluded, reducing efficiency.
+        // Correctness is still maintained. The incorrectly crawled directory
+        // will be filtered out by the final ignore check.
+        //
+        // For maximum crawl efficiency, users should explicitly mark directory
+        // patterns with a trailing slash (e.g., "my.assets/").
+        this.fileIgnorer.add(pattern);
+        if (!hasFileExtension(pattern)) {
+          this.dirIgnorer.add(pattern);
+        }
+      }
+    }
+
+    return this;
+  }
+
+  /**
+   * Returns a predicate that matches explicit directory ignore patterns (patterns ending with '/').
+   * @returns {(dirPath: string) => boolean}
+   */
+  getDirectoryFilter(): (dirPath: string) => boolean {
+    return (dirPath: string) => this.dirIgnorer.ignores(dirPath);
+  }
+
+  /**
+   * Returns a predicate that matches file ignore patterns (all patterns not ending with '/').
+   * Note: This may also match directories if a file pattern matches a directory name, but all explicit directory patterns are handled by getDirectoryFilter.
+   * @returns {(filePath: string) => boolean}
+   */
+  getFileFilter(): (filePath: string) => boolean {
+    return (filePath: string) => this.fileIgnorer.ignores(filePath);
+  }
+
+  /**
+   * Returns a string representing the current set of ignore patterns.
+   * This can be used to generate a unique identifier for the ignore configuration,
+   * useful for caching purposes.
+   * @returns A string fingerprint of the ignore patterns.
+   */
+  getFingerprint(): string {
+    return this.allPatterns.join('\n');
+  }
+}
diff --git a/packages/core/src/utils/filesearch/result-cache.test.ts b/packages/core/src/utils/filesearch/result-cache.test.ts
new file mode 100644
index 00000000..0b1b4e17
--- /dev/null
+++ b/packages/core/src/utils/filesearch/result-cache.test.ts
@@ -0,0 +1,56 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import path from 'node:path';
+import { test, expect } from 'vitest';
+import { ResultCache } from './result-cache.js';
+
+test('ResultCache basic usage', async () => {
+  const files = [
+    'foo.txt',
+    'bar.js',
+    'baz.md',
+    'subdir/file.txt',
+    'subdir/other.js',
+    'subdir/nested/file.md',
+  ];
+  const cache = new ResultCache(files, path.resolve('.'));
+  const { files: resultFiles, isExactMatch } = await cache.get('*.js');
+  expect(resultFiles).toEqual(files);
+  expect(isExactMatch).toBe(false);
+});
+
+test('ResultCache cache hit/miss', async () => {
+  const files = ['foo.txt', 'bar.js', 'baz.md'];
+  const cache = new ResultCache(files, path.resolve('.'));
+  // First call: miss
+  const { files: result1Files, isExactMatch: isExactMatch1 } =
+    await cache.get('*.js');
+  expect(result1Files).toEqual(files);
+  expect(isExactMatch1).toBe(false);
+
+  // Simulate FileSearch applying the filter and setting the result
+  cache.set('*.js', ['bar.js']);
+
+  // Second call: hit
+  const { files: result2Files, isExactMatch: isExactMatch2 } =
+    await cache.get('*.js');
+  expect(result2Files).toEqual(['bar.js']);
+  expect(isExactMatch2).toBe(true);
+});
+
+test('ResultCache best base query', async () => {
+  const files = ['foo.txt', 'foobar.js', 'baz.md'];
+  const cache = new ResultCache(files, path.resolve('.'));
+
+  // Cache a broader query
+  cache.set('foo', ['foo.txt', 'foobar.js']);
+
+  // Search for a more specific query that starts with the broader one
+  const { files: resultFiles, isExactMatch } = await cache.get('foobar');
+  expect(resultFiles).toEqual(['foo.txt', 'foobar.js']);
+  expect(isExactMatch).toBe(false);
+});
diff --git a/packages/core/src/utils/filesearch/result-cache.ts b/packages/core/src/utils/filesearch/result-cache.ts
new file mode 100644
index 00000000..77b99aec
--- /dev/null
+++ b/packages/core/src/utils/filesearch/result-cache.ts
@@ -0,0 +1,70 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Implements an in-memory cache for file search results.
+ * This cache optimizes subsequent searches by leveraging previously computed results.
+ */
+export class ResultCache {
+  private readonly cache: Map<string, string[]>;
+  private hits = 0;
+  private misses = 0;
+
+  constructor(
+    private readonly allFiles: string[],
+    private readonly absoluteDir: string,
+  ) {
+    this.cache = new Map();
+  }
+
+  /**
+   * Retrieves cached search results for a given query, or provides a base set
+   * of files to search from.
+   * @param query The search query pattern.
+   * @returns An object containing the files to search and a boolean indicating
+   *          if the result is an exact cache hit.
+   */
+  async get(
+    query: string,
+  ): Promise<{ files: string[]; isExactMatch: boolean }> {
+    const isCacheHit = this.cache.has(query);
+
+    if (isCacheHit) {
+      this.hits++;
+      return { files: this.cache.get(query)!, isExactMatch: true };
+    }
+
+    this.misses++;
+
+    // This is the core optimization of the memory cache.
+    // If a user first searches for "foo", and then for "foobar",
+    // we don't need to search through all files again. We can start
+    // from the results of the "foo" search.
+    // This finds the most specific, already-cached query that is a prefix
+    // of the current query.
+    let bestBaseQuery = '';
+    for (const key of this.cache?.keys?.() ?? []) {
+      if (query.startsWith(key) && key.length > bestBaseQuery.length) {
+        bestBaseQuery = key;
+      }
+    }
+
+    const filesToSearch = bestBaseQuery
+      ? this.cache.get(bestBaseQuery)!
+      : this.allFiles;
+
+    return { files: filesToSearch, isExactMatch: false };
+  }
+
+  /**
+   * Stores search results in the cache.
+   * @param query The search query pattern.
+   * @param results The matching file paths to cache.
+   */
+  set(query: string, results: string[]): void {
+    this.cache.set(query, results);
+  }
+}
author	Bryant Chandler <[email protected]>	2025-08-05 16:18:03 -0700
committer	GitHub <[email protected]>	2025-08-05 23:18:03 +0000
commit	12a9bc3ed94fab3071529b5304d46bcc5b4fe756 (patch)
tree	90967b6670668c6c476719ac04422e1744cbabd6 /packages/core/src
parent	2141b39c3d713a19f2dd8012a76c2ff8b7c30a5e (diff)