feat(filesearch): Introduce non-recursive file search strategy (#6087)

Co-authored-by: Jacob Richman <[email protected]> Co-authored-by: Bryant Chandler <[email protected]>
author: Bryant Chandler <[email protected]> 2025-08-18 13:43:24 -0700
committer: GitHub <[email protected]> 2025-08-18 20:43:24 +0000
commit: 465ac9f547d0d684439886d1466c1a1133da611d (patch)
tree: b94f00730118784b5b07800db71224816b444bfe /packages/core/src/utils/filesearch/fileSearch.ts
parent: d66ddcd82e09d7b6fbc0226e31d73d38db5cff2a (diff)
1 files changed, 77 insertions, 140 deletions
diff --git a/packages/core/src/utils/filesearch/fileSearch.ts b/packages/core/src/utils/filesearch/fileSearch.ts
index dff8d0ec..fa36dab4 100644
--- a/packages/core/src/utils/filesearch/fileSearch.ts
+++ b/packages/core/src/utils/filesearch/fileSearch.ts
@@ -5,23 +5,22 @@
  */
 
 import path from 'node:path';
-import fs from 'node:fs';
-import { fdir } from 'fdir';
 import picomatch from 'picomatch';
-import { Ignore } from './ignore.js';
+import { Ignore, loadIgnoreRules } from './ignore.js';
 import { ResultCache } from './result-cache.js';
-import * as cache from './crawlCache.js';
+import { crawl } from './crawler.js';
 import { AsyncFzf, FzfResultItem } from 'fzf';
 
-export type FileSearchOptions = {
+export interface FileSearchOptions {
   projectRoot: string;
   ignoreDirs: string[];
   useGitignore: boolean;
   useGeminiignore: boolean;
   cache: boolean;
   cacheTtl: number;
+  enableRecursiveFileSearch: boolean;
   maxDepth?: number;
-};
+}
 
 export class AbortError extends Error {
   constructor(message = 'Search aborted') {
@@ -78,54 +77,42 @@ export async function filter(
   return results;
 }
 
-export type SearchOptions = {
+export interface SearchOptions {
   signal?: AbortSignal;
   maxResults?: number;
-};
+}
 
-/**
- * Provides a fast and efficient way to search for files within a project,
- * respecting .gitignore and .geminiignore rules, and utilizing caching
- * for improved performance.
- */
-export class FileSearch {
-  private readonly absoluteDir: string;
-  private readonly ignore: Ignore = new Ignore();
+export interface FileSearch {
+  initialize(): Promise<void>;
+  search(pattern: string, options?: SearchOptions): Promise<string[]>;
+}
+
+class RecursiveFileSearch implements FileSearch {
+  private ignore: Ignore | undefined;
   private resultCache: ResultCache | undefined;
   private allFiles: string[] = [];
   private fzf: AsyncFzf<string[]> | undefined;
 
-  /**
-   * Constructs a new `FileSearch` instance.
-   * @param options Configuration options for the file search.
-   */
-  constructor(private readonly options: FileSearchOptions) {
-    this.absoluteDir = path.resolve(options.projectRoot);
-  }
+  constructor(private readonly options: FileSearchOptions) {}
 
-  /**
-   * Initializes the file search engine by loading ignore rules, crawling the
-   * file system, and building the in-memory cache. This method must be called
-   * before performing any searches.
-   */
   async initialize(): Promise<void> {
-    this.loadIgnoreRules();
-    await this.crawlFiles();
+    this.ignore = loadIgnoreRules(this.options);
+    this.allFiles = await crawl({
+      crawlDirectory: this.options.projectRoot,
+      cwd: this.options.projectRoot,
+      ignore: this.ignore,
+      cache: this.options.cache,
+      cacheTtl: this.options.cacheTtl,
+      maxDepth: this.options.maxDepth,
+    });
     this.buildResultCache();
   }
 
-  /**
-   * Searches for files matching a given pattern.
-   * @param pattern The picomatch pattern to search for (e.g., '*.js', 'src/**').
-   * @param options Search options, including an AbortSignal and maxResults.
-   * @returns A promise that resolves to a list of matching file paths, relative
-   *          to the project root.
-   */
   async search(
     pattern: string,
     options: SearchOptions = {},
   ): Promise<string[]> {
-    if (!this.resultCache || !this.fzf) {
+    if (!this.resultCache || !this.fzf || !this.ignore) {
       throw new Error('Engine not initialized. Call initialize() first.');
     }
 
@@ -159,21 +146,9 @@ export class FileSearch {
       }
     }
 
-    // Trade-off: We apply a two-stage filtering process.
-    // 1. During the file system crawl (`performCrawl`), we only apply directory-level
-    //    ignore rules (e.g., `node_modules/`, `dist/`). This is because applying
-    //    a full ignore filter (which includes file-specific patterns like `*.log`)
-    //    during the crawl can significantly slow down `fdir`.
-    // 2. Here, in the `search` method, we apply the full ignore filter
-    //    (including file patterns) to the `filteredCandidates` (which have already
-    //    been filtered by the user's search pattern and sorted). For autocomplete,
-    //    the number of displayed results is small (MAX_SUGGESTIONS_TO_SHOW),
-    //    so applying the full filter to this truncated list is much more efficient
-    //    than applying it to every file during the initial crawl.
     const fileFilter = this.ignore.getFileFilter();
     const results: string[] = [];
     for (const [i, candidate] of filteredCandidates.entries()) {
-      // Yield to the event loop to avoid blocking on large result sets.
       if (i % 1000 === 0) {
         await new Promise((resolve) => setImmediate(resolve));
         if (options.signal?.aborted) {
@@ -184,7 +159,6 @@ export class FileSearch {
       if (results.length >= (options.maxResults ?? Infinity)) {
         break;
       }
-      // The `ignore` library throws an error if the path is '.', so we skip it.
       if (candidate === '.') {
         continue;
       }
@@ -195,106 +169,69 @@ export class FileSearch {
     return results;
   }
 
-  /**
-   * Loads ignore rules from .gitignore and .geminiignore files, and applies
-   * any additional ignore directories specified in the options.
-   */
-  private loadIgnoreRules(): void {
-    if (this.options.useGitignore) {
-      const gitignorePath = path.join(this.absoluteDir, '.gitignore');
-      if (fs.existsSync(gitignorePath)) {
-        this.ignore.add(fs.readFileSync(gitignorePath, 'utf8'));
-      }
-    }
-
-    if (this.options.useGeminiignore) {
-      const geminiignorePath = path.join(this.absoluteDir, '.geminiignore');
-      if (fs.existsSync(geminiignorePath)) {
-        this.ignore.add(fs.readFileSync(geminiignorePath, 'utf8'));
-      }
-    }
-
-    const ignoreDirs = ['.git', ...this.options.ignoreDirs];
-    this.ignore.add(
-      ignoreDirs.map((dir) => {
-        if (dir.endsWith('/')) {
-          return dir;
-        }
-        return `${dir}/`;
-      }),
-    );
+  private buildResultCache(): void {
+    this.resultCache = new ResultCache(this.allFiles);
+    // The v1 algorithm is much faster since it only looks at the first
+    // occurence of the pattern. We use it for search spaces that have >20k
+    // files, because the v2 algorithm is just too slow in those cases.
+    this.fzf = new AsyncFzf(this.allFiles, {
+      fuzzy: this.allFiles.length > 20000 ? 'v1' : 'v2',
+    });
   }
+}
 
-  /**
-   * Crawls the file system to get a list of all files and directories,
-   * optionally using a cache for faster initialization.
-   */
-  private async crawlFiles(): Promise<void> {
-    if (this.options.cache) {
-      const cacheKey = cache.getCacheKey(
-        this.absoluteDir,
-        this.ignore.getFingerprint(),
-        this.options.maxDepth,
-      );
-      const cachedResults = cache.read(cacheKey);
+class DirectoryFileSearch implements FileSearch {
+  private ignore: Ignore | undefined;
 
-      if (cachedResults) {
-        this.allFiles = cachedResults;
-        return;
-      }
-    }
+  constructor(private readonly options: FileSearchOptions) {}
 
-    this.allFiles = await this.performCrawl();
+  async initialize(): Promise<void> {
+    this.ignore = loadIgnoreRules(this.options);
+  }
 
-    if (this.options.cache) {
-      const cacheKey = cache.getCacheKey(
-        this.absoluteDir,
-        this.ignore.getFingerprint(),
-        this.options.maxDepth,
-      );
-      cache.write(cacheKey, this.allFiles, this.options.cacheTtl * 1000);
+  async search(
+    pattern: string,
+    options: SearchOptions = {},
+  ): Promise<string[]> {
+    if (!this.ignore) {
+      throw new Error('Engine not initialized. Call initialize() first.');
     }
-  }
+    pattern = pattern || '*';
 
-  /**
-   * Performs the actual file system crawl using `fdir`, applying directory
-   * ignore rules.
-   * @returns A promise that resolves to a list of all files and directories.
-   */
-  private async performCrawl(): Promise<string[]> {
-    const dirFilter = this.ignore.getDirectoryFilter();
+    const dir = pattern.endsWith('/') ? pattern : path.dirname(pattern);
+    const results = await crawl({
+      crawlDirectory: path.join(this.options.projectRoot, dir),
+      cwd: this.options.projectRoot,
+      maxDepth: 0,
+      ignore: this.ignore,
+      cache: this.options.cache,
+      cacheTtl: this.options.cacheTtl,
+    });
 
-    // We use `fdir` for fast file system traversal. A key performance
-    // optimization for large workspaces is to exclude entire directories
-    // early in the traversal process. This is why we apply directory-specific
-    // ignore rules (e.g., `node_modules/`, `dist/`) directly to `fdir`'s
-    // exclude filter.
-    const api = new fdir()
-      .withRelativePaths()
-      .withDirs()
-      .withPathSeparator('/') // Always use unix style paths
-      .exclude((_, dirPath) => {
-        const relativePath = path.relative(this.absoluteDir, dirPath);
-        return dirFilter(`${relativePath}/`);
-      });
+    const filteredResults = await filter(results, pattern, options.signal);
 
-    if (this.options.maxDepth !== undefined) {
-      api.withMaxDepth(this.options.maxDepth);
+    const fileFilter = this.ignore.getFileFilter();
+    const finalResults: string[] = [];
+    for (const candidate of filteredResults) {
+      if (finalResults.length >= (options.maxResults ?? Infinity)) {
+        break;
+      }
+      if (candidate === '.') {
+        continue;
+      }
+      if (!fileFilter(candidate)) {
+        finalResults.push(candidate);
+      }
     }
-
-    return api.crawl(this.absoluteDir).withPromise();
+    return finalResults;
   }
+}
 
-  /**
-   * Builds the in-memory cache for fast pattern matching.
-   */
-  private buildResultCache(): void {
-    this.resultCache = new ResultCache(this.allFiles);
-    // The v1 algorithm is much faster since it only looks at the first
-    // occurence of the pattern. We use it for search spaces that have >20k
-    // files, because the v2 algorithm is just too slow in those cases.
-    this.fzf = new AsyncFzf(this.allFiles, {
-      fuzzy: this.allFiles.length > 20000 ? 'v1' : 'v2',
-    });
+export class FileSearchFactory {
+  static create(options: FileSearchOptions): FileSearch {
+    if (options.enableRecursiveFileSearch) {
+      return new RecursiveFileSearch(options);
+    }
+    return new DirectoryFileSearch(options);
   }
 }
author	Bryant Chandler <[email protected]>	2025-08-18 13:43:24 -0700
committer	GitHub <[email protected]>	2025-08-18 20:43:24 +0000
commit	465ac9f547d0d684439886d1466c1a1133da611d (patch)
tree	b94f00730118784b5b07800db71224816b444bfe /packages/core/src/utils/filesearch/fileSearch.ts
parent	d66ddcd82e09d7b6fbc0226e31d73d38db5cff2a (diff)