diff --git a/docs/codebase_search.md b/docs/codebase_search.md index 78d6c15..d64bdeb 100644 --- a/docs/codebase_search.md +++ b/docs/codebase_search.md @@ -36,7 +36,9 @@ A `retry_on_error` decorator is applied to these tools to enhance their resilien * `line_content` (str): The content of the line containing the match. * **Implementation Notes:** * Recursively searches directories starting from the project root. - * Skips hidden files and common non-code directories (e.g., `__pycache__`, `venv`, `node_modules`). + * Skips hidden files (starting with '.'). + * Skips directories listed in `CodebaseContext.excluded_dirs`. By default, this includes common SCM, IDE, virtual environment, build, and documentation folders (see "Configuring Search Behavior" section for defaults). + * Skips files matching glob patterns in `CodebaseContext.excluded_file_patterns` (e.g., `*.log`, `*.pyc` by default). * Uses a helper function `_is_binary_file` to attempt to skip searching in binary files (based on extension and content sniffing). * Limits the number of returned matches (currently to 100) to avoid excessive output. * Handles `re.error` for invalid regex patterns. @@ -82,6 +84,24 @@ A `retry_on_error` decorator is applied to these tools to enhance their resilien * Reads the specified file and extracts the relevant lines. * Formats the output to include line numbers and highlight the `frame_line_number`. +## Configuring Search Behavior + +The behavior of search tools like `find_in_files` and `find_symbol_references` can be customized by providing specific exclusion lists when creating an instance of `CodebaseContext` (typically available as `ctx.deps` within tool execution). These settings allow for more fine-grained control over which parts of the codebase are searched. + +The `CodebaseContext` model (in `gemini_stacktrace/models/config.py`) includes the following fields for configuration: + +* `excluded_dirs: List[str]` + * **Description:** A list of directory names that should be completely ignored during searches. + * **Default Values:** `["__pycache__", "venv", ".venv", "node_modules", "dist", "build", ".git", ".hg", ".svn", ".vscode", ".idea", "docs"]` + * **Customization:** When creating a `CodebaseContext` instance, you can provide your own list to this field to override the defaults. An empty list `[]` would mean no directories are excluded by name (though hidden directories starting with `.` are always skipped). + +* `excluded_file_patterns: List[str]` + * **Description:** A list of glob-style file patterns (e.g., `*.log`, `temp_*.*`) for files that should be ignored during searches, even if they are in directories that are otherwise being searched. + * **Default Values:** `["*.pyc", "*.pyo", "*.log"]` + * **Customization:** Provide your own list of patterns to override the defaults. An empty list `[]` would mean no files are excluded based on these patterns (though the `_is_binary_file` check will still apply). + +These configurations are used by the search tools to filter out irrelevant files and directories, leading to more focused and efficient search results. + ## Potential Improvements While the current set of tools provides a solid foundation for codebase interaction and search, several areas could be enhanced for better performance, accuracy, and broader capabilities: diff --git a/gemini_stacktrace/models/config.py b/gemini_stacktrace/models/config.py index d18b90b..4f3bfc7 100644 --- a/gemini_stacktrace/models/config.py +++ b/gemini_stacktrace/models/config.py @@ -3,7 +3,7 @@ """ from pathlib import Path -from typing import Annotated, Optional +from typing import Annotated, Optional, List # Make sure List is imported import os from pydantic import ( @@ -107,6 +107,20 @@ class CodebaseContext(BaseModel): description="Absolute path to the project directory" ) + excluded_dirs: List[str] = Field( + default_factory=lambda: [ + "__pycache__", "venv", ".venv", "node_modules", + "dist", "build", ".git", ".hg", ".svn", + ".vscode", ".idea", "docs", + ], + description="List of directory names to exclude from searches. Default includes common virtual env, SCM, build, and IDE folders." + ) + + excluded_file_patterns: List[str] = Field( + default_factory=lambda: ["*.pyc", "*.pyo", "*.log"], + description="List of glob file patterns to exclude from searches (e.g., '*.log', '*.tmp'). Default includes Python bytecode files and logs." + ) + model_config = ConfigDict(arbitrary_types_allowed=True) def validate_file_path(self, file_path: str) -> str: diff --git a/gemini_stacktrace/tools/codebase_tools.py b/gemini_stacktrace/tools/codebase_tools.py index 5e1c6d0..eb1cf27 100644 --- a/gemini_stacktrace/tools/codebase_tools.py +++ b/gemini_stacktrace/tools/codebase_tools.py @@ -153,14 +153,7 @@ def search_directory(dir_path: str) -> None: continue # Skip common directories that shouldn't contain Python code - if os.path.isdir(item_path) and item in { - "__pycache__", - "venv", - ".venv", - "node_modules", - "dist", - "build", - }: + if os.path.isdir(item_path) and item in ctx.deps.excluded_dirs: continue if os.path.isdir(item_path): @@ -171,6 +164,11 @@ def search_directory(dir_path: str) -> None: if file_pattern and not fnmatch.fnmatch(item, file_pattern): continue + # Skip if file matches any excluded patterns + if ctx.deps.excluded_file_patterns and \ + any(fnmatch.fnmatch(item, pattern) for pattern in ctx.deps.excluded_file_patterns): + continue + # Skip if file has been visited if item_path in visited_files: continue @@ -232,14 +230,7 @@ def search_directory(dir_path: str) -> None: continue # Skip common directories that shouldn't contain Python code - if os.path.isdir(item_path) and item in { - "__pycache__", - "venv", - ".venv", - "node_modules", - "dist", - "build", - }: + if os.path.isdir(item_path) and item in ctx.deps.excluded_dirs: continue if os.path.isdir(item_path): @@ -250,6 +241,11 @@ def search_directory(dir_path: str) -> None: if file_pattern and not fnmatch.fnmatch(item, file_pattern): continue + # Skip if file matches any excluded patterns + if ctx.deps.excluded_file_patterns and \ + any(fnmatch.fnmatch(item, pattern) for pattern in ctx.deps.excluded_file_patterns): + continue + # Skip if file has been visited if item_path in visited_files: continue diff --git a/tests/test_codebase_tools.py b/tests/test_codebase_tools.py index 6b8949e..8e7ee3f 100644 --- a/tests/test_codebase_tools.py +++ b/tests/test_codebase_tools.py @@ -72,6 +72,27 @@ def process_numbers(numbers: List[int]) -> Dict[str, int]: print("This is a helper function") """) + # Create a directory that should be ignored by default excluded_dirs + os.makedirs(project_dir / "docs", exist_ok=True) + with open(project_dir / "docs" / "documentation.txt", "w") as f: + f.write("Searchable content in a default excluded directory.") + + # Create a file that should be ignored by default excluded_file_patterns + with open(project_dir / "system.log", "w") as f: + f.write("Searchable log content.") + + with open(project_dir / "src" / "another.log", "w") as f: # For testing interaction with file_pattern + f.write("Searchable log content in src.") + + # Create a directory and a file for testing custom directory exclusion + os.makedirs(project_dir / "custom_exclude_dir", exist_ok=True) + with open(project_dir / "custom_exclude_dir" / "custom_file.txt", "w") as f: + f.write("Searchable content in custom excluded directory.") + + # Create a file for testing custom file pattern exclusion + with open(project_dir / "data.custom_pattern", "w") as f: + f.write("Searchable content in custom pattern file.") + yield project_dir @@ -269,3 +290,101 @@ async def test_get_stack_frame_context(self, mock_agent, test_codebase_context, # Check that the error line is marked with '>' assert "> " in context + + @pytest.mark.asyncio + async def test_find_in_files_default_dir_exclusion(self, mock_agent, test_codebase_context, test_project): + register_tools(mock_agent) + find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files") + ctx = AsyncMock() + ctx.deps = test_codebase_context # Uses default CodebaseContext exclusions + + matches = await find_in_files(ctx, "Searchable content in a default excluded directory") + assert not any(match["file_path"] == str(Path("docs") / "documentation.txt") for match in matches) + # Also check it finds something NOT in an excluded dir to be sure search works generally + matches_main = await find_in_files(ctx, "def add") + assert any(match["file_path"] == "main.py" for match in matches_main) + + @pytest.mark.asyncio + async def test_find_in_files_default_file_pattern_exclusion(self, mock_agent, test_codebase_context, test_project): + register_tools(mock_agent) + find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files") + ctx = AsyncMock() + ctx.deps = test_codebase_context # Uses default CodebaseContext exclusions + + matches = await find_in_files(ctx, "Searchable log content") + # Check it's not found in system.log (root) or src/another.log + assert not any("system.log" in match["file_path"] for match in matches) + assert not any(str(Path("src") / "another.log") in match["file_path"] for match in matches) + + # Check it finds it if we search specifically with a file_pattern that overrides (e.g. whitelisting) + # The current find_in_files logic applies exclusion before _is_binary check, and after whitelist file_pattern + # So, excluded_file_patterns should take precedence over the input file_pattern. + # Let's confirm: if we search for *.log, it should still be excluded by default excluded_file_patterns. + matches_specific_log = await find_in_files(ctx, "Searchable log content", file_pattern="*.log") + assert not any("system.log" in match["file_path"] for match in matches_specific_log) + assert not any(str(Path("src") / "another.log") in match["file_path"] for match in matches_specific_log) + + @pytest.mark.asyncio + async def test_find_in_files_custom_dir_exclusion(self, mock_agent, test_project): + register_tools(mock_agent) + find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files") + + # CodebaseContext with custom exclusion + custom_context = CodebaseContext( + project_dir=str(test_project), + excluded_dirs=["custom_exclude_dir"] # Override defaults by providing a new list + ) + ctx = AsyncMock() + ctx.deps = custom_context + + matches = await find_in_files(ctx, "Searchable content in custom excluded directory") + assert not any(match["file_path"] == str(Path("custom_exclude_dir") / "custom_file.txt") for match in matches) + + # Verify it IS found if not excluded + # Use a context that doesn't exclude 'custom_exclude_dir' but keeps other defaults to avoid matching everything + default_dirs = CodebaseContext().excluded_dirs + custom_dirs_without_custom_exclude = [d for d in default_dirs if d != "custom_exclude_dir"] + + non_excluding_context = CodebaseContext(project_dir=str(test_project), excluded_dirs=custom_dirs_without_custom_exclude) + ctx.deps = non_excluding_context + matches_found = await find_in_files(ctx, "Searchable content in custom excluded directory") + assert any(match["file_path"] == str(Path("custom_exclude_dir") / "custom_file.txt") for match in matches_found) + + @pytest.mark.asyncio + async def test_find_in_files_custom_file_pattern_exclusion(self, mock_agent, test_project): + register_tools(mock_agent) + find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files") + + custom_context = CodebaseContext( + project_dir=str(test_project), + excluded_file_patterns=["*.custom_pattern"] # Override defaults + ) + ctx = AsyncMock() + ctx.deps = custom_context + + matches = await find_in_files(ctx, "Searchable content in custom pattern file") + assert not any(match["file_path"] == "data.custom_pattern" for match in matches) + + # Verify it IS found if not excluded + non_excluding_context = CodebaseContext(project_dir=str(test_project), excluded_file_patterns=[]) # No file pattern exclusions + ctx.deps = non_excluding_context + matches_found = await find_in_files(ctx, "Searchable content in custom pattern file") + assert any(match["file_path"] == "data.custom_pattern" for match in matches_found) + + @pytest.mark.asyncio + async def test_find_in_files_whitelist_and_dir_exclusion(self, mock_agent, test_project): + register_tools(mock_agent) + find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files") + + # Use default context which excludes "docs" + default_context_with_exclusions = CodebaseContext(project_dir=str(test_project)) + ctx = AsyncMock() + ctx.deps = default_context_with_exclusions + + # Search for *.txt files. documentation.txt is in 'docs' which is excluded by default. + matches = await find_in_files(ctx, "Searchable content", file_pattern="*.txt") + assert not any(match["file_path"] == str(Path("docs") / "documentation.txt") for match in matches) + + # Check it finds other .txt files not in excluded dirs (e.g. custom_exclude_dir/custom_file.txt) + # custom_exclude_dir is not in the default exclusion list of CodebaseContext + assert any(match["file_path"] == str(Path("custom_exclude_dir") / "custom_file.txt") for match in matches)