Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion docs/codebase_search.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ A `retry_on_error` decorator is applied to these tools to enhance their resilien
* `line_content` (str): The content of the line containing the match.
* **Implementation Notes:**
* Recursively searches directories starting from the project root.
* Skips hidden files and common non-code directories (e.g., `__pycache__`, `venv`, `node_modules`).
* Skips hidden files (starting with '.').
* Skips directories listed in `CodebaseContext.excluded_dirs`. By default, this includes common SCM, IDE, virtual environment, build, and documentation folders (see "Configuring Search Behavior" section for defaults).
* Skips files matching glob patterns in `CodebaseContext.excluded_file_patterns` (e.g., `*.log`, `*.pyc` by default).
* Uses a helper function `_is_binary_file` to attempt to skip searching in binary files (based on extension and content sniffing).
* Limits the number of returned matches (currently to 100) to avoid excessive output.
* Handles `re.error` for invalid regex patterns.
Expand Down Expand Up @@ -82,6 +84,24 @@ A `retry_on_error` decorator is applied to these tools to enhance their resilien
* Reads the specified file and extracts the relevant lines.
* Formats the output to include line numbers and highlight the `frame_line_number`.

## Configuring Search Behavior

The behavior of search tools like `find_in_files` and `find_symbol_references` can be customized by providing specific exclusion lists when creating an instance of `CodebaseContext` (typically available as `ctx.deps` within tool execution). These settings allow for more fine-grained control over which parts of the codebase are searched.

The `CodebaseContext` model (in `gemini_stacktrace/models/config.py`) includes the following fields for configuration:

* `excluded_dirs: List[str]`
* **Description:** A list of directory names that should be completely ignored during searches.
* **Default Values:** `["__pycache__", "venv", ".venv", "node_modules", "dist", "build", ".git", ".hg", ".svn", ".vscode", ".idea", "docs"]`
* **Customization:** When creating a `CodebaseContext` instance, you can provide your own list to this field to override the defaults. An empty list `[]` would mean no directories are excluded by name (though hidden directories starting with `.` are always skipped).

* `excluded_file_patterns: List[str]`
* **Description:** A list of glob-style file patterns (e.g., `*.log`, `temp_*.*`) for files that should be ignored during searches, even if they are in directories that are otherwise being searched.
* **Default Values:** `["*.pyc", "*.pyo", "*.log"]`
* **Customization:** Provide your own list of patterns to override the defaults. An empty list `[]` would mean no files are excluded based on these patterns (though the `_is_binary_file` check will still apply).

These configurations are used by the search tools to filter out irrelevant files and directories, leading to more focused and efficient search results.

## Potential Improvements

While the current set of tools provides a solid foundation for codebase interaction and search, several areas could be enhanced for better performance, accuracy, and broader capabilities:
Expand Down
16 changes: 15 additions & 1 deletion gemini_stacktrace/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from pathlib import Path
from typing import Annotated, Optional
from typing import Annotated, Optional, List # Make sure List is imported
import os

from pydantic import (
Expand Down Expand Up @@ -107,6 +107,20 @@ class CodebaseContext(BaseModel):
description="Absolute path to the project directory"
)

excluded_dirs: List[str] = Field(
default_factory=lambda: [
"__pycache__", "venv", ".venv", "node_modules",
"dist", "build", ".git", ".hg", ".svn",
".vscode", ".idea", "docs",
],
description="List of directory names to exclude from searches. Default includes common virtual env, SCM, build, and IDE folders."
)

excluded_file_patterns: List[str] = Field(
default_factory=lambda: ["*.pyc", "*.pyo", "*.log"],
description="List of glob file patterns to exclude from searches (e.g., '*.log', '*.tmp'). Default includes Python bytecode files and logs."
)

model_config = ConfigDict(arbitrary_types_allowed=True)

def validate_file_path(self, file_path: str) -> str:
Expand Down
28 changes: 12 additions & 16 deletions gemini_stacktrace/tools/codebase_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,7 @@ def search_directory(dir_path: str) -> None:
continue

# Skip common directories that shouldn't contain Python code
if os.path.isdir(item_path) and item in {
"__pycache__",
"venv",
".venv",
"node_modules",
"dist",
"build",
}:
if os.path.isdir(item_path) and item in ctx.deps.excluded_dirs:
continue

if os.path.isdir(item_path):
Expand All @@ -171,6 +164,11 @@ def search_directory(dir_path: str) -> None:
if file_pattern and not fnmatch.fnmatch(item, file_pattern):
continue

# Skip if file matches any excluded patterns
if ctx.deps.excluded_file_patterns and \
any(fnmatch.fnmatch(item, pattern) for pattern in ctx.deps.excluded_file_patterns):
continue

# Skip if file has been visited
if item_path in visited_files:
continue
Expand Down Expand Up @@ -232,14 +230,7 @@ def search_directory(dir_path: str) -> None:
continue

# Skip common directories that shouldn't contain Python code
if os.path.isdir(item_path) and item in {
"__pycache__",
"venv",
".venv",
"node_modules",
"dist",
"build",
}:
if os.path.isdir(item_path) and item in ctx.deps.excluded_dirs:
continue

if os.path.isdir(item_path):
Expand All @@ -250,6 +241,11 @@ def search_directory(dir_path: str) -> None:
if file_pattern and not fnmatch.fnmatch(item, file_pattern):
continue

# Skip if file matches any excluded patterns
if ctx.deps.excluded_file_patterns and \
any(fnmatch.fnmatch(item, pattern) for pattern in ctx.deps.excluded_file_patterns):
continue

# Skip if file has been visited
if item_path in visited_files:
continue
Expand Down
119 changes: 119 additions & 0 deletions tests/test_codebase_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ def process_numbers(numbers: List[int]) -> Dict[str, int]:
print("This is a helper function")
""")

# Create a directory that should be ignored by default excluded_dirs
os.makedirs(project_dir / "docs", exist_ok=True)
with open(project_dir / "docs" / "documentation.txt", "w") as f:
f.write("Searchable content in a default excluded directory.")

# Create a file that should be ignored by default excluded_file_patterns
with open(project_dir / "system.log", "w") as f:
f.write("Searchable log content.")

with open(project_dir / "src" / "another.log", "w") as f: # For testing interaction with file_pattern
f.write("Searchable log content in src.")

# Create a directory and a file for testing custom directory exclusion
os.makedirs(project_dir / "custom_exclude_dir", exist_ok=True)
with open(project_dir / "custom_exclude_dir" / "custom_file.txt", "w") as f:
f.write("Searchable content in custom excluded directory.")

# Create a file for testing custom file pattern exclusion
with open(project_dir / "data.custom_pattern", "w") as f:
f.write("Searchable content in custom pattern file.")

yield project_dir


Expand Down Expand Up @@ -269,3 +290,101 @@ async def test_get_stack_frame_context(self, mock_agent, test_codebase_context,

# Check that the error line is marked with '>'
assert "> " in context

@pytest.mark.asyncio
async def test_find_in_files_default_dir_exclusion(self, mock_agent, test_codebase_context, test_project):
register_tools(mock_agent)
find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files")
ctx = AsyncMock()
ctx.deps = test_codebase_context # Uses default CodebaseContext exclusions

matches = await find_in_files(ctx, "Searchable content in a default excluded directory")
assert not any(match["file_path"] == str(Path("docs") / "documentation.txt") for match in matches)
# Also check it finds something NOT in an excluded dir to be sure search works generally
matches_main = await find_in_files(ctx, "def add")
assert any(match["file_path"] == "main.py" for match in matches_main)

@pytest.mark.asyncio
async def test_find_in_files_default_file_pattern_exclusion(self, mock_agent, test_codebase_context, test_project):
register_tools(mock_agent)
find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files")
ctx = AsyncMock()
ctx.deps = test_codebase_context # Uses default CodebaseContext exclusions

matches = await find_in_files(ctx, "Searchable log content")
# Check it's not found in system.log (root) or src/another.log
assert not any("system.log" in match["file_path"] for match in matches)
assert not any(str(Path("src") / "another.log") in match["file_path"] for match in matches)

# Check it finds it if we search specifically with a file_pattern that overrides (e.g. whitelisting)
# The current find_in_files logic applies exclusion before _is_binary check, and after whitelist file_pattern
# So, excluded_file_patterns should take precedence over the input file_pattern.
# Let's confirm: if we search for *.log, it should still be excluded by default excluded_file_patterns.
matches_specific_log = await find_in_files(ctx, "Searchable log content", file_pattern="*.log")
assert not any("system.log" in match["file_path"] for match in matches_specific_log)
assert not any(str(Path("src") / "another.log") in match["file_path"] for match in matches_specific_log)

@pytest.mark.asyncio
async def test_find_in_files_custom_dir_exclusion(self, mock_agent, test_project):
register_tools(mock_agent)
find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files")

# CodebaseContext with custom exclusion
custom_context = CodebaseContext(
project_dir=str(test_project),
excluded_dirs=["custom_exclude_dir"] # Override defaults by providing a new list
)
ctx = AsyncMock()
ctx.deps = custom_context

matches = await find_in_files(ctx, "Searchable content in custom excluded directory")
assert not any(match["file_path"] == str(Path("custom_exclude_dir") / "custom_file.txt") for match in matches)

# Verify it IS found if not excluded
# Use a context that doesn't exclude 'custom_exclude_dir' but keeps other defaults to avoid matching everything
default_dirs = CodebaseContext().excluded_dirs
custom_dirs_without_custom_exclude = [d for d in default_dirs if d != "custom_exclude_dir"]

non_excluding_context = CodebaseContext(project_dir=str(test_project), excluded_dirs=custom_dirs_without_custom_exclude)
ctx.deps = non_excluding_context
matches_found = await find_in_files(ctx, "Searchable content in custom excluded directory")
assert any(match["file_path"] == str(Path("custom_exclude_dir") / "custom_file.txt") for match in matches_found)

@pytest.mark.asyncio
async def test_find_in_files_custom_file_pattern_exclusion(self, mock_agent, test_project):
register_tools(mock_agent)
find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files")

custom_context = CodebaseContext(
project_dir=str(test_project),
excluded_file_patterns=["*.custom_pattern"] # Override defaults
)
ctx = AsyncMock()
ctx.deps = custom_context

matches = await find_in_files(ctx, "Searchable content in custom pattern file")
assert not any(match["file_path"] == "data.custom_pattern" for match in matches)

# Verify it IS found if not excluded
non_excluding_context = CodebaseContext(project_dir=str(test_project), excluded_file_patterns=[]) # No file pattern exclusions
ctx.deps = non_excluding_context
matches_found = await find_in_files(ctx, "Searchable content in custom pattern file")
assert any(match["file_path"] == "data.custom_pattern" for match in matches_found)

@pytest.mark.asyncio
async def test_find_in_files_whitelist_and_dir_exclusion(self, mock_agent, test_project):
register_tools(mock_agent)
find_in_files = next(call[1][0] for call in mock_agent.tool.mock_calls if call[1][0].__name__ == "find_in_files")

# Use default context which excludes "docs"
default_context_with_exclusions = CodebaseContext(project_dir=str(test_project))
ctx = AsyncMock()
ctx.deps = default_context_with_exclusions

# Search for *.txt files. documentation.txt is in 'docs' which is excluded by default.
matches = await find_in_files(ctx, "Searchable content", file_pattern="*.txt")
assert not any(match["file_path"] == str(Path("docs") / "documentation.txt") for match in matches)

# Check it finds other .txt files not in excluded dirs (e.g. custom_exclude_dir/custom_file.txt)
# custom_exclude_dir is not in the default exclusion list of CodebaseContext
assert any(match["file_path"] == str(Path("custom_exclude_dir") / "custom_file.txt") for match in matches)
Loading