Address code review feedback and improve documentation

Copilot · Mte90 · Copilot · commit d24bad4ecc8e · 2025-11-07T09:10:52.000Z
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
diff --git a/analyzer.py b/analyzer.py
@@ -324,10 +324,11 @@ def _process_file_sync(
         if isinstance(cfg, dict):
             embedding_model = cfg.get("embedding_model")
 
-        # Use smart chunking for code files, fallback to simple for others
+        # Use smart chunking for supported code languages
         use_smart_chunking = cfg.get("smart_chunking", True) if isinstance(cfg, dict) else True
+        supported_languages = ["python", "javascript", "typescript", "java", "go", "rust", "c", "cpp"]
         
-        if use_smart_chunking and lang != "text":
+        if use_smart_chunking and lang in supported_languages:
             chunks = smart_chunk(content, language=lang, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
         else:
             chunks = chunk_text(content, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
diff --git a/db_modules/__init__.py b/db_modules/__init__.py
diff --git a/services/search_service.py b/services/search_service.py
@@ -85,9 +85,9 @@ def semantic_search(
     
     @staticmethod
     def _make_cache_key(project_id: str, query: str, top_k: int) -> str:
-        """Generate cache key for search query."""
+        """Generate cache key for search query using SHA-256."""
         key_str = f"{project_id}:{query}:{top_k}"
-        key_hash = hashlib.md5(key_str.encode()).hexdigest()
+        key_hash = hashlib.sha256(key_str.encode()).hexdigest()[:16]  # Use first 16 chars
         return f"search:{key_hash}"
     
     @staticmethod
diff --git a/smart_chunker.py b/smart_chunker.py
@@ -106,7 +106,13 @@ def _split_into_units(self, text: str, language: str) -> List[Tuple[str, str]]:
             return []
     
     def _split_python(self, text: str) -> List[Tuple[str, str]]:
-        """Split Python code into classes and functions."""
+        """
+        Split Python code into classes and functions.
+        
+        Uses indentation-based parsing. Works well for most Python code
+        but may have edge cases with complex indentation patterns.
+        Falls back to simple chunking if parsing fails.
+        """
         units = []
         lines = text.split("\n")
         current_unit = []
@@ -154,11 +160,19 @@ def _split_python(self, text: str) -> List[Tuple[str, str]]:
         return units
     
     def _split_javascript(self, text: str) -> List[Tuple[str, str]]:
-        """Split JavaScript/TypeScript code into functions and classes."""
+        """
+        Split JavaScript/TypeScript code into functions and classes.
+        
+        Uses regex patterns to match function and class declarations.
+        Works well for standard code patterns but may not handle all
+        edge cases with nested structures. Falls back to brace-based
+        splitting if regex matching doesn't find units.
+        """
         units = []
         
         # Regex patterns for JS/TS
         # Match function declarations, arrow functions, class declarations
+        # Note: Non-greedy matching, works for most cases but not perfect for deeply nested code
         patterns = [
             r'((?:export\s+)?(?:async\s+)?function\s+\w+\s*\([^)]*\)\s*{[\s\S]*?})',
             r'((?:export\s+)?const\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>\s*{[\s\S]*?})',
@@ -209,6 +223,11 @@ def _split_by_braces(self, text: str) -> List[Tuple[str, str]]:
         """
         Generic brace-based splitting for C-style languages.
         Finds balanced brace blocks.
+        
+        Note: This is a simple heuristic that doesn't handle braces
+        inside strings, comments, or template literals. It works well
+        for most code but may produce imperfect results in edge cases.
+        The chunker will still fall back to simple chunking if needed.
         """
         units = []
         lines = text.split("\n")
@@ -219,7 +238,8 @@ def _split_by_braces(self, text: str) -> List[Tuple[str, str]]:
         for line in lines:
             current_unit.append(line)
             
-            # Count braces (simple heuristic, doesn't handle strings/comments perfectly)
+            # Count braces (simple heuristic)
+            # Note: Doesn't handle strings/comments perfectly, but works well in practice
             brace_count += line.count("{") - line.count("}")
             
             if "{" in line and not in_block: