Implement incremental indexing and project metadata tracking

Copilot · Mte90 · Copilot · commit ec368ffc328a · 2025-11-06T18:03:39.000Z
Co-authored-by: Mte90 &lt;403283+Mte90@users.noreply.github.com&gt;
diff --git a/analyzer.py b/analyzer.py
@@ -4,13 +4,14 @@
 import traceback
 import sqlite3
 import importlib.resources
+import hashlib
 from pathlib import Path
 from typing import Optional, Dict, Any, List
 
 import concurrent.futures
 import threading
 
-from db import store_file
+from db import store_file, needs_reindex, set_project_metadata, get_project_metadata
 from external_api import get_embedding_for_text, call_coding_api
 from llama_index.core import Document
 from logger import get_logger
@@ -71,6 +72,13 @@ def detect_language(path: str):
     return EXT_LANG.get(ext, "text")
 
 
+def compute_file_hash(content: str) -> str:
+    """
+    Compute SHA256 hash of file content for change detection.
+    """
+    return hashlib.sha256(content.encode('utf-8')).hexdigest()
+
+
 # Simple chunker (character-based). Tunable CHUNK_SIZE, CHUNK_OVERLAP.
 def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
     if chunk_size <= 0:
@@ -271,33 +279,43 @@ def _process_file_sync(
     full_path: str,
     rel_path: str,
     cfg: Optional[Dict[str, Any]],
+    incremental: bool = True,
 ):
     """
     Synchronous implementation of per-file processing.
     Intended to run on a ThreadPoolExecutor worker thread.
-    Returns a dict: {"stored": bool, "embedded": bool}
+    Returns a dict: {"stored": bool, "embedded": bool, "skipped": bool}
     """
     try:
         # read file content
         try:
             with open(full_path, "r", encoding="utf-8", errors="ignore") as fh:
                 content = fh.read()
+            # Get file modification time
+            mtime = os.path.getmtime(full_path)
         except Exception:
-            return {"stored": False, "embedded": False}
+            return {"stored": False, "embedded": False, "skipped": False}
 
         if not content:
-            return {"stored": False, "embedded": False}
+            return {"stored": False, "embedded": False, "skipped": False}
 
         lang = detect_language(rel_path)
         if lang == "text":
-            return {"stored": False, "embedded": False}
+            return {"stored": False, "embedded": False, "skipped": False}
 
-        # store file (synchronous DB writer)
+        # Compute hash for change detection
+        file_hash = compute_file_hash(content)
+        
+        # Check if file needs reindexing (incremental mode)
+        if incremental and not needs_reindex(database_path, rel_path, mtime, file_hash):
+            return {"stored": False, "embedded": False, "skipped": True}
+
+        # store file (synchronous DB writer) with metadata
         try:
-            fid = store_file(database_path, rel_path, content, lang)
+            fid = store_file(database_path, rel_path, content, lang, mtime, file_hash)
         except Exception:
             logger.exception("Failed to store file %s", rel_path)
-            return {"stored": False, "embedded": False}
+            return {"stored": False, "embedded": False, "skipped": False}
 
         _ = Document(text=content, extra_info={"path": rel_path, "lang": lang})
 
@@ -372,7 +390,7 @@ def _process_file_sync(
                     except Exception:
                         logger.exception("Failed to write empty-embedding error to disk for %s chunk %d", rel_path, idx)
 
-        return {"stored": True, "embedded": embedded_any}
+        return {"stored": True, "embedded": embedded_any, "skipped": False}
     except Exception:
         tb = traceback.format_exc()
         try:
@@ -383,7 +401,7 @@ def _process_file_sync(
                 logger.exception("Failed to write exception error to disk for file %s", rel_path)
         except Exception:
             logger.exception("Failed while handling exception for file %s", rel_path)
-        return {"stored": False, "embedded": False}
+        return {"stored": False, "embedded": False, "skipped": False}
 
 
 def analyze_local_path_sync(
@@ -392,15 +410,20 @@ def analyze_local_path_sync(
     venv_path: Optional[str] = None,
     max_file_size: int = 200000,
     cfg: Optional[dict] = None,
+    incremental: bool = True,
 ):
     """
     Synchronous implementation of the analysis pipeline.
     Submits per-file tasks to a shared ThreadPoolExecutor.
+    Supports incremental indexing to skip unchanged files.
     """
     semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
+    start_time = time.time()
+    
     try:
         file_count = 0
         emb_count = 0
+        skipped_count = 0
         file_paths: List[Dict[str, str]] = []
 
         # Collect files to process
@@ -429,6 +452,7 @@ def analyze_local_path_sync(
                     f["full"],
                     f["rel"],
                     cfg,
+                    incremental,
                 )
                 futures.append(fut)
 
@@ -440,9 +464,23 @@ def analyze_local_path_sync(
                             file_count += 1
                         if r.get("embedded"):
                             emb_count += 1
+                        if r.get("skipped"):
+                            skipped_count += 1
                 except Exception:
                     logger.exception("A per-file task failed")
 
+        # Store indexing metadata
+        end_time = time.time()
+        duration = end_time - start_time
+        
+        try:
+            set_project_metadata(database_path, "last_indexed_at", time.strftime("%Y-%m-%d %H:%M:%S"))
+            set_project_metadata(database_path, "last_index_duration", str(duration))
+            set_project_metadata(database_path, "files_indexed", str(file_count))
+            set_project_metadata(database_path, "files_skipped", str(skipped_count))
+        except Exception:
+            logger.exception("Failed to store indexing metadata")
+
         # store uv_detected.json metadata if possible
         uv_info = None
         try:
diff --git a/db.py b/db.py
@@ -173,27 +173,33 @@ def init_db(database_path: str) -> None:
     """
     Initialize database schema. Safe to call multiple times.
     Creates:
-    - files (stores full content of indexed files)
+    - files (stores full content of indexed files with metadata for incremental indexing)
     - chunks (with embedding BLOB column for sqlite-vector)
+    - project_metadata (project-level tracking)
     """
     conn = _get_connection(database_path)
     try:
         cur = conn.cursor()
         
         # files table (stores full content, used to reconstruct chunks)
+        # Added last_modified and file_hash for incremental indexing
         cur.execute(
             """
             CREATE TABLE IF NOT EXISTS files (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
-                path TEXT NOT NULL,
+                path TEXT NOT NULL UNIQUE,
                 content TEXT,
                 language TEXT,
                 snippet TEXT,
-                created_at TEXT DEFAULT (datetime('now'))
+                last_modified REAL,
+                file_hash TEXT,
+                created_at TEXT DEFAULT (datetime('now')),
+                updated_at TEXT DEFAULT (datetime('now'))
             )
             """
         )
         cur.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);")
+        cur.execute("CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);")
 
         # chunks table: metadata for chunked documents; includes embedding BLOB column
         cur.execute(
@@ -210,20 +216,43 @@ def init_db(database_path: str) -> None:
             """
         )
         cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id);")
+        
+        # project_metadata table for project-level tracking
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS project_metadata (
+                key TEXT PRIMARY KEY,
+                value TEXT,
+                updated_at TEXT DEFAULT (datetime('now'))
+            )
+            """
+        )
+        
         conn.commit()
     finally:
         conn.close()
 
 
-def store_file(database_path, path, content, language):
+def store_file(database_path, path, content, language, last_modified=None, file_hash=None):
     """
-    Insert a file record into the DB using a queued single-writer to avoid
+    Insert or update a file record into the DB using a queued single-writer to avoid
     sqlite 'database is locked' errors in multithreaded scenarios.
+    Supports incremental indexing with last_modified and file_hash tracking.
     Returns lastrowid (same as the previous store_file implementation).
     """
     snippet = (content[:512] if content else "")
-    sql = "INSERT INTO files (path, content, language, snippet) VALUES (?, ?, ?, ?)"
-    params = (path, content, language, snippet)
+    sql = """
+        INSERT INTO files (path, content, language, snippet, last_modified, file_hash, updated_at) 
+        VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
+        ON CONFLICT(path) DO UPDATE SET 
+            content=excluded.content,
+            language=excluded.language,
+            snippet=excluded.snippet,
+            last_modified=excluded.last_modified,
+            file_hash=excluded.file_hash,
+            updated_at=datetime('now')
+    """
+    params = (path, content, language, snippet, last_modified, file_hash)
 
     writer = _get_writer(database_path)
     # We wait for the background writer to complete the insert and then return the row id.
@@ -314,6 +343,106 @@ def clear_project_data(database_path: str) -> None:
         conn.close()
 
 
+def get_file_by_path(database_path: str, path: str) -> Optional[Dict[str, Any]]:
+    """
+    Get file metadata by path for incremental indexing checks.
+    Returns None if file doesn't exist.
+    """
+    conn = _get_connection(database_path)
+    try:
+        row = conn.execute(
+            "SELECT id, path, last_modified, file_hash FROM files WHERE path = ?",
+            (path,)
+        ).fetchone()
+        if row:
+            return {
+                "id": row["id"],
+                "path": row["path"],
+                "last_modified": row["last_modified"],
+                "file_hash": row["file_hash"]
+            }
+        return None
+    finally:
+        conn.close()
+
+
+def needs_reindex(database_path: str, path: str, current_mtime: float, current_hash: str) -> bool:
+    """
+    Check if a file needs to be re-indexed based on modification time and hash.
+    Returns True if file is new or has changed.
+    """
+    existing = get_file_by_path(database_path, path)
+    if not existing:
+        return True
+    
+    # Check if modification time or hash changed
+    if existing["last_modified"] is None or existing["file_hash"] is None:
+        return True
+    
+    if existing["last_modified"] != current_mtime or existing["file_hash"] != current_hash:
+        return True
+    
+    return False
+
+
+def set_project_metadata(database_path: str, key: str, value: str) -> None:
+    """
+    Set a project metadata key-value pair.
+    """
+    conn = _get_connection(database_path)
+    try:
+        cur = conn.cursor()
+        cur.execute(
+            """
+            INSERT INTO project_metadata (key, value, updated_at) 
+            VALUES (?, ?, datetime('now'))
+            ON CONFLICT(key) DO UPDATE SET 
+                value=excluded.value,
+                updated_at=datetime('now')
+            """,
+            (key, value)
+        )
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def get_project_metadata(database_path: str, key: str) -> Optional[str]:
+    """
+    Get a project metadata value by key.
+    """
+    conn = _get_connection(database_path)
+    try:
+        row = conn.execute(
+            "SELECT value FROM project_metadata WHERE key = ?",
+            (key,)
+        ).fetchone()
+        return row["value"] if row else None
+    finally:
+        conn.close()
+
+
+def delete_file_by_path(database_path: str, path: str) -> None:
+    """
+    Delete a file and its chunks by path.
+    Used for incremental indexing when files are removed.
+    """
+    conn = _get_connection(database_path)
+    try:
+        cur = conn.cursor()
+        # Get file_id
+        row = cur.execute("SELECT id FROM files WHERE path = ?", (path,)).fetchone()
+        if row:
+            file_id = row["id"]
+            # Delete chunks first
+            cur.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,))
+            # Delete file
+            cur.execute("DELETE FROM files WHERE id = ?", (file_id,))
+            conn.commit()
+    finally:
+        conn.close()
+
+
 # ============================================================================
 # Project Registry Database Operations
 # ============================================================================