Skip to content

Commit ec368ff

Browse files
CopilotMte90
andcommitted
Implement incremental indexing and project metadata tracking
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 7067f19 commit ec368ff

File tree

2 files changed

+184
-17
lines changed

2 files changed

+184
-17
lines changed

analyzer.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44
import traceback
55
import sqlite3
66
import importlib.resources
7+
import hashlib
78
from pathlib import Path
89
from typing import Optional, Dict, Any, List
910

1011
import concurrent.futures
1112
import threading
1213

13-
from db import store_file
14+
from db import store_file, needs_reindex, set_project_metadata, get_project_metadata
1415
from external_api import get_embedding_for_text, call_coding_api
1516
from llama_index.core import Document
1617
from logger import get_logger
@@ -71,6 +72,13 @@ def detect_language(path: str):
7172
return EXT_LANG.get(ext, "text")
7273

7374

75+
def compute_file_hash(content: str) -> str:
76+
"""
77+
Compute SHA256 hash of file content for change detection.
78+
"""
79+
return hashlib.sha256(content.encode('utf-8')).hexdigest()
80+
81+
7482
# Simple chunker (character-based). Tunable CHUNK_SIZE, CHUNK_OVERLAP.
7583
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
7684
if chunk_size <= 0:
@@ -271,33 +279,43 @@ def _process_file_sync(
271279
full_path: str,
272280
rel_path: str,
273281
cfg: Optional[Dict[str, Any]],
282+
incremental: bool = True,
274283
):
275284
"""
276285
Synchronous implementation of per-file processing.
277286
Intended to run on a ThreadPoolExecutor worker thread.
278-
Returns a dict: {"stored": bool, "embedded": bool}
287+
Returns a dict: {"stored": bool, "embedded": bool, "skipped": bool}
279288
"""
280289
try:
281290
# read file content
282291
try:
283292
with open(full_path, "r", encoding="utf-8", errors="ignore") as fh:
284293
content = fh.read()
294+
# Get file modification time
295+
mtime = os.path.getmtime(full_path)
285296
except Exception:
286-
return {"stored": False, "embedded": False}
297+
return {"stored": False, "embedded": False, "skipped": False}
287298

288299
if not content:
289-
return {"stored": False, "embedded": False}
300+
return {"stored": False, "embedded": False, "skipped": False}
290301

291302
lang = detect_language(rel_path)
292303
if lang == "text":
293-
return {"stored": False, "embedded": False}
304+
return {"stored": False, "embedded": False, "skipped": False}
294305

295-
# store file (synchronous DB writer)
306+
# Compute hash for change detection
307+
file_hash = compute_file_hash(content)
308+
309+
# Check if file needs reindexing (incremental mode)
310+
if incremental and not needs_reindex(database_path, rel_path, mtime, file_hash):
311+
return {"stored": False, "embedded": False, "skipped": True}
312+
313+
# store file (synchronous DB writer) with metadata
296314
try:
297-
fid = store_file(database_path, rel_path, content, lang)
315+
fid = store_file(database_path, rel_path, content, lang, mtime, file_hash)
298316
except Exception:
299317
logger.exception("Failed to store file %s", rel_path)
300-
return {"stored": False, "embedded": False}
318+
return {"stored": False, "embedded": False, "skipped": False}
301319

302320
_ = Document(text=content, extra_info={"path": rel_path, "lang": lang})
303321

@@ -372,7 +390,7 @@ def _process_file_sync(
372390
except Exception:
373391
logger.exception("Failed to write empty-embedding error to disk for %s chunk %d", rel_path, idx)
374392

375-
return {"stored": True, "embedded": embedded_any}
393+
return {"stored": True, "embedded": embedded_any, "skipped": False}
376394
except Exception:
377395
tb = traceback.format_exc()
378396
try:
@@ -383,7 +401,7 @@ def _process_file_sync(
383401
logger.exception("Failed to write exception error to disk for file %s", rel_path)
384402
except Exception:
385403
logger.exception("Failed while handling exception for file %s", rel_path)
386-
return {"stored": False, "embedded": False}
404+
return {"stored": False, "embedded": False, "skipped": False}
387405

388406

389407
def analyze_local_path_sync(
@@ -392,15 +410,20 @@ def analyze_local_path_sync(
392410
venv_path: Optional[str] = None,
393411
max_file_size: int = 200000,
394412
cfg: Optional[dict] = None,
413+
incremental: bool = True,
395414
):
396415
"""
397416
Synchronous implementation of the analysis pipeline.
398417
Submits per-file tasks to a shared ThreadPoolExecutor.
418+
Supports incremental indexing to skip unchanged files.
399419
"""
400420
semaphore = threading.Semaphore(EMBEDDING_CONCURRENCY)
421+
start_time = time.time()
422+
401423
try:
402424
file_count = 0
403425
emb_count = 0
426+
skipped_count = 0
404427
file_paths: List[Dict[str, str]] = []
405428

406429
# Collect files to process
@@ -429,6 +452,7 @@ def analyze_local_path_sync(
429452
f["full"],
430453
f["rel"],
431454
cfg,
455+
incremental,
432456
)
433457
futures.append(fut)
434458

@@ -440,9 +464,23 @@ def analyze_local_path_sync(
440464
file_count += 1
441465
if r.get("embedded"):
442466
emb_count += 1
467+
if r.get("skipped"):
468+
skipped_count += 1
443469
except Exception:
444470
logger.exception("A per-file task failed")
445471

472+
# Store indexing metadata
473+
end_time = time.time()
474+
duration = end_time - start_time
475+
476+
try:
477+
set_project_metadata(database_path, "last_indexed_at", time.strftime("%Y-%m-%d %H:%M:%S"))
478+
set_project_metadata(database_path, "last_index_duration", str(duration))
479+
set_project_metadata(database_path, "files_indexed", str(file_count))
480+
set_project_metadata(database_path, "files_skipped", str(skipped_count))
481+
except Exception:
482+
logger.exception("Failed to store indexing metadata")
483+
446484
# store uv_detected.json metadata if possible
447485
uv_info = None
448486
try:

db.py

Lines changed: 136 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,27 +173,33 @@ def init_db(database_path: str) -> None:
173173
"""
174174
Initialize database schema. Safe to call multiple times.
175175
Creates:
176-
- files (stores full content of indexed files)
176+
- files (stores full content of indexed files with metadata for incremental indexing)
177177
- chunks (with embedding BLOB column for sqlite-vector)
178+
- project_metadata (project-level tracking)
178179
"""
179180
conn = _get_connection(database_path)
180181
try:
181182
cur = conn.cursor()
182183

183184
# files table (stores full content, used to reconstruct chunks)
185+
# Added last_modified and file_hash for incremental indexing
184186
cur.execute(
185187
"""
186188
CREATE TABLE IF NOT EXISTS files (
187189
id INTEGER PRIMARY KEY AUTOINCREMENT,
188-
path TEXT NOT NULL,
190+
path TEXT NOT NULL UNIQUE,
189191
content TEXT,
190192
language TEXT,
191193
snippet TEXT,
192-
created_at TEXT DEFAULT (datetime('now'))
194+
last_modified REAL,
195+
file_hash TEXT,
196+
created_at TEXT DEFAULT (datetime('now')),
197+
updated_at TEXT DEFAULT (datetime('now'))
193198
)
194199
"""
195200
)
196201
cur.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);")
202+
cur.execute("CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);")
197203

198204
# chunks table: metadata for chunked documents; includes embedding BLOB column
199205
cur.execute(
@@ -210,20 +216,43 @@ def init_db(database_path: str) -> None:
210216
"""
211217
)
212218
cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id);")
219+
220+
# project_metadata table for project-level tracking
221+
cur.execute(
222+
"""
223+
CREATE TABLE IF NOT EXISTS project_metadata (
224+
key TEXT PRIMARY KEY,
225+
value TEXT,
226+
updated_at TEXT DEFAULT (datetime('now'))
227+
)
228+
"""
229+
)
230+
213231
conn.commit()
214232
finally:
215233
conn.close()
216234

217235

218-
def store_file(database_path, path, content, language):
236+
def store_file(database_path, path, content, language, last_modified=None, file_hash=None):
219237
"""
220-
Insert a file record into the DB using a queued single-writer to avoid
238+
Insert or update a file record into the DB using a queued single-writer to avoid
221239
sqlite 'database is locked' errors in multithreaded scenarios.
240+
Supports incremental indexing with last_modified and file_hash tracking.
222241
Returns lastrowid (same as the previous store_file implementation).
223242
"""
224243
snippet = (content[:512] if content else "")
225-
sql = "INSERT INTO files (path, content, language, snippet) VALUES (?, ?, ?, ?)"
226-
params = (path, content, language, snippet)
244+
sql = """
245+
INSERT INTO files (path, content, language, snippet, last_modified, file_hash, updated_at)
246+
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
247+
ON CONFLICT(path) DO UPDATE SET
248+
content=excluded.content,
249+
language=excluded.language,
250+
snippet=excluded.snippet,
251+
last_modified=excluded.last_modified,
252+
file_hash=excluded.file_hash,
253+
updated_at=datetime('now')
254+
"""
255+
params = (path, content, language, snippet, last_modified, file_hash)
227256

228257
writer = _get_writer(database_path)
229258
# We wait for the background writer to complete the insert and then return the row id.
@@ -314,6 +343,106 @@ def clear_project_data(database_path: str) -> None:
314343
conn.close()
315344

316345

346+
def get_file_by_path(database_path: str, path: str) -> Optional[Dict[str, Any]]:
347+
"""
348+
Get file metadata by path for incremental indexing checks.
349+
Returns None if file doesn't exist.
350+
"""
351+
conn = _get_connection(database_path)
352+
try:
353+
row = conn.execute(
354+
"SELECT id, path, last_modified, file_hash FROM files WHERE path = ?",
355+
(path,)
356+
).fetchone()
357+
if row:
358+
return {
359+
"id": row["id"],
360+
"path": row["path"],
361+
"last_modified": row["last_modified"],
362+
"file_hash": row["file_hash"]
363+
}
364+
return None
365+
finally:
366+
conn.close()
367+
368+
369+
def needs_reindex(database_path: str, path: str, current_mtime: float, current_hash: str) -> bool:
370+
"""
371+
Check if a file needs to be re-indexed based on modification time and hash.
372+
Returns True if file is new or has changed.
373+
"""
374+
existing = get_file_by_path(database_path, path)
375+
if not existing:
376+
return True
377+
378+
# Check if modification time or hash changed
379+
if existing["last_modified"] is None or existing["file_hash"] is None:
380+
return True
381+
382+
if existing["last_modified"] != current_mtime or existing["file_hash"] != current_hash:
383+
return True
384+
385+
return False
386+
387+
388+
def set_project_metadata(database_path: str, key: str, value: str) -> None:
389+
"""
390+
Set a project metadata key-value pair.
391+
"""
392+
conn = _get_connection(database_path)
393+
try:
394+
cur = conn.cursor()
395+
cur.execute(
396+
"""
397+
INSERT INTO project_metadata (key, value, updated_at)
398+
VALUES (?, ?, datetime('now'))
399+
ON CONFLICT(key) DO UPDATE SET
400+
value=excluded.value,
401+
updated_at=datetime('now')
402+
""",
403+
(key, value)
404+
)
405+
conn.commit()
406+
finally:
407+
conn.close()
408+
409+
410+
def get_project_metadata(database_path: str, key: str) -> Optional[str]:
411+
"""
412+
Get a project metadata value by key.
413+
"""
414+
conn = _get_connection(database_path)
415+
try:
416+
row = conn.execute(
417+
"SELECT value FROM project_metadata WHERE key = ?",
418+
(key,)
419+
).fetchone()
420+
return row["value"] if row else None
421+
finally:
422+
conn.close()
423+
424+
425+
def delete_file_by_path(database_path: str, path: str) -> None:
426+
"""
427+
Delete a file and its chunks by path.
428+
Used for incremental indexing when files are removed.
429+
"""
430+
conn = _get_connection(database_path)
431+
try:
432+
cur = conn.cursor()
433+
# Get file_id
434+
row = cur.execute("SELECT id FROM files WHERE path = ?", (path,)).fetchone()
435+
if row:
436+
file_id = row["id"]
437+
# Delete chunks first
438+
cur.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,))
439+
# Delete file
440+
cur.execute("DELETE FROM files WHERE id = ?", (file_id,))
441+
conn.commit()
442+
finally:
443+
conn.close()
444+
445+
317446
# ============================================================================
318447
# Project Registry Database Operations
319448
# ============================================================================

0 commit comments

Comments
 (0)