22import json
33import time
44import traceback
5- import hashlib
65import math
76from pathlib import Path
87from typing import Optional , Dict , Any , List
2221from .openai import get_embedding_for_text , call_coding_api
2322from llama_index .core import Document
2423from utils .logger import get_logger
24+ from utils import compute_file_hash , chunk_text , norm , cosine
2525from .smart_chunker import smart_chunk
2626import logging
2727
6060logger = get_logger (__name__ )
6161
6262
63+ def _get_embedding_with_semaphore (semaphore : threading .Semaphore , text : str , model : Optional [str ] = None ):
64+ """
65+ Wrapper to acquire semaphore inside executor task to avoid deadlock.
66+ The semaphore is acquired in the worker thread, not the main thread.
67+ """
68+ semaphore .acquire ()
69+ try :
70+ return get_embedding_for_text (text , model )
71+ finally :
72+ semaphore .release ()
73+
74+
6375def detect_language (path : str ):
6476 if "LICENSE.md" in path :
6577 return "text"
@@ -73,26 +85,6 @@ def detect_language(path: str):
7385 return EXT_LANG .get (ext , "text" )
7486
7587
76- def compute_file_hash (content : str ) -> str :
77- """
78- Compute SHA256 hash of file content for change detection.
79- """
80- return hashlib .sha256 (content .encode ('utf-8' )).hexdigest ()
81-
82-
83- def chunk_text (text : str , chunk_size : int = CHUNK_SIZE , overlap : int = CHUNK_OVERLAP ) -> List [str ]:
84- if chunk_size <= 0 :
85- return [text ]
86- step = max (1 , chunk_size - overlap )
87- chunks : List [str ] = []
88- start = 0
89- L = len (text )
90- while start < L :
91- end = min (start + chunk_size , L )
92- chunks .append (text [start :end ])
93- start += step
94- return chunks
95-
9688
9789# Main synchronous processing for a single file
9890def _process_file_sync (
@@ -198,15 +190,10 @@ def _process_file_sync(
198190 embedding_futures = []
199191
200192 for idx , chunk_doc in batch :
201- # Acquire semaphore to bound concurrent embedding requests
202- semaphore .acquire ()
203- try :
204- embedding_start_time = time .time ()
205- future = _EXECUTOR .submit (get_embedding_for_text , chunk_doc .text , embedding_model )
206- embedding_futures .append ((idx , chunk_doc , future , embedding_start_time ))
207- except Exception :
208- semaphore .release ()
209- raise
193+ # Submit task to executor; semaphore will be acquired inside the worker
194+ embedding_start_time = time .time ()
195+ future = _EXECUTOR .submit (_get_embedding_with_semaphore , semaphore , chunk_doc .text , embedding_model )
196+ embedding_futures .append ((idx , chunk_doc , future , embedding_start_time ))
210197
211198 # Wait for batch to complete and store results
212199 saved_count = 0
@@ -232,8 +219,6 @@ def _process_file_sync(
232219 logger .exception ("Embedding retrieval failed for %s chunk %d: %s" , rel_path , idx , e )
233220 emb = None
234221 failed_count += 1
235- finally :
236- semaphore .release ()
237222
238223 if emb :
239224 try :
@@ -443,23 +428,6 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
443428 logger .exception ("Background analysis worker failed for %s" , local_path )
444429
445430
446- # Simple synchronous helpers preserved for compatibility --------------------------------
447- def dot (a , b ):
448- return sum (x * y for x , y in zip (a , b ))
449-
450-
451- def norm (a ):
452- import math
453- return math .sqrt (sum (x * x for x in a ))
454-
455-
456- def cosine (a , b ):
457- na = norm (a )
458- nb = norm (b )
459- if na == 0 or nb == 0 :
460- return 0.0
461- return sum (x * y for x , y in zip (a , b )) / (na * nb )
462-
463431
464432def search_semantic (query : str , database_path : str , top_k : int = 5 ):
465433 """
@@ -479,14 +447,3 @@ def search_semantic(query: str, database_path: str, top_k: int = 5):
479447def call_coding_model (prompt : str , context : str = "" ):
480448 combined = f"Context:\n { context } \n \n Prompt:\n { prompt } " if context else prompt
481449 return call_coding_api (combined )
482-
483-
484- # llama-index helper ---------------------------------------------------------
485- def llama_index_retrieve_documents (query : str , database_path : str , top_k : int = 5 ) -> List [Document ]:
486- """
487- Return llama_index.core.Document objects for the top_k matching chunks using sqlite-vector.
488- """
489- from .llama_integration import llama_index_retrieve_documents as _llama_retrieve
490- return _llama_retrieve (query , database_path , top_k ,
491- search_func = _search_vectors ,
492- get_chunk_func = _get_chunk_text )
0 commit comments