Skip to content

Commit 9cb1905

Browse files
CopilotMte90
andauthored
Fix embedding deadlock and refactor utility functions to utils module (#9)
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
1 parent 68fd02a commit 9cb1905

File tree

3 files changed

+86
-60
lines changed

3 files changed

+86
-60
lines changed

ai/analyzer.py

Lines changed: 17 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import json
33
import time
44
import traceback
5-
import hashlib
65
import math
76
from pathlib import Path
87
from typing import Optional, Dict, Any, List
@@ -22,6 +21,7 @@
2221
from .openai import get_embedding_for_text, call_coding_api
2322
from llama_index.core import Document
2423
from utils.logger import get_logger
24+
from utils import compute_file_hash, chunk_text, norm, cosine
2525
from .smart_chunker import smart_chunk
2626
import logging
2727

@@ -60,6 +60,18 @@
6060
logger = get_logger(__name__)
6161

6262

63+
def _get_embedding_with_semaphore(semaphore: threading.Semaphore, text: str, model: Optional[str] = None):
64+
"""
65+
Wrapper to acquire semaphore inside executor task to avoid deadlock.
66+
The semaphore is acquired in the worker thread, not the main thread.
67+
"""
68+
semaphore.acquire()
69+
try:
70+
return get_embedding_for_text(text, model)
71+
finally:
72+
semaphore.release()
73+
74+
6375
def detect_language(path: str):
6476
if "LICENSE.md" in path:
6577
return "text"
@@ -73,26 +85,6 @@ def detect_language(path: str):
7385
return EXT_LANG.get(ext, "text")
7486

7587

76-
def compute_file_hash(content: str) -> str:
77-
"""
78-
Compute SHA256 hash of file content for change detection.
79-
"""
80-
return hashlib.sha256(content.encode('utf-8')).hexdigest()
81-
82-
83-
def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
84-
if chunk_size <= 0:
85-
return [text]
86-
step = max(1, chunk_size - overlap)
87-
chunks: List[str] = []
88-
start = 0
89-
L = len(text)
90-
while start < L:
91-
end = min(start + chunk_size, L)
92-
chunks.append(text[start:end])
93-
start += step
94-
return chunks
95-
9688

9789
# Main synchronous processing for a single file
9890
def _process_file_sync(
@@ -198,15 +190,10 @@ def _process_file_sync(
198190
embedding_futures = []
199191

200192
for idx, chunk_doc in batch:
201-
# Acquire semaphore to bound concurrent embedding requests
202-
semaphore.acquire()
203-
try:
204-
embedding_start_time = time.time()
205-
future = _EXECUTOR.submit(get_embedding_for_text, chunk_doc.text, embedding_model)
206-
embedding_futures.append((idx, chunk_doc, future, embedding_start_time))
207-
except Exception:
208-
semaphore.release()
209-
raise
193+
# Submit task to executor; semaphore will be acquired inside the worker
194+
embedding_start_time = time.time()
195+
future = _EXECUTOR.submit(_get_embedding_with_semaphore, semaphore, chunk_doc.text, embedding_model)
196+
embedding_futures.append((idx, chunk_doc, future, embedding_start_time))
210197

211198
# Wait for batch to complete and store results
212199
saved_count = 0
@@ -232,8 +219,6 @@ def _process_file_sync(
232219
logger.exception("Embedding retrieval failed for %s chunk %d: %s", rel_path, idx, e)
233220
emb = None
234221
failed_count += 1
235-
finally:
236-
semaphore.release()
237222

238223
if emb:
239224
try:
@@ -443,23 +428,6 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path
443428
logger.exception("Background analysis worker failed for %s", local_path)
444429

445430

446-
# Simple synchronous helpers preserved for compatibility --------------------------------
447-
def dot(a, b):
448-
return sum(x * y for x, y in zip(a, b))
449-
450-
451-
def norm(a):
452-
import math
453-
return math.sqrt(sum(x * x for x in a))
454-
455-
456-
def cosine(a, b):
457-
na = norm(a)
458-
nb = norm(b)
459-
if na == 0 or nb == 0:
460-
return 0.0
461-
return sum(x * y for x, y in zip(a, b)) / (na * nb)
462-
463431

464432
def search_semantic(query: str, database_path: str, top_k: int = 5):
465433
"""
@@ -479,14 +447,3 @@ def search_semantic(query: str, database_path: str, top_k: int = 5):
479447
def call_coding_model(prompt: str, context: str = ""):
480448
combined = f"Context:\n{context}\n\nPrompt:\n{prompt}" if context else prompt
481449
return call_coding_api(combined)
482-
483-
484-
# llama-index helper ---------------------------------------------------------
485-
def llama_index_retrieve_documents(query: str, database_path: str, top_k: int = 5) -> List[Document]:
486-
"""
487-
Return llama_index.core.Document objects for the top_k matching chunks using sqlite-vector.
488-
"""
489-
from .llama_integration import llama_index_retrieve_documents as _llama_retrieve
490-
return _llama_retrieve(query, database_path, top_k,
491-
search_func=_search_vectors,
492-
get_chunk_func=_get_chunk_text)

utils/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
"""
22
Utility modules for configuration and logging.
33
"""
4+
from .utils import compute_file_hash, chunk_text, dot, norm, cosine
5+
6+
__all__ = ['compute_file_hash', 'chunk_text', 'dot', 'norm', 'cosine']
47

utils/utils.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
Utility functions for text processing and vector operations.
3+
"""
4+
import hashlib
5+
import math
6+
from typing import List
7+
8+
9+
def compute_file_hash(content: str) -> str:
10+
"""
11+
Compute SHA256 hash of file content for change detection.
12+
"""
13+
return hashlib.sha256(content.encode('utf-8')).hexdigest()
14+
15+
16+
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 100) -> List[str]:
17+
"""
18+
Split text into overlapping chunks.
19+
20+
Args:
21+
text: Text to chunk
22+
chunk_size: Maximum size of each chunk in characters
23+
overlap: Number of overlapping characters between chunks
24+
25+
Returns:
26+
List of text chunks
27+
"""
28+
if chunk_size <= 0:
29+
return [text]
30+
step = max(1, chunk_size - overlap)
31+
chunks: List[str] = []
32+
start = 0
33+
L = len(text)
34+
while start < L:
35+
end = min(start + chunk_size, L)
36+
chunks.append(text[start:end])
37+
start += step
38+
return chunks
39+
40+
41+
def dot(a, b):
42+
"""
43+
Compute dot product of two vectors.
44+
"""
45+
return sum(x * y for x, y in zip(a, b))
46+
47+
48+
def norm(a):
49+
"""
50+
Compute L2 norm (magnitude) of a vector.
51+
"""
52+
return math.sqrt(sum(x * x for x in a))
53+
54+
55+
def cosine(a, b):
56+
"""
57+
Compute cosine similarity between two vectors.
58+
59+
Returns:
60+
Cosine similarity value between 0 and 1
61+
"""
62+
na = norm(a)
63+
nb = norm(b)
64+
if na == 0 or nb == 0:
65+
return 0.0
66+
return sum(x * y for x, y in zip(a, b)) / (na * nb)

0 commit comments

Comments
 (0)