Skip to content

Commit 0822032

Browse files
CopilotMte90
andcommitted
Optimize total files calculation and remove unused code
Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com>
1 parent 7921f45 commit 0822032

File tree

5 files changed

+9
-550
lines changed

5 files changed

+9
-550
lines changed

ai/analyzer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,11 +390,13 @@ def analyze_local_path_sync(
390390

391391
try:
392392
# Use batch update for efficiency - single database transaction
393+
# Store total_files for performance (avoid re-scanning directory on every request)
393394
set_project_metadata_batch(database_path, {
394395
"last_indexed_at": time.strftime("%Y-%m-%d %H:%M:%S"),
395396
"last_index_duration": str(duration),
396397
"files_indexed": str(file_count),
397-
"files_skipped": str(skipped_count)
398+
"files_skipped": str(skipped_count),
399+
"total_files": str(total_files) # Store total files found during indexing
398400
})
399401
except Exception:
400402
logger.exception("Failed to store indexing metadata")

ai/openai.py

Lines changed: 0 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -106,240 +106,11 @@ def _retry_with_backoff(func, *args, **kwargs):
106106
delay = base_delay * (2 ** attempt)
107107
time.sleep(delay)
108108

109-
110109
class EmbeddingError(Exception):
111110
"""Custom exception for embedding failures"""
112111
pass
113112

114113

115-
class EmbeddingClient:
116-
"""
117-
Embedding client with detailed logging, retry logic, and configurable timeouts.
118-
Provides better debugging for embedding API failures.
119-
Uses OpenAI SDK for proper API compatibility.
120-
"""
121-
def __init__(self,
122-
api_url: Optional[str] = None,
123-
api_key: Optional[str] = None,
124-
model: Optional[str] = None,
125-
timeout: float = 15.0,
126-
max_retries: int = 2,
127-
backoff: float = 1.5):
128-
self.api_url = api_url or CFG.get("api_url")
129-
self.api_key = api_key or CFG.get("api_key")
130-
self.model = model or DEFAULT_EMBEDDING_MODEL or "text-embedding-3-small"
131-
self.timeout = timeout
132-
self.max_retries = max_retries
133-
self.backoff = backoff
134-
135-
# Use OpenAI SDK client instead of raw requests
136-
# The SDK automatically handles the /embeddings path
137-
self.client = _client
138-
139-
def _generate_curl_command(self, payload: Dict[str, Any]) -> str:
140-
"""
141-
Generate a curl command for debugging purposes.
142-
Masks the API key for security.
143-
"""
144-
# Construct the full embeddings URL
145-
base_url = self.api_url.rstrip('/')
146-
if not base_url.endswith('/embeddings'):
147-
url = f"{base_url}/embeddings"
148-
else:
149-
url = base_url
150-
151-
# Start with basic curl command
152-
curl_parts = ["curl", "-X", "POST", f"'{url}'"]
153-
154-
# Add standard headers
155-
headers = {
156-
"Content-Type": "application/json",
157-
"Authorization": f"Bearer <API_KEY_MASKED>"
158-
}
159-
160-
for key, value in headers.items():
161-
curl_parts.append(f"-H '{key}: {value}'")
162-
163-
# Add data payload
164-
payload_json = json.dumps(payload)
165-
# Escape single quotes in the JSON for shell compatibility
166-
payload_json_escaped = payload_json.replace("'", "'\\''")
167-
curl_parts.append(f"-d '{payload_json_escaped}'")
168-
169-
return " \\\n ".join(curl_parts)
170-
171-
def _save_curl_script(self, curl_command: str, request_id: str, file_path: str, chunk_index: int) -> Optional[str]:
172-
"""
173-
Save curl command to a bash script in /tmp for debugging.
174-
Returns the path to the generated script, or None if save failed.
175-
"""
176-
try:
177-
import tempfile
178-
# Create a unique filename based on request_id
179-
script_name = f"embedding_debug_{request_id[:8]}.sh"
180-
script_path = os.path.join("/tmp", script_name)
181-
182-
# Generate script content with shebang and comments
183-
script_content = f"""#!/bin/bash
184-
# Embedding request debug script
185-
# Request ID: {request_id}
186-
# File: {file_path}
187-
# Chunk: {chunk_index}
188-
# Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}
189-
190-
{curl_command}
191-
"""
192-
193-
with open(script_path, 'w') as f:
194-
f.write(script_content)
195-
196-
# Make the script executable
197-
os.chmod(script_path, 0o755)
198-
199-
return script_path
200-
except Exception as e:
201-
_embedding_logger.warning(f"Failed to save curl debug script: {e}")
202-
return None
203-
204-
205-
def _log_request_start(self, request_id: str, file_path: str, chunk_index: int, chunk_len: int):
206-
_embedding_logger.debug(
207-
"Embedding request START",
208-
extra={
209-
"request_id": request_id,
210-
"file": file_path,
211-
"chunk_index": chunk_index,
212-
"chunk_length": chunk_len,
213-
"model": self.model,
214-
"api_url": self.api_url,
215-
"timeout": self.timeout,
216-
},
217-
)
218-
219-
def _log_request_end(self, request_id: str, elapsed: float, status: Optional[int], response_body_preview: str):
220-
_embedding_logger.debug(
221-
"Embedding request END",
222-
extra={
223-
"request_id": request_id,
224-
"elapsed_s": elapsed,
225-
"status": status,
226-
"response_preview": response_body_preview,
227-
},
228-
)
229-
230-
def embed_text(self, text: str, file_path: str = "<unknown>", chunk_index: int = 0) -> List[float]:
231-
"""
232-
Embed a single chunk of text using OpenAI SDK. Returns the embedding vector.
233-
Raises EmbeddingError on failure.
234-
"""
235-
request_id = str(uuid.uuid4())
236-
chunk_len = len(text)
237-
self._log_request_start(request_id, file_path, chunk_index, chunk_len)
238-
239-
payload = {
240-
"model": self.model,
241-
"input": text,
242-
}
243-
244-
attempt = 0
245-
err_msg = ""
246-
while True:
247-
attempt += 1
248-
start = time.perf_counter()
249-
try:
250-
# Use OpenAI SDK for embeddings
251-
resp = self.client.embeddings.create(
252-
model=self.model,
253-
input=text,
254-
timeout=self.timeout
255-
)
256-
elapsed = time.perf_counter() - start
257-
258-
# Log successful response
259-
self._log_request_end(request_id, elapsed, 200, "Success")
260-
261-
# Extract embedding from response
262-
# The SDK returns a response object with a data list
263-
if resp and hasattr(resp, 'data') and len(resp.data) > 0:
264-
embedding = resp.data[0].embedding
265-
if embedding and isinstance(embedding, list):
266-
return embedding
267-
else:
268-
raise EmbeddingError(f"Invalid embedding format in response")
269-
else:
270-
raise EmbeddingError(f"Unexpected embedding response shape from SDK")
271-
272-
except Exception as e:
273-
elapsed = time.perf_counter() - start
274-
err_msg = f"Error after {elapsed:.2f}s: {e}"
275-
276-
# Save debug information for timeout or API errors
277-
script_path = None
278-
if CFG.get("debug"):
279-
# Generate curl command for debugging
280-
curl_command = self._generate_curl_command(payload)
281-
script_path = self._save_curl_script(curl_command, request_id, file_path, chunk_index)
282-
if script_path:
283-
_embedding_logger.error(f"\nDebug script saved to: {script_path}")
284-
_embedding_logger.error(f"Run with: bash {script_path}")
285-
else:
286-
_embedding_logger.error(f"\nDebug with this curl command:")
287-
_embedding_logger.error(curl_command)
288-
289-
_embedding_logger.error(
290-
"Embedding API Error",
291-
extra={
292-
"request_id": request_id,
293-
"error": str(e),
294-
"elapsed_s": elapsed,
295-
"attempt": attempt,
296-
"file": file_path,
297-
"chunk_index": chunk_index,
298-
}
299-
)
300-
301-
# Retry logic
302-
if attempt > self.max_retries:
303-
_embedding_logger.error(
304-
"Max retries exceeded for embedding request",
305-
extra={"request_id": request_id, "file": file_path, "chunk_index": chunk_index, "attempts": attempt},
306-
)
307-
raise EmbeddingError(f"Failed to get embedding after {attempt} attempts. Last error: {err_msg}")
308-
309-
# Backoff and retry
310-
sleep_for = self.backoff * (2 ** (attempt - 1))
311-
_embedding_logger.info(
312-
"Retrying embedding request",
313-
extra={
314-
"request_id": request_id,
315-
"file": file_path,
316-
"chunk_index": chunk_index,
317-
"attempt": attempt,
318-
"sleep_s": sleep_for,
319-
},
320-
)
321-
time.sleep(sleep_for)
322-
323-
def embed_multiple(self, chunks: List[str], file_path: str = "<unknown>") -> List[Dict[str, Any]]:
324-
"""
325-
Embed a list of text chunks. Returns list of dicts: {"chunk_index": i, "embedding": [...]}.
326-
This method logs progress and errors for each chunk.
327-
"""
328-
results = []
329-
for i, chunk in enumerate(chunks):
330-
try:
331-
emb = self.embed_text(chunk, file_path=file_path, chunk_index=i)
332-
results.append({"chunk_index": i, "embedding": emb})
333-
except EmbeddingError as e:
334-
_embedding_logger.error(
335-
"Failed to embed chunk",
336-
extra={"file": file_path, "chunk_index": i, "error": str(e)},
337-
)
338-
# append a failure marker or skip depending on desired behavior
339-
results.append({"chunk_index": i, "embedding": None, "error": str(e)})
340-
return results
341-
342-
343114
def call_coding_api(prompt: str, model: Optional[str] = None, max_tokens: int = 1024):
344115
"""
345116
Call a generative/coding model via the new OpenAI client.

0 commit comments

Comments
 (0)