From c6cd5c34f3bfb7108c8dd9849dedb9c2d278c655 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Thu, 18 Dec 2025 20:36:15 -0800 Subject: [PATCH 1/5] fix: legacy evaluation reporting with Strategy Pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes legacy evaluation reporting to the backend that was returning HTTP 400 errors and implements the Strategy Pattern for cleaner code separation. ## Changes ### Strategy Pattern Implementation - Created `EvalReportingStrategy` Protocol defining the interface for evaluation reporting strategies - Implemented `LegacyEvalReportingStrategy` for legacy evaluations: - Converts string IDs to deterministic GUIDs using uuid5 - Uses endpoints without /coded/ prefix - Uses assertionRuns format with assertionSnapshot - Implemented `CodedEvalReportingStrategy` for coded evaluations: - Keeps IDs as strings - Uses /coded/ endpoint prefix - Uses evaluatorRuns format with evaluationCriterias ### Bug Fixes - Fixed legacy eval API payload structure for backend compatibility - Added type assertion for project_id to fix mypy errors - Removed unused ABC, abstractmethod imports after Protocol migration ### Test Results - All 27 unit tests passing - All linting checks (ruff, mypy) passing - Integration testing with calculator sample: all API calls returning HTTP 200 OK 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../evaluations/eval-sets/legacy.json | 26 +- .../evaluators/legacy-equality.json | 4 +- .../evaluators/legacy-json-similarity.json | 4 +- .../evaluators/legacy-llm-as-a-judge.json | 4 +- .../evaluators/legacy-trajectory.json | 4 +- src/uipath/_cli/_evals/_progress_reporter.py | 1576 +++++++++-------- tests/cli/eval/test_progress_reporter.py | 31 +- 7 files changed, 844 insertions(+), 805 deletions(-) diff --git a/samples/calculator/evaluations/eval-sets/legacy.json b/samples/calculator/evaluations/eval-sets/legacy.json index 1e3234fae..4740c7a3a 100644 --- a/samples/calculator/evaluations/eval-sets/legacy.json +++ b/samples/calculator/evaluations/eval-sets/legacy.json @@ -1,17 +1,17 @@ { - "fileName": "default.json", - "id": "default-eval-set-id", - "name": "Basic Calculator Evaluation Set", + "fileName": "legacy.json", + "id": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789", + "name": "Basic Calculator Evaluation Set (Legacy)", "batchSize": 10, "evaluatorRefs": [ - "equality", - "llm-as-a-judge", - "json-similarity", - "trajectory" + "aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa", + "bbbbbbbb-bbbb-4bbb-bbbb-bbbbbbbbbbbb", + "cccccccc-cccc-4ccc-cccc-cccccccccccc", + "dddddddd-dddd-4ddd-dddd-dddddddddddd" ], "evaluations": [ { - "id": "test-addition", + "id": "11111111-1111-4111-8111-111111111111", "name": "Test Addition", "inputs": { "a": 1, @@ -22,12 +22,12 @@ "result": 2.0 }, "expectedAgentBehavior": "The operation should produce the right output.", - "evalSetId": "default-eval-set-id", + "evalSetId": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789", "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z" }, { - "id": "test-random-addition-using-llm", + "id": "22222222-2222-4222-8222-222222222222", "name": "Test Random Addition Using LLM", "inputs": { "a": 1, @@ -45,12 +45,12 @@ "name": "get_random_operator" } ], - "evalSetId": "default-eval-set-id", + "evalSetId": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789", "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z" }, { - "id": "test-with-llm-input-mocking", + "id": "33333333-3333-4333-8333-333333333333", "name": "Test with LLM input mocking", "inputs": {}, "expectedOutput": { @@ -59,7 +59,7 @@ "expectedAgentBehavior": "The operation should produce the right output.", "simulateInput": true, "inputGenerationInstructions": "Generate a multiplication calculation where the first number is 5 and the second number is 7", - "evalSetId": "default-eval-set-id", + "evalSetId": "a1b2c3d4-e5f6-4a89-abcd-ef0123456789", "createdAt": "2025-09-04T18:54:58.378Z", "updatedAt": "2025-09-04T18:55:55.416Z" } diff --git a/samples/calculator/evaluations/evaluators/legacy-equality.json b/samples/calculator/evaluations/evaluators/legacy-equality.json index 10e073c8e..73f0fbd9a 100644 --- a/samples/calculator/evaluations/evaluators/legacy-equality.json +++ b/samples/calculator/evaluations/evaluators/legacy-equality.json @@ -1,6 +1,6 @@ { - "fileName": "equality.json", - "id": "equality", + "fileName": "legacy-equality.json", + "id": "aaaaaaaa-aaaa-4aaa-aaaa-aaaaaaaaaaaa", "name": "Equality Evaluator", "description": "An evaluator that judges the agent based on expected output.", "category": 0, diff --git a/samples/calculator/evaluations/evaluators/legacy-json-similarity.json b/samples/calculator/evaluations/evaluators/legacy-json-similarity.json index dd1fca355..d1066b0ee 100644 --- a/samples/calculator/evaluations/evaluators/legacy-json-similarity.json +++ b/samples/calculator/evaluations/evaluators/legacy-json-similarity.json @@ -1,6 +1,6 @@ { - "fileName": "json-similarity.json", - "id": "json-similarity", + "fileName": "legacy-json-similarity.json", + "id": "cccccccc-cccc-4ccc-cccc-cccccccccccc", "name": "JSON Similarity Evaluator", "description": "An evaluator that compares JSON structures with tolerance for numeric and string differences.", "category": 0, diff --git a/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge.json b/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge.json index 1b90f193f..209d663f0 100644 --- a/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge.json +++ b/samples/calculator/evaluations/evaluators/legacy-llm-as-a-judge.json @@ -1,6 +1,6 @@ { - "fileName": "llm-as-a-judge.json", - "id": "llm-as-a-judge", + "fileName": "legacy-llm-as-a-judge.json", + "id": "bbbbbbbb-bbbb-4bbb-bbbb-bbbbbbbbbbbb", "name": "LLMAsAJudge Evaluator", "description": "An evaluator that judges the agent based on it's run history and expected behavior", "category": 3, diff --git a/samples/calculator/evaluations/evaluators/legacy-trajectory.json b/samples/calculator/evaluations/evaluators/legacy-trajectory.json index 8d6e600ea..894424fd6 100644 --- a/samples/calculator/evaluations/evaluators/legacy-trajectory.json +++ b/samples/calculator/evaluations/evaluators/legacy-trajectory.json @@ -1,6 +1,6 @@ { - "fileName": "trajectory.json", - "id": "trajectory", + "fileName": "legacy-trajectory.json", + "id": "dddddddd-dddd-4ddd-dddd-dddddddddddd", "name": "Trajectory Evaluator", "description": "An evaluator that analyzes the execution trajectory and decision sequence taken by the agent.", "category": 3, diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index 92e10fed0..8a358c4e2 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -1,4 +1,8 @@ -"""Progress reporter for sending evaluation updates to StudioWeb.""" +"""Progress reporter for sending evaluation updates to StudioWeb. + +This module uses the Strategy Pattern to separate legacy and coded evaluation +reporting flows. Each strategy handles the specific API format differences. +""" import functools import json @@ -6,7 +10,7 @@ import os import uuid from datetime import datetime, timezone -from typing import Any +from typing import Any, Callable, Protocol, runtime_checkable from urllib.parse import urlparse from opentelemetry import trace @@ -50,6 +54,11 @@ logger = logging.getLogger(__name__) +# ============================================================================= +# Utility Functions +# ============================================================================= + + def gracefully_handle_errors(func): """Decorator to catch and log errors without stopping execution.""" @@ -60,7 +69,6 @@ async def wrapper(self, *args, **kwargs): except Exception as e: if hasattr(self, "_console"): error_type = type(e).__name__ - # Log the full error message for debugging logger.debug(f"Full error details: {e}") logger.warning( f"Cannot report progress to SW. " @@ -73,8 +81,420 @@ async def wrapper(self, *args, **kwargs): return wrapper +# ============================================================================= +# Strategy Protocol +# ============================================================================= + + +@runtime_checkable +class EvalReportingStrategy(Protocol): + """Protocol for evaluation reporting strategies. + + Strategies handle the differences between legacy and coded evaluation + API formats, including ID conversion, endpoint routing, and payload structure. + """ + + @property + def endpoint_suffix(self) -> str: + """Return the endpoint suffix for this strategy. + + Returns: + "" for legacy, "coded/" for coded evaluations + """ + ... + + def convert_id(self, id_value: str) -> str: + """Convert an ID to the format expected by the backend. + + Args: + id_value: The original string ID + + Returns: + For legacy: deterministic GUID from uuid5 + For coded: original string ID unchanged + """ + ... + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create the payload for creating an eval set run.""" + ... + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create the payload for creating an eval run.""" + ... + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create the payload for updating an eval run.""" + ... + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create the payload for updating an eval set run.""" + ... + + def collect_results( + self, + eval_results: list[EvalItemResult], + evaluators: dict[str, Any], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results from evaluations in strategy-specific format. + + Returns: + Tuple of (evaluator_runs, evaluator_scores) + """ + ... + + +# ============================================================================= +# Legacy Evaluation Reporting Strategy +# ============================================================================= + + +class LegacyEvalReportingStrategy: + """Strategy for legacy evaluation reporting. + + Legacy evaluations: + - Convert string IDs to deterministic GUIDs using uuid5 + - Use endpoints without /coded/ prefix + - Use assertionRuns format with assertionSnapshot + - Put expectedOutput directly in evalSnapshot + """ + + @property + def endpoint_suffix(self) -> str: + """Return empty string for legacy endpoints (no /coded/ prefix).""" + return "" + + def convert_id(self, id_value: str) -> str: + """Convert string ID to deterministic GUID for legacy API. + + Args: + id_value: The original string ID + + Returns: + The ID as a GUID (either original if valid, or deterministic uuid5) + """ + try: + uuid.UUID(id_value) + return id_value + except ValueError: + return str(uuid.uuid5(uuid.NAMESPACE_DNS, id_value)) + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create payload for creating a legacy eval set run.""" + return { + "agentId": project_id, + "evalSetId": self.convert_id(eval_set_id), + "agentSnapshot": agent_snapshot.model_dump(by_alias=True), + "status": EvaluationStatus.IN_PROGRESS.value, + "numberOfEvalsExecuted": no_of_evals, + "source": 0, # EvalRunSource.Manual + } + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create payload for creating a legacy eval run.""" + eval_item_id = self.convert_id(eval_item.id) + + # Extract expectedOutput from evaluation_criterias + expected_output = {} + if eval_item.evaluation_criterias: + first_criteria = next(iter(eval_item.evaluation_criterias.values()), None) + if first_criteria and isinstance(first_criteria, dict): + expected_output = first_criteria.get("expectedOutput", {}) + + return { + "evalSetRunId": eval_set_run_id, + "evalSnapshot": { + "id": eval_item_id, + "name": eval_item.name, + "inputs": eval_item.inputs, + "expectedOutput": expected_output, + }, + "status": EvaluationStatus.IN_PROGRESS.value, + } + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a legacy eval run.""" + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalRunId": eval_run_id, + "status": status.value, + "result": { + "output": dict(actual_output), + "evaluatorScores": evaluator_scores, + }, + "completionMetrics": {"duration": int(execution_time)}, + "assertionRuns": evaluator_runs, + } + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a legacy eval set run.""" + scores_list = [ + {"value": avg_score, "evaluatorId": self.convert_id(eval_id)} + for eval_id, avg_score in evaluator_scores.items() + ] + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalSetRunId": eval_set_run_id, + "status": status.value, + "evaluatorScores": scores_list, + } + + def collect_results( + self, + eval_results: list[EvalItemResult], + evaluators: dict[str, LegacyBaseEvaluator[Any]], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results in legacy assertionRuns format.""" + assertion_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + for eval_result in eval_results: + if eval_result.evaluator_id not in evaluators: + continue + + evaluator_id_value = self.convert_id(eval_result.evaluator_id) + evaluator = evaluators[eval_result.evaluator_id] + justification = serialize_justification_fn(eval_result.result.details) + + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": justification, + "evaluatorId": evaluator_id_value, + } + ) + + assertion_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": evaluator_id_value, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time or 0), + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + "assertionSnapshot": { + "assertionType": evaluator.evaluator_type.name, + "outputKey": evaluator.target_output_key, + }, + } + ) + + return assertion_runs, evaluator_scores_list + + +# ============================================================================= +# Coded Evaluation Reporting Strategy +# ============================================================================= + + +class CodedEvalReportingStrategy: + """Strategy for coded evaluation reporting. + + Coded evaluations: + - Keep string IDs unchanged + - Use endpoints with /coded/ prefix + - Use evaluatorRuns format with nested result + - Put evaluationCriterias in evalSnapshot + """ + + @property + def endpoint_suffix(self) -> str: + """Return 'coded/' for coded endpoints.""" + return "coded/" + + def convert_id(self, id_value: str) -> str: + """Keep string ID unchanged for coded API.""" + return id_value + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create payload for creating a coded eval set run.""" + return { + "agentId": project_id, + "evalSetId": eval_set_id, + "agentSnapshot": agent_snapshot.model_dump(by_alias=True), + "status": EvaluationStatus.IN_PROGRESS.value, + "numberOfEvalsExecuted": no_of_evals, + "source": 0, # EvalRunSource.Manual + } + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create payload for creating a coded eval run.""" + return { + "evalSetRunId": eval_set_run_id, + "evalSnapshot": { + "id": eval_item.id, + "name": eval_item.name, + "inputs": eval_item.inputs, + "evaluationCriterias": eval_item.evaluation_criterias, + }, + "status": EvaluationStatus.IN_PROGRESS.value, + } + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a coded eval run.""" + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalRunId": eval_run_id, + "status": status.value, + "result": { + "output": dict(actual_output), + "scores": evaluator_scores, # Note: "scores" not "evaluatorScores" + }, + "completionMetrics": {"duration": int(execution_time)}, + "evaluatorRuns": evaluator_runs, # Note: "evaluatorRuns" not "assertionRuns" + } + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a coded eval set run.""" + scores_list = [ + {"value": avg_score, "evaluatorId": eval_id} + for eval_id, avg_score in evaluator_scores.items() + ] + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalSetRunId": eval_set_run_id, + "status": status.value, + "evaluatorScores": scores_list, + } + + def collect_results( + self, + eval_results: list[EvalItemResult], + evaluators: dict[str, BaseEvaluator[Any, Any, Any]], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results in coded evaluatorRuns format.""" + evaluator_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + for eval_result in eval_results: + if eval_result.evaluator_id not in evaluators: + continue + + justification = serialize_justification_fn(eval_result.result.details) + + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": justification, + "evaluatorId": eval_result.evaluator_id, + } + ) + + evaluator_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": eval_result.evaluator_id, + "result": { + "score": { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + }, + "justification": justification, + }, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time or 0), + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + } + ) + + return evaluator_runs, evaluator_scores_list + + +# ============================================================================= +# Main Progress Reporter Class +# ============================================================================= + + class StudioWebProgressReporter: - """Handles reporting evaluation progress to StudioWeb.""" + """Handles reporting evaluation progress to StudioWeb. + + Uses the Strategy Pattern to delegate legacy vs coded evaluation + formatting to appropriate strategy classes. + """ def __init__(self, spans_exporter: LlmOpsHttpExporter): self.spans_exporter = spans_exporter @@ -95,28 +515,37 @@ def __init__(self, spans_exporter: LlmOpsHttpExporter): "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID." ) + # Strategy instances + self._legacy_strategy = LegacyEvalReportingStrategy() + self._coded_strategy = CodedEvalReportingStrategy() + + # State tracking self.eval_set_run_ids: dict[str, str] = {} self.evaluators: dict[str, Any] = {} self.evaluator_scores: dict[str, list[float]] = {} self.eval_run_ids: dict[str, str] = {} - self.is_coded_eval: dict[str, bool] = {} # Track coded vs legacy per execution - self.eval_spans: dict[ - str, list[Any] - ] = {} # Store spans per execution for usage metrics - self.eval_set_execution_id: str | None = ( - None # Track current eval set execution ID - ) + self.is_coded_eval: dict[str, bool] = {} + self.eval_spans: dict[str, list[Any]] = {} + self.eval_set_execution_id: str | None = None + + # ------------------------------------------------------------------------- + # Strategy Selection + # ------------------------------------------------------------------------- + + def _get_strategy(self, is_coded: bool) -> EvalReportingStrategy: + """Get the appropriate strategy for the evaluation type.""" + return self._coded_strategy if is_coded else self._legacy_strategy + + # ------------------------------------------------------------------------- + # Utility Methods + # ------------------------------------------------------------------------- def _format_error_message(self, error: Exception, context: str) -> None: """Helper method to format and display error messages consistently.""" self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]") def _is_localhost(self) -> bool: - """Check if the eval backend URL is localhost. - - Returns: - True if using localhost, False otherwise. - """ + """Check if the eval backend URL is localhost.""" eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL, "") if eval_backend_url: try: @@ -128,15 +557,7 @@ def _is_localhost(self) -> bool: return False def _get_endpoint_prefix(self) -> str: - """Determine the endpoint prefix based on environment. - - Checks UIPATH_EVAL_BACKEND_URL environment variable: - - If set to localhost/127.0.0.1: returns "api/" (direct API access) - - Otherwise: returns "agentsruntime_/api/" (service routing for alpha/prod) - - Returns: - "api/" for localhost environments, "agentsruntime_/api/" for alpha/production. - """ + """Determine the endpoint prefix based on environment.""" if self._is_localhost(): return "api/" return "agentsruntime_/api/" @@ -144,30 +565,32 @@ def _get_endpoint_prefix(self) -> str: def _is_coded_evaluator( self, evaluators: list[BaseEvaluator[Any, Any, Any]] ) -> bool: - """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator). - - Args: - evaluators: List of evaluators to check - - Returns: - True if using coded evaluators, False for legacy evaluators - """ + """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).""" if not evaluators: return False - # Check the first evaluator type return not isinstance(evaluators[0], LegacyBaseEvaluator) + def _serialize_justification( + self, justification: BaseModel | str | None + ) -> str | None: + """Serialize justification to JSON string for API compatibility.""" + if isinstance(justification, BaseModel): + justification = json.dumps(justification.model_dump()) + return justification + + def _tenant_header(self) -> dict[str, str | None]: + """Build tenant header for API requests.""" + tenant_id = os.getenv(ENV_TENANT_ID, None) + if not tenant_id: + self._console.error( + f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'." + ) + return {HEADER_INTERNAL_TENANT_ID: tenant_id} + def _extract_usage_from_spans( self, spans: list[Any] ) -> dict[str, int | float | None]: - """Extract token usage and cost from OpenTelemetry spans. - - Args: - spans: List of ReadableSpan objects from agent execution - - Returns: - Dictionary with tokens, completionTokens, promptTokens, and cost - """ + """Extract token usage and cost from OpenTelemetry spans.""" total_tokens = 0 completion_tokens = 0 prompt_tokens = 0 @@ -175,16 +598,13 @@ def _extract_usage_from_spans( for span in spans: try: - # Handle both dictionary attributes and string Attributes field attrs = None if hasattr(span, "attributes") and span.attributes: if isinstance(span.attributes, dict): attrs = span.attributes elif isinstance(span.attributes, str): - # Parse JSON string attributes attrs = json.loads(span.attributes) - # Also check for Attributes field (capitalized) from backend spans if not attrs and hasattr(span, "Attributes") and span.Attributes: if isinstance(span.Attributes, str): attrs = json.loads(span.Attributes) @@ -192,16 +612,13 @@ def _extract_usage_from_spans( attrs = span.Attributes if attrs: - # Try to get usage from nested usage object (backend format) if "usage" in attrs and isinstance(attrs["usage"], dict): usage = attrs["usage"] prompt_tokens += usage.get("promptTokens", 0) completion_tokens += usage.get("completionTokens", 0) total_tokens += usage.get("totalTokens", 0) - # Cost might be in usage or at root level total_cost += usage.get("cost", 0.0) - # Also try OpenTelemetry semantic conventions (SDK format) prompt_tokens += attrs.get("gen_ai.usage.prompt_tokens", 0) completion_tokens += attrs.get("gen_ai.usage.completion_tokens", 0) total_tokens += attrs.get("gen_ai.usage.total_tokens", 0) @@ -219,6 +636,139 @@ def _extract_usage_from_spans( "cost": total_cost if total_cost > 0 else None, } + def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: + """Extract agent snapshot from entry points configuration.""" + try: + entry_points_file_path = os.path.join( + os.getcwd(), str(UiPathConfig.entry_points_file_path) + ) + if not os.path.exists(entry_points_file_path): + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + + with open(entry_points_file_path, "r") as f: + entry_points = json.load(f).get("entryPoints", []) + + ep = None + for entry_point in entry_points: + if entry_point.get("filePath") == entrypoint: + ep = entry_point + break + + if not ep: + logger.warning( + f"Entrypoint {entrypoint} not found in configuration file" + ) + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + + input_schema = ep.get("input", {}) + output_schema = ep.get("output", {}) + + return StudioWebAgentSnapshot( + input_schema=input_schema, output_schema=output_schema + ) + except Exception as e: + logger.warning(f"Failed to extract agent snapshot: {e}") + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + + # ------------------------------------------------------------------------- + # Request Spec Generation (delegating to strategies) + # ------------------------------------------------------------------------- + + def _create_eval_set_run_spec( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + is_coded: bool = False, + ) -> RequestSpec: + """Create request spec for creating an eval set run.""" + assert self._project_id is not None, "project_id is required for SW reporting" + strategy = self._get_strategy(is_coded) + payload = strategy.create_eval_set_run_payload( + eval_set_id, agent_snapshot, no_of_evals, self._project_id + ) + return RequestSpec( + method="POST", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalSetRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + def _create_eval_run_spec( + self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False + ) -> RequestSpec: + """Create request spec for creating an eval run.""" + strategy = self._get_strategy(is_coded) + payload = strategy.create_eval_run_payload(eval_item, eval_set_run_id) + return RequestSpec( + method="POST", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + def _update_eval_run_spec( + self, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + eval_run_id: str, + actual_output: dict[str, Any], + execution_time: float, + success: bool, + is_coded: bool = False, + ) -> RequestSpec: + """Create request spec for updating an eval run.""" + strategy = self._get_strategy(is_coded) + payload = strategy.create_update_eval_run_payload( + eval_run_id, + evaluator_runs, + evaluator_scores, + actual_output, + execution_time, + success, + ) + return RequestSpec( + method="PUT", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + def _update_eval_set_run_spec( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + is_coded: bool = False, + success: bool = True, + ) -> RequestSpec: + """Create request spec for updating an eval set run.""" + strategy = self._get_strategy(is_coded) + payload = strategy.create_update_eval_set_run_payload( + eval_set_run_id, evaluator_scores, success + ) + return RequestSpec( + method="PUT", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalSetRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + # ------------------------------------------------------------------------- + # API Methods + # ------------------------------------------------------------------------- + @gracefully_handle_errors async def create_eval_set_run_sw( self, @@ -247,16 +797,7 @@ async def create_eval_set_run_sw( async def create_eval_run( self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False ) -> str: - """Create a new evaluation run in StudioWeb. - - Args: - eval_item: Dictionary containing evaluation data - eval_set_run_id: The ID of the evaluation set run - is_coded: Whether this is a coded evaluation (vs legacy) - - Returns: - The ID of the created evaluation run - """ + """Create a new evaluation run in StudioWeb.""" spec = self._create_eval_run_spec(eval_item, eval_set_run_id, is_coded) response = await self._client.request_async( method=spec.method, @@ -277,10 +818,9 @@ async def update_eval_run( spans: list[Any] | None = None, ): """Update an evaluation run with results.""" + # Separate evaluators by type coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {} - evaluator_runs: list[dict[str, Any]] = [] - evaluator_scores: list[dict[str, Any]] = [] for k, v in evaluators.items(): if isinstance(v, LegacyBaseEvaluator): @@ -288,696 +828,252 @@ async def update_eval_run( elif isinstance(v, BaseEvaluator): coded_evaluators[k] = v - # Use coded evaluator format - runs, scores = self._collect_coded_results( - sw_progress_item.eval_results, coded_evaluators, spans or [] - ) - evaluator_runs.extend(runs) - evaluator_scores.extend(scores) - - # Use legacy evaluator format - runs, scores = self._collect_results( - sw_progress_item.eval_results, - legacy_evaluators, - spans or [], - ) - evaluator_runs.extend(runs) - evaluator_scores.extend(scores) - - # Use the appropriate spec method based on evaluation type - if is_coded: - spec = self._update_coded_eval_run_spec( - evaluator_runs=evaluator_runs, - evaluator_scores=evaluator_scores, - eval_run_id=sw_progress_item.eval_run_id, - execution_time=sw_progress_item.agent_execution_time, - actual_output=sw_progress_item.agent_output, - success=sw_progress_item.success, - is_coded=is_coded, - ) - else: - spec = self._update_eval_run_spec( - assertion_runs=evaluator_runs, - evaluator_scores=evaluator_scores, - eval_run_id=sw_progress_item.eval_run_id, - execution_time=sw_progress_item.agent_execution_time, - actual_output=sw_progress_item.agent_output, - success=sw_progress_item.success, - is_coded=is_coded, - ) - - await self._client.request_async( - method=spec.method, - url=spec.endpoint, - params=spec.params, - json=spec.json, - headers=spec.headers, - scoped="org" if self._is_localhost() else "tenant", - ) + usage_metrics = self._extract_usage_from_spans(spans or []) - @gracefully_handle_errors - async def update_eval_set_run( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - is_coded: bool = False, - success: bool = True, - ): - """Update the evaluation set run status to complete.""" - spec = self._update_eval_set_run_spec( - eval_set_run_id, evaluator_scores, is_coded, success - ) - await self._client.request_async( - method=spec.method, - url=spec.endpoint, - params=spec.params, - json=spec.json, - headers=spec.headers, - scoped="org" if self._is_localhost() else "tenant", - ) - - async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None: - try: - self.evaluators = {eval.id: eval for eval in payload.evaluators} - self.evaluator_scores = {eval.id: [] for eval in payload.evaluators} - - # Store the eval set execution ID for mapping eval runs to eval set - self.eval_set_execution_id = payload.execution_id - - # Detect if using coded evaluators and store for this execution - is_coded = self._is_coded_evaluator(payload.evaluators) - self.is_coded_eval[payload.execution_id] = is_coded - - eval_set_run_id = payload.eval_set_run_id - if not eval_set_run_id: - eval_set_run_id = await self.create_eval_set_run_sw( - eval_set_id=payload.eval_set_id, - agent_snapshot=self._extract_agent_snapshot(payload.entrypoint), - no_of_evals=payload.no_of_evals, - evaluators=payload.evaluators, - is_coded=is_coded, - ) - self.eval_set_run_ids[payload.execution_id] = eval_set_run_id - current_span = trace.get_current_span() - if current_span.is_recording(): - current_span.set_attribute("eval_set_run_id", eval_set_run_id) - - # Create and send parent trace for the evaluation set run - if eval_set_run_id: - await self._send_parent_trace(eval_set_run_id, payload.eval_set_id) - - logger.debug( - f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})" - ) - - except Exception as e: - self._format_error_message(e, "StudioWeb create eval set run error") - - async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None: - try: - # Use the stored eval set execution ID to find the eval_set_run_id - if self.eval_set_execution_id and ( - eval_set_run_id := self.eval_set_run_ids.get(self.eval_set_execution_id) - ): - # Get the is_coded flag for this execution - is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) - eval_run_id = await self.create_eval_run( - payload.eval_item, eval_set_run_id, is_coded - ) - if eval_run_id: - # Store eval_run_id with the individual eval run's execution_id - self.eval_run_ids[payload.execution_id] = eval_run_id - - logger.debug( - f"Created eval run with ID: {eval_run_id} (coded={is_coded})" - ) - else: - logger.warning("Cannot create eval run: eval_set_run_id not available") - - except Exception as e: - self._format_error_message(e, "StudioWeb create eval run error") - - async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: - try: - eval_run_id = self.eval_run_ids.get(payload.execution_id) - - # Use evalRunId as the trace_id for agent execution spans - # This makes all agent spans children of the eval run trace - if eval_run_id: - self.spans_exporter.trace_id = eval_run_id - else: - # Fallback to evalSetRunId if eval_run_id not available yet - if self.eval_set_execution_id: - self.spans_exporter.trace_id = self.eval_set_run_ids.get( - self.eval_set_execution_id - ) - - self.spans_exporter.export(payload.spans) - - for eval_result in payload.eval_results: - evaluator_id = eval_result.evaluator_id - if evaluator_id in self.evaluator_scores: - match eval_result.result.score_type: - case ScoreType.NUMERICAL: - self.evaluator_scores[evaluator_id].append( - eval_result.result.score - ) - case ScoreType.BOOLEAN: - self.evaluator_scores[evaluator_id].append( - 100 if eval_result.result.score else 0 - ) - case ScoreType.ERROR: - self.evaluator_scores[evaluator_id].append(0) - - if eval_run_id and self.eval_set_execution_id: - # Get the is_coded flag for this execution - is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) - - # Extract usage metrics from spans - self._extract_usage_from_spans(payload.spans) - - # Send evaluator traces - await self._send_evaluator_traces( - eval_run_id, payload.eval_results, payload.spans - ) - - await self.update_eval_run( - StudioWebProgressItem( - eval_run_id=eval_run_id, - eval_results=payload.eval_results, - success=payload.success, - agent_output=payload.agent_output, - agent_execution_time=payload.agent_execution_time, - ), - self.evaluators, - is_coded=is_coded, - spans=payload.spans, - ) - - logger.debug( - f"Updated eval run with ID: {eval_run_id} (coded={is_coded})" - ) - - except Exception as e: - self._format_error_message(e, "StudioWeb reporting error") - - async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> None: - try: - if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id): - # Get the is_coded flag for this execution - is_coded = self.is_coded_eval.get(payload.execution_id, False) - await self.update_eval_set_run( - eval_set_run_id, - payload.evaluator_scores, - is_coded=is_coded, - success=payload.success, - ) - status_str = "completed" if payload.success else "failed" - logger.debug( - f"Updated eval set run with ID: {eval_set_run_id} (coded={is_coded}, status={status_str})" - ) - else: - logger.warning( - "Cannot update eval set run: eval_set_run_id not available" - ) - - except Exception as e: - self._format_error_message(e, "StudioWeb update eval set run error") - - async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: - event_bus.subscribe( - EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run - ) - event_bus.subscribe( - EvaluationEvents.CREATE_EVAL_RUN, self.handle_create_eval_run - ) - event_bus.subscribe( - EvaluationEvents.UPDATE_EVAL_RUN, self.handle_update_eval_run - ) - event_bus.subscribe( - EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run - ) - - logger.debug("StudioWeb progress reporter subscribed to evaluation events") - - def _serialize_justification( - self, justification: BaseModel | str | None - ) -> str | None: - """Serialize justification to JSON string for API compatibility. - - Args: - justification: The justification object which could be None, a BaseModel, - a string, or any other JSON-serializable object - - Returns: - JSON string representation or None if justification is None - """ - if isinstance(justification, BaseModel): - justification = json.dumps(justification.model_dump()) - - return justification - - def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: - try: - entry_points_file_path = os.path.join( - os.getcwd(), str(UiPathConfig.entry_points_file_path) - ) - if not os.path.exists(entry_points_file_path): - return StudioWebAgentSnapshot(input_schema={}, output_schema={}) - - with open(entry_points_file_path, "r") as f: - entry_points = json.load(f).get("entryPoints", []) - - ep = None - for entry_point in entry_points: - if entry_point.get("filePath") == entrypoint: - ep = entry_point - break - - if not ep: - logger.warning( - f"Entrypoint {entrypoint} not found in configuration file" - ) - return StudioWebAgentSnapshot(input_schema={}, output_schema={}) - - input_schema = ep.get("input", {}) - output_schema = ep.get("output", {}) - - return StudioWebAgentSnapshot( - input_schema=input_schema, output_schema=output_schema - ) - except Exception as e: - logger.warning(f"Failed to extract agent snapshot: {e}") - return StudioWebAgentSnapshot(input_schema={}, output_schema={}) - - def _collect_results( - self, - eval_results: list[EvalItemResult], - evaluators: dict[str, LegacyBaseEvaluator[Any]], - spans: list[Any], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - assertion_runs: list[dict[str, Any]] = [] - evaluator_scores_list: list[dict[str, Any]] = [] - - # Extract usage metrics from spans - usage_metrics = self._extract_usage_from_spans(spans) - - for eval_result in eval_results: - # Skip results for evaluators not in the provided dict - # (happens when processing mixed coded/legacy eval sets) - if eval_result.evaluator_id not in evaluators: - continue - - # Legacy API expects evaluatorId as GUID, convert string to GUID - try: - uuid.UUID(eval_result.evaluator_id) - evaluator_id_value = eval_result.evaluator_id - except ValueError: - # Generate deterministic UUID5 from string - evaluator_id_value = str( - uuid.uuid5(uuid.NAMESPACE_DNS, eval_result.evaluator_id) - ) - - # Convert BaseModel justification to JSON string for API compatibility - justification = self._serialize_justification(eval_result.result.details) - - evaluator_scores_list.append( - { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - "justification": justification, - "evaluatorId": evaluator_id_value, - } - ) - assertion_runs.append( - { - "status": EvaluationStatus.COMPLETED.value, - "evaluatorId": evaluator_id_value, - "completionMetrics": { - "duration": int(eval_result.result.evaluation_time) - if eval_result.result.evaluation_time - else 0, - "cost": usage_metrics["cost"], - "tokens": usage_metrics["tokens"] or 0, - "completionTokens": usage_metrics["completionTokens"] or 0, - "promptTokens": usage_metrics["promptTokens"] or 0, - }, - "assertionSnapshot": { - "assertionType": evaluators[ - eval_result.evaluator_id - ].evaluator_type.name, - "outputKey": evaluators[ - eval_result.evaluator_id - ].target_output_key, - }, - } - ) - return assertion_runs, evaluator_scores_list - - def _collect_coded_results( - self, - eval_results: list[EvalItemResult], - evaluators: dict[str, BaseEvaluator[Any, Any, Any]], - spans: list[Any], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results for coded evaluators. - - Returns evaluatorRuns and scores in the format expected by coded eval endpoints. - """ evaluator_runs: list[dict[str, Any]] = [] - evaluator_scores_list: list[dict[str, Any]] = [] - - # Extract usage metrics from spans - usage_metrics = self._extract_usage_from_spans(spans) - - for eval_result in eval_results: - # Skip results for evaluators not in the provided dict - # (happens when processing mixed coded/legacy eval sets) - if eval_result.evaluator_id not in evaluators: - continue - - # Convert BaseModel justification to JSON string for API compatibility - justification = self._serialize_justification(eval_result.result.details) - - evaluator_scores_list.append( - { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - "justification": justification, - "evaluatorId": eval_result.evaluator_id, - } - ) - evaluator_runs.append( - { - "status": EvaluationStatus.COMPLETED.value, - "evaluatorId": eval_result.evaluator_id, - "result": { - "score": { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - }, - "justification": justification, - }, - "completionMetrics": { - "duration": int(eval_result.result.evaluation_time) - if eval_result.result.evaluation_time - else 0, - "cost": usage_metrics["cost"], - "tokens": usage_metrics["tokens"] or 0, - "completionTokens": usage_metrics["completionTokens"] or 0, - "promptTokens": usage_metrics["promptTokens"] or 0, - }, - } - ) - return evaluator_runs, evaluator_scores_list - - def _update_eval_run_spec( - self, - assertion_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - eval_run_id: str, - actual_output: dict[str, Any], - execution_time: float, - success: bool, - is_coded: bool = False, - ) -> RequestSpec: - # For legacy evaluations, endpoint is without /coded - endpoint_suffix = "coded/" if is_coded else "" - - # Determine status based on success - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - - inner_payload: dict[str, Any] = { - "evalRunId": eval_run_id, - # Backend expects integer status - "status": status.value, - "result": { - "output": dict(actual_output), - "evaluatorScores": evaluator_scores, - }, - "completionMetrics": {"duration": int(execution_time)}, - "assertionRuns": assertion_runs, - } - - # Legacy backend expects payload wrapped in "request" field - # Coded backend accepts payload directly - # Both coded and legacy send payload directly at root level - payload = inner_payload + evaluator_scores: list[dict[str, Any]] = [] - return RequestSpec( - method="PUT", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalRun" - ), - json=payload, - headers=self._tenant_header(), + # Use strategies for result collection + if coded_evaluators: + runs, scores = self._coded_strategy.collect_results( + sw_progress_item.eval_results, + coded_evaluators, + usage_metrics, + self._serialize_justification, + ) + evaluator_runs.extend(runs) + evaluator_scores.extend(scores) + + if legacy_evaluators: + runs, scores = self._legacy_strategy.collect_results( + sw_progress_item.eval_results, + legacy_evaluators, + usage_metrics, + self._serialize_justification, + ) + evaluator_runs.extend(runs) + evaluator_scores.extend(scores) + + # Use strategy for spec generation + spec = self._update_eval_run_spec( + evaluator_runs=evaluator_runs, + evaluator_scores=evaluator_scores, + eval_run_id=sw_progress_item.eval_run_id, + actual_output=sw_progress_item.agent_output, + execution_time=sw_progress_item.agent_execution_time, + success=sw_progress_item.success, + is_coded=is_coded, + ) + + await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", ) - def _update_coded_eval_run_spec( + @gracefully_handle_errors + async def update_eval_set_run( self, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - eval_run_id: str, - actual_output: dict[str, Any], - execution_time: float, - success: bool, + eval_set_run_id: str, + evaluator_scores: dict[str, float], is_coded: bool = False, - ) -> RequestSpec: - """Create update spec for coded evaluators.""" - # For coded evaluations, endpoint has /coded - endpoint_suffix = "coded/" if is_coded else "" - - # Determine status based on success - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + success: bool = True, + ): + """Update the evaluation set run status to complete.""" + spec = self._update_eval_set_run_spec( + eval_set_run_id, evaluator_scores, is_coded, success + ) + await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", + ) - payload: dict[str, Any] = { - "evalRunId": eval_run_id, - # For coded evaluations, use integer status; for legacy, use string - "status": status.value, - "result": { - "output": dict(actual_output), - "scores": evaluator_scores, - }, - "completionMetrics": {"duration": int(execution_time)}, - "evaluatorRuns": evaluator_runs, - } + # ------------------------------------------------------------------------- + # Event Handlers + # ------------------------------------------------------------------------- - return RequestSpec( - method="PUT", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalRun" - ), - json=payload, - headers=self._tenant_header(), - ) + async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None: + try: + self.evaluators = {eval.id: eval for eval in payload.evaluators} + self.evaluator_scores = {eval.id: [] for eval in payload.evaluators} + self.eval_set_execution_id = payload.execution_id - def _create_eval_run_spec( - self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False - ) -> RequestSpec: - # Legacy API expects eval IDs as GUIDs, coded accepts strings - # Convert string IDs to deterministic GUIDs for legacy - if is_coded: - eval_item_id = eval_item.id - else: - # Try to parse as GUID, if it fails, generate deterministic GUID from string - try: - uuid.UUID(eval_item.id) - eval_item_id = eval_item.id - except ValueError: - # Generate deterministic UUID5 from string - eval_item_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, eval_item.id)) - - # Build eval snapshot based on evaluation item type - eval_snapshot = { - "id": eval_item_id, - "name": eval_item.name, - "inputs": eval_item.inputs, - } + is_coded = self._is_coded_evaluator(payload.evaluators) + self.is_coded_eval[payload.execution_id] = is_coded - # For coded evaluators, use evaluationCriterias directly - # For legacy evaluators, extract expectedOutput from the migrated evaluationCriterias - # (Legacy evals are migrated to EvaluationItem format with expectedOutput inside evaluationCriterias) - if is_coded: - eval_snapshot["evaluationCriterias"] = eval_item.evaluation_criterias - else: - # Legacy backend endpoint expects expectedOutput directly in evalSnapshot - # Extract it from the first evaluator criteria (all criteria have the same expectedOutput) - expected_output = {} - if eval_item.evaluation_criterias: - first_criteria = next( - iter(eval_item.evaluation_criterias.values()), None + eval_set_run_id = payload.eval_set_run_id + if not eval_set_run_id: + eval_set_run_id = await self.create_eval_set_run_sw( + eval_set_id=payload.eval_set_id, + agent_snapshot=self._extract_agent_snapshot(payload.entrypoint), + no_of_evals=payload.no_of_evals, + evaluators=payload.evaluators, + is_coded=is_coded, ) - if first_criteria and isinstance(first_criteria, dict): - expected_output = first_criteria.get("expectedOutput", {}) - eval_snapshot["expectedOutput"] = expected_output - - # For legacy evaluations, endpoint is without /coded - endpoint_suffix = "coded/" if is_coded else "" + self.eval_set_run_ids[payload.execution_id] = eval_set_run_id + current_span = trace.get_current_span() + if current_span.is_recording(): + current_span.set_attribute("eval_set_run_id", eval_set_run_id) - inner_payload: dict[str, Any] = { - "evalSetRunId": eval_set_run_id, - "evalSnapshot": eval_snapshot, - # Backend expects integer status - "status": EvaluationStatus.IN_PROGRESS.value, - } + if eval_set_run_id: + await self._send_parent_trace(eval_set_run_id, payload.eval_set_id) - # Legacy backend expects payload wrapped in "request" field - # Coded backend accepts payload directly - # Both coded and legacy send payload directly at root level - payload = inner_payload + logger.debug( + f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})" + ) - return RequestSpec( - method="POST", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalRun" - ), - json=payload, - headers=self._tenant_header(), - ) + except Exception as e: + self._format_error_message(e, "StudioWeb create eval set run error") - def _create_eval_set_run_spec( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - is_coded: bool = False, - ) -> RequestSpec: - # For legacy evaluations, endpoint is without /coded - endpoint_suffix = "coded/" if is_coded else "" - - # Legacy API expects evalSetId as GUID, coded accepts string - # Convert string IDs to deterministic GUIDs for legacy - if is_coded: - eval_set_id_value = eval_set_id - else: - # Try to parse as GUID, if it fails, generate deterministic GUID from string - try: - uuid.UUID(eval_set_id) - eval_set_id_value = eval_set_id - except ValueError: - # Generate deterministic UUID5 from string - eval_set_id_value = str(uuid.uuid5(uuid.NAMESPACE_DNS, eval_set_id)) - - inner_payload: dict[str, Any] = { - "agentId": self._project_id, - "evalSetId": eval_set_id_value, - "agentSnapshot": agent_snapshot.model_dump(by_alias=True), - # Backend expects integer status - "status": EvaluationStatus.IN_PROGRESS.value, - "numberOfEvalsExecuted": no_of_evals, - # Source is required by the backend (0 = coded SDK) - "source": 0, - } + async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None: + try: + if self.eval_set_execution_id and ( + eval_set_run_id := self.eval_set_run_ids.get(self.eval_set_execution_id) + ): + is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) + eval_run_id = await self.create_eval_run( + payload.eval_item, eval_set_run_id, is_coded + ) + if eval_run_id: + self.eval_run_ids[payload.execution_id] = eval_run_id + logger.debug( + f"Created eval run with ID: {eval_run_id} (coded={is_coded})" + ) + else: + logger.warning("Cannot create eval run: eval_set_run_id not available") - # Both coded and legacy send payload directly at root level - payload = inner_payload + except Exception as e: + self._format_error_message(e, "StudioWeb create eval run error") - return RequestSpec( - method="POST", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun" - ), - json=payload, - headers=self._tenant_header(), - ) + async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: + try: + eval_run_id = self.eval_run_ids.get(payload.execution_id) - def _update_eval_set_run_spec( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - is_coded: bool = False, - success: bool = True, - ) -> RequestSpec: - # Legacy API expects evaluatorId as GUID, coded accepts string - evaluator_scores_list = [] - for evaluator_id, avg_score in evaluator_scores.items(): - if is_coded: - evaluator_id_value = evaluator_id + if eval_run_id: + self.spans_exporter.trace_id = eval_run_id else: - # Convert string to GUID for legacy - try: - uuid.UUID(evaluator_id) - evaluator_id_value = evaluator_id - except ValueError: - # Generate deterministic UUID5 from string - evaluator_id_value = str( - uuid.uuid5(uuid.NAMESPACE_DNS, evaluator_id) + if self.eval_set_execution_id: + self.spans_exporter.trace_id = self.eval_set_run_ids.get( + self.eval_set_execution_id ) - evaluator_scores_list.append( - {"value": avg_score, "evaluatorId": evaluator_id_value} - ) + self.spans_exporter.export(payload.spans) - # For legacy evaluations, endpoint is without /coded - endpoint_suffix = "coded/" if is_coded else "" + for eval_result in payload.eval_results: + evaluator_id = eval_result.evaluator_id + if evaluator_id in self.evaluator_scores: + match eval_result.result.score_type: + case ScoreType.NUMERICAL: + self.evaluator_scores[evaluator_id].append( + eval_result.result.score + ) + case ScoreType.BOOLEAN: + self.evaluator_scores[evaluator_id].append( + 100 if eval_result.result.score else 0 + ) + case ScoreType.ERROR: + self.evaluator_scores[evaluator_id].append(0) - # Determine status based on success - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + if eval_run_id and self.eval_set_execution_id: + is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) + self._extract_usage_from_spans(payload.spans) - inner_payload: dict[str, Any] = { - "evalSetRunId": eval_set_run_id, - # Backend expects integer status - "status": status.value, - "evaluatorScores": evaluator_scores_list, - } + await self._send_evaluator_traces( + eval_run_id, payload.eval_results, payload.spans + ) - # Legacy backend expects payload wrapped in "request" field - # Coded backend accepts payload directly - # Both coded and legacy send payload directly at root level - payload = inner_payload + await self.update_eval_run( + StudioWebProgressItem( + eval_run_id=eval_run_id, + eval_results=payload.eval_results, + success=payload.success, + agent_output=payload.agent_output, + agent_execution_time=payload.agent_execution_time, + ), + self.evaluators, + is_coded=is_coded, + spans=payload.spans, + ) - return RequestSpec( - method="PUT", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/{endpoint_suffix}evalSetRun" - ), - json=payload, - headers=self._tenant_header(), + logger.debug( + f"Updated eval run with ID: {eval_run_id} (coded={is_coded})" + ) + + except Exception as e: + self._format_error_message(e, "StudioWeb reporting error") + + async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> None: + try: + if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id): + is_coded = self.is_coded_eval.get(payload.execution_id, False) + await self.update_eval_set_run( + eval_set_run_id, + payload.evaluator_scores, + is_coded=is_coded, + success=payload.success, + ) + status_str = "completed" if payload.success else "failed" + logger.debug( + f"Updated eval set run with ID: {eval_set_run_id} " + f"(coded={is_coded}, status={status_str})" + ) + else: + logger.warning( + "Cannot update eval set run: eval_set_run_id not available" + ) + + except Exception as e: + self._format_error_message(e, "StudioWeb update eval set run error") + + async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: + event_bus.subscribe( + EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run + ) + event_bus.subscribe( + EvaluationEvents.CREATE_EVAL_RUN, self.handle_create_eval_run + ) + event_bus.subscribe( + EvaluationEvents.UPDATE_EVAL_RUN, self.handle_update_eval_run ) + event_bus.subscribe( + EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run + ) + logger.debug("StudioWeb progress reporter subscribed to evaluation events") - def _tenant_header(self) -> dict[str, str | None]: - tenant_id = os.getenv(ENV_TENANT_ID, None) - if not tenant_id: - self._console.error( - f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'." - ) - return {HEADER_INTERNAL_TENANT_ID: tenant_id} + # ------------------------------------------------------------------------- + # Tracing Methods + # ------------------------------------------------------------------------- async def _send_parent_trace( self, eval_set_run_id: str, eval_set_name: str ) -> None: - """Send the parent trace span for the evaluation set run. - - Args: - eval_set_run_id: The ID of the evaluation set run - eval_set_name: The name of the evaluation set - """ + """Send the parent trace span for the evaluation set run.""" try: - # Get the tracer tracer = trace.get_tracer(__name__) - - # Convert eval_set_run_id to trace ID format (128-bit integer) trace_id_int = int(uuid.UUID(eval_set_run_id)) - # Create a span context with the eval_set_run_id as the trace ID span_context = SpanContext( trace_id=trace_id_int, - span_id=trace_id_int, # Use same ID for root span + span_id=trace_id_int, is_remote=False, - trace_flags=TraceFlags(0x01), # Sampled + trace_flags=TraceFlags(0x01), ) - # Create a non-recording span with our custom context ctx = trace.set_span_in_context(trace.NonRecordingSpan(span_context)) - # Start a new span with the custom trace ID with tracer.start_as_current_span( eval_set_name, context=ctx, kind=SpanKind.INTERNAL, start_time=int(datetime.now(timezone.utc).timestamp() * 1_000_000_000), ) as span: - # Set attributes for the evaluation set span span.set_attribute("openinference.span.kind", "CHAIN") span.set_attribute("span.type", "evaluationSet") span.set_attribute("eval_set_run_id", eval_set_run_id) @@ -990,22 +1086,12 @@ async def _send_parent_trace( async def _send_eval_run_trace( self, eval_run_id: str, eval_set_run_id: str, eval_name: str ) -> None: - """Send the child trace span for an evaluation run. - - Args: - eval_run_id: The ID of the evaluation run - eval_set_run_id: The ID of the parent evaluation set run - eval_name: The name of the evaluation - """ + """Send the child trace span for an evaluation run.""" try: - # Get the tracer tracer = trace.get_tracer(__name__) - - # Convert IDs to trace format trace_id_int = int(uuid.UUID(eval_run_id)) parent_span_id_int = int(uuid.UUID(eval_set_run_id)) - # Create a parent span context parent_context = SpanContext( trace_id=trace_id_int, span_id=parent_span_id_int, @@ -1013,17 +1099,14 @@ async def _send_eval_run_trace( trace_flags=TraceFlags(0x01), ) - # Create context with parent span ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) - # Start a new span with the eval_run_id as trace ID with tracer.start_as_current_span( eval_name, context=ctx, kind=SpanKind.INTERNAL, start_time=int(datetime.now(timezone.utc).timestamp() * 1_000_000_000), ) as span: - # Set attributes for the evaluation run span span.set_attribute("openinference.span.kind", "CHAIN") span.set_attribute("span.type", "evaluation") span.set_attribute("eval_run_id", eval_run_id) @@ -1039,13 +1122,7 @@ async def _send_eval_run_trace( async def _send_evaluator_traces( self, eval_run_id: str, eval_results: list[EvalItemResult], spans: list[Any] ) -> None: - """Send trace spans for all evaluators. - - Args: - eval_run_id: The ID of the evaluation run - eval_results: List of evaluator results - spans: List of spans that may contain evaluator LLM calls - """ + """Send trace spans for all evaluators.""" try: if not eval_results: logger.debug( @@ -1053,7 +1130,6 @@ async def _send_evaluator_traces( ) return - # First, export the agent execution spans so they appear in the trace agent_readable_spans = [] if spans: for span in spans: @@ -1063,30 +1139,22 @@ async def _send_evaluator_traces( if agent_readable_spans: self.spans_exporter.export(agent_readable_spans) logger.debug( - f"Exported {len(agent_readable_spans)} agent execution spans for eval run: {eval_run_id}" + f"Exported {len(agent_readable_spans)} agent execution spans " + f"for eval run: {eval_run_id}" ) - # Get the tracer tracer = trace.get_tracer(__name__) - - # Calculate overall start and end times for the evaluators parent span - # Since evaluators run sequentially, the parent span duration should be - # the sum of all individual evaluator times now = datetime.now(timezone.utc) - # Sum all evaluator execution times for sequential execution total_eval_time = ( sum( - ( - r.result.evaluation_time - for r in eval_results - if r.result.evaluation_time - ) + r.result.evaluation_time + for r in eval_results + if r.result.evaluation_time ) or 0.0 ) - # Parent span covers the sequential evaluation period parent_end_time = now parent_start_time = ( datetime.fromtimestamp( @@ -1096,29 +1164,21 @@ async def _send_evaluator_traces( else now ) - # Find the root execution span from the agent spans - # The root span typically has no parent root_span_uuid = None if spans: from uipath.tracing._utils import _SpanUtils for span in spans: - # Check if this span has no parent (indicating it's the root) if span.parent is None: - # Get the span context and convert to UUID span_context = span.get_span_context() root_span_uuid = _SpanUtils.span_id_to_uuid4( span_context.span_id ) break - # Convert eval_run_id to trace ID format trace_id_int = int(uuid.UUID(eval_run_id)) - # Create parent span context - child of root span if available - # The root span should be the eval span (the agent execution root) if root_span_uuid: - # Convert root span UUID to integer for SpanContext root_span_id_int = int(root_span_uuid) parent_context = SpanContext( trace_id=trace_id_int, @@ -1128,7 +1188,6 @@ async def _send_evaluator_traces( ) ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) else: - # No root span found, create as root span with eval_run_id as both trace and span parent_context = SpanContext( trace_id=trace_id_int, span_id=trace_id_int, @@ -1137,11 +1196,9 @@ async def _send_evaluator_traces( ) ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) - # Create the evaluators parent span parent_start_ns = int(parent_start_time.timestamp() * 1_000_000_000) parent_end_ns = int(parent_end_time.timestamp() * 1_000_000_000) - # Start parent span manually (not using with statement) to control end time parent_span = tracer.start_span( "Evaluators", context=ctx, @@ -1149,41 +1206,28 @@ async def _send_evaluator_traces( start_time=parent_start_ns, ) - # Set attributes for the evaluators parent span parent_span.set_attribute("openinference.span.kind", "CHAIN") parent_span.set_attribute("span.type", "evaluators") parent_span.set_attribute("eval_run_id", eval_run_id) - # Make this span the active span for child spans parent_ctx = trace.set_span_in_context(parent_span, ctx) - - # Track the current time for sequential execution current_time = parent_start_time - - # Collect all readable spans for export readable_spans = [] - # Create individual evaluator spans - running sequentially for eval_result in eval_results: - # Get evaluator name from stored evaluators evaluator = self.evaluators.get(eval_result.evaluator_id) evaluator_name = evaluator.id if evaluator else eval_result.evaluator_id - # Each evaluator starts where the previous one ended (sequential execution) eval_time = eval_result.result.evaluation_time or 0 eval_start = current_time eval_end = datetime.fromtimestamp( current_time.timestamp() + eval_time, tz=timezone.utc ) - - # Move current time forward for the next evaluator current_time = eval_end - # Create timestamps eval_start_ns = int(eval_start.timestamp() * 1_000_000_000) eval_end_ns = int(eval_end.timestamp() * 1_000_000_000) - # Start evaluator span manually (not using with statement) to control end time evaluator_span = tracer.start_span( evaluator_name, context=parent_ctx, @@ -1191,7 +1235,6 @@ async def _send_evaluator_traces( start_time=eval_start_ns, ) - # Set attributes for the evaluator span evaluator_span.set_attribute("openinference.span.kind", "EVALUATOR") evaluator_span.set_attribute("span.type", "evaluator") evaluator_span.set_attribute("evaluator_id", eval_result.evaluator_id) @@ -1202,7 +1245,6 @@ async def _send_evaluator_traces( "score_type", eval_result.result.score_type.name ) - # Add details/justification if available if eval_result.result.details: if isinstance(eval_result.result.details, BaseModel): evaluator_span.set_attribute( @@ -1214,13 +1256,11 @@ async def _send_evaluator_traces( "details", str(eval_result.result.details) ) - # Add evaluation time if available if eval_result.result.evaluation_time: evaluator_span.set_attribute( "evaluation_time", eval_result.result.evaluation_time ) - # Set status based on score type from opentelemetry.trace import Status, StatusCode if eval_result.result.score_type == ScoreType.ERROR: @@ -1230,28 +1270,22 @@ async def _send_evaluator_traces( else: evaluator_span.set_status(Status(StatusCode.OK)) - # End the evaluator span at the correct time evaluator_span.end(end_time=eval_end_ns) - # Convert to ReadableSpan for export - # The span object has a method to get the readable version if hasattr(evaluator_span, "_readable_span"): readable_spans.append(evaluator_span._readable_span()) - # End the parent span at the correct time after all children are created parent_span.end(end_time=parent_end_ns) - # Convert parent span to ReadableSpan if hasattr(parent_span, "_readable_span"): - # Add parent span at the beginning for proper ordering readable_spans.insert(0, parent_span._readable_span()) - # Export all evaluator spans together if readable_spans: self.spans_exporter.export(readable_spans) logger.debug( - f"Created evaluator traces for eval run: {eval_run_id} ({len(eval_results)} evaluators)" + f"Created evaluator traces for eval run: {eval_run_id} " + f"({len(eval_results)} evaluators)" ) except Exception as e: logger.warning(f"Failed to create evaluator traces: {e}") diff --git a/tests/cli/eval/test_progress_reporter.py b/tests/cli/eval/test_progress_reporter.py index 17cccf712..4db5f6713 100644 --- a/tests/cli/eval/test_progress_reporter.py +++ b/tests/cli/eval/test_progress_reporter.py @@ -261,10 +261,11 @@ def test_create_eval_set_run_spec_for_legacy(self, progress_reporter): assert spec.method == "POST" assert "coded/" not in spec.endpoint - # Both coded and legacy now send payload directly at root level + # Both legacy and coded APIs accept payload directly at root level (no wrapper) + assert "request" not in spec.json # Legacy should not have version field assert "version" not in spec.json - # Source field is now required by backend for all evaluations + # Source field is required for both legacy and coded assert spec.json["source"] == 0 assert spec.json["numberOfEvalsExecuted"] == 5 # Backend expects integer status @@ -281,7 +282,8 @@ def test_update_coded_eval_run_spec(self, progress_reporter): ] evaluator_scores = [{"evaluatorId": "test-1", "value": 0.9}] - spec = progress_reporter._update_coded_eval_run_spec( + # Now uses unified _update_eval_run_spec with is_coded=True + spec = progress_reporter._update_eval_run_spec( evaluator_runs=evaluator_runs, evaluator_scores=evaluator_scores, eval_run_id="test-run-id", @@ -301,13 +303,14 @@ def test_update_coded_eval_run_spec(self, progress_reporter): def test_update_legacy_eval_run_spec(self, progress_reporter): """Test updating eval run spec for legacy evaluators.""" - assertion_runs = [ + # Note: unified method uses evaluator_runs param, strategy outputs assertionRuns + evaluator_runs = [ {"evaluatorId": "test-1", "status": "completed", "assertionSnapshot": {}} ] evaluator_scores = [{"evaluatorId": "test-1", "value": 0.9}] spec = progress_reporter._update_eval_run_spec( - assertion_runs=assertion_runs, + evaluator_runs=evaluator_runs, evaluator_scores=evaluator_scores, eval_run_id="test-run-id", actual_output={"result": "success"}, @@ -318,10 +321,11 @@ def test_update_legacy_eval_run_spec(self, progress_reporter): assert spec.method == "PUT" assert "coded/" not in spec.endpoint - # Both coded and legacy now send payload directly at root level + # Both legacy and coded APIs accept payload directly at root level (no wrapper) assert "request" not in spec.json assert spec.json["evalRunId"] == "test-run-id" - assert spec.json["assertionRuns"] == assertion_runs + # Legacy strategy outputs assertionRuns in payload + assert spec.json["assertionRuns"] == evaluator_runs assert spec.json["result"]["evaluatorScores"] == evaluator_scores assert spec.json["completionMetrics"]["duration"] == 5 # Backend expects integer status @@ -332,7 +336,8 @@ def test_update_coded_eval_run_spec_with_failure(self, progress_reporter): evaluator_runs: list[dict[str, Any]] = [] evaluator_scores: list[dict[str, Any]] = [] - spec = progress_reporter._update_coded_eval_run_spec( + # Now uses unified _update_eval_run_spec with is_coded=True + spec = progress_reporter._update_eval_run_spec( evaluator_runs=evaluator_runs, evaluator_scores=evaluator_scores, eval_run_id="test-run-id", @@ -349,11 +354,11 @@ def test_update_coded_eval_run_spec_with_failure(self, progress_reporter): def test_update_legacy_eval_run_spec_with_failure(self, progress_reporter): """Test updating eval run spec for legacy evaluators with failure.""" - assertion_runs: list[dict[str, Any]] = [] + evaluator_runs: list[dict[str, Any]] = [] evaluator_scores: list[dict[str, Any]] = [] spec = progress_reporter._update_eval_run_spec( - assertion_runs=assertion_runs, + evaluator_runs=evaluator_runs, evaluator_scores=evaluator_scores, eval_run_id="test-run-id", actual_output={}, @@ -364,7 +369,7 @@ def test_update_legacy_eval_run_spec_with_failure(self, progress_reporter): assert spec.method == "PUT" assert "coded/" not in spec.endpoint - # Both coded and legacy now send payload directly at root level + # Both legacy and coded APIs accept payload directly at root level (no wrapper) assert "request" not in spec.json assert spec.json["evalRunId"] == "test-run-id" # Backend expects integer status @@ -527,7 +532,7 @@ def test_update_eval_set_run_spec_with_success_legacy(self, progress_reporter): assert spec.method == "PUT" assert "coded/" not in spec.endpoint - # Both coded and legacy now send payload directly at root level + # Both legacy and coded APIs accept payload directly at root level (no wrapper) assert "request" not in spec.json assert spec.json["evalSetRunId"] == "test-run-id" # Backend expects integer status @@ -546,7 +551,7 @@ def test_update_eval_set_run_spec_with_failure_legacy(self, progress_reporter): assert spec.method == "PUT" assert "coded/" not in spec.endpoint - # Both coded and legacy now send payload directly at root level + # Both legacy and coded APIs accept payload directly at root level (no wrapper) assert "request" not in spec.json assert spec.json["evalSetRunId"] == "test-run-id" # Backend expects integer status From 86e893054b6570e25246935288fb6530bf6e67a4 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Thu, 18 Dec 2025 22:16:49 -0800 Subject: [PATCH 2/5] refactor: split progress reporter into modular package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create _reporting/ package with focused modules - Split strategies, utils, and reporter into separate files - Maintain backward compatibility via re-exports - Split tests to match new structure (48 tests, up from 27) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/uipath/_cli/_evals/_progress_reporter.py | 1305 +---------------- src/uipath/_cli/_evals/_reporting/__init__.py | 21 + .../_cli/_evals/_reporting/_reporter.py | 861 +++++++++++ .../_cli/_evals/_reporting/_strategies.py | 418 ++++++ src/uipath/_cli/_evals/_reporting/_utils.py | 44 + tests/cli/eval/reporting/__init__.py | 1 + .../test_reporter.py} | 8 +- tests/cli/eval/reporting/test_strategies.py | 244 +++ tests/cli/eval/reporting/test_utils.py | 89 ++ 9 files changed, 1699 insertions(+), 1292 deletions(-) create mode 100644 src/uipath/_cli/_evals/_reporting/__init__.py create mode 100644 src/uipath/_cli/_evals/_reporting/_reporter.py create mode 100644 src/uipath/_cli/_evals/_reporting/_strategies.py create mode 100644 src/uipath/_cli/_evals/_reporting/_utils.py create mode 100644 tests/cli/eval/reporting/__init__.py rename tests/cli/eval/{test_progress_reporter.py => reporting/test_reporter.py} (98%) create mode 100644 tests/cli/eval/reporting/test_strategies.py create mode 100644 tests/cli/eval/reporting/test_utils.py diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index 8a358c4e2..72f000731 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -1,1291 +1,24 @@ -"""Progress reporter for sending evaluation updates to StudioWeb. +"""Backward compatibility - import from _reporting instead. -This module uses the Strategy Pattern to separate legacy and coded evaluation -reporting flows. Each strategy handles the specific API format differences. -""" - -import functools -import json -import logging -import os -import uuid -from datetime import datetime, timezone -from typing import Any, Callable, Protocol, runtime_checkable -from urllib.parse import urlparse +This module re-exports components from the _reporting package for +backward compatibility with existing code that imports from this location. -from opentelemetry import trace -from opentelemetry.trace import SpanContext, SpanKind, TraceFlags -from pydantic import BaseModel -from rich.console import Console +For new code, prefer importing directly from: + from uipath._cli._evals._reporting import StudioWebProgressReporter +""" -from uipath._cli._evals._models._evaluation_set import ( - EvaluationItem, - EvaluationStatus, -) -from uipath._cli._evals._models._evaluator import Evaluator -from uipath._cli._evals._models._sw_reporting import ( - StudioWebAgentSnapshot, - StudioWebProgressItem, -) -from uipath._cli._utils._console import ConsoleLogger -from uipath._events._event_bus import EventBus -from uipath._events._events import ( - EvalRunCreatedEvent, - EvalRunUpdatedEvent, - EvalSetRunCreatedEvent, - EvalSetRunUpdatedEvent, - EvaluationEvents, -) -from uipath._utils import Endpoint, RequestSpec -from uipath._utils.constants import ( - ENV_EVAL_BACKEND_URL, - ENV_TENANT_ID, - HEADER_INTERNAL_TENANT_ID, +from uipath._cli._evals._reporting import ( + CodedEvalReportingStrategy, + EvalReportingStrategy, + LegacyEvalReportingStrategy, + StudioWebProgressReporter, + gracefully_handle_errors, ) -from uipath.eval.evaluators import ( - BaseEvaluator, - LegacyBaseEvaluator, -) -from uipath.eval.models import EvalItemResult, ScoreType -from uipath.platform import UiPath -from uipath.platform.common import UiPathConfig -from uipath.tracing import LlmOpsHttpExporter - -logger = logging.getLogger(__name__) - - -# ============================================================================= -# Utility Functions -# ============================================================================= - - -def gracefully_handle_errors(func): - """Decorator to catch and log errors without stopping execution.""" - - @functools.wraps(func) - async def wrapper(self, *args, **kwargs): - try: - return await func(self, *args, **kwargs) - except Exception as e: - if hasattr(self, "_console"): - error_type = type(e).__name__ - logger.debug(f"Full error details: {e}") - logger.warning( - f"Cannot report progress to SW. " - f"Function: {func.__name__}, " - f"Error type: {error_type}, " - f"Details: {e}" - ) - return None - - return wrapper - - -# ============================================================================= -# Strategy Protocol -# ============================================================================= - - -@runtime_checkable -class EvalReportingStrategy(Protocol): - """Protocol for evaluation reporting strategies. - - Strategies handle the differences between legacy and coded evaluation - API formats, including ID conversion, endpoint routing, and payload structure. - """ - - @property - def endpoint_suffix(self) -> str: - """Return the endpoint suffix for this strategy. - - Returns: - "" for legacy, "coded/" for coded evaluations - """ - ... - - def convert_id(self, id_value: str) -> str: - """Convert an ID to the format expected by the backend. - - Args: - id_value: The original string ID - - Returns: - For legacy: deterministic GUID from uuid5 - For coded: original string ID unchanged - """ - ... - - def create_eval_set_run_payload( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - project_id: str, - ) -> dict[str, Any]: - """Create the payload for creating an eval set run.""" - ... - - def create_eval_run_payload( - self, - eval_item: EvaluationItem, - eval_set_run_id: str, - ) -> dict[str, Any]: - """Create the payload for creating an eval run.""" - ... - - def create_update_eval_run_payload( - self, - eval_run_id: str, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - actual_output: dict[str, Any], - execution_time: float, - success: bool, - ) -> dict[str, Any]: - """Create the payload for updating an eval run.""" - ... - - def create_update_eval_set_run_payload( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - success: bool, - ) -> dict[str, Any]: - """Create the payload for updating an eval set run.""" - ... - - def collect_results( - self, - eval_results: list[EvalItemResult], - evaluators: dict[str, Any], - usage_metrics: dict[str, int | float | None], - serialize_justification_fn: Callable[[Any], str | None], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results from evaluations in strategy-specific format. - - Returns: - Tuple of (evaluator_runs, evaluator_scores) - """ - ... - - -# ============================================================================= -# Legacy Evaluation Reporting Strategy -# ============================================================================= - - -class LegacyEvalReportingStrategy: - """Strategy for legacy evaluation reporting. - - Legacy evaluations: - - Convert string IDs to deterministic GUIDs using uuid5 - - Use endpoints without /coded/ prefix - - Use assertionRuns format with assertionSnapshot - - Put expectedOutput directly in evalSnapshot - """ - - @property - def endpoint_suffix(self) -> str: - """Return empty string for legacy endpoints (no /coded/ prefix).""" - return "" - - def convert_id(self, id_value: str) -> str: - """Convert string ID to deterministic GUID for legacy API. - - Args: - id_value: The original string ID - - Returns: - The ID as a GUID (either original if valid, or deterministic uuid5) - """ - try: - uuid.UUID(id_value) - return id_value - except ValueError: - return str(uuid.uuid5(uuid.NAMESPACE_DNS, id_value)) - - def create_eval_set_run_payload( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - project_id: str, - ) -> dict[str, Any]: - """Create payload for creating a legacy eval set run.""" - return { - "agentId": project_id, - "evalSetId": self.convert_id(eval_set_id), - "agentSnapshot": agent_snapshot.model_dump(by_alias=True), - "status": EvaluationStatus.IN_PROGRESS.value, - "numberOfEvalsExecuted": no_of_evals, - "source": 0, # EvalRunSource.Manual - } - - def create_eval_run_payload( - self, - eval_item: EvaluationItem, - eval_set_run_id: str, - ) -> dict[str, Any]: - """Create payload for creating a legacy eval run.""" - eval_item_id = self.convert_id(eval_item.id) - - # Extract expectedOutput from evaluation_criterias - expected_output = {} - if eval_item.evaluation_criterias: - first_criteria = next(iter(eval_item.evaluation_criterias.values()), None) - if first_criteria and isinstance(first_criteria, dict): - expected_output = first_criteria.get("expectedOutput", {}) - - return { - "evalSetRunId": eval_set_run_id, - "evalSnapshot": { - "id": eval_item_id, - "name": eval_item.name, - "inputs": eval_item.inputs, - "expectedOutput": expected_output, - }, - "status": EvaluationStatus.IN_PROGRESS.value, - } - - def create_update_eval_run_payload( - self, - eval_run_id: str, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - actual_output: dict[str, Any], - execution_time: float, - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a legacy eval run.""" - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalRunId": eval_run_id, - "status": status.value, - "result": { - "output": dict(actual_output), - "evaluatorScores": evaluator_scores, - }, - "completionMetrics": {"duration": int(execution_time)}, - "assertionRuns": evaluator_runs, - } - - def create_update_eval_set_run_payload( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a legacy eval set run.""" - scores_list = [ - {"value": avg_score, "evaluatorId": self.convert_id(eval_id)} - for eval_id, avg_score in evaluator_scores.items() - ] - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalSetRunId": eval_set_run_id, - "status": status.value, - "evaluatorScores": scores_list, - } - - def collect_results( - self, - eval_results: list[EvalItemResult], - evaluators: dict[str, LegacyBaseEvaluator[Any]], - usage_metrics: dict[str, int | float | None], - serialize_justification_fn: Callable[[Any], str | None], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results in legacy assertionRuns format.""" - assertion_runs: list[dict[str, Any]] = [] - evaluator_scores_list: list[dict[str, Any]] = [] - - for eval_result in eval_results: - if eval_result.evaluator_id not in evaluators: - continue - - evaluator_id_value = self.convert_id(eval_result.evaluator_id) - evaluator = evaluators[eval_result.evaluator_id] - justification = serialize_justification_fn(eval_result.result.details) - - evaluator_scores_list.append( - { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - "justification": justification, - "evaluatorId": evaluator_id_value, - } - ) - - assertion_runs.append( - { - "status": EvaluationStatus.COMPLETED.value, - "evaluatorId": evaluator_id_value, - "completionMetrics": { - "duration": int(eval_result.result.evaluation_time or 0), - "cost": usage_metrics["cost"], - "tokens": usage_metrics["tokens"] or 0, - "completionTokens": usage_metrics["completionTokens"] or 0, - "promptTokens": usage_metrics["promptTokens"] or 0, - }, - "assertionSnapshot": { - "assertionType": evaluator.evaluator_type.name, - "outputKey": evaluator.target_output_key, - }, - } - ) - - return assertion_runs, evaluator_scores_list - - -# ============================================================================= -# Coded Evaluation Reporting Strategy -# ============================================================================= - - -class CodedEvalReportingStrategy: - """Strategy for coded evaluation reporting. - - Coded evaluations: - - Keep string IDs unchanged - - Use endpoints with /coded/ prefix - - Use evaluatorRuns format with nested result - - Put evaluationCriterias in evalSnapshot - """ - - @property - def endpoint_suffix(self) -> str: - """Return 'coded/' for coded endpoints.""" - return "coded/" - - def convert_id(self, id_value: str) -> str: - """Keep string ID unchanged for coded API.""" - return id_value - - def create_eval_set_run_payload( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - project_id: str, - ) -> dict[str, Any]: - """Create payload for creating a coded eval set run.""" - return { - "agentId": project_id, - "evalSetId": eval_set_id, - "agentSnapshot": agent_snapshot.model_dump(by_alias=True), - "status": EvaluationStatus.IN_PROGRESS.value, - "numberOfEvalsExecuted": no_of_evals, - "source": 0, # EvalRunSource.Manual - } - - def create_eval_run_payload( - self, - eval_item: EvaluationItem, - eval_set_run_id: str, - ) -> dict[str, Any]: - """Create payload for creating a coded eval run.""" - return { - "evalSetRunId": eval_set_run_id, - "evalSnapshot": { - "id": eval_item.id, - "name": eval_item.name, - "inputs": eval_item.inputs, - "evaluationCriterias": eval_item.evaluation_criterias, - }, - "status": EvaluationStatus.IN_PROGRESS.value, - } - - def create_update_eval_run_payload( - self, - eval_run_id: str, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - actual_output: dict[str, Any], - execution_time: float, - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a coded eval run.""" - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalRunId": eval_run_id, - "status": status.value, - "result": { - "output": dict(actual_output), - "scores": evaluator_scores, # Note: "scores" not "evaluatorScores" - }, - "completionMetrics": {"duration": int(execution_time)}, - "evaluatorRuns": evaluator_runs, # Note: "evaluatorRuns" not "assertionRuns" - } - - def create_update_eval_set_run_payload( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a coded eval set run.""" - scores_list = [ - {"value": avg_score, "evaluatorId": eval_id} - for eval_id, avg_score in evaluator_scores.items() - ] - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalSetRunId": eval_set_run_id, - "status": status.value, - "evaluatorScores": scores_list, - } - - def collect_results( - self, - eval_results: list[EvalItemResult], - evaluators: dict[str, BaseEvaluator[Any, Any, Any]], - usage_metrics: dict[str, int | float | None], - serialize_justification_fn: Callable[[Any], str | None], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results in coded evaluatorRuns format.""" - evaluator_runs: list[dict[str, Any]] = [] - evaluator_scores_list: list[dict[str, Any]] = [] - - for eval_result in eval_results: - if eval_result.evaluator_id not in evaluators: - continue - - justification = serialize_justification_fn(eval_result.result.details) - - evaluator_scores_list.append( - { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - "justification": justification, - "evaluatorId": eval_result.evaluator_id, - } - ) - - evaluator_runs.append( - { - "status": EvaluationStatus.COMPLETED.value, - "evaluatorId": eval_result.evaluator_id, - "result": { - "score": { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - }, - "justification": justification, - }, - "completionMetrics": { - "duration": int(eval_result.result.evaluation_time or 0), - "cost": usage_metrics["cost"], - "tokens": usage_metrics["tokens"] or 0, - "completionTokens": usage_metrics["completionTokens"] or 0, - "promptTokens": usage_metrics["promptTokens"] or 0, - }, - } - ) - - return evaluator_runs, evaluator_scores_list - - -# ============================================================================= -# Main Progress Reporter Class -# ============================================================================= - - -class StudioWebProgressReporter: - """Handles reporting evaluation progress to StudioWeb. - - Uses the Strategy Pattern to delegate legacy vs coded evaluation - formatting to appropriate strategy classes. - """ - - def __init__(self, spans_exporter: LlmOpsHttpExporter): - self.spans_exporter = spans_exporter - - logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL) - console_logger = ConsoleLogger.get_instance() - - # Use UIPATH_EVAL_BACKEND_URL for eval-specific routing if set - eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL) - uipath = UiPath(base_url=eval_backend_url) if eval_backend_url else UiPath() - - self._client = uipath.api_client - self._console = console_logger - self._rich_console = Console() - self._project_id = os.getenv("UIPATH_PROJECT_ID", None) - if not self._project_id: - logger.warning( - "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID." - ) - - # Strategy instances - self._legacy_strategy = LegacyEvalReportingStrategy() - self._coded_strategy = CodedEvalReportingStrategy() - - # State tracking - self.eval_set_run_ids: dict[str, str] = {} - self.evaluators: dict[str, Any] = {} - self.evaluator_scores: dict[str, list[float]] = {} - self.eval_run_ids: dict[str, str] = {} - self.is_coded_eval: dict[str, bool] = {} - self.eval_spans: dict[str, list[Any]] = {} - self.eval_set_execution_id: str | None = None - - # ------------------------------------------------------------------------- - # Strategy Selection - # ------------------------------------------------------------------------- - - def _get_strategy(self, is_coded: bool) -> EvalReportingStrategy: - """Get the appropriate strategy for the evaluation type.""" - return self._coded_strategy if is_coded else self._legacy_strategy - - # ------------------------------------------------------------------------- - # Utility Methods - # ------------------------------------------------------------------------- - - def _format_error_message(self, error: Exception, context: str) -> None: - """Helper method to format and display error messages consistently.""" - self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]") - - def _is_localhost(self) -> bool: - """Check if the eval backend URL is localhost.""" - eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL, "") - if eval_backend_url: - try: - parsed = urlparse(eval_backend_url) - hostname = parsed.hostname or parsed.netloc.split(":")[0] - return hostname.lower() in ("localhost", "127.0.0.1") - except Exception: - pass - return False - - def _get_endpoint_prefix(self) -> str: - """Determine the endpoint prefix based on environment.""" - if self._is_localhost(): - return "api/" - return "agentsruntime_/api/" - - def _is_coded_evaluator( - self, evaluators: list[BaseEvaluator[Any, Any, Any]] - ) -> bool: - """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).""" - if not evaluators: - return False - return not isinstance(evaluators[0], LegacyBaseEvaluator) - - def _serialize_justification( - self, justification: BaseModel | str | None - ) -> str | None: - """Serialize justification to JSON string for API compatibility.""" - if isinstance(justification, BaseModel): - justification = json.dumps(justification.model_dump()) - return justification - - def _tenant_header(self) -> dict[str, str | None]: - """Build tenant header for API requests.""" - tenant_id = os.getenv(ENV_TENANT_ID, None) - if not tenant_id: - self._console.error( - f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'." - ) - return {HEADER_INTERNAL_TENANT_ID: tenant_id} - - def _extract_usage_from_spans( - self, spans: list[Any] - ) -> dict[str, int | float | None]: - """Extract token usage and cost from OpenTelemetry spans.""" - total_tokens = 0 - completion_tokens = 0 - prompt_tokens = 0 - total_cost = 0.0 - - for span in spans: - try: - attrs = None - if hasattr(span, "attributes") and span.attributes: - if isinstance(span.attributes, dict): - attrs = span.attributes - elif isinstance(span.attributes, str): - attrs = json.loads(span.attributes) - - if not attrs and hasattr(span, "Attributes") and span.Attributes: - if isinstance(span.Attributes, str): - attrs = json.loads(span.Attributes) - elif isinstance(span.Attributes, dict): - attrs = span.Attributes - - if attrs: - if "usage" in attrs and isinstance(attrs["usage"], dict): - usage = attrs["usage"] - prompt_tokens += usage.get("promptTokens", 0) - completion_tokens += usage.get("completionTokens", 0) - total_tokens += usage.get("totalTokens", 0) - total_cost += usage.get("cost", 0.0) - - prompt_tokens += attrs.get("gen_ai.usage.prompt_tokens", 0) - completion_tokens += attrs.get("gen_ai.usage.completion_tokens", 0) - total_tokens += attrs.get("gen_ai.usage.total_tokens", 0) - total_cost += attrs.get("gen_ai.usage.cost", 0.0) - total_cost += attrs.get("llm.usage.cost", 0.0) - - except (json.JSONDecodeError, AttributeError, TypeError) as e: - logger.debug(f"Failed to parse span attributes: {e}") - continue - - return { - "tokens": total_tokens if total_tokens > 0 else None, - "completionTokens": completion_tokens if completion_tokens > 0 else None, - "promptTokens": prompt_tokens if prompt_tokens > 0 else None, - "cost": total_cost if total_cost > 0 else None, - } - - def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: - """Extract agent snapshot from entry points configuration.""" - try: - entry_points_file_path = os.path.join( - os.getcwd(), str(UiPathConfig.entry_points_file_path) - ) - if not os.path.exists(entry_points_file_path): - return StudioWebAgentSnapshot(input_schema={}, output_schema={}) - - with open(entry_points_file_path, "r") as f: - entry_points = json.load(f).get("entryPoints", []) - - ep = None - for entry_point in entry_points: - if entry_point.get("filePath") == entrypoint: - ep = entry_point - break - - if not ep: - logger.warning( - f"Entrypoint {entrypoint} not found in configuration file" - ) - return StudioWebAgentSnapshot(input_schema={}, output_schema={}) - - input_schema = ep.get("input", {}) - output_schema = ep.get("output", {}) - - return StudioWebAgentSnapshot( - input_schema=input_schema, output_schema=output_schema - ) - except Exception as e: - logger.warning(f"Failed to extract agent snapshot: {e}") - return StudioWebAgentSnapshot(input_schema={}, output_schema={}) - - # ------------------------------------------------------------------------- - # Request Spec Generation (delegating to strategies) - # ------------------------------------------------------------------------- - - def _create_eval_set_run_spec( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - is_coded: bool = False, - ) -> RequestSpec: - """Create request spec for creating an eval set run.""" - assert self._project_id is not None, "project_id is required for SW reporting" - strategy = self._get_strategy(is_coded) - payload = strategy.create_eval_set_run_payload( - eval_set_id, agent_snapshot, no_of_evals, self._project_id - ) - return RequestSpec( - method="POST", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" - f"{strategy.endpoint_suffix}evalSetRun" - ), - json=payload, - headers=self._tenant_header(), - ) - - def _create_eval_run_spec( - self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False - ) -> RequestSpec: - """Create request spec for creating an eval run.""" - strategy = self._get_strategy(is_coded) - payload = strategy.create_eval_run_payload(eval_item, eval_set_run_id) - return RequestSpec( - method="POST", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" - f"{strategy.endpoint_suffix}evalRun" - ), - json=payload, - headers=self._tenant_header(), - ) - - def _update_eval_run_spec( - self, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - eval_run_id: str, - actual_output: dict[str, Any], - execution_time: float, - success: bool, - is_coded: bool = False, - ) -> RequestSpec: - """Create request spec for updating an eval run.""" - strategy = self._get_strategy(is_coded) - payload = strategy.create_update_eval_run_payload( - eval_run_id, - evaluator_runs, - evaluator_scores, - actual_output, - execution_time, - success, - ) - return RequestSpec( - method="PUT", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" - f"{strategy.endpoint_suffix}evalRun" - ), - json=payload, - headers=self._tenant_header(), - ) - - def _update_eval_set_run_spec( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - is_coded: bool = False, - success: bool = True, - ) -> RequestSpec: - """Create request spec for updating an eval set run.""" - strategy = self._get_strategy(is_coded) - payload = strategy.create_update_eval_set_run_payload( - eval_set_run_id, evaluator_scores, success - ) - return RequestSpec( - method="PUT", - endpoint=Endpoint( - f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" - f"{strategy.endpoint_suffix}evalSetRun" - ), - json=payload, - headers=self._tenant_header(), - ) - - # ------------------------------------------------------------------------- - # API Methods - # ------------------------------------------------------------------------- - - @gracefully_handle_errors - async def create_eval_set_run_sw( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - evaluators: list[LegacyBaseEvaluator[Any]], - is_coded: bool = False, - ) -> str: - """Create a new evaluation set run in StudioWeb.""" - spec = self._create_eval_set_run_spec( - eval_set_id, agent_snapshot, no_of_evals, is_coded - ) - response = await self._client.request_async( - method=spec.method, - url=spec.endpoint, - params=spec.params, - json=spec.json, - headers=spec.headers, - scoped="org" if self._is_localhost() else "tenant", - ) - eval_set_run_id = json.loads(response.content)["id"] - return eval_set_run_id - - @gracefully_handle_errors - async def create_eval_run( - self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False - ) -> str: - """Create a new evaluation run in StudioWeb.""" - spec = self._create_eval_run_spec(eval_item, eval_set_run_id, is_coded) - response = await self._client.request_async( - method=spec.method, - url=spec.endpoint, - params=spec.params, - json=spec.json, - headers=spec.headers, - scoped="org" if self._is_localhost() else "tenant", - ) - return json.loads(response.content)["id"] - - @gracefully_handle_errors - async def update_eval_run( - self, - sw_progress_item: StudioWebProgressItem, - evaluators: dict[str, Evaluator], - is_coded: bool = False, - spans: list[Any] | None = None, - ): - """Update an evaluation run with results.""" - # Separate evaluators by type - coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} - legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {} - - for k, v in evaluators.items(): - if isinstance(v, LegacyBaseEvaluator): - legacy_evaluators[k] = v - elif isinstance(v, BaseEvaluator): - coded_evaluators[k] = v - - usage_metrics = self._extract_usage_from_spans(spans or []) - - evaluator_runs: list[dict[str, Any]] = [] - evaluator_scores: list[dict[str, Any]] = [] - - # Use strategies for result collection - if coded_evaluators: - runs, scores = self._coded_strategy.collect_results( - sw_progress_item.eval_results, - coded_evaluators, - usage_metrics, - self._serialize_justification, - ) - evaluator_runs.extend(runs) - evaluator_scores.extend(scores) - - if legacy_evaluators: - runs, scores = self._legacy_strategy.collect_results( - sw_progress_item.eval_results, - legacy_evaluators, - usage_metrics, - self._serialize_justification, - ) - evaluator_runs.extend(runs) - evaluator_scores.extend(scores) - - # Use strategy for spec generation - spec = self._update_eval_run_spec( - evaluator_runs=evaluator_runs, - evaluator_scores=evaluator_scores, - eval_run_id=sw_progress_item.eval_run_id, - actual_output=sw_progress_item.agent_output, - execution_time=sw_progress_item.agent_execution_time, - success=sw_progress_item.success, - is_coded=is_coded, - ) - - await self._client.request_async( - method=spec.method, - url=spec.endpoint, - params=spec.params, - json=spec.json, - headers=spec.headers, - scoped="org" if self._is_localhost() else "tenant", - ) - - @gracefully_handle_errors - async def update_eval_set_run( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - is_coded: bool = False, - success: bool = True, - ): - """Update the evaluation set run status to complete.""" - spec = self._update_eval_set_run_spec( - eval_set_run_id, evaluator_scores, is_coded, success - ) - await self._client.request_async( - method=spec.method, - url=spec.endpoint, - params=spec.params, - json=spec.json, - headers=spec.headers, - scoped="org" if self._is_localhost() else "tenant", - ) - - # ------------------------------------------------------------------------- - # Event Handlers - # ------------------------------------------------------------------------- - - async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None: - try: - self.evaluators = {eval.id: eval for eval in payload.evaluators} - self.evaluator_scores = {eval.id: [] for eval in payload.evaluators} - self.eval_set_execution_id = payload.execution_id - - is_coded = self._is_coded_evaluator(payload.evaluators) - self.is_coded_eval[payload.execution_id] = is_coded - - eval_set_run_id = payload.eval_set_run_id - if not eval_set_run_id: - eval_set_run_id = await self.create_eval_set_run_sw( - eval_set_id=payload.eval_set_id, - agent_snapshot=self._extract_agent_snapshot(payload.entrypoint), - no_of_evals=payload.no_of_evals, - evaluators=payload.evaluators, - is_coded=is_coded, - ) - self.eval_set_run_ids[payload.execution_id] = eval_set_run_id - current_span = trace.get_current_span() - if current_span.is_recording(): - current_span.set_attribute("eval_set_run_id", eval_set_run_id) - - if eval_set_run_id: - await self._send_parent_trace(eval_set_run_id, payload.eval_set_id) - - logger.debug( - f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})" - ) - - except Exception as e: - self._format_error_message(e, "StudioWeb create eval set run error") - - async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None: - try: - if self.eval_set_execution_id and ( - eval_set_run_id := self.eval_set_run_ids.get(self.eval_set_execution_id) - ): - is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) - eval_run_id = await self.create_eval_run( - payload.eval_item, eval_set_run_id, is_coded - ) - if eval_run_id: - self.eval_run_ids[payload.execution_id] = eval_run_id - logger.debug( - f"Created eval run with ID: {eval_run_id} (coded={is_coded})" - ) - else: - logger.warning("Cannot create eval run: eval_set_run_id not available") - - except Exception as e: - self._format_error_message(e, "StudioWeb create eval run error") - - async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: - try: - eval_run_id = self.eval_run_ids.get(payload.execution_id) - - if eval_run_id: - self.spans_exporter.trace_id = eval_run_id - else: - if self.eval_set_execution_id: - self.spans_exporter.trace_id = self.eval_set_run_ids.get( - self.eval_set_execution_id - ) - - self.spans_exporter.export(payload.spans) - - for eval_result in payload.eval_results: - evaluator_id = eval_result.evaluator_id - if evaluator_id in self.evaluator_scores: - match eval_result.result.score_type: - case ScoreType.NUMERICAL: - self.evaluator_scores[evaluator_id].append( - eval_result.result.score - ) - case ScoreType.BOOLEAN: - self.evaluator_scores[evaluator_id].append( - 100 if eval_result.result.score else 0 - ) - case ScoreType.ERROR: - self.evaluator_scores[evaluator_id].append(0) - - if eval_run_id and self.eval_set_execution_id: - is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) - self._extract_usage_from_spans(payload.spans) - - await self._send_evaluator_traces( - eval_run_id, payload.eval_results, payload.spans - ) - - await self.update_eval_run( - StudioWebProgressItem( - eval_run_id=eval_run_id, - eval_results=payload.eval_results, - success=payload.success, - agent_output=payload.agent_output, - agent_execution_time=payload.agent_execution_time, - ), - self.evaluators, - is_coded=is_coded, - spans=payload.spans, - ) - - logger.debug( - f"Updated eval run with ID: {eval_run_id} (coded={is_coded})" - ) - - except Exception as e: - self._format_error_message(e, "StudioWeb reporting error") - - async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> None: - try: - if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id): - is_coded = self.is_coded_eval.get(payload.execution_id, False) - await self.update_eval_set_run( - eval_set_run_id, - payload.evaluator_scores, - is_coded=is_coded, - success=payload.success, - ) - status_str = "completed" if payload.success else "failed" - logger.debug( - f"Updated eval set run with ID: {eval_set_run_id} " - f"(coded={is_coded}, status={status_str})" - ) - else: - logger.warning( - "Cannot update eval set run: eval_set_run_id not available" - ) - - except Exception as e: - self._format_error_message(e, "StudioWeb update eval set run error") - - async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: - event_bus.subscribe( - EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run - ) - event_bus.subscribe( - EvaluationEvents.CREATE_EVAL_RUN, self.handle_create_eval_run - ) - event_bus.subscribe( - EvaluationEvents.UPDATE_EVAL_RUN, self.handle_update_eval_run - ) - event_bus.subscribe( - EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run - ) - logger.debug("StudioWeb progress reporter subscribed to evaluation events") - - # ------------------------------------------------------------------------- - # Tracing Methods - # ------------------------------------------------------------------------- - - async def _send_parent_trace( - self, eval_set_run_id: str, eval_set_name: str - ) -> None: - """Send the parent trace span for the evaluation set run.""" - try: - tracer = trace.get_tracer(__name__) - trace_id_int = int(uuid.UUID(eval_set_run_id)) - - span_context = SpanContext( - trace_id=trace_id_int, - span_id=trace_id_int, - is_remote=False, - trace_flags=TraceFlags(0x01), - ) - - ctx = trace.set_span_in_context(trace.NonRecordingSpan(span_context)) - - with tracer.start_as_current_span( - eval_set_name, - context=ctx, - kind=SpanKind.INTERNAL, - start_time=int(datetime.now(timezone.utc).timestamp() * 1_000_000_000), - ) as span: - span.set_attribute("openinference.span.kind", "CHAIN") - span.set_attribute("span.type", "evaluationSet") - span.set_attribute("eval_set_run_id", eval_set_run_id) - - logger.debug(f"Created parent trace for eval set run: {eval_set_run_id}") - - except Exception as e: - logger.warning(f"Failed to create parent trace: {e}") - - async def _send_eval_run_trace( - self, eval_run_id: str, eval_set_run_id: str, eval_name: str - ) -> None: - """Send the child trace span for an evaluation run.""" - try: - tracer = trace.get_tracer(__name__) - trace_id_int = int(uuid.UUID(eval_run_id)) - parent_span_id_int = int(uuid.UUID(eval_set_run_id)) - - parent_context = SpanContext( - trace_id=trace_id_int, - span_id=parent_span_id_int, - is_remote=False, - trace_flags=TraceFlags(0x01), - ) - - ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) - - with tracer.start_as_current_span( - eval_name, - context=ctx, - kind=SpanKind.INTERNAL, - start_time=int(datetime.now(timezone.utc).timestamp() * 1_000_000_000), - ) as span: - span.set_attribute("openinference.span.kind", "CHAIN") - span.set_attribute("span.type", "evaluation") - span.set_attribute("eval_run_id", eval_run_id) - span.set_attribute("eval_set_run_id", eval_set_run_id) - - logger.debug( - f"Created trace for eval run: {eval_run_id} (parent: {eval_set_run_id})" - ) - - except Exception as e: - logger.warning(f"Failed to create eval run trace: {e}") - - async def _send_evaluator_traces( - self, eval_run_id: str, eval_results: list[EvalItemResult], spans: list[Any] - ) -> None: - """Send trace spans for all evaluators.""" - try: - if not eval_results: - logger.debug( - f"No evaluator results to trace for eval run: {eval_run_id}" - ) - return - - agent_readable_spans = [] - if spans: - for span in spans: - if hasattr(span, "_readable_span"): - agent_readable_spans.append(span._readable_span()) - - if agent_readable_spans: - self.spans_exporter.export(agent_readable_spans) - logger.debug( - f"Exported {len(agent_readable_spans)} agent execution spans " - f"for eval run: {eval_run_id}" - ) - - tracer = trace.get_tracer(__name__) - now = datetime.now(timezone.utc) - - total_eval_time = ( - sum( - r.result.evaluation_time - for r in eval_results - if r.result.evaluation_time - ) - or 0.0 - ) - - parent_end_time = now - parent_start_time = ( - datetime.fromtimestamp( - now.timestamp() - total_eval_time, tz=timezone.utc - ) - if total_eval_time > 0 - else now - ) - - root_span_uuid = None - if spans: - from uipath.tracing._utils import _SpanUtils - - for span in spans: - if span.parent is None: - span_context = span.get_span_context() - root_span_uuid = _SpanUtils.span_id_to_uuid4( - span_context.span_id - ) - break - - trace_id_int = int(uuid.UUID(eval_run_id)) - - if root_span_uuid: - root_span_id_int = int(root_span_uuid) - parent_context = SpanContext( - trace_id=trace_id_int, - span_id=root_span_id_int, - is_remote=False, - trace_flags=TraceFlags(0x01), - ) - ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) - else: - parent_context = SpanContext( - trace_id=trace_id_int, - span_id=trace_id_int, - is_remote=False, - trace_flags=TraceFlags(0x01), - ) - ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) - - parent_start_ns = int(parent_start_time.timestamp() * 1_000_000_000) - parent_end_ns = int(parent_end_time.timestamp() * 1_000_000_000) - - parent_span = tracer.start_span( - "Evaluators", - context=ctx, - kind=SpanKind.INTERNAL, - start_time=parent_start_ns, - ) - - parent_span.set_attribute("openinference.span.kind", "CHAIN") - parent_span.set_attribute("span.type", "evaluators") - parent_span.set_attribute("eval_run_id", eval_run_id) - - parent_ctx = trace.set_span_in_context(parent_span, ctx) - current_time = parent_start_time - readable_spans = [] - - for eval_result in eval_results: - evaluator = self.evaluators.get(eval_result.evaluator_id) - evaluator_name = evaluator.id if evaluator else eval_result.evaluator_id - - eval_time = eval_result.result.evaluation_time or 0 - eval_start = current_time - eval_end = datetime.fromtimestamp( - current_time.timestamp() + eval_time, tz=timezone.utc - ) - current_time = eval_end - - eval_start_ns = int(eval_start.timestamp() * 1_000_000_000) - eval_end_ns = int(eval_end.timestamp() * 1_000_000_000) - - evaluator_span = tracer.start_span( - evaluator_name, - context=parent_ctx, - kind=SpanKind.INTERNAL, - start_time=eval_start_ns, - ) - - evaluator_span.set_attribute("openinference.span.kind", "EVALUATOR") - evaluator_span.set_attribute("span.type", "evaluator") - evaluator_span.set_attribute("evaluator_id", eval_result.evaluator_id) - evaluator_span.set_attribute("evaluator_name", evaluator_name) - evaluator_span.set_attribute("eval_run_id", eval_run_id) - evaluator_span.set_attribute("score", eval_result.result.score) - evaluator_span.set_attribute( - "score_type", eval_result.result.score_type.name - ) - - if eval_result.result.details: - if isinstance(eval_result.result.details, BaseModel): - evaluator_span.set_attribute( - "details", - json.dumps(eval_result.result.details.model_dump()), - ) - else: - evaluator_span.set_attribute( - "details", str(eval_result.result.details) - ) - - if eval_result.result.evaluation_time: - evaluator_span.set_attribute( - "evaluation_time", eval_result.result.evaluation_time - ) - - from opentelemetry.trace import Status, StatusCode - - if eval_result.result.score_type == ScoreType.ERROR: - evaluator_span.set_status( - Status(StatusCode.ERROR, "Evaluation failed") - ) - else: - evaluator_span.set_status(Status(StatusCode.OK)) - - evaluator_span.end(end_time=eval_end_ns) - - if hasattr(evaluator_span, "_readable_span"): - readable_spans.append(evaluator_span._readable_span()) - - parent_span.end(end_time=parent_end_ns) - - if hasattr(parent_span, "_readable_span"): - readable_spans.insert(0, parent_span._readable_span()) - - if readable_spans: - self.spans_exporter.export(readable_spans) - logger.debug( - f"Created evaluator traces for eval run: {eval_run_id} " - f"({len(eval_results)} evaluators)" - ) - except Exception as e: - logger.warning(f"Failed to create evaluator traces: {e}") +__all__ = [ + "StudioWebProgressReporter", + "EvalReportingStrategy", + "LegacyEvalReportingStrategy", + "CodedEvalReportingStrategy", + "gracefully_handle_errors", +] diff --git a/src/uipath/_cli/_evals/_reporting/__init__.py b/src/uipath/_cli/_evals/_reporting/__init__.py new file mode 100644 index 000000000..30b5d48d1 --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/__init__.py @@ -0,0 +1,21 @@ +"""Evaluation progress reporting module. + +This module provides components for reporting evaluation progress to StudioWeb, +supporting both legacy and coded evaluation formats through the Strategy Pattern. +""" + +from uipath._cli._evals._reporting._reporter import StudioWebProgressReporter +from uipath._cli._evals._reporting._strategies import ( + CodedEvalReportingStrategy, + EvalReportingStrategy, + LegacyEvalReportingStrategy, +) +from uipath._cli._evals._reporting._utils import gracefully_handle_errors + +__all__ = [ + "StudioWebProgressReporter", + "EvalReportingStrategy", + "LegacyEvalReportingStrategy", + "CodedEvalReportingStrategy", + "gracefully_handle_errors", +] diff --git a/src/uipath/_cli/_evals/_reporting/_reporter.py b/src/uipath/_cli/_evals/_reporting/_reporter.py new file mode 100644 index 000000000..d6242fa0f --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/_reporter.py @@ -0,0 +1,861 @@ +"""StudioWeb Progress Reporter for evaluation runs. + +This module provides the main reporter class for sending evaluation +progress updates to StudioWeb, including creating and updating +eval set runs and individual eval runs. +""" + +import json +import logging +import os +import uuid +from datetime import datetime, timezone +from typing import Any +from urllib.parse import urlparse + +from opentelemetry import trace +from opentelemetry.trace import SpanContext, SpanKind, TraceFlags +from pydantic import BaseModel +from rich.console import Console + +from uipath._cli._evals._models._evaluation_set import ( + EvaluationItem, +) +from uipath._cli._evals._models._evaluator import Evaluator +from uipath._cli._evals._models._sw_reporting import ( + StudioWebAgentSnapshot, + StudioWebProgressItem, +) +from uipath._cli._evals._reporting._strategies import ( + CodedEvalReportingStrategy, + EvalReportingStrategy, + LegacyEvalReportingStrategy, +) +from uipath._cli._evals._reporting._utils import gracefully_handle_errors +from uipath._cli._utils._console import ConsoleLogger +from uipath._events._event_bus import EventBus +from uipath._events._events import ( + EvalRunCreatedEvent, + EvalRunUpdatedEvent, + EvalSetRunCreatedEvent, + EvalSetRunUpdatedEvent, + EvaluationEvents, +) +from uipath._utils import Endpoint, RequestSpec +from uipath._utils.constants import ( + ENV_EVAL_BACKEND_URL, + ENV_TENANT_ID, + HEADER_INTERNAL_TENANT_ID, +) +from uipath.eval.evaluators import ( + BaseEvaluator, + LegacyBaseEvaluator, +) +from uipath.eval.models import EvalItemResult, ScoreType +from uipath.platform import UiPath +from uipath.platform.common import UiPathConfig +from uipath.tracing import LlmOpsHttpExporter + +logger = logging.getLogger(__name__) + + +class StudioWebProgressReporter: + """Handles reporting evaluation progress to StudioWeb. + + Uses the Strategy Pattern to delegate legacy vs coded evaluation + formatting to appropriate strategy classes. + """ + + def __init__(self, spans_exporter: LlmOpsHttpExporter): + self.spans_exporter = spans_exporter + + logging.getLogger("uipath._cli.middlewares").setLevel(logging.CRITICAL) + console_logger = ConsoleLogger.get_instance() + + # Use UIPATH_EVAL_BACKEND_URL for eval-specific routing if set + eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL) + uipath = UiPath(base_url=eval_backend_url) if eval_backend_url else UiPath() + + self._client = uipath.api_client + self._console = console_logger + self._rich_console = Console() + self._project_id = os.getenv("UIPATH_PROJECT_ID", None) + if not self._project_id: + logger.warning( + "Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID." + ) + + # Strategy instances + self._legacy_strategy = LegacyEvalReportingStrategy() + self._coded_strategy = CodedEvalReportingStrategy() + + # State tracking + self.eval_set_run_ids: dict[str, str] = {} + self.evaluators: dict[str, Any] = {} + self.evaluator_scores: dict[str, list[float]] = {} + self.eval_run_ids: dict[str, str] = {} + self.is_coded_eval: dict[str, bool] = {} + self.eval_spans: dict[str, list[Any]] = {} + self.eval_set_execution_id: str | None = None + + # ------------------------------------------------------------------------- + # Strategy Selection + # ------------------------------------------------------------------------- + + def _get_strategy(self, is_coded: bool) -> EvalReportingStrategy: + """Get the appropriate strategy for the evaluation type.""" + return self._coded_strategy if is_coded else self._legacy_strategy + + # ------------------------------------------------------------------------- + # Utility Methods + # ------------------------------------------------------------------------- + + def _format_error_message(self, error: Exception, context: str) -> None: + """Helper method to format and display error messages consistently.""" + self._rich_console.print(f" • \u26a0 [dim]{context}: {error}[/dim]") + + def _is_localhost(self) -> bool: + """Check if the eval backend URL is localhost.""" + eval_backend_url = os.getenv(ENV_EVAL_BACKEND_URL, "") + if eval_backend_url: + try: + parsed = urlparse(eval_backend_url) + hostname = parsed.hostname or parsed.netloc.split(":")[0] + return hostname.lower() in ("localhost", "127.0.0.1") + except Exception: + pass + return False + + def _get_endpoint_prefix(self) -> str: + """Determine the endpoint prefix based on environment.""" + if self._is_localhost(): + return "api/" + return "agentsruntime_/api/" + + def _is_coded_evaluator( + self, evaluators: list[BaseEvaluator[Any, Any, Any]] + ) -> bool: + """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator).""" + if not evaluators: + return False + return not isinstance(evaluators[0], LegacyBaseEvaluator) + + def _serialize_justification( + self, justification: BaseModel | str | None + ) -> str | None: + """Serialize justification to JSON string for API compatibility.""" + if isinstance(justification, BaseModel): + justification = json.dumps(justification.model_dump()) + return justification + + def _tenant_header(self) -> dict[str, str | None]: + """Build tenant header for API requests.""" + tenant_id = os.getenv(ENV_TENANT_ID, None) + if not tenant_id: + self._console.error( + f"{ENV_TENANT_ID} env var is not set. Please run 'uipath auth'." + ) + return {HEADER_INTERNAL_TENANT_ID: tenant_id} + + def _extract_usage_from_spans( + self, spans: list[Any] + ) -> dict[str, int | float | None]: + """Extract token usage and cost from OpenTelemetry spans.""" + total_tokens = 0 + completion_tokens = 0 + prompt_tokens = 0 + total_cost = 0.0 + + for span in spans: + try: + attrs = None + if hasattr(span, "attributes") and span.attributes: + if isinstance(span.attributes, dict): + attrs = span.attributes + elif isinstance(span.attributes, str): + attrs = json.loads(span.attributes) + + if not attrs and hasattr(span, "Attributes") and span.Attributes: + if isinstance(span.Attributes, str): + attrs = json.loads(span.Attributes) + elif isinstance(span.Attributes, dict): + attrs = span.Attributes + + if attrs: + if "usage" in attrs and isinstance(attrs["usage"], dict): + usage = attrs["usage"] + prompt_tokens += usage.get("promptTokens", 0) + completion_tokens += usage.get("completionTokens", 0) + total_tokens += usage.get("totalTokens", 0) + total_cost += usage.get("cost", 0.0) + + prompt_tokens += attrs.get("gen_ai.usage.prompt_tokens", 0) + completion_tokens += attrs.get("gen_ai.usage.completion_tokens", 0) + total_tokens += attrs.get("gen_ai.usage.total_tokens", 0) + total_cost += attrs.get("gen_ai.usage.cost", 0.0) + total_cost += attrs.get("llm.usage.cost", 0.0) + + except (json.JSONDecodeError, AttributeError, TypeError) as e: + logger.debug(f"Failed to parse span attributes: {e}") + continue + + return { + "tokens": total_tokens if total_tokens > 0 else None, + "completionTokens": completion_tokens if completion_tokens > 0 else None, + "promptTokens": prompt_tokens if prompt_tokens > 0 else None, + "cost": total_cost if total_cost > 0 else None, + } + + def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: + """Extract agent snapshot from entry points configuration.""" + try: + entry_points_file_path = os.path.join( + os.getcwd(), str(UiPathConfig.entry_points_file_path) + ) + if not os.path.exists(entry_points_file_path): + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + + with open(entry_points_file_path, "r") as f: + entry_points = json.load(f).get("entryPoints", []) + + ep = None + for entry_point in entry_points: + if entry_point.get("filePath") == entrypoint: + ep = entry_point + break + + if not ep: + logger.warning( + f"Entrypoint {entrypoint} not found in configuration file" + ) + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + + input_schema = ep.get("input", {}) + output_schema = ep.get("output", {}) + + return StudioWebAgentSnapshot( + input_schema=input_schema, output_schema=output_schema + ) + except Exception as e: + logger.warning(f"Failed to extract agent snapshot: {e}") + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + + # ------------------------------------------------------------------------- + # Request Spec Generation (delegating to strategies) + # ------------------------------------------------------------------------- + + def _create_eval_set_run_spec( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + is_coded: bool = False, + ) -> RequestSpec: + """Create request spec for creating an eval set run.""" + assert self._project_id is not None, "project_id is required for SW reporting" + strategy = self._get_strategy(is_coded) + payload = strategy.create_eval_set_run_payload( + eval_set_id, agent_snapshot, no_of_evals, self._project_id + ) + return RequestSpec( + method="POST", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalSetRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + def _create_eval_run_spec( + self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False + ) -> RequestSpec: + """Create request spec for creating an eval run.""" + strategy = self._get_strategy(is_coded) + payload = strategy.create_eval_run_payload(eval_item, eval_set_run_id) + return RequestSpec( + method="POST", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + def _update_eval_run_spec( + self, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + eval_run_id: str, + actual_output: dict[str, Any], + execution_time: float, + success: bool, + is_coded: bool = False, + ) -> RequestSpec: + """Create request spec for updating an eval run.""" + strategy = self._get_strategy(is_coded) + payload = strategy.create_update_eval_run_payload( + eval_run_id, + evaluator_runs, + evaluator_scores, + actual_output, + execution_time, + success, + ) + return RequestSpec( + method="PUT", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + def _update_eval_set_run_spec( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + is_coded: bool = False, + success: bool = True, + ) -> RequestSpec: + """Create request spec for updating an eval set run.""" + strategy = self._get_strategy(is_coded) + payload = strategy.create_update_eval_set_run_payload( + eval_set_run_id, evaluator_scores, success + ) + return RequestSpec( + method="PUT", + endpoint=Endpoint( + f"{self._get_endpoint_prefix()}execution/agents/{self._project_id}/" + f"{strategy.endpoint_suffix}evalSetRun" + ), + json=payload, + headers=self._tenant_header(), + ) + + # ------------------------------------------------------------------------- + # API Methods + # ------------------------------------------------------------------------- + + @gracefully_handle_errors + async def create_eval_set_run_sw( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + evaluators: list[LegacyBaseEvaluator[Any]], + is_coded: bool = False, + ) -> str: + """Create a new evaluation set run in StudioWeb.""" + spec = self._create_eval_set_run_spec( + eval_set_id, agent_snapshot, no_of_evals, is_coded + ) + response = await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", + ) + eval_set_run_id = json.loads(response.content)["id"] + return eval_set_run_id + + @gracefully_handle_errors + async def create_eval_run( + self, eval_item: EvaluationItem, eval_set_run_id: str, is_coded: bool = False + ) -> str: + """Create a new evaluation run in StudioWeb.""" + spec = self._create_eval_run_spec(eval_item, eval_set_run_id, is_coded) + response = await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", + ) + return json.loads(response.content)["id"] + + @gracefully_handle_errors + async def update_eval_run( + self, + sw_progress_item: StudioWebProgressItem, + evaluators: dict[str, Evaluator], + is_coded: bool = False, + spans: list[Any] | None = None, + ): + """Update an evaluation run with results.""" + # Separate evaluators by type + coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} + legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {} + + for k, v in evaluators.items(): + if isinstance(v, LegacyBaseEvaluator): + legacy_evaluators[k] = v + elif isinstance(v, BaseEvaluator): + coded_evaluators[k] = v + + usage_metrics = self._extract_usage_from_spans(spans or []) + + evaluator_runs: list[dict[str, Any]] = [] + evaluator_scores: list[dict[str, Any]] = [] + + # Use strategies for result collection + if coded_evaluators: + runs, scores = self._coded_strategy.collect_results( + sw_progress_item.eval_results, + coded_evaluators, + usage_metrics, + self._serialize_justification, + ) + evaluator_runs.extend(runs) + evaluator_scores.extend(scores) + + if legacy_evaluators: + runs, scores = self._legacy_strategy.collect_results( + sw_progress_item.eval_results, + legacy_evaluators, + usage_metrics, + self._serialize_justification, + ) + evaluator_runs.extend(runs) + evaluator_scores.extend(scores) + + # Use strategy for spec generation + spec = self._update_eval_run_spec( + evaluator_runs=evaluator_runs, + evaluator_scores=evaluator_scores, + eval_run_id=sw_progress_item.eval_run_id, + actual_output=sw_progress_item.agent_output, + execution_time=sw_progress_item.agent_execution_time, + success=sw_progress_item.success, + is_coded=is_coded, + ) + + await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", + ) + + @gracefully_handle_errors + async def update_eval_set_run( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + is_coded: bool = False, + success: bool = True, + ): + """Update the evaluation set run status to complete.""" + spec = self._update_eval_set_run_spec( + eval_set_run_id, evaluator_scores, is_coded, success + ) + await self._client.request_async( + method=spec.method, + url=spec.endpoint, + params=spec.params, + json=spec.json, + headers=spec.headers, + scoped="org" if self._is_localhost() else "tenant", + ) + + # ------------------------------------------------------------------------- + # Event Handlers + # ------------------------------------------------------------------------- + + async def handle_create_eval_set_run(self, payload: EvalSetRunCreatedEvent) -> None: + try: + self.evaluators = {eval.id: eval for eval in payload.evaluators} + self.evaluator_scores = {eval.id: [] for eval in payload.evaluators} + self.eval_set_execution_id = payload.execution_id + + is_coded = self._is_coded_evaluator(payload.evaluators) + self.is_coded_eval[payload.execution_id] = is_coded + + eval_set_run_id = payload.eval_set_run_id + if not eval_set_run_id: + eval_set_run_id = await self.create_eval_set_run_sw( + eval_set_id=payload.eval_set_id, + agent_snapshot=self._extract_agent_snapshot(payload.entrypoint), + no_of_evals=payload.no_of_evals, + evaluators=payload.evaluators, + is_coded=is_coded, + ) + self.eval_set_run_ids[payload.execution_id] = eval_set_run_id + current_span = trace.get_current_span() + if current_span.is_recording(): + current_span.set_attribute("eval_set_run_id", eval_set_run_id) + + if eval_set_run_id: + await self._send_parent_trace(eval_set_run_id, payload.eval_set_id) + + logger.debug( + f"Created eval set run with ID: {eval_set_run_id} (coded={is_coded})" + ) + + except Exception as e: + self._format_error_message(e, "StudioWeb create eval set run error") + + async def handle_create_eval_run(self, payload: EvalRunCreatedEvent) -> None: + try: + if self.eval_set_execution_id and ( + eval_set_run_id := self.eval_set_run_ids.get(self.eval_set_execution_id) + ): + is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) + eval_run_id = await self.create_eval_run( + payload.eval_item, eval_set_run_id, is_coded + ) + if eval_run_id: + self.eval_run_ids[payload.execution_id] = eval_run_id + logger.debug( + f"Created eval run with ID: {eval_run_id} (coded={is_coded})" + ) + else: + logger.warning("Cannot create eval run: eval_set_run_id not available") + + except Exception as e: + self._format_error_message(e, "StudioWeb create eval run error") + + async def handle_update_eval_run(self, payload: EvalRunUpdatedEvent) -> None: + try: + eval_run_id = self.eval_run_ids.get(payload.execution_id) + + if eval_run_id: + self.spans_exporter.trace_id = eval_run_id + else: + if self.eval_set_execution_id: + self.spans_exporter.trace_id = self.eval_set_run_ids.get( + self.eval_set_execution_id + ) + + self.spans_exporter.export(payload.spans) + + for eval_result in payload.eval_results: + evaluator_id = eval_result.evaluator_id + if evaluator_id in self.evaluator_scores: + match eval_result.result.score_type: + case ScoreType.NUMERICAL: + self.evaluator_scores[evaluator_id].append( + eval_result.result.score + ) + case ScoreType.BOOLEAN: + self.evaluator_scores[evaluator_id].append( + 100 if eval_result.result.score else 0 + ) + case ScoreType.ERROR: + self.evaluator_scores[evaluator_id].append(0) + + if eval_run_id and self.eval_set_execution_id: + is_coded = self.is_coded_eval.get(self.eval_set_execution_id, False) + self._extract_usage_from_spans(payload.spans) + + await self._send_evaluator_traces( + eval_run_id, payload.eval_results, payload.spans + ) + + await self.update_eval_run( + StudioWebProgressItem( + eval_run_id=eval_run_id, + eval_results=payload.eval_results, + success=payload.success, + agent_output=payload.agent_output, + agent_execution_time=payload.agent_execution_time, + ), + self.evaluators, + is_coded=is_coded, + spans=payload.spans, + ) + + logger.debug( + f"Updated eval run with ID: {eval_run_id} (coded={is_coded})" + ) + + except Exception as e: + self._format_error_message(e, "StudioWeb reporting error") + + async def handle_update_eval_set_run(self, payload: EvalSetRunUpdatedEvent) -> None: + try: + if eval_set_run_id := self.eval_set_run_ids.get(payload.execution_id): + is_coded = self.is_coded_eval.get(payload.execution_id, False) + await self.update_eval_set_run( + eval_set_run_id, + payload.evaluator_scores, + is_coded=is_coded, + success=payload.success, + ) + status_str = "completed" if payload.success else "failed" + logger.debug( + f"Updated eval set run with ID: {eval_set_run_id} " + f"(coded={is_coded}, status={status_str})" + ) + else: + logger.warning( + "Cannot update eval set run: eval_set_run_id not available" + ) + + except Exception as e: + self._format_error_message(e, "StudioWeb update eval set run error") + + async def subscribe_to_eval_runtime_events(self, event_bus: EventBus) -> None: + event_bus.subscribe( + EvaluationEvents.CREATE_EVAL_SET_RUN, self.handle_create_eval_set_run + ) + event_bus.subscribe( + EvaluationEvents.CREATE_EVAL_RUN, self.handle_create_eval_run + ) + event_bus.subscribe( + EvaluationEvents.UPDATE_EVAL_RUN, self.handle_update_eval_run + ) + event_bus.subscribe( + EvaluationEvents.UPDATE_EVAL_SET_RUN, self.handle_update_eval_set_run + ) + logger.debug("StudioWeb progress reporter subscribed to evaluation events") + + # ------------------------------------------------------------------------- + # Tracing Methods + # ------------------------------------------------------------------------- + + async def _send_parent_trace( + self, eval_set_run_id: str, eval_set_name: str + ) -> None: + """Send the parent trace span for the evaluation set run.""" + try: + tracer = trace.get_tracer(__name__) + trace_id_int = int(uuid.UUID(eval_set_run_id)) + + span_context = SpanContext( + trace_id=trace_id_int, + span_id=trace_id_int, + is_remote=False, + trace_flags=TraceFlags(0x01), + ) + + ctx = trace.set_span_in_context(trace.NonRecordingSpan(span_context)) + + with tracer.start_as_current_span( + eval_set_name, + context=ctx, + kind=SpanKind.INTERNAL, + start_time=int(datetime.now(timezone.utc).timestamp() * 1_000_000_000), + ) as span: + span.set_attribute("openinference.span.kind", "CHAIN") + span.set_attribute("span.type", "evaluationSet") + span.set_attribute("eval_set_run_id", eval_set_run_id) + + logger.debug(f"Created parent trace for eval set run: {eval_set_run_id}") + + except Exception as e: + logger.warning(f"Failed to create parent trace: {e}") + + async def _send_eval_run_trace( + self, eval_run_id: str, eval_set_run_id: str, eval_name: str + ) -> None: + """Send the child trace span for an evaluation run.""" + try: + tracer = trace.get_tracer(__name__) + trace_id_int = int(uuid.UUID(eval_run_id)) + parent_span_id_int = int(uuid.UUID(eval_set_run_id)) + + parent_context = SpanContext( + trace_id=trace_id_int, + span_id=parent_span_id_int, + is_remote=False, + trace_flags=TraceFlags(0x01), + ) + + ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) + + with tracer.start_as_current_span( + eval_name, + context=ctx, + kind=SpanKind.INTERNAL, + start_time=int(datetime.now(timezone.utc).timestamp() * 1_000_000_000), + ) as span: + span.set_attribute("openinference.span.kind", "CHAIN") + span.set_attribute("span.type", "evaluation") + span.set_attribute("eval_run_id", eval_run_id) + span.set_attribute("eval_set_run_id", eval_set_run_id) + + logger.debug( + f"Created trace for eval run: {eval_run_id} (parent: {eval_set_run_id})" + ) + + except Exception as e: + logger.warning(f"Failed to create eval run trace: {e}") + + async def _send_evaluator_traces( + self, eval_run_id: str, eval_results: list[EvalItemResult], spans: list[Any] + ) -> None: + """Send trace spans for all evaluators.""" + try: + if not eval_results: + logger.debug( + f"No evaluator results to trace for eval run: {eval_run_id}" + ) + return + + agent_readable_spans = [] + if spans: + for span in spans: + if hasattr(span, "_readable_span"): + agent_readable_spans.append(span._readable_span()) + + if agent_readable_spans: + self.spans_exporter.export(agent_readable_spans) + logger.debug( + f"Exported {len(agent_readable_spans)} agent execution spans " + f"for eval run: {eval_run_id}" + ) + + tracer = trace.get_tracer(__name__) + now = datetime.now(timezone.utc) + + total_eval_time = ( + sum( + r.result.evaluation_time + for r in eval_results + if r.result.evaluation_time + ) + or 0.0 + ) + + parent_end_time = now + parent_start_time = ( + datetime.fromtimestamp( + now.timestamp() - total_eval_time, tz=timezone.utc + ) + if total_eval_time > 0 + else now + ) + + root_span_uuid = None + if spans: + from uipath.tracing._utils import _SpanUtils + + for span in spans: + if span.parent is None: + span_context = span.get_span_context() + root_span_uuid = _SpanUtils.span_id_to_uuid4( + span_context.span_id + ) + break + + trace_id_int = int(uuid.UUID(eval_run_id)) + + if root_span_uuid: + root_span_id_int = int(root_span_uuid) + parent_context = SpanContext( + trace_id=trace_id_int, + span_id=root_span_id_int, + is_remote=False, + trace_flags=TraceFlags(0x01), + ) + ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) + else: + parent_context = SpanContext( + trace_id=trace_id_int, + span_id=trace_id_int, + is_remote=False, + trace_flags=TraceFlags(0x01), + ) + ctx = trace.set_span_in_context(trace.NonRecordingSpan(parent_context)) + + parent_start_ns = int(parent_start_time.timestamp() * 1_000_000_000) + parent_end_ns = int(parent_end_time.timestamp() * 1_000_000_000) + + parent_span = tracer.start_span( + "Evaluators", + context=ctx, + kind=SpanKind.INTERNAL, + start_time=parent_start_ns, + ) + + parent_span.set_attribute("openinference.span.kind", "CHAIN") + parent_span.set_attribute("span.type", "evaluators") + parent_span.set_attribute("eval_run_id", eval_run_id) + + parent_ctx = trace.set_span_in_context(parent_span, ctx) + current_time = parent_start_time + readable_spans = [] + + for eval_result in eval_results: + evaluator = self.evaluators.get(eval_result.evaluator_id) + evaluator_name = evaluator.id if evaluator else eval_result.evaluator_id + + eval_time = eval_result.result.evaluation_time or 0 + eval_start = current_time + eval_end = datetime.fromtimestamp( + current_time.timestamp() + eval_time, tz=timezone.utc + ) + current_time = eval_end + + eval_start_ns = int(eval_start.timestamp() * 1_000_000_000) + eval_end_ns = int(eval_end.timestamp() * 1_000_000_000) + + evaluator_span = tracer.start_span( + evaluator_name, + context=parent_ctx, + kind=SpanKind.INTERNAL, + start_time=eval_start_ns, + ) + + evaluator_span.set_attribute("openinference.span.kind", "EVALUATOR") + evaluator_span.set_attribute("span.type", "evaluator") + evaluator_span.set_attribute("evaluator_id", eval_result.evaluator_id) + evaluator_span.set_attribute("evaluator_name", evaluator_name) + evaluator_span.set_attribute("eval_run_id", eval_run_id) + evaluator_span.set_attribute("score", eval_result.result.score) + evaluator_span.set_attribute( + "score_type", eval_result.result.score_type.name + ) + + if eval_result.result.details: + if isinstance(eval_result.result.details, BaseModel): + evaluator_span.set_attribute( + "details", + json.dumps(eval_result.result.details.model_dump()), + ) + else: + evaluator_span.set_attribute( + "details", str(eval_result.result.details) + ) + + if eval_result.result.evaluation_time: + evaluator_span.set_attribute( + "evaluation_time", eval_result.result.evaluation_time + ) + + from opentelemetry.trace import Status, StatusCode + + if eval_result.result.score_type == ScoreType.ERROR: + evaluator_span.set_status( + Status(StatusCode.ERROR, "Evaluation failed") + ) + else: + evaluator_span.set_status(Status(StatusCode.OK)) + + evaluator_span.end(end_time=eval_end_ns) + + if hasattr(evaluator_span, "_readable_span"): + readable_spans.append(evaluator_span._readable_span()) + + parent_span.end(end_time=parent_end_ns) + + if hasattr(parent_span, "_readable_span"): + readable_spans.insert(0, parent_span._readable_span()) + + if readable_spans: + self.spans_exporter.export(readable_spans) + + logger.debug( + f"Created evaluator traces for eval run: {eval_run_id} " + f"({len(eval_results)} evaluators)" + ) + except Exception as e: + logger.warning(f"Failed to create evaluator traces: {e}") diff --git a/src/uipath/_cli/_evals/_reporting/_strategies.py b/src/uipath/_cli/_evals/_reporting/_strategies.py new file mode 100644 index 000000000..35a7fa2b6 --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/_strategies.py @@ -0,0 +1,418 @@ +"""Evaluation reporting strategies for legacy and coded evaluations. + +This module defines the Strategy Pattern for handling the differences between +legacy and coded evaluation API formats, including ID conversion, endpoint +routing, and payload structure. +""" + +import uuid +from typing import Any, Callable, Protocol, runtime_checkable + +from uipath._cli._evals._models._evaluation_set import ( + EvaluationItem, + EvaluationStatus, +) +from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot +from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator + +# ============================================================================= +# Strategy Protocol +# ============================================================================= + + +@runtime_checkable +class EvalReportingStrategy(Protocol): + """Protocol for evaluation reporting strategies. + + Strategies handle the differences between legacy and coded evaluation + API formats, including ID conversion, endpoint routing, and payload structure. + """ + + @property + def endpoint_suffix(self) -> str: + """Return the endpoint suffix for this strategy. + + Returns: + "" for legacy, "coded/" for coded evaluations + """ + ... + + def convert_id(self, id_value: str) -> str: + """Convert an ID to the format expected by the backend. + + Args: + id_value: The original string ID + + Returns: + For legacy: deterministic GUID from uuid5 + For coded: original string ID unchanged + """ + ... + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create the payload for creating an eval set run.""" + ... + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create the payload for creating an eval run.""" + ... + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create the payload for updating an eval run.""" + ... + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create the payload for updating an eval set run.""" + ... + + def collect_results( + self, + eval_results: list[Any], + evaluators: dict[str, Any], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results from evaluations in strategy-specific format. + + Returns: + Tuple of (evaluator_runs, evaluator_scores) + """ + ... + + +# ============================================================================= +# Legacy Evaluation Reporting Strategy +# ============================================================================= + + +class LegacyEvalReportingStrategy: + """Strategy for legacy evaluation reporting. + + Legacy evaluations: + - Convert string IDs to deterministic GUIDs using uuid5 + - Use endpoints without /coded/ prefix + - Use assertionRuns format with assertionSnapshot + - Put expectedOutput directly in evalSnapshot + """ + + @property + def endpoint_suffix(self) -> str: + """Return empty string for legacy endpoints (no /coded/ prefix).""" + return "" + + def convert_id(self, id_value: str) -> str: + """Convert string ID to deterministic GUID for legacy API. + + Args: + id_value: The original string ID + + Returns: + The ID as a GUID (either original if valid, or deterministic uuid5) + """ + try: + uuid.UUID(id_value) + return id_value + except ValueError: + return str(uuid.uuid5(uuid.NAMESPACE_DNS, id_value)) + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create payload for creating a legacy eval set run.""" + return { + "agentId": project_id, + "evalSetId": self.convert_id(eval_set_id), + "agentSnapshot": agent_snapshot.model_dump(by_alias=True), + "status": EvaluationStatus.IN_PROGRESS.value, + "numberOfEvalsExecuted": no_of_evals, + "source": 0, # EvalRunSource.Manual + } + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create payload for creating a legacy eval run.""" + eval_item_id = self.convert_id(eval_item.id) + + # Extract expectedOutput from evaluation_criterias + expected_output = {} + if eval_item.evaluation_criterias: + first_criteria = next(iter(eval_item.evaluation_criterias.values()), None) + if first_criteria and isinstance(first_criteria, dict): + expected_output = first_criteria.get("expectedOutput", {}) + + return { + "evalSetRunId": eval_set_run_id, + "evalSnapshot": { + "id": eval_item_id, + "name": eval_item.name, + "inputs": eval_item.inputs, + "expectedOutput": expected_output, + }, + "status": EvaluationStatus.IN_PROGRESS.value, + } + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a legacy eval run.""" + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalRunId": eval_run_id, + "status": status.value, + "result": { + "output": dict(actual_output), + "evaluatorScores": evaluator_scores, + }, + "completionMetrics": {"duration": int(execution_time)}, + "assertionRuns": evaluator_runs, + } + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a legacy eval set run.""" + scores_list = [ + {"value": avg_score, "evaluatorId": self.convert_id(eval_id)} + for eval_id, avg_score in evaluator_scores.items() + ] + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalSetRunId": eval_set_run_id, + "status": status.value, + "evaluatorScores": scores_list, + } + + def collect_results( + self, + eval_results: list[Any], + evaluators: dict[str, LegacyBaseEvaluator[Any]], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results in legacy assertionRuns format.""" + assertion_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + for eval_result in eval_results: + if eval_result.evaluator_id not in evaluators: + continue + + evaluator_id_value = self.convert_id(eval_result.evaluator_id) + evaluator = evaluators[eval_result.evaluator_id] + justification = serialize_justification_fn(eval_result.result.details) + + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": justification, + "evaluatorId": evaluator_id_value, + } + ) + + assertion_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": evaluator_id_value, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time or 0), + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + "assertionSnapshot": { + "assertionType": evaluator.evaluator_type.name, + "outputKey": evaluator.target_output_key, + }, + } + ) + + return assertion_runs, evaluator_scores_list + + +# ============================================================================= +# Coded Evaluation Reporting Strategy +# ============================================================================= + + +class CodedEvalReportingStrategy: + """Strategy for coded evaluation reporting. + + Coded evaluations: + - Keep string IDs unchanged + - Use endpoints with /coded/ prefix + - Use evaluatorRuns format with nested result + - Put evaluationCriterias in evalSnapshot + """ + + @property + def endpoint_suffix(self) -> str: + """Return 'coded/' for coded endpoints.""" + return "coded/" + + def convert_id(self, id_value: str) -> str: + """Keep string ID unchanged for coded API.""" + return id_value + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create payload for creating a coded eval set run.""" + return { + "agentId": project_id, + "evalSetId": eval_set_id, + "agentSnapshot": agent_snapshot.model_dump(by_alias=True), + "status": EvaluationStatus.IN_PROGRESS.value, + "numberOfEvalsExecuted": no_of_evals, + "source": 0, # EvalRunSource.Manual + } + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create payload for creating a coded eval run.""" + return { + "evalSetRunId": eval_set_run_id, + "evalSnapshot": { + "id": eval_item.id, + "name": eval_item.name, + "inputs": eval_item.inputs, + "evaluationCriterias": eval_item.evaluation_criterias, + }, + "status": EvaluationStatus.IN_PROGRESS.value, + } + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a coded eval run.""" + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalRunId": eval_run_id, + "status": status.value, + "result": { + "output": dict(actual_output), + "scores": evaluator_scores, # Note: "scores" not "evaluatorScores" + }, + "completionMetrics": {"duration": int(execution_time)}, + "evaluatorRuns": evaluator_runs, # Note: "evaluatorRuns" not "assertionRuns" + } + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a coded eval set run.""" + scores_list = [ + {"value": avg_score, "evaluatorId": eval_id} + for eval_id, avg_score in evaluator_scores.items() + ] + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalSetRunId": eval_set_run_id, + "status": status.value, + "evaluatorScores": scores_list, + } + + def collect_results( + self, + eval_results: list[Any], + evaluators: dict[str, BaseEvaluator[Any, Any, Any]], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results in coded evaluatorRuns format.""" + evaluator_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + for eval_result in eval_results: + if eval_result.evaluator_id not in evaluators: + continue + + justification = serialize_justification_fn(eval_result.result.details) + + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": justification, + "evaluatorId": eval_result.evaluator_id, + } + ) + + evaluator_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": eval_result.evaluator_id, + "result": { + "score": { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + }, + "justification": justification, + }, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time or 0), + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + } + ) + + return evaluator_runs, evaluator_scores_list diff --git a/src/uipath/_cli/_evals/_reporting/_utils.py b/src/uipath/_cli/_evals/_reporting/_utils.py new file mode 100644 index 000000000..eb2d39a3b --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/_utils.py @@ -0,0 +1,44 @@ +"""Utility functions for evaluation progress reporting. + +This module contains decorators and helper functions used by the +progress reporter and related components. +""" + +import functools +import logging + +logger = logging.getLogger(__name__) + + +def gracefully_handle_errors(func): + """Decorator to catch and log errors without stopping execution. + + This decorator wraps async functions and catches any exceptions, + logging them as warnings instead of allowing them to propagate. + This ensures that progress reporting failures don't break the + main evaluation flow. + + Args: + func: The async function to wrap + + Returns: + The wrapped function that catches and logs errors + """ + + @functools.wraps(func) + async def wrapper(self, *args, **kwargs): + try: + return await func(self, *args, **kwargs) + except Exception as e: + if hasattr(self, "_console"): + error_type = type(e).__name__ + logger.debug(f"Full error details: {e}") + logger.warning( + f"Cannot report progress to SW. " + f"Function: {func.__name__}, " + f"Error type: {error_type}, " + f"Details: {e}" + ) + return None + + return wrapper diff --git a/tests/cli/eval/reporting/__init__.py b/tests/cli/eval/reporting/__init__.py new file mode 100644 index 000000000..8f5346601 --- /dev/null +++ b/tests/cli/eval/reporting/__init__.py @@ -0,0 +1 @@ +"""Tests for the evaluation progress reporting module.""" diff --git a/tests/cli/eval/test_progress_reporter.py b/tests/cli/eval/reporting/test_reporter.py similarity index 98% rename from tests/cli/eval/test_progress_reporter.py rename to tests/cli/eval/reporting/test_reporter.py index 4db5f6713..3cba9ca5d 100644 --- a/tests/cli/eval/test_progress_reporter.py +++ b/tests/cli/eval/reporting/test_reporter.py @@ -15,11 +15,11 @@ import pytest from opentelemetry.sdk.trace import ReadableSpan -from uipath._cli._evals._progress_reporter import StudioWebProgressReporter +from uipath._cli._evals._reporting import StudioWebProgressReporter from uipath._events._events import EvalSetRunCreatedEvent from uipath.tracing import LlmOpsHttpExporter -# Test fixtures - simple mocks without full evaluator instantiation +# Test fixtures @pytest.fixture @@ -215,10 +215,6 @@ def test_extract_usage_from_spans_without_usage(self, progress_reporter): assert usage["cost"] is None -# Result collection tests removed - complex to test without real evaluator instances -# The core functionality is tested indirectly through the request spec generation tests - - # Tests for request spec generation class TestRequestSpecGeneration: """Tests for generating request specs for different evaluator types.""" diff --git a/tests/cli/eval/reporting/test_strategies.py b/tests/cli/eval/reporting/test_strategies.py new file mode 100644 index 000000000..400424f52 --- /dev/null +++ b/tests/cli/eval/reporting/test_strategies.py @@ -0,0 +1,244 @@ +"""Tests for evaluation reporting strategies. + +This module tests the strategy classes including: +- LegacyEvalReportingStrategy +- CodedEvalReportingStrategy +- ID conversion behavior +- Payload structure generation +""" + +import uuid + +import pytest + +from uipath._cli._evals._reporting._strategies import ( + CodedEvalReportingStrategy, + LegacyEvalReportingStrategy, +) + + +class TestLegacyEvalReportingStrategy: + """Tests for LegacyEvalReportingStrategy.""" + + @pytest.fixture + def strategy(self): + """Create a LegacyEvalReportingStrategy instance.""" + return LegacyEvalReportingStrategy() + + def test_endpoint_suffix_is_empty(self, strategy): + """Test that legacy strategy has empty endpoint suffix.""" + assert strategy.endpoint_suffix == "" + + def test_convert_id_with_valid_uuid(self, strategy): + """Test that valid UUIDs are returned unchanged.""" + valid_uuid = "550e8400-e29b-41d4-a716-446655440000" + assert strategy.convert_id(valid_uuid) == valid_uuid + + def test_convert_id_with_string_id(self, strategy): + """Test that string IDs are converted to deterministic UUIDs.""" + string_id = "my-custom-id" + result = strategy.convert_id(string_id) + + # Result should be a valid UUID + uuid.UUID(result) + + # Same input should produce same output (deterministic) + assert strategy.convert_id(string_id) == result + + def test_convert_id_with_different_strings_produces_different_uuids(self, strategy): + """Test that different string IDs produce different UUIDs.""" + id1 = strategy.convert_id("id-one") + id2 = strategy.convert_id("id-two") + + assert id1 != id2 + + def test_create_eval_set_run_payload_structure(self, strategy): + """Test the structure of legacy eval set run payload.""" + from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot + + agent_snapshot = StudioWebAgentSnapshot( + input_schema={"type": "object"}, output_schema={"type": "object"} + ) + + payload = strategy.create_eval_set_run_payload( + eval_set_id="test-eval-set", + agent_snapshot=agent_snapshot, + no_of_evals=5, + project_id="test-project", + ) + + assert payload["agentId"] == "test-project" + assert payload["status"] == 1 # IN_PROGRESS + assert payload["numberOfEvalsExecuted"] == 5 + assert payload["source"] == 0 + assert "agentSnapshot" in payload + + def test_create_update_eval_run_payload_uses_assertion_runs(self, strategy): + """Test that legacy update payload uses assertionRuns field.""" + evaluator_runs = [{"evaluatorId": "test-1", "status": 2}] + evaluator_scores = [{"evaluatorId": "test-1", "value": 0.9}] + + payload = strategy.create_update_eval_run_payload( + eval_run_id="run-id", + evaluator_runs=evaluator_runs, + evaluator_scores=evaluator_scores, + actual_output={"result": "success"}, + execution_time=5.0, + success=True, + ) + + assert "assertionRuns" in payload + assert payload["assertionRuns"] == evaluator_runs + assert "evaluatorRuns" not in payload + assert payload["result"]["evaluatorScores"] == evaluator_scores + + def test_create_update_eval_set_run_payload_converts_ids(self, strategy): + """Test that eval set run update converts evaluator IDs.""" + evaluator_scores = {"my-evaluator": 0.85} + + payload = strategy.create_update_eval_set_run_payload( + eval_set_run_id="run-id", + evaluator_scores=evaluator_scores, + success=True, + ) + + # Check that the evaluator ID was converted + assert len(payload["evaluatorScores"]) == 1 + score_entry = payload["evaluatorScores"][0] + assert score_entry["evaluatorId"] != "my-evaluator" # Should be converted + # Verify it's a valid UUID + uuid.UUID(score_entry["evaluatorId"]) + + +class TestCodedEvalReportingStrategy: + """Tests for CodedEvalReportingStrategy.""" + + @pytest.fixture + def strategy(self): + """Create a CodedEvalReportingStrategy instance.""" + return CodedEvalReportingStrategy() + + def test_endpoint_suffix_is_coded(self, strategy): + """Test that coded strategy has 'coded/' endpoint suffix.""" + assert strategy.endpoint_suffix == "coded/" + + def test_convert_id_returns_unchanged(self, strategy): + """Test that IDs are returned unchanged.""" + string_id = "my-custom-id" + assert strategy.convert_id(string_id) == string_id + + uuid_id = "550e8400-e29b-41d4-a716-446655440000" + assert strategy.convert_id(uuid_id) == uuid_id + + def test_create_eval_set_run_payload_keeps_original_id(self, strategy): + """Test that eval set ID is kept unchanged.""" + from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot + + agent_snapshot = StudioWebAgentSnapshot( + input_schema={"type": "object"}, output_schema={"type": "object"} + ) + + payload = strategy.create_eval_set_run_payload( + eval_set_id="my-eval-set-id", + agent_snapshot=agent_snapshot, + no_of_evals=3, + project_id="test-project", + ) + + assert payload["evalSetId"] == "my-eval-set-id" # Unchanged + + def test_create_update_eval_run_payload_uses_evaluator_runs(self, strategy): + """Test that coded update payload uses evaluatorRuns field.""" + evaluator_runs = [{"evaluatorId": "test-1", "status": 2}] + evaluator_scores = [{"evaluatorId": "test-1", "value": 0.9}] + + payload = strategy.create_update_eval_run_payload( + eval_run_id="run-id", + evaluator_runs=evaluator_runs, + evaluator_scores=evaluator_scores, + actual_output={"result": "success"}, + execution_time=5.0, + success=True, + ) + + assert "evaluatorRuns" in payload + assert payload["evaluatorRuns"] == evaluator_runs + assert "assertionRuns" not in payload + assert ( + payload["result"]["scores"] == evaluator_scores + ) # "scores" not "evaluatorScores" + + def test_create_update_eval_set_run_payload_keeps_ids(self, strategy): + """Test that eval set run update keeps evaluator IDs unchanged.""" + evaluator_scores = {"my-evaluator": 0.85} + + payload = strategy.create_update_eval_set_run_payload( + eval_set_run_id="run-id", + evaluator_scores=evaluator_scores, + success=True, + ) + + # Check that the evaluator ID was NOT converted + assert len(payload["evaluatorScores"]) == 1 + score_entry = payload["evaluatorScores"][0] + assert score_entry["evaluatorId"] == "my-evaluator" # Should be unchanged + + +class TestStrategyStatusHandling: + """Tests for status handling in both strategies.""" + + @pytest.fixture + def legacy_strategy(self): + return LegacyEvalReportingStrategy() + + @pytest.fixture + def coded_strategy(self): + return CodedEvalReportingStrategy() + + def test_legacy_success_status(self, legacy_strategy): + """Test legacy strategy sets COMPLETED status on success.""" + payload = legacy_strategy.create_update_eval_run_payload( + eval_run_id="run-id", + evaluator_runs=[], + evaluator_scores=[], + actual_output={}, + execution_time=0.0, + success=True, + ) + assert payload["status"] == 2 # COMPLETED + + def test_legacy_failure_status(self, legacy_strategy): + """Test legacy strategy sets FAILED status on failure.""" + payload = legacy_strategy.create_update_eval_run_payload( + eval_run_id="run-id", + evaluator_runs=[], + evaluator_scores=[], + actual_output={}, + execution_time=0.0, + success=False, + ) + assert payload["status"] == 3 # FAILED + + def test_coded_success_status(self, coded_strategy): + """Test coded strategy sets COMPLETED status on success.""" + payload = coded_strategy.create_update_eval_run_payload( + eval_run_id="run-id", + evaluator_runs=[], + evaluator_scores=[], + actual_output={}, + execution_time=0.0, + success=True, + ) + assert payload["status"] == 2 # COMPLETED + + def test_coded_failure_status(self, coded_strategy): + """Test coded strategy sets FAILED status on failure.""" + payload = coded_strategy.create_update_eval_run_payload( + eval_run_id="run-id", + evaluator_runs=[], + evaluator_scores=[], + actual_output={}, + execution_time=0.0, + success=False, + ) + assert payload["status"] == 3 # FAILED diff --git a/tests/cli/eval/reporting/test_utils.py b/tests/cli/eval/reporting/test_utils.py new file mode 100644 index 000000000..15afbfeab --- /dev/null +++ b/tests/cli/eval/reporting/test_utils.py @@ -0,0 +1,89 @@ +"""Tests for evaluation reporting utilities. + +This module tests utility functions and decorators including: +- gracefully_handle_errors decorator +""" + +from unittest.mock import Mock + +import pytest + +from uipath._cli._evals._reporting._utils import gracefully_handle_errors + + +class TestGracefullyHandleErrors: + """Tests for the gracefully_handle_errors decorator.""" + + @pytest.mark.asyncio + async def test_successful_execution(self): + """Test that successful functions return normally.""" + + class TestClass: + _console = Mock() + + @gracefully_handle_errors + async def test_method(self, value): + return value * 2 + + obj = TestClass() + result = await obj.test_method(5) + assert result == 10 + + @pytest.mark.asyncio + async def test_exception_returns_none(self): + """Test that exceptions are caught and None is returned.""" + + class TestClass: + _console = Mock() + + @gracefully_handle_errors + async def test_method(self): + raise ValueError("Test error") + + obj = TestClass() + result = await obj.test_method() + assert result is None + + @pytest.mark.asyncio + async def test_exception_without_console(self): + """Test that exceptions are handled even without _console attribute.""" + + class TestClass: + @gracefully_handle_errors + async def test_method(self): + raise RuntimeError("Test error") + + obj = TestClass() + result = await obj.test_method() + assert result is None + + @pytest.mark.asyncio + async def test_preserves_function_metadata(self): + """Test that the decorator preserves function metadata.""" + + class TestClass: + _console = Mock() + + @gracefully_handle_errors + async def documented_method(self): + """This is a documented method.""" + return "success" + + obj = TestClass() + assert obj.documented_method.__name__ == "documented_method" + assert "documented" in obj.documented_method.__doc__ + + @pytest.mark.asyncio + async def test_handles_multiple_args_and_kwargs(self): + """Test that the decorator handles multiple arguments correctly.""" + + class TestClass: + _console = Mock() + + @gracefully_handle_errors + async def test_method(self, a, b, c=None, d=None): + return a + b + (c or 0) + (d or 0) + + obj = TestClass() + result = await obj.test_method(1, 2, c=3, d=4) + assert result == 10 From cd8d3421041c2b2e5e600de59f9604ad89adeec6 Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Thu, 18 Dec 2025 23:55:42 -0800 Subject: [PATCH 3/5] chore: bump version to 2.2.37 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5b99ec684..251f00126 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.2.36" +version = "2.2.37" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" From 9c222faa98eccfb44d6ba529a58bf485c014587c Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Fri, 19 Dec 2025 17:45:15 -0800 Subject: [PATCH 4/5] fix: add logging for eval set run schema reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add INFO-level logging to show inputSchema and outputSchema when creating eval set runs for better debugging - Add DEBUG-level logging for full payloads on all eval reporting operations - Add warning when entrypoint is not provided, falling back to empty schemas - Add tests for agent snapshot extraction behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../_cli/_evals/_reporting/_reporter.py | 95 +++++++++- tests/cli/eval/reporting/test_reporter.py | 168 ++++++++++++++++++ 2 files changed, 261 insertions(+), 2 deletions(-) diff --git a/src/uipath/_cli/_evals/_reporting/_reporter.py b/src/uipath/_cli/_evals/_reporting/_reporter.py index d6242fa0f..6ebfe38a0 100644 --- a/src/uipath/_cli/_evals/_reporting/_reporter.py +++ b/src/uipath/_cli/_evals/_reporting/_reporter.py @@ -206,13 +206,57 @@ def _extract_usage_from_spans( "cost": total_cost if total_cost > 0 else None, } - def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: - """Extract agent snapshot from entry points configuration.""" + def _extract_agent_snapshot(self, entrypoint: str | None) -> StudioWebAgentSnapshot: + """Extract agent snapshot from entry points configuration or low-code agent file. + + For coded agents, reads from entry-points.json configuration file. + For low-code agents (*.json files like agent.json), reads inputSchema + and outputSchema directly from the agent file. + + Args: + entrypoint: The entrypoint file path to look up + + Returns: + StudioWebAgentSnapshot with input and output schemas + """ + if not entrypoint: + logger.warning( + "Entrypoint not provided - falling back to empty inputSchema " + "and outputSchema" + ) + return StudioWebAgentSnapshot(input_schema={}, output_schema={}) + try: + # Check if entrypoint is a low-code agent JSON file (e.g., agent.json) + if entrypoint.endswith(".json"): + agent_file_path = os.path.join(os.getcwd(), entrypoint) + if os.path.exists(agent_file_path): + with open(agent_file_path, "r") as f: + agent_data = json.load(f) + + # Low-code agent files have inputSchema and outputSchema at root + input_schema = agent_data.get("inputSchema", {}) + output_schema = agent_data.get("outputSchema", {}) + + logger.debug( + f"Extracted agent snapshot from low-code agent '{entrypoint}': " + f"inputSchema={json.dumps(input_schema)}, " + f"outputSchema={json.dumps(output_schema)}" + ) + + return StudioWebAgentSnapshot( + input_schema=input_schema, output_schema=output_schema + ) + + # Fall back to entry-points.json for coded agents entry_points_file_path = os.path.join( os.getcwd(), str(UiPathConfig.entry_points_file_path) ) if not os.path.exists(entry_points_file_path): + logger.debug( + f"Entry points file not found at {entry_points_file_path}, " + "using empty schemas" + ) return StudioWebAgentSnapshot(input_schema={}, output_schema={}) with open(entry_points_file_path, "r") as f: @@ -233,6 +277,12 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot: input_schema = ep.get("input", {}) output_schema = ep.get("output", {}) + logger.debug( + f"Extracted agent snapshot for entrypoint '{entrypoint}': " + f"inputSchema={json.dumps(input_schema)}, " + f"outputSchema={json.dumps(output_schema)}" + ) + return StudioWebAgentSnapshot( input_schema=input_schema, output_schema=output_schema ) @@ -257,6 +307,17 @@ def _create_eval_set_run_spec( payload = strategy.create_eval_set_run_payload( eval_set_id, agent_snapshot, no_of_evals, self._project_id ) + + # Log the payload for debugging eval set run reporting + agent_type = "coded" if is_coded else "low-code" + logger.info( + f"Creating eval set run (type={agent_type}): " + f"evalSetId={eval_set_id}, " + f"inputSchema={json.dumps(payload.get('agentSnapshot', {}).get('inputSchema', {}))}, " + f"outputSchema={json.dumps(payload.get('agentSnapshot', {}).get('outputSchema', {}))}" + ) + logger.debug(f"Full eval set run payload: {json.dumps(payload, indent=2)}") + return RequestSpec( method="POST", endpoint=Endpoint( @@ -273,6 +334,15 @@ def _create_eval_run_spec( """Create request spec for creating an eval run.""" strategy = self._get_strategy(is_coded) payload = strategy.create_eval_run_payload(eval_item, eval_set_run_id) + + # Log the payload for debugging eval run reporting + agent_type = "coded" if is_coded else "low-code" + logger.debug( + f"Creating eval run (type={agent_type}): " + f"evalSetRunId={eval_set_run_id}, evalItemId={eval_item.id}" + ) + logger.debug(f"Full eval run payload: {json.dumps(payload, indent=2)}") + return RequestSpec( method="POST", endpoint=Endpoint( @@ -303,6 +373,15 @@ def _update_eval_run_spec( execution_time, success, ) + + # Log the payload for debugging eval run updates + agent_type = "coded" if is_coded else "low-code" + logger.debug( + f"Updating eval run (type={agent_type}): " + f"evalRunId={eval_run_id}, success={success}" + ) + logger.debug(f"Full eval run update payload: {json.dumps(payload, indent=2)}") + return RequestSpec( method="PUT", endpoint=Endpoint( @@ -325,6 +404,18 @@ def _update_eval_set_run_spec( payload = strategy.create_update_eval_set_run_payload( eval_set_run_id, evaluator_scores, success ) + + # Log the payload for debugging eval set run updates + agent_type = "coded" if is_coded else "low-code" + logger.info( + f"Updating eval set run (type={agent_type}): " + f"evalSetRunId={eval_set_run_id}, success={success}, " + f"evaluatorScores={json.dumps(payload.get('evaluatorScores', []))}" + ) + logger.debug( + f"Full eval set run update payload: {json.dumps(payload, indent=2)}" + ) + return RequestSpec( method="PUT", endpoint=Endpoint( diff --git a/tests/cli/eval/reporting/test_reporter.py b/tests/cli/eval/reporting/test_reporter.py index 3cba9ca5d..5b3a3d691 100644 --- a/tests/cli/eval/reporting/test_reporter.py +++ b/tests/cli/eval/reporting/test_reporter.py @@ -552,3 +552,171 @@ def test_update_eval_set_run_spec_with_failure_legacy(self, progress_reporter): assert spec.json["evalSetRunId"] == "test-run-id" # Backend expects integer status assert spec.json["status"] == 3 # FAILED + + +# Tests for agent snapshot extraction +class TestAgentSnapshotExtraction: + """Tests for extracting agent snapshot with proper schema handling.""" + + def test_extract_agent_snapshot_reads_from_entry_points( + self, progress_reporter, tmp_path, monkeypatch + ): + """Test that agent snapshot reads schemas from entry points file.""" + import os + + # Create a temporary entry points file with full schemas + entry_points_data = { + "entryPoints": [ + { + "filePath": "test_agent", + "uniqueId": "test-uuid", + "type": "agent", + "input": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + "output": { + "type": "object", + "properties": {"response": {"type": "string"}}, + }, + } + ] + } + + entry_points_file = tmp_path / "entry-points.json" + with open(entry_points_file, "w") as f: + json.dump(entry_points_data, f) + + # Change to the temp directory so the reporter finds the file + original_cwd = os.getcwd() + os.chdir(tmp_path) + + try: + snapshot = progress_reporter._extract_agent_snapshot( + entrypoint="test_agent" + ) + + # Should read full schemas from entry points + assert snapshot.input_schema == { + "type": "object", + "properties": {"query": {"type": "string"}}, + } + assert snapshot.output_schema == { + "type": "object", + "properties": {"response": {"type": "string"}}, + } + finally: + os.chdir(original_cwd) + + def test_extract_agent_snapshot_returns_empty_when_no_file(self, progress_reporter): + """Test that empty schemas are returned when entry points file doesn't exist.""" + snapshot = progress_reporter._extract_agent_snapshot( + entrypoint="nonexistent_agent" + ) + + assert snapshot.input_schema == {} + assert snapshot.output_schema == {} + + def test_extract_agent_snapshot_warns_when_entrypoint_is_none( + self, progress_reporter, caplog + ): + """Test that a warning is logged when entrypoint is None.""" + import logging + + with caplog.at_level(logging.WARNING): + snapshot = progress_reporter._extract_agent_snapshot(entrypoint=None) + + assert snapshot.input_schema == {} + assert snapshot.output_schema == {} + assert "Entrypoint not provided" in caplog.text + assert "falling back to empty inputSchema" in caplog.text + + def test_extract_agent_snapshot_warns_when_entrypoint_is_empty( + self, progress_reporter, caplog + ): + """Test that a warning is logged when entrypoint is empty string.""" + import logging + + with caplog.at_level(logging.WARNING): + snapshot = progress_reporter._extract_agent_snapshot(entrypoint="") + + assert snapshot.input_schema == {} + assert snapshot.output_schema == {} + assert "Entrypoint not provided" in caplog.text + + def test_extract_agent_snapshot_returns_empty_when_entrypoint_not_found( + self, progress_reporter, tmp_path + ): + """Test that empty schemas are returned when entrypoint is not in file.""" + import os + + # Create entry points file without the requested entrypoint + entry_points_data = { + "entryPoints": [ + { + "filePath": "other_agent", + "uniqueId": "test-uuid", + "type": "agent", + "input": {"type": "object"}, + "output": {"type": "object"}, + } + ] + } + + entry_points_file = tmp_path / "entry-points.json" + with open(entry_points_file, "w") as f: + json.dump(entry_points_data, f) + + original_cwd = os.getcwd() + os.chdir(tmp_path) + + try: + snapshot = progress_reporter._extract_agent_snapshot( + entrypoint="nonexistent_agent" + ) + + assert snapshot.input_schema == {} + assert snapshot.output_schema == {} + finally: + os.chdir(original_cwd) + + def test_agent_snapshot_serializes_with_camel_case( + self, progress_reporter, tmp_path + ): + """Test that agent snapshot serializes to correct JSON format with camelCase.""" + import os + + entry_points_data = { + "entryPoints": [ + { + "filePath": "test_agent", + "uniqueId": "test-uuid", + "type": "agent", + "input": {"type": "object", "properties": {}}, + "output": {"type": "object", "properties": {}}, + } + ] + } + + entry_points_file = tmp_path / "entry-points.json" + with open(entry_points_file, "w") as f: + json.dump(entry_points_data, f) + + original_cwd = os.getcwd() + os.chdir(tmp_path) + + try: + snapshot = progress_reporter._extract_agent_snapshot( + entrypoint="test_agent" + ) + + # Serialize using pydantic + serialized = snapshot.model_dump(by_alias=True) + + # Should have camelCase keys + assert "inputSchema" in serialized + assert "outputSchema" in serialized + assert serialized["inputSchema"] == {"type": "object", "properties": {}} + assert serialized["outputSchema"] == {"type": "object", "properties": {}} + finally: + os.chdir(original_cwd) From d65e8e9a619a9e258ea0e78e3bb32119680170fa Mon Sep 17 00:00:00 2001 From: Chibi Vikram Date: Fri, 19 Dec 2025 19:10:20 -0800 Subject: [PATCH 5/5] refactor: split progress reporter into modular package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the monolithic _strategies.py into separate files for better code organization: - _strategy_protocol.py: Protocol definition - _legacy_strategy.py: Legacy evaluation reporting strategy - _coded_strategy.py: Coded evaluation reporting strategy - _strategies.py: Re-exports for backward compatibility 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../_cli/_evals/_reporting/_coded_strategy.py | 157 +++++++ .../_evals/_reporting/_legacy_strategy.py | 177 ++++++++ .../_cli/_evals/_reporting/_strategies.py | 423 +----------------- .../_evals/_reporting/_strategy_protocol.py | 93 ++++ 4 files changed, 437 insertions(+), 413 deletions(-) create mode 100644 src/uipath/_cli/_evals/_reporting/_coded_strategy.py create mode 100644 src/uipath/_cli/_evals/_reporting/_legacy_strategy.py create mode 100644 src/uipath/_cli/_evals/_reporting/_strategy_protocol.py diff --git a/src/uipath/_cli/_evals/_reporting/_coded_strategy.py b/src/uipath/_cli/_evals/_reporting/_coded_strategy.py new file mode 100644 index 000000000..d8613fe39 --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/_coded_strategy.py @@ -0,0 +1,157 @@ +"""Coded evaluation reporting strategy. + +This module implements the strategy for coded evaluation reporting, +which uses evaluatorRuns format and keeps string IDs unchanged. +""" + +from typing import Any, Callable + +from uipath._cli._evals._models._evaluation_set import ( + EvaluationItem, + EvaluationStatus, +) +from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot +from uipath.eval.evaluators import BaseEvaluator + + +class CodedEvalReportingStrategy: + """Strategy for coded evaluation reporting. + + Coded evaluations: + - Keep string IDs unchanged + - Use endpoints with /coded/ prefix + - Use evaluatorRuns format with nested result + - Put evaluationCriterias in evalSnapshot + """ + + @property + def endpoint_suffix(self) -> str: + """Return 'coded/' for coded endpoints.""" + return "coded/" + + def convert_id(self, id_value: str) -> str: + """Keep string ID unchanged for coded API.""" + return id_value + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create payload for creating a coded eval set run.""" + return { + "agentId": project_id, + "evalSetId": eval_set_id, + "agentSnapshot": agent_snapshot.model_dump(by_alias=True), + "status": EvaluationStatus.IN_PROGRESS.value, + "numberOfEvalsExecuted": no_of_evals, + "source": 0, # EvalRunSource.Manual + } + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create payload for creating a coded eval run.""" + return { + "evalSetRunId": eval_set_run_id, + "evalSnapshot": { + "id": eval_item.id, + "name": eval_item.name, + "inputs": eval_item.inputs, + "evaluationCriterias": eval_item.evaluation_criterias, + }, + "status": EvaluationStatus.IN_PROGRESS.value, + } + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a coded eval run.""" + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalRunId": eval_run_id, + "status": status.value, + "result": { + "output": dict(actual_output), + "scores": evaluator_scores, # Note: "scores" not "evaluatorScores" + }, + "completionMetrics": {"duration": int(execution_time)}, + "evaluatorRuns": evaluator_runs, # Note: "evaluatorRuns" not "assertionRuns" + } + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a coded eval set run.""" + scores_list = [ + {"value": avg_score, "evaluatorId": eval_id} + for eval_id, avg_score in evaluator_scores.items() + ] + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalSetRunId": eval_set_run_id, + "status": status.value, + "evaluatorScores": scores_list, + } + + def collect_results( + self, + eval_results: list[Any], + evaluators: dict[str, BaseEvaluator[Any, Any, Any]], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results in coded evaluatorRuns format.""" + evaluator_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + for eval_result in eval_results: + if eval_result.evaluator_id not in evaluators: + continue + + justification = serialize_justification_fn(eval_result.result.details) + + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": justification, + "evaluatorId": eval_result.evaluator_id, + } + ) + + evaluator_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": eval_result.evaluator_id, + "result": { + "score": { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + }, + "justification": justification, + }, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time or 0), + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + } + ) + + return evaluator_runs, evaluator_scores_list diff --git a/src/uipath/_cli/_evals/_reporting/_legacy_strategy.py b/src/uipath/_cli/_evals/_reporting/_legacy_strategy.py new file mode 100644 index 000000000..c427f897a --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/_legacy_strategy.py @@ -0,0 +1,177 @@ +"""Legacy evaluation reporting strategy. + +This module implements the strategy for legacy evaluation reporting, +which uses assertionRuns format and converts string IDs to GUIDs. +""" + +import uuid +from typing import Any, Callable + +from uipath._cli._evals._models._evaluation_set import ( + EvaluationItem, + EvaluationStatus, +) +from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot +from uipath.eval.evaluators import LegacyBaseEvaluator + + +class LegacyEvalReportingStrategy: + """Strategy for legacy evaluation reporting. + + Legacy evaluations: + - Convert string IDs to deterministic GUIDs using uuid5 + - Use endpoints without /coded/ prefix + - Use assertionRuns format with assertionSnapshot + - Put expectedOutput directly in evalSnapshot + """ + + @property + def endpoint_suffix(self) -> str: + """Return empty string for legacy endpoints (no /coded/ prefix).""" + return "" + + def convert_id(self, id_value: str) -> str: + """Convert string ID to deterministic GUID for legacy API. + + Args: + id_value: The original string ID + + Returns: + The ID as a GUID (either original if valid, or deterministic uuid5) + """ + try: + uuid.UUID(id_value) + return id_value + except ValueError: + return str(uuid.uuid5(uuid.NAMESPACE_DNS, id_value)) + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create payload for creating a legacy eval set run.""" + return { + "agentId": project_id, + "evalSetId": self.convert_id(eval_set_id), + "agentSnapshot": agent_snapshot.model_dump(by_alias=True), + "status": EvaluationStatus.IN_PROGRESS.value, + "numberOfEvalsExecuted": no_of_evals, + "source": 0, # EvalRunSource.Manual + } + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create payload for creating a legacy eval run.""" + eval_item_id = self.convert_id(eval_item.id) + + # Extract expectedOutput from evaluation_criterias + expected_output = {} + if eval_item.evaluation_criterias: + first_criteria = next(iter(eval_item.evaluation_criterias.values()), None) + if first_criteria and isinstance(first_criteria, dict): + expected_output = first_criteria.get("expectedOutput", {}) + + return { + "evalSetRunId": eval_set_run_id, + "evalSnapshot": { + "id": eval_item_id, + "name": eval_item.name, + "inputs": eval_item.inputs, + "expectedOutput": expected_output, + }, + "status": EvaluationStatus.IN_PROGRESS.value, + } + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a legacy eval run.""" + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalRunId": eval_run_id, + "status": status.value, + "result": { + "output": dict(actual_output), + "evaluatorScores": evaluator_scores, + }, + "completionMetrics": {"duration": int(execution_time)}, + "assertionRuns": evaluator_runs, + } + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create payload for updating a legacy eval set run.""" + scores_list = [ + {"value": avg_score, "evaluatorId": self.convert_id(eval_id)} + for eval_id, avg_score in evaluator_scores.items() + ] + status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED + return { + "evalSetRunId": eval_set_run_id, + "status": status.value, + "evaluatorScores": scores_list, + } + + def collect_results( + self, + eval_results: list[Any], + evaluators: dict[str, LegacyBaseEvaluator[Any]], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results in legacy assertionRuns format.""" + assertion_runs: list[dict[str, Any]] = [] + evaluator_scores_list: list[dict[str, Any]] = [] + + for eval_result in eval_results: + if eval_result.evaluator_id not in evaluators: + continue + + evaluator_id_value = self.convert_id(eval_result.evaluator_id) + evaluator = evaluators[eval_result.evaluator_id] + justification = serialize_justification_fn(eval_result.result.details) + + evaluator_scores_list.append( + { + "type": eval_result.result.score_type.value, + "value": eval_result.result.score, + "justification": justification, + "evaluatorId": evaluator_id_value, + } + ) + + assertion_runs.append( + { + "status": EvaluationStatus.COMPLETED.value, + "evaluatorId": evaluator_id_value, + "completionMetrics": { + "duration": int(eval_result.result.evaluation_time or 0), + "cost": usage_metrics["cost"], + "tokens": usage_metrics["tokens"] or 0, + "completionTokens": usage_metrics["completionTokens"] or 0, + "promptTokens": usage_metrics["promptTokens"] or 0, + }, + "assertionSnapshot": { + "assertionType": evaluator.evaluator_type.name, + "outputKey": evaluator.target_output_key, + }, + } + ) + + return assertion_runs, evaluator_scores_list diff --git a/src/uipath/_cli/_evals/_reporting/_strategies.py b/src/uipath/_cli/_evals/_reporting/_strategies.py index 35a7fa2b6..7100eb698 100644 --- a/src/uipath/_cli/_evals/_reporting/_strategies.py +++ b/src/uipath/_cli/_evals/_reporting/_strategies.py @@ -1,418 +1,15 @@ """Evaluation reporting strategies for legacy and coded evaluations. -This module defines the Strategy Pattern for handling the differences between -legacy and coded evaluation API formats, including ID conversion, endpoint -routing, and payload structure. +This module re-exports strategy classes from their individual modules +for backward compatibility. """ -import uuid -from typing import Any, Callable, Protocol, runtime_checkable +from uipath._cli._evals._reporting._coded_strategy import CodedEvalReportingStrategy +from uipath._cli._evals._reporting._legacy_strategy import LegacyEvalReportingStrategy +from uipath._cli._evals._reporting._strategy_protocol import EvalReportingStrategy -from uipath._cli._evals._models._evaluation_set import ( - EvaluationItem, - EvaluationStatus, -) -from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot -from uipath.eval.evaluators import BaseEvaluator, LegacyBaseEvaluator - -# ============================================================================= -# Strategy Protocol -# ============================================================================= - - -@runtime_checkable -class EvalReportingStrategy(Protocol): - """Protocol for evaluation reporting strategies. - - Strategies handle the differences between legacy and coded evaluation - API formats, including ID conversion, endpoint routing, and payload structure. - """ - - @property - def endpoint_suffix(self) -> str: - """Return the endpoint suffix for this strategy. - - Returns: - "" for legacy, "coded/" for coded evaluations - """ - ... - - def convert_id(self, id_value: str) -> str: - """Convert an ID to the format expected by the backend. - - Args: - id_value: The original string ID - - Returns: - For legacy: deterministic GUID from uuid5 - For coded: original string ID unchanged - """ - ... - - def create_eval_set_run_payload( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - project_id: str, - ) -> dict[str, Any]: - """Create the payload for creating an eval set run.""" - ... - - def create_eval_run_payload( - self, - eval_item: EvaluationItem, - eval_set_run_id: str, - ) -> dict[str, Any]: - """Create the payload for creating an eval run.""" - ... - - def create_update_eval_run_payload( - self, - eval_run_id: str, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - actual_output: dict[str, Any], - execution_time: float, - success: bool, - ) -> dict[str, Any]: - """Create the payload for updating an eval run.""" - ... - - def create_update_eval_set_run_payload( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - success: bool, - ) -> dict[str, Any]: - """Create the payload for updating an eval set run.""" - ... - - def collect_results( - self, - eval_results: list[Any], - evaluators: dict[str, Any], - usage_metrics: dict[str, int | float | None], - serialize_justification_fn: Callable[[Any], str | None], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results from evaluations in strategy-specific format. - - Returns: - Tuple of (evaluator_runs, evaluator_scores) - """ - ... - - -# ============================================================================= -# Legacy Evaluation Reporting Strategy -# ============================================================================= - - -class LegacyEvalReportingStrategy: - """Strategy for legacy evaluation reporting. - - Legacy evaluations: - - Convert string IDs to deterministic GUIDs using uuid5 - - Use endpoints without /coded/ prefix - - Use assertionRuns format with assertionSnapshot - - Put expectedOutput directly in evalSnapshot - """ - - @property - def endpoint_suffix(self) -> str: - """Return empty string for legacy endpoints (no /coded/ prefix).""" - return "" - - def convert_id(self, id_value: str) -> str: - """Convert string ID to deterministic GUID for legacy API. - - Args: - id_value: The original string ID - - Returns: - The ID as a GUID (either original if valid, or deterministic uuid5) - """ - try: - uuid.UUID(id_value) - return id_value - except ValueError: - return str(uuid.uuid5(uuid.NAMESPACE_DNS, id_value)) - - def create_eval_set_run_payload( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - project_id: str, - ) -> dict[str, Any]: - """Create payload for creating a legacy eval set run.""" - return { - "agentId": project_id, - "evalSetId": self.convert_id(eval_set_id), - "agentSnapshot": agent_snapshot.model_dump(by_alias=True), - "status": EvaluationStatus.IN_PROGRESS.value, - "numberOfEvalsExecuted": no_of_evals, - "source": 0, # EvalRunSource.Manual - } - - def create_eval_run_payload( - self, - eval_item: EvaluationItem, - eval_set_run_id: str, - ) -> dict[str, Any]: - """Create payload for creating a legacy eval run.""" - eval_item_id = self.convert_id(eval_item.id) - - # Extract expectedOutput from evaluation_criterias - expected_output = {} - if eval_item.evaluation_criterias: - first_criteria = next(iter(eval_item.evaluation_criterias.values()), None) - if first_criteria and isinstance(first_criteria, dict): - expected_output = first_criteria.get("expectedOutput", {}) - - return { - "evalSetRunId": eval_set_run_id, - "evalSnapshot": { - "id": eval_item_id, - "name": eval_item.name, - "inputs": eval_item.inputs, - "expectedOutput": expected_output, - }, - "status": EvaluationStatus.IN_PROGRESS.value, - } - - def create_update_eval_run_payload( - self, - eval_run_id: str, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - actual_output: dict[str, Any], - execution_time: float, - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a legacy eval run.""" - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalRunId": eval_run_id, - "status": status.value, - "result": { - "output": dict(actual_output), - "evaluatorScores": evaluator_scores, - }, - "completionMetrics": {"duration": int(execution_time)}, - "assertionRuns": evaluator_runs, - } - - def create_update_eval_set_run_payload( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a legacy eval set run.""" - scores_list = [ - {"value": avg_score, "evaluatorId": self.convert_id(eval_id)} - for eval_id, avg_score in evaluator_scores.items() - ] - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalSetRunId": eval_set_run_id, - "status": status.value, - "evaluatorScores": scores_list, - } - - def collect_results( - self, - eval_results: list[Any], - evaluators: dict[str, LegacyBaseEvaluator[Any]], - usage_metrics: dict[str, int | float | None], - serialize_justification_fn: Callable[[Any], str | None], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results in legacy assertionRuns format.""" - assertion_runs: list[dict[str, Any]] = [] - evaluator_scores_list: list[dict[str, Any]] = [] - - for eval_result in eval_results: - if eval_result.evaluator_id not in evaluators: - continue - - evaluator_id_value = self.convert_id(eval_result.evaluator_id) - evaluator = evaluators[eval_result.evaluator_id] - justification = serialize_justification_fn(eval_result.result.details) - - evaluator_scores_list.append( - { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - "justification": justification, - "evaluatorId": evaluator_id_value, - } - ) - - assertion_runs.append( - { - "status": EvaluationStatus.COMPLETED.value, - "evaluatorId": evaluator_id_value, - "completionMetrics": { - "duration": int(eval_result.result.evaluation_time or 0), - "cost": usage_metrics["cost"], - "tokens": usage_metrics["tokens"] or 0, - "completionTokens": usage_metrics["completionTokens"] or 0, - "promptTokens": usage_metrics["promptTokens"] or 0, - }, - "assertionSnapshot": { - "assertionType": evaluator.evaluator_type.name, - "outputKey": evaluator.target_output_key, - }, - } - ) - - return assertion_runs, evaluator_scores_list - - -# ============================================================================= -# Coded Evaluation Reporting Strategy -# ============================================================================= - - -class CodedEvalReportingStrategy: - """Strategy for coded evaluation reporting. - - Coded evaluations: - - Keep string IDs unchanged - - Use endpoints with /coded/ prefix - - Use evaluatorRuns format with nested result - - Put evaluationCriterias in evalSnapshot - """ - - @property - def endpoint_suffix(self) -> str: - """Return 'coded/' for coded endpoints.""" - return "coded/" - - def convert_id(self, id_value: str) -> str: - """Keep string ID unchanged for coded API.""" - return id_value - - def create_eval_set_run_payload( - self, - eval_set_id: str, - agent_snapshot: StudioWebAgentSnapshot, - no_of_evals: int, - project_id: str, - ) -> dict[str, Any]: - """Create payload for creating a coded eval set run.""" - return { - "agentId": project_id, - "evalSetId": eval_set_id, - "agentSnapshot": agent_snapshot.model_dump(by_alias=True), - "status": EvaluationStatus.IN_PROGRESS.value, - "numberOfEvalsExecuted": no_of_evals, - "source": 0, # EvalRunSource.Manual - } - - def create_eval_run_payload( - self, - eval_item: EvaluationItem, - eval_set_run_id: str, - ) -> dict[str, Any]: - """Create payload for creating a coded eval run.""" - return { - "evalSetRunId": eval_set_run_id, - "evalSnapshot": { - "id": eval_item.id, - "name": eval_item.name, - "inputs": eval_item.inputs, - "evaluationCriterias": eval_item.evaluation_criterias, - }, - "status": EvaluationStatus.IN_PROGRESS.value, - } - - def create_update_eval_run_payload( - self, - eval_run_id: str, - evaluator_runs: list[dict[str, Any]], - evaluator_scores: list[dict[str, Any]], - actual_output: dict[str, Any], - execution_time: float, - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a coded eval run.""" - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalRunId": eval_run_id, - "status": status.value, - "result": { - "output": dict(actual_output), - "scores": evaluator_scores, # Note: "scores" not "evaluatorScores" - }, - "completionMetrics": {"duration": int(execution_time)}, - "evaluatorRuns": evaluator_runs, # Note: "evaluatorRuns" not "assertionRuns" - } - - def create_update_eval_set_run_payload( - self, - eval_set_run_id: str, - evaluator_scores: dict[str, float], - success: bool, - ) -> dict[str, Any]: - """Create payload for updating a coded eval set run.""" - scores_list = [ - {"value": avg_score, "evaluatorId": eval_id} - for eval_id, avg_score in evaluator_scores.items() - ] - status = EvaluationStatus.COMPLETED if success else EvaluationStatus.FAILED - return { - "evalSetRunId": eval_set_run_id, - "status": status.value, - "evaluatorScores": scores_list, - } - - def collect_results( - self, - eval_results: list[Any], - evaluators: dict[str, BaseEvaluator[Any, Any, Any]], - usage_metrics: dict[str, int | float | None], - serialize_justification_fn: Callable[[Any], str | None], - ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: - """Collect results in coded evaluatorRuns format.""" - evaluator_runs: list[dict[str, Any]] = [] - evaluator_scores_list: list[dict[str, Any]] = [] - - for eval_result in eval_results: - if eval_result.evaluator_id not in evaluators: - continue - - justification = serialize_justification_fn(eval_result.result.details) - - evaluator_scores_list.append( - { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - "justification": justification, - "evaluatorId": eval_result.evaluator_id, - } - ) - - evaluator_runs.append( - { - "status": EvaluationStatus.COMPLETED.value, - "evaluatorId": eval_result.evaluator_id, - "result": { - "score": { - "type": eval_result.result.score_type.value, - "value": eval_result.result.score, - }, - "justification": justification, - }, - "completionMetrics": { - "duration": int(eval_result.result.evaluation_time or 0), - "cost": usage_metrics["cost"], - "tokens": usage_metrics["tokens"] or 0, - "completionTokens": usage_metrics["completionTokens"] or 0, - "promptTokens": usage_metrics["promptTokens"] or 0, - }, - } - ) - - return evaluator_runs, evaluator_scores_list +__all__ = [ + "EvalReportingStrategy", + "LegacyEvalReportingStrategy", + "CodedEvalReportingStrategy", +] diff --git a/src/uipath/_cli/_evals/_reporting/_strategy_protocol.py b/src/uipath/_cli/_evals/_reporting/_strategy_protocol.py new file mode 100644 index 000000000..e817dcea6 --- /dev/null +++ b/src/uipath/_cli/_evals/_reporting/_strategy_protocol.py @@ -0,0 +1,93 @@ +"""Protocol definition for evaluation reporting strategies. + +This module defines the Strategy Protocol for handling the differences between +legacy and coded evaluation API formats. +""" + +from typing import Any, Callable, Protocol, runtime_checkable + +from uipath._cli._evals._models._evaluation_set import EvaluationItem +from uipath._cli._evals._models._sw_reporting import StudioWebAgentSnapshot + + +@runtime_checkable +class EvalReportingStrategy(Protocol): + """Protocol for evaluation reporting strategies. + + Strategies handle the differences between legacy and coded evaluation + API formats, including ID conversion, endpoint routing, and payload structure. + """ + + @property + def endpoint_suffix(self) -> str: + """Return the endpoint suffix for this strategy. + + Returns: + "" for legacy, "coded/" for coded evaluations + """ + ... + + def convert_id(self, id_value: str) -> str: + """Convert an ID to the format expected by the backend. + + Args: + id_value: The original string ID + + Returns: + For legacy: deterministic GUID from uuid5 + For coded: original string ID unchanged + """ + ... + + def create_eval_set_run_payload( + self, + eval_set_id: str, + agent_snapshot: StudioWebAgentSnapshot, + no_of_evals: int, + project_id: str, + ) -> dict[str, Any]: + """Create the payload for creating an eval set run.""" + ... + + def create_eval_run_payload( + self, + eval_item: EvaluationItem, + eval_set_run_id: str, + ) -> dict[str, Any]: + """Create the payload for creating an eval run.""" + ... + + def create_update_eval_run_payload( + self, + eval_run_id: str, + evaluator_runs: list[dict[str, Any]], + evaluator_scores: list[dict[str, Any]], + actual_output: dict[str, Any], + execution_time: float, + success: bool, + ) -> dict[str, Any]: + """Create the payload for updating an eval run.""" + ... + + def create_update_eval_set_run_payload( + self, + eval_set_run_id: str, + evaluator_scores: dict[str, float], + success: bool, + ) -> dict[str, Any]: + """Create the payload for updating an eval set run.""" + ... + + def collect_results( + self, + eval_results: list[Any], + evaluators: dict[str, Any], + usage_metrics: dict[str, int | float | None], + serialize_justification_fn: Callable[[Any], str | None], + ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + """Collect results from evaluations in strategy-specific format. + + Returns: + Tuple of (evaluator_runs, evaluator_scores) + """ + ...