Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "uipath"
version = "2.4.8"
version = "2.4.9"
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
Expand Down
40 changes: 37 additions & 3 deletions src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
from uipath.runtime.logging import UiPathRuntimeExecutionLogHandler
from uipath.runtime.schema import UiPathRuntimeSchema

from uipath._cli._evals._span_utils import (
configure_eval_set_run_span,
configure_evaluation_span,
set_evaluation_output_span_output,
)
from uipath._cli._evals.mocks.cache_manager import CacheManager
from uipath._cli._evals.mocks.input_mocker import (
generate_llm_input,
Expand Down Expand Up @@ -355,6 +360,17 @@ async def execute(self) -> UiPathRuntimeResult:
evaluator_averages[eval_id] = (
evaluator_averages[eval_id] / evaluator_count[eval_id]
)

# Configure span with output and metadata
await configure_eval_set_run_span(
span=span,
evaluator_averages=evaluator_averages,
execution_id=self.execution_id,
runtime=runtime,
get_schema_func=self.get_schema,
success=not any_failed,
)

await self.event_bus.publish(
EvaluationEvents.UPDATE_EVAL_SET_RUN,
EvalSetRunUpdatedEvent(
Expand Down Expand Up @@ -422,7 +438,7 @@ async def _execute_eval(
"eval_item_id": eval_item.id,
"eval_item_name": eval_item.name,
},
):
) as span:
evaluation_run_results = EvaluationRunResult(
evaluation_name=eval_item.name, evaluation_run_results=[]
)
Expand Down Expand Up @@ -583,6 +599,17 @@ async def _execute_eval(
finally:
clear_execution_context()

# Configure span with output and metadata
await configure_evaluation_span(
span=span,
evaluation_run_results=evaluation_run_results,
execution_id=execution_id,
input_data=eval_item.inputs,
agent_execution_output=agent_execution_output
if "agent_execution_output" in locals()
else None,
)

return evaluation_run_results

async def _generate_input_for_eval(
Expand Down Expand Up @@ -766,6 +793,7 @@ async def run_evaluator(
}

# Add justification if available
justification = None
if result.details:
if isinstance(result.details, BaseModel):
details_dict = result.details.model_dump()
Expand All @@ -779,8 +807,14 @@ async def run_evaluator(
with tracer.start_as_current_span(
"Evaluation output",
attributes=eval_output_attrs,
):
pass # Span just records the output, no work needed
) as span:
# Set output using utility function
set_evaluation_output_span_output(
span=span,
score=result.score,
evaluator_id=evaluator.id,
justification=justification,
)

return result

Expand Down
280 changes: 280 additions & 0 deletions src/uipath/_cli/_evals/_span_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
"""Utility functions for setting evaluation span attributes."""

import json
from typing import Any, Dict, Optional

from opentelemetry.trace import Span, Status, StatusCode
from pydantic import BaseModel, ConfigDict, Field

# Type hint for runtime protocol (avoids circular imports)
try:
from uipath.runtime import UiPathRuntimeProtocol
except ImportError:
UiPathRuntimeProtocol = Any # type: ignore


class EvalSetRunOutput(BaseModel):
"""Output model for Evaluation Set Run span."""

model_config = ConfigDict(populate_by_name=True)

score: int = Field(..., alias="score")


class EvaluationOutput(BaseModel):
"""Output model for Evaluation span."""

model_config = ConfigDict(populate_by_name=True)

score: int = Field(..., alias="score")


class EvaluationOutputSpanOutput(BaseModel):
"""Output model for Evaluation output span."""

model_config = ConfigDict(populate_by_name=True)

type: int = Field(1, alias="type")
value: float = Field(..., alias="value")
evaluator_id: Optional[str] = Field(None, alias="evaluatorId")
justification: Optional[str] = Field(None, alias="justification")


def calculate_overall_score(evaluator_averages: Dict[str, float]) -> float:
"""Calculate overall average score from evaluator averages.

Args:
evaluator_averages: Dictionary mapping evaluator IDs to their average scores

Returns:
Overall average score across all evaluators, or 0.0 if no evaluators
"""
if not evaluator_averages:
return 0.0
return sum(evaluator_averages.values()) / len(evaluator_averages)


def calculate_evaluation_average_score(evaluation_run_results: Any) -> float:
"""Calculate average score from evaluation run results.

Args:
evaluation_run_results: EvaluationRunResult object containing evaluation results

Returns:
Average score across all evaluators, or 0.0 if no results
"""
if not evaluation_run_results.evaluation_run_results:
return 0.0

total_score = sum(
result.result.score for result in evaluation_run_results.evaluation_run_results
)
return total_score / len(evaluation_run_results.evaluation_run_results)


def set_eval_set_run_output_and_metadata(
span: Span,
overall_score: float,
execution_id: str,
input_schema: Optional[Dict[str, Any]],
output_schema: Optional[Dict[str, Any]],
success: bool = True,
) -> None:
"""Set output and metadata attributes for Evaluation Set Run span.

Args:
span: The OpenTelemetry span to set attributes on
overall_score: The overall average score across all evaluators
execution_id: The execution ID for the evaluation set run
input_schema: The input schema from the runtime
output_schema: The output schema from the runtime
success: Whether the evaluation set run was successful
"""
# Set span output with overall score using Pydantic model (formatted for UI rendering)
output = EvalSetRunOutput(score=int(overall_score))
span.set_attribute("output", output.model_dump_json(by_alias=True, indent=2))

# Set metadata attributes
span.set_attribute("agentId", execution_id)
span.set_attribute("agentName", "N/A")

# Set schemas as formatted JSON strings for proper rendering in UI
if input_schema:
try:
span.set_attribute("inputSchema", json.dumps(input_schema, indent=2))
except (TypeError, ValueError):
span.set_attribute("inputSchema", "{}")
else:
span.set_attribute("inputSchema", "{}")

if output_schema:
try:
span.set_attribute("outputSchema", json.dumps(output_schema, indent=2))
except (TypeError, ValueError):
span.set_attribute("outputSchema", "{}")
else:
span.set_attribute("outputSchema", "{}")

# Set span status
if success:
span.set_status(Status(StatusCode.OK))


def set_evaluation_output_and_metadata(
span: Span,
avg_score: float,
execution_id: str,
input_data: Optional[Dict[str, Any]] = None,
has_error: bool = False,
error_message: Optional[str] = None,
) -> None:
"""Set output and metadata attributes for Evaluation span.

Args:
span: The OpenTelemetry span to set attributes on
avg_score: The average score for this evaluation across all evaluators
execution_id: The execution ID for this evaluation
input_data: The input data for this evaluation
has_error: Whether the evaluation had an error
error_message: Optional error message if has_error is True
"""
# Set span output with average score using Pydantic model (formatted for UI rendering)
output = EvaluationOutput(score=int(avg_score))
span.set_attribute("output", output.model_dump_json(by_alias=True, indent=2))

# Set input data if provided (formatted JSON for UI rendering)
if input_data is not None:
try:
span.set_attribute("input", json.dumps(input_data, indent=2))
except (TypeError, ValueError):
span.set_attribute("input", "{}")

# Set metadata attributes
span.set_attribute("agentId", execution_id)
span.set_attribute("agentName", "N/A")

# Set span status based on success
if has_error and error_message:
span.set_status(Status(StatusCode.ERROR, error_message))
elif not has_error:
span.set_status(Status(StatusCode.OK))


def set_evaluation_output_span_output(
span: Span,
score: float,
evaluator_id: Optional[str] = None,
justification: Optional[str] = None,
) -> None:
"""Set output attribute for Evaluation output span.

Args:
span: The OpenTelemetry span to set attributes on
score: The evaluation score
evaluator_id: The ID of the evaluator that produced this score
justification: Optional justification text for the score
"""
# Set output using Pydantic model (formatted for UI rendering)
output = EvaluationOutputSpanOutput(
value=score,
evaluator_id=evaluator_id,
justification=justification,
)
span.set_attribute(
"output", output.model_dump_json(by_alias=True, exclude_none=True, indent=2)
)


# High-level wrapper functions that handle complete flow


async def configure_eval_set_run_span(
span: Span,
evaluator_averages: Dict[str, float],
execution_id: str,
runtime: Any,
get_schema_func: Any,
success: bool = True,
) -> None:
"""Configure Evaluation Set Run span with output and metadata.

This high-level function handles:
- Calculating overall score from evaluator averages
- Getting runtime schemas
- Setting all span attributes

Args:
span: The OpenTelemetry span to configure
evaluator_averages: Dictionary mapping evaluator IDs to their average scores
execution_id: The execution ID for the evaluation set run
runtime: The runtime instance
get_schema_func: Async function to get schema from runtime
success: Whether the evaluation set run was successful
"""
# Calculate overall score
overall_score = calculate_overall_score(evaluator_averages)

# Get runtime schemas
try:
schema = await get_schema_func(runtime)
input_schema = schema.input
output_schema = schema.output
except Exception:
input_schema = None
output_schema = None

# Set span output and metadata
set_eval_set_run_output_and_metadata(
span=span,
overall_score=overall_score,
execution_id=execution_id,
input_schema=input_schema,
output_schema=output_schema,
success=success,
)


async def configure_evaluation_span(
span: Span,
evaluation_run_results: Any,
execution_id: str,
input_data: Optional[Dict[str, Any]] = None,
agent_execution_output: Optional[Any] = None,
) -> None:
"""Configure Evaluation span with output and metadata.

This high-level function handles:
- Calculating average score from evaluation results
- Determining error status
- Setting all span attributes

Args:
span: The OpenTelemetry span to configure
evaluation_run_results: EvaluationRunResult object containing evaluation results
execution_id: The execution ID for this evaluation
input_data: The input data for this evaluation
agent_execution_output: Optional agent execution output for error checking
"""
# Calculate average score
avg_score = calculate_evaluation_average_score(evaluation_run_results)

# Determine error status
has_error = False
error_message = None
if agent_execution_output is not None:
try:
if agent_execution_output.result.error:
has_error = True
error_message = str(agent_execution_output.result.error)
except (AttributeError, NameError, UnboundLocalError):
pass

# Set span output and metadata
set_evaluation_output_and_metadata(
span=span,
avg_score=avg_score,
execution_id=execution_id,
input_data=input_data,
has_error=has_error,
error_message=error_message,
)
Loading