Skip to content

Commit 0dce05e

Browse files
jsondaicopybara-github
authored andcommitted
fix: GenAI Client(evals) - Support direct pandas DataFrame dataset in evaluate()
PiperOrigin-RevId: 823070911
1 parent 02ab764 commit 0dce05e

File tree

2 files changed

+52
-4
lines changed

2 files changed

+52
-4
lines changed

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,45 @@ def test_evaluation_result(client):
5454
assert case_result.response_candidate_results is not None
5555

5656

57+
def test_evaluation_byor(client):
58+
"""Tests that evaluate() with BYOR produces a correctly structured EvaluationResult."""
59+
byor_df = pd.DataFrame({
60+
"prompt": [
61+
"Write a simple story about a dinosaur",
62+
"Generate a poem about Vertex AI",
63+
],
64+
"response": [
65+
"Once upon a time, there was a T-Rex named Rexy.",
66+
"In clouds of code, a mind of silicon born...",
67+
],
68+
})
69+
70+
metrics_to_run = [
71+
types.RubricMetric.GENERAL_QUALITY,
72+
]
73+
74+
evaluation_result = client.evals.evaluate(
75+
dataset=byor_df,
76+
metrics=metrics_to_run,
77+
)
78+
79+
assert isinstance(evaluation_result, types.EvaluationResult)
80+
81+
assert evaluation_result.summary_metrics is not None
82+
assert len(evaluation_result.summary_metrics) > 0
83+
for summary in evaluation_result.summary_metrics:
84+
assert isinstance(summary, types.AggregatedMetricResult)
85+
assert summary.metric_name is not None
86+
assert summary.mean_score is not None
87+
88+
assert evaluation_result.eval_case_results is not None
89+
assert len(evaluation_result.eval_case_results) > 0
90+
for case_result in evaluation_result.eval_case_results:
91+
assert isinstance(case_result, types.EvalCaseResult)
92+
assert case_result.eval_case_index is not None
93+
assert case_result.response_candidate_results is not None
94+
95+
5796
pytestmark = pytest_helper.setup(
5897
file=__file__,
5998
globals_for_file=globals(),

vertexai/_genai/evals.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -970,7 +970,9 @@ def evaluate(
970970
self,
971971
*,
972972
dataset: Union[
973-
types.EvaluationDatasetOrDict, list[types.EvaluationDatasetOrDict]
973+
pd.DataFrame,
974+
types.EvaluationDatasetOrDict,
975+
list[types.EvaluationDatasetOrDict],
974976
],
975977
metrics: list[types.MetricOrDict] = None,
976978
config: Optional[types.EvaluateMethodConfigOrDict] = None,
@@ -979,10 +981,13 @@ def evaluate(
979981
"""Evaluates candidate responses in the provided dataset(s) using the specified metrics.
980982
981983
Args:
982-
dataset: The dataset(s) to evaluate. Can be a single `types.EvaluationDataset` or a list of `types.EvaluationDataset`.
984+
dataset: The dataset(s) to evaluate. Can be a pandas DataFrame, a single
985+
`types.EvaluationDataset` or a list of `types.EvaluationDataset`.
983986
metrics: The list of metrics to use for evaluation.
984-
config: Optional configuration for the evaluation. Can be a dictionary or a `types.EvaluateMethodConfig` object.
985-
- dataset_schema: Schema to use for the dataset. If not specified, the dataset schema will be inferred from the dataset automatically.
987+
config: Optional configuration for the evaluation. Can be a dictionary or a
988+
`types.EvaluateMethodConfig` object.
989+
- dataset_schema: Schema to use for the dataset. If not specified, the
990+
dataset schema will be inferred from the dataset automatically.
986991
- dest: Destination path for storing evaluation results.
987992
**kwargs: Extra arguments to pass to evaluation, such as `agent_info`.
988993
@@ -993,6 +998,10 @@ def evaluate(
993998
config = types.EvaluateMethodConfig()
994999
if isinstance(config, dict):
9951000
config = types.EvaluateMethodConfig.model_validate(config)
1001+
1002+
if isinstance(dataset, pd.DataFrame):
1003+
dataset = types.EvaluationDataset(eval_dataset_df=dataset)
1004+
9961005
if isinstance(dataset, list):
9971006
dataset = [
9981007
(

0 commit comments

Comments
 (0)