Skip to content
64 changes: 64 additions & 0 deletions .github/workflows/evals-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: 'Nightly Evaluations'

on:
schedule:
- cron: '0 1 * * *' # 1 AM UTC
workflow_dispatch:
inputs:
iterations:
description: 'Number of iterations per test case'
required: true
default: '1'

jobs:
evaluate:
runs-on: 'ubuntu-latest'
permissions:
contents: 'read'
strategy:
matrix:
model:
[
'gemini-3-pro-preview',
'gemini-3-flash-preview',
'gemini-2.5-pro',
'gemini-2.5-flash',
'gemini-2.5-flash-lite',
]
name: 'Evaluate ${{ matrix.model }}'

steps:
- name: 'Checkout code'
uses: 'actions/checkout@v4' # ratchet:exclude

- name: 'Set up Node.js'
uses: 'actions/setup-node@v4' # ratchet:exclude
with:
node-version: '20'
cache: 'npm'

- name: 'Install dependencies'
run: |
npm ci

- name: 'Install Gemini CLI'
run: 'npm install -g @google/gemini-cli@latest'

- name: 'Run Evaluations'
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
GEMINI_MODEL: '${{ matrix.model }}'
run: |
npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json

- name: 'Upload Results'
if: 'always()'
uses: 'actions/upload-artifact@v4' # ratchet:exclude
with:
name: 'eval-results-${{ matrix.model }}'
path: 'eval-results-${{ matrix.model }}.json'

- name: 'Job Summary'
if: 'always()'
run: |
npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY"
48 changes: 48 additions & 0 deletions evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Gemini CLI Workflow Evaluations

This directory contains resources for evaluating and improving the example workflows using a TypeScript + Vitest framework.

## Goals

1. **Systematic Testing:** Ensure changes to prompts or configurations improve quality.
2. **Regression Testing:** Catch degradations in performance.
3. **Benchmarking:** Compare different models (e.g., `gemini-2.5-pro` vs `gemini-2.5-flash`).

## Structure

- `evals/`:
- `test-rig.ts`: Utility to setup a temporary environment for the CLI.
- `issue-triage.eval.ts`: Benchmark for the Issue Triage workflow.
- `pr-review.eval.ts`: Benchmark for the PR Review workflow.
- `issue-fixer.eval.ts`: Benchmark for the autonomous Issue Fixer.
- `gemini-assistant.eval.ts`: Benchmark for the interactive Assistant.
- `gemini-scheduled-triage.eval.ts`: Benchmark for batch triage.
- `data/*.jsonl`: Gold-standard datasets for each workflow.
- `vitest.config.ts`: Configuration for the evaluation runner.

## How to Run

### Prerequisites

- `npm install`
- `gemini-cli` installed and available in your PATH.
- `GEMINI_API_KEY` environment variable set.

### Run Locally

```bash
npm run test:evals
```

To run against a specific model:

```bash
GEMINI_MODEL=gemini-2.5-flash npm run test:evals
```

## Adding New Evals

1. Create a new file in `evals/` ending in `.eval.ts`.
2. Add corresponding test data in `evals/data/`.
3. Use the `TestRig` to set up files, environment variables, and run the CLI.
4. Assert the expected behavior (e.g., check `GITHUB_ENV` output or tool calls captured in telemetry).
36 changes: 36 additions & 0 deletions evals/data/gemini-assistant.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[
{
"id": "fix-typo",
"inputs": {
"TITLE": "Fix typo in utils.js",
"DESCRIPTION": "There is a typo in the helper function name.",
"EVENT_NAME": "issues",
"IS_PULL_REQUEST": "false",
"ISSUE_NUMBER": "10",
"REPOSITORY": "owner/repo",
"ADDITIONAL_CONTEXT": "Please fix it."
},
"expected_actions": ["AI Assistant: Plan of Action"],
"expected_plan_keywords": ["search", "grep", "read", "replace", "utils.js"]
},
{
"id": "add-feature",
"inputs": {
"TITLE": "Add login page",
"DESCRIPTION": "We need a login page.",
"EVENT_NAME": "issues",
"IS_PULL_REQUEST": "false",
"ISSUE_NUMBER": "11",
"REPOSITORY": "owner/repo",
"ADDITIONAL_CONTEXT": "Make it pretty."
},
"expected_actions": ["AI Assistant: Plan of Action"],
"expected_plan_keywords": [
"create",
"component",
"structure",
"design",
"implement"
]
}
]
19 changes: 19 additions & 0 deletions evals/data/gemini-scheduled-triage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[
{
"id": "batch-1",
"inputs": {
"AVAILABLE_LABELS": "bug,enhancement,priority/p0",
"ISSUES_TO_TRIAGE": "[{\"number\": 1, \"title\": \"Crash on start\", \"body\": \"It crashes immediately.\"}, {\"number\": 2, \"title\": \"Add help button\", \"body\": \"Users need help.\"}]"
},
"expected": [
{
"issue_number": 1,
"labels_to_set": ["bug", "priority/p0"]
},
{
"issue_number": 2,
"labels_to_set": ["enhancement"]
}
]
}
]
47 changes: 47 additions & 0 deletions evals/data/issue-fixer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
[
{
"id": "new-page-request",
"inputs": {
"REPOSITORY": "owner/repo",
"ISSUE_NUMBER": "1",
"ISSUE_TITLE": "Add a new landing page",
"ISSUE_BODY": "We need a landing page for the new product launch."
},
"expected_actions": ["update_issue", "gh issue comment"],
"expected_plan_keywords": ["explore", "create", "file", "add", "content"]
},
{
"id": "bug-fix-request",
"inputs": {
"REPOSITORY": "owner/repo",
"ISSUE_NUMBER": "2",
"ISSUE_TITLE": "Fix login crash",
"ISSUE_BODY": "The app crashes when the user clicks 'forgot password'."
},
"expected_actions": ["update_issue", "gh issue comment"],
"expected_plan_keywords": [
"search",
"reproduce",
"investigate",
"fix",
"logic"
]
},
{
"id": "dependency-update",
"inputs": {
"REPOSITORY": "owner/repo",
"ISSUE_NUMBER": "5",
"ISSUE_TITLE": "Update lodash to the latest version",
"ISSUE_BODY": "We need to update lodash to address a known security vulnerability in older versions."
},
"expected_actions": ["update_issue", "gh issue comment"],
"expected_plan_keywords": [
"npm",
"install",
"update",
"package.json",
"verify"
]
}
]
72 changes: 72 additions & 0 deletions evals/data/issue-triage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
[
{
"id": "bug-1",
"inputs": {
"ISSUE_TITLE": "Application crashes on startup",
"ISSUE_BODY": "When I launch the app, it immediately closes with a segfault.",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["bug"],
"reason": "Explicit mention of crash and segfault."
},
{
"id": "feature-1",
"inputs": {
"ISSUE_TITLE": "Add dark mode",
"ISSUE_BODY": "It would be great to have a dark mode for better visibility at night.",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["enhancement"],
"reason": "Request for a new feature (dark mode)."
},
{
"id": "question-1",
"inputs": {
"ISSUE_TITLE": "How to run tests?",
"ISSUE_BODY": "I cannot find instructions on running the unit tests.",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["question", "documentation"],
"reason": "Asking for information/instructions regarding documentation."
},
{
"id": "security-1",
"inputs": {
"ISSUE_TITLE": "SQL Injection vulnerability in login form",
"ISSUE_BODY": "I found a way to bypass login using SQL injection on the username field.",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["bug", "security"],
"reason": "Specific security vulnerability mentioned."
},
{
"id": "empty-body",
"inputs": {
"ISSUE_TITLE": "Feature request: support pnpm",
"ISSUE_BODY": "",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["enhancement"],
"reason": "Title clearly indicates a feature request despite empty body."
},
{
"id": "vague-bug",
"inputs": {
"ISSUE_TITLE": "It broke",
"ISSUE_BODY": "I was using it and then it just stopped working. No error message.",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["bug"],
"reason": "Functional failure reported."
},
{
"id": "translation-req",
"inputs": {
"ISSUE_TITLE": "Traducción al español",
"ISSUE_BODY": "Necesitamos traducir la documentación al español.",
"AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
},
"expected": ["documentation", "enhancement"],
"reason": "Request for documentation work in another language."
}
]
41 changes: 41 additions & 0 deletions evals/data/pr-review.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[
{
"id": "logic-error",
"inputs": {
"REPOSITORY": "google-github-actions/run-gemini-cli",
"PULL_REQUEST_NUMBER": "454",
"ADDITIONAL_CONTEXT": "Focus on logic errors and edge cases."
},
"expected_tools": [
"pull_request_read.get_diff",
"add_comment_to_pending_review"
],
"expected_findings": ["eval", "untrusted", "calculation", "input"]
},
{
"id": "security-vulnerability",
"inputs": {
"REPOSITORY": "google-github-actions/run-gemini-cli",
"PULL_REQUEST_NUMBER": "454",
"ADDITIONAL_CONTEXT": "Security review requested. Check for injection and data exposure."
},
"expected_tools": [
"pull_request_read.get_diff",
"add_comment_to_pending_review"
],
"expected_findings": ["eval", "injection", "arbitrary", "execution"]
},
{
"id": "performance-optimization",
"inputs": {
"REPOSITORY": "google-github-actions/run-gemini-cli",
"PULL_REQUEST_NUMBER": "454",
"ADDITIONAL_CONTEXT": "The current implementation is slow on large datasets. Look for performance bottlenecks."
},
"expected_tools": [
"pull_request_read.get_diff",
"add_comment_to_pending_review"
],
"expected_findings": ["nested", "loop", "quadratic", "n^2"]
}
]
Loading
Loading