google-github-actions · cocosheng-g · Feb 4, 2026 · Feb 4, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -0,0 +1,64 @@
+name: 'Nightly Evaluations'
+
+on:
+  schedule:
+    - cron: '0 1 * * *' # 1 AM UTC
+  workflow_dispatch:
+    inputs:
+      iterations:
+        description: 'Number of iterations per test case'
+        required: true
+        default: '1'
+
+jobs:
+  evaluate:
+    runs-on: 'ubuntu-latest'
+    permissions:
+      contents: 'read'
+    strategy:
+      matrix:
+        model:
+          [
+            'gemini-3-pro-preview',
+            'gemini-3-flash-preview',
+            'gemini-2.5-pro',
+            'gemini-2.5-flash',
+            'gemini-2.5-flash-lite',
+          ]
+    name: 'Evaluate ${{ matrix.model }}'
+
+    steps:
+      - name: 'Checkout code'
+        uses: 'actions/checkout@v4' # ratchet:exclude
+
+      - name: 'Set up Node.js'
+        uses: 'actions/setup-node@v4' # ratchet:exclude
+        with:
+          node-version: '20'
+          cache: 'npm'
+
+      - name: 'Install dependencies'
+        run: |
+          npm ci
+
+      - name: 'Install Gemini CLI'
+        run: 'npm install -g @google/gemini-cli@latest'
+
+      - name: 'Run Evaluations'
+        env:
+          GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
+          GEMINI_MODEL: '${{ matrix.model }}'
+        run: |
+          npm run test:evals -- --reporter=json --outputFile=eval-results-${{ matrix.model }}.json
+
+      - name: 'Upload Results'
+        if: 'always()'
+        uses: 'actions/upload-artifact@v4' # ratchet:exclude
+        with:
+          name: 'eval-results-${{ matrix.model }}'
+          path: 'eval-results-${{ matrix.model }}.json'
+
+      - name: 'Job Summary'
+        if: 'always()'
+        run: |
+          npx tsx scripts/aggregate_evals.ts "eval-results-${{ matrix.model }}.json" >> "$GITHUB_STEP_SUMMARY"
@@ -0,0 +1,48 @@
+# Gemini CLI Workflow Evaluations
+
+This directory contains resources for evaluating and improving the example workflows using a TypeScript + Vitest framework.
+
+## Goals
+
+1.  **Systematic Testing:** Ensure changes to prompts or configurations improve quality.
+2.  **Regression Testing:** Catch degradations in performance.
+3.  **Benchmarking:** Compare different models (e.g., `gemini-2.5-pro` vs `gemini-2.5-flash`).
+
+## Structure
+
+- `evals/`:
+  - `test-rig.ts`: Utility to setup a temporary environment for the CLI.
+  - `issue-triage.eval.ts`: Benchmark for the Issue Triage workflow.
+  - `pr-review.eval.ts`: Benchmark for the PR Review workflow.
+  - `issue-fixer.eval.ts`: Benchmark for the autonomous Issue Fixer.
+  - `gemini-assistant.eval.ts`: Benchmark for the interactive Assistant.
+  - `gemini-scheduled-triage.eval.ts`: Benchmark for batch triage.
+  - `data/*.jsonl`: Gold-standard datasets for each workflow.
+  - `vitest.config.ts`: Configuration for the evaluation runner.
+
+## How to Run
+
+### Prerequisites
+
+- `npm install`
+- `gemini-cli` installed and available in your PATH.
+- `GEMINI_API_KEY` environment variable set.
+
+### Run Locally
+
+```bash
+npm run test:evals
+```
+
+To run against a specific model:
+
+```bash
+GEMINI_MODEL=gemini-2.5-flash npm run test:evals
+```
+
+## Adding New Evals
+
+1. Create a new file in `evals/` ending in `.eval.ts`.
+2. Add corresponding test data in `evals/data/`.
+3. Use the `TestRig` to set up files, environment variables, and run the CLI.
+4. Assert the expected behavior (e.g., check `GITHUB_ENV` output or tool calls captured in telemetry).
@@ -0,0 +1,36 @@
+[
+  {
+    "id": "fix-typo",
+    "inputs": {
+      "TITLE": "Fix typo in utils.js",
+      "DESCRIPTION": "There is a typo in the helper function name.",
+      "EVENT_NAME": "issues",
+      "IS_PULL_REQUEST": "false",
+      "ISSUE_NUMBER": "10",
+      "REPOSITORY": "owner/repo",
+      "ADDITIONAL_CONTEXT": "Please fix it."
+    },
+    "expected_actions": ["AI Assistant: Plan of Action"],
+    "expected_plan_keywords": ["search", "grep", "read", "replace", "utils.js"]
+  },
+  {
+    "id": "add-feature",
+    "inputs": {
+      "TITLE": "Add login page",
+      "DESCRIPTION": "We need a login page.",
+      "EVENT_NAME": "issues",
+      "IS_PULL_REQUEST": "false",
+      "ISSUE_NUMBER": "11",
+      "REPOSITORY": "owner/repo",
+      "ADDITIONAL_CONTEXT": "Make it pretty."
+    },
+    "expected_actions": ["AI Assistant: Plan of Action"],
+    "expected_plan_keywords": [
+      "create",
+      "component",
+      "structure",
+      "design",
+      "implement"
+    ]
+  }
+]
@@ -0,0 +1,19 @@
+[
+  {
+    "id": "batch-1",
+    "inputs": {
+      "AVAILABLE_LABELS": "bug,enhancement,priority/p0",
+      "ISSUES_TO_TRIAGE": "[{\"number\": 1, \"title\": \"Crash on start\", \"body\": \"It crashes immediately.\"}, {\"number\": 2, \"title\": \"Add help button\", \"body\": \"Users need help.\"}]"
+    },
+    "expected": [
+      {
+        "issue_number": 1,
+        "labels_to_set": ["bug", "priority/p0"]
+      },
+      {
+        "issue_number": 2,
+        "labels_to_set": ["enhancement"]
+      }
+    ]
+  }
+]
@@ -0,0 +1,47 @@
+[
+  {
+    "id": "new-page-request",
+    "inputs": {
+      "REPOSITORY": "owner/repo",
+      "ISSUE_NUMBER": "1",
+      "ISSUE_TITLE": "Add a new landing page",
+      "ISSUE_BODY": "We need a landing page for the new product launch."
+    },
+    "expected_actions": ["update_issue", "gh issue comment"],
+    "expected_plan_keywords": ["explore", "create", "file", "add", "content"]
+  },
+  {
+    "id": "bug-fix-request",
+    "inputs": {
+      "REPOSITORY": "owner/repo",
+      "ISSUE_NUMBER": "2",
+      "ISSUE_TITLE": "Fix login crash",
+      "ISSUE_BODY": "The app crashes when the user clicks 'forgot password'."
+    },
+    "expected_actions": ["update_issue", "gh issue comment"],
+    "expected_plan_keywords": [
+      "search",
+      "reproduce",
+      "investigate",
+      "fix",
+      "logic"
+    ]
+  },
+  {
+    "id": "dependency-update",
+    "inputs": {
+      "REPOSITORY": "owner/repo",
+      "ISSUE_NUMBER": "5",
+      "ISSUE_TITLE": "Update lodash to the latest version",
+      "ISSUE_BODY": "We need to update lodash to address a known security vulnerability in older versions."
+    },
+    "expected_actions": ["update_issue", "gh issue comment"],
+    "expected_plan_keywords": [
+      "npm",
+      "install",
+      "update",
+      "package.json",
+      "verify"
+    ]
+  }
+]
@@ -0,0 +1,72 @@
+[
+  {
+    "id": "bug-1",
+    "inputs": {
+      "ISSUE_TITLE": "Application crashes on startup",
+      "ISSUE_BODY": "When I launch the app, it immediately closes with a segfault.",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["bug"],
+    "reason": "Explicit mention of crash and segfault."
+  },
+  {
+    "id": "feature-1",
+    "inputs": {
+      "ISSUE_TITLE": "Add dark mode",
+      "ISSUE_BODY": "It would be great to have a dark mode for better visibility at night.",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["enhancement"],
+    "reason": "Request for a new feature (dark mode)."
+  },
+  {
+    "id": "question-1",
+    "inputs": {
+      "ISSUE_TITLE": "How to run tests?",
+      "ISSUE_BODY": "I cannot find instructions on running the unit tests.",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["question", "documentation"],
+    "reason": "Asking for information/instructions regarding documentation."
+  },
+  {
+    "id": "security-1",
+    "inputs": {
+      "ISSUE_TITLE": "SQL Injection vulnerability in login form",
+      "ISSUE_BODY": "I found a way to bypass login using SQL injection on the username field.",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["bug", "security"],
+    "reason": "Specific security vulnerability mentioned."
+  },
+  {
+    "id": "empty-body",
+    "inputs": {
+      "ISSUE_TITLE": "Feature request: support pnpm",
+      "ISSUE_BODY": "",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["enhancement"],
+    "reason": "Title clearly indicates a feature request despite empty body."
+  },
+  {
+    "id": "vague-bug",
+    "inputs": {
+      "ISSUE_TITLE": "It broke",
+      "ISSUE_BODY": "I was using it and then it just stopped working. No error message.",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["bug"],
+    "reason": "Functional failure reported."
+  },
+  {
+    "id": "translation-req",
+    "inputs": {
+      "ISSUE_TITLE": "Traducción al español",
+      "ISSUE_BODY": "Necesitamos traducir la documentación al español.",
+      "AVAILABLE_LABELS": "bug,enhancement,question,documentation,security,duplicate,wontfix"
+    },
+    "expected": ["documentation", "enhancement"],
+    "reason": "Request for documentation work in another language."
+  }
+]
@@ -0,0 +1,41 @@
+[
+  {
+    "id": "logic-error",
+    "inputs": {
+      "REPOSITORY": "google-github-actions/run-gemini-cli",
+      "PULL_REQUEST_NUMBER": "454",
+      "ADDITIONAL_CONTEXT": "Focus on logic errors and edge cases."
+    },
+    "expected_tools": [
+      "pull_request_read.get_diff",
+      "add_comment_to_pending_review"
+    ],
+    "expected_findings": ["eval", "untrusted", "calculation", "input"]
+  },
+  {
+    "id": "security-vulnerability",
+    "inputs": {
+      "REPOSITORY": "google-github-actions/run-gemini-cli",
+      "PULL_REQUEST_NUMBER": "454",
+      "ADDITIONAL_CONTEXT": "Security review requested. Check for injection and data exposure."
+    },
+    "expected_tools": [
+      "pull_request_read.get_diff",
+      "add_comment_to_pending_review"
+    ],
+    "expected_findings": ["eval", "injection", "arbitrary", "execution"]
+  },
+  {
+    "id": "performance-optimization",
+    "inputs": {
+      "REPOSITORY": "google-github-actions/run-gemini-cli",
+      "PULL_REQUEST_NUMBER": "454",
+      "ADDITIONAL_CONTEXT": "The current implementation is slow on large datasets. Look for performance bottlenecks."
+    },
+    "expected_tools": [
+      "pull_request_read.get_diff",
+      "add_comment_to_pending_review"
+    ],
+    "expected_findings": ["nested", "loop", "quadratic", "n^2"]
+  }
+]