diff --git a/app/en/home/evaluate-tools/_meta.tsx b/app/en/home/evaluate-tools/_meta.tsx index bc4929294..eafc38c4c 100644 --- a/app/en/home/evaluate-tools/_meta.tsx +++ b/app/en/home/evaluate-tools/_meta.tsx @@ -1,5 +1,28 @@ -export default { - "why-evaluate-tools": "Why evaluate tools?", - "create-an-evaluation-suite": "Create an evaluation suite", - "run-evaluations": "Run evaluations", +import type { MetaRecord } from "nextra"; + +const meta: MetaRecord = { + "*": { + theme: { + breadcrumb: true, + toc: true, + copyPage: true, + }, + }, + "why-evaluate-tools": { + title: "Why evaluate tools?", + }, + "create-an-evaluation-suite": { + title: "Create an evaluation suite", + }, + "run-evaluations": { + title: "Run evaluations", + }, + "capture-mode": { + title: "Capture mode", + }, + "comparative-evaluations": { + title: "Comparative evaluations", + }, }; + +export default meta; diff --git a/app/en/home/evaluate-tools/capture-mode/page.mdx b/app/en/home/evaluate-tools/capture-mode/page.mdx new file mode 100644 index 000000000..16bf5f7a5 --- /dev/null +++ b/app/en/home/evaluate-tools/capture-mode/page.mdx @@ -0,0 +1,414 @@ +--- +title: "Capture mode" +description: "Record tool calls without scoring to bootstrap test expectations" +--- + +import { Callout, Steps } from "nextra/components"; + +# Capture mode + +Capture mode records tool calls without evaluating them. Use it to bootstrap test expectations or debug model behavior. + + + **Backward compatibility**: Capture mode works with existing evaluation + suites. Simply add the `--capture` flag to any `arcade evals` command. No code + changes needed. + + +## When to use capture mode + +**Bootstrapping test expectations**: When you don't know what tool calls to expect, run capture mode to see what the model actually calls. + +**Debugging model behavior**: When evaluations fail unexpectedly, capture mode shows exactly what the model is doing. + +**Exploring new tools**: When adding new tools, capture mode helps you understand how models interpret them. + +**Documenting tool usage**: Create examples of how models use your tools in different scenarios. + +### Typical workflow + +``` +1. Create suite with empty expected_tool_calls + ↓ +2. Run: arcade evals . --capture --format json + ↓ +3. Review captured tool calls in output file + ↓ +4. Copy tool calls into expected_tool_calls + ↓ +5. Add critics for validation + ↓ +6. Run: arcade evals . --details +``` + +## Basic usage + + + +### Create an evaluation suite without expectations + +Create a suite with test cases but empty `expected_tool_calls`: + +```python +from arcade_evals import EvalSuite, tool_eval + +@tool_eval() +async def capture_weather_suite(): + suite = EvalSuite( + name="Weather Capture", + system_message="You are a weather assistant.", + ) + + await suite.add_mcp_stdio_server(["python", "weather_server.py"]) + + # Add cases without expected tool calls + suite.add_case( + name="Simple weather query", + user_message="What's the weather in Seattle?", + expected_tool_calls=[], # Empty for capture + ) + + suite.add_case( + name="Multi-city comparison", + user_message="Compare the weather in Seattle and Portland", + expected_tool_calls=[], + ) + + return suite +``` + +### Run in capture mode + +Run evaluations with the `--capture` flag: + +```bash +arcade evals . --capture --file captures/weather --format json +``` + +This creates `captures/weather.json` with all tool calls. + +### Review captured output + +Open the JSON file to see what the model called: + +```json +{ + "suite_name": "Weather Capture", + "model": "gpt-4o", + "provider": "openai", + "captured_cases": [ + { + "case_name": "Simple weather query", + "user_message": "What's the weather in Seattle?", + "tool_calls": [ + { + "name": "Weather_GetCurrent", + "args": { + "location": "Seattle", + "units": "fahrenheit" + } + } + ] + } + ] +} +``` + +### Convert to test expectations + +Copy the captured calls into your evaluation suite: + +```python +from arcade_evals import ExpectedMCPToolCall, BinaryCritic + +suite.add_case( + name="Simple weather query", + user_message="What's the weather in Seattle?", + expected_tool_calls=[ + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle", "units": "fahrenheit"} + ) + ], + critics=[ + BinaryCritic(critic_field="location", weight=0.7), + BinaryCritic(critic_field="units", weight=0.3), + ], +) +``` + + + +## CLI options + +### Basic capture + +Record tool calls to JSON: + +```bash +arcade evals . --capture --file captures/baseline --format json +``` + +### Include conversation context + +Capture system messages and conversation history: + +```bash +arcade evals . --capture --add-context --file captures/detailed --format json +``` + +Output includes: + +```json +{ + "case_name": "Weather with context", + "user_message": "What about the weather there?", + "system_message": "You are a weather assistant.", + "additional_messages": [ + {"role": "user", "content": "I'm traveling to Tokyo"}, + {"role": "assistant", "content": "Tokyo is a great city!"} + ], + "tool_calls": [...] +} +``` + +### Multiple formats + +Save captures in multiple formats: + +```bash +arcade evals . --capture --file captures/out --format json,md +``` + +Markdown format is more readable for quick review: + +```markdown +## Weather Capture + +### Model: gpt-4o + +#### Case: Simple weather query + +**Input:** What's the weather in Seattle? + +**Tool Calls:** + +- `Weather_GetCurrent` + - location: Seattle + - units: fahrenheit +``` + +### Multiple providers + +Capture from multiple providers to compare behavior: + +```bash +arcade evals . --capture \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 \ + --file captures/comparison --format json +``` + +## Capture with comparative tracks + +Capture from multiple tool sources to see how different implementations behave: + +```python +@tool_eval() +async def capture_comparative(): + suite = EvalSuite( + name="Weather Comparison", + system_message="You are a weather assistant.", + ) + + # Register different tool sources + await suite.add_mcp_server( + "http://weather-api-1.example/mcp", + track="Weather API v1" + ) + + await suite.add_mcp_server( + "http://weather-api-2.example/mcp", + track="Weather API v2" + ) + + # Capture will run against each track + suite.add_case( + name="get_weather", + user_message="What's the weather in Seattle?", + expected_tool_calls=[], + ) + + return suite +``` + +Run capture: + +```bash +arcade evals . --capture --file captures/apis --format json +``` + +Output shows captures per track: + +```json +{ + "captured_cases": [ + { + "case_name": "get_weather", + "track_name": "Weather API v1", + "tool_calls": [ + {"name": "GetCurrentWeather", "args": {...}} + ] + }, + { + "case_name": "get_weather", + "track_name": "Weather API v2", + "tool_calls": [ + {"name": "Weather_Current", "args": {...}} + ] + } + ] +} +``` + +## Best practices + +### Start with broad queries + +Begin with open-ended prompts to see natural model behavior: + +```python +suite.add_case( + name="explore_weather_tools", + user_message="Show me everything you can do with weather", + expected_tool_calls=[], +) +``` + +### Capture edge cases + +Record model behavior on unusual inputs: + +```python +suite.add_case( + name="ambiguous_location", + user_message="What's the weather in Portland?", # OR or ME? + expected_tool_calls=[], +) +``` + +### Include context variations + +Capture with different conversation contexts: + +```python +suite.add_case( + name="weather_from_context", + user_message="How about the weather there?", + additional_messages=[ + {"role": "user", "content": "I'm going to Seattle"}, + ], + expected_tool_calls=[], +) +``` + +### Capture multiple providers + +Compare how different models interpret your tools: + +```bash +arcade evals . --capture \ + --use-provider openai:gpt-4o,gpt-4o-mini \ + --use-provider anthropic:claude-sonnet-4-5-20250929 \ + --file captures/models --format json,md +``` + +## Converting captures to tests + +### Step 1: Identify patterns + +Review captured tool calls to find patterns: + +```json +// Most queries use "fahrenheit" +{"location": "Seattle", "units": "fahrenheit"} +{"location": "Portland", "units": "fahrenheit"} + +// Some use "celsius" +{"location": "Tokyo", "units": "celsius"} +``` + +### Step 2: Create base expectations + +Create expected tool calls based on patterns: + +```python +# Default to fahrenheit for US cities +ExpectedMCPToolCall("GetWeather", {"location": "Seattle", "units": "fahrenheit"}) + +# Use celsius for international cities +ExpectedMCPToolCall("GetWeather", {"location": "Tokyo", "units": "celsius"}) +``` + +### Step 3: Add critics + +Add critics to validate parameters. See [Critics](/home/evaluate-tools/create-an-evaluation-suite#critics) for options. + +### Step 4: Run evaluations + +Test with real evaluations: + +```bash +arcade evals . --details +``` + +### Step 5: Iterate + +Use failures to refine: + +- Adjust expected values +- Change critic weights +- Modify tool descriptions +- Add more test cases + +## Troubleshooting + +### No tool calls captured + +**Symptom:** Empty `tool_calls` arrays + +**Possible causes:** + +1. Model didn't call any tools +2. Tools not properly registered +3. System message doesn't encourage tool use + +**Solution:** + +```python +suite = EvalSuite( + name="Weather", + system_message="You are a weather assistant. Use the available weather tools to answer questions.", +) +``` + +### Missing parameters + +**Symptom:** Some parameters are missing from captured calls + +**Explanation:** Models may omit optional parameters. + +**Solution:** Check if parameters have defaults in your schema. The evaluation framework applies defaults automatically. + +### Different results per provider + +**Symptom:** OpenAI and Anthropic capture different tool calls + +**Explanation:** Providers interpret tool descriptions differently. + +**Solution:** This is expected. Use captures to understand provider-specific behavior, then create provider-agnostic tests. + +## Next steps + +- Learn about [comparative evaluations](/home/evaluate-tools/comparative-evaluations) to compare tool sources +- [Create evaluation suites](/home/evaluate-tools/create-an-evaluation-suite) with expectations diff --git a/app/en/home/evaluate-tools/comparative-evaluations/page.mdx b/app/en/home/evaluate-tools/comparative-evaluations/page.mdx new file mode 100644 index 000000000..4358d0884 --- /dev/null +++ b/app/en/home/evaluate-tools/comparative-evaluations/page.mdx @@ -0,0 +1,757 @@ +--- +title: "Comparative evaluations" +description: "Compare different tool implementations with the same test cases" +--- + +# Comparative evaluations + +Comparative evaluations let you test how well AI models select and use tools from different, isolated tool sources. Each "track" represents a separate tool registry, allowing you to compare implementations side-by-side. + +import { Callout, Steps } from "nextra/components"; + +## What are tracks? + +**Tracks are isolated tool registries** within a single evaluation suite. Each track has its own set of tools that are **not shared** with other tracks. This isolation lets you test how models perform when given different tool options for the same task. + +**Key concept**: Comparative evaluations test tool **selection** across different tool sets. Each track provides a different context (set of tools) to the model. + +**Common use cases:** + +- **Compare tool providers**: Test Google Weather vs OpenWeather API +- **Implementation comparison**: Test different MCP servers offering similar functionality +- **A/B testing**: Evaluate alternative tool designs + +### When to use comparative evaluations + +Use **comparative evaluations** when: + +- ✅ Testing multiple implementations of the same functionality +- ✅ Comparing different tool providers +- ✅ Evaluating how models choose between different tool sets + +Use **regular evaluations** when: + +- ✅ Testing a single tool implementation +- ✅ Testing mixed tools from multiple sources in the same context +- ✅ Regression testing + +### Testing mixed tool sources + +To test how multiple MCP servers work **together** in the same context (not isolated), use a regular evaluation and load multiple sources: + +```python +@tool_eval() +async def mixed_tools_eval(): + suite = EvalSuite(name="Mixed Tools", system_message="You are helpful.") + + # All tools available to the model in the same context + await suite.add_mcp_server("http://server1.example") + await suite.add_mcp_server("http://server2.example") + suite.add_tool_definitions([{"name": "CustomTool", ...}]) + + # Model can use any tool from any source + suite.add_case(...) + return suite +``` + +Alternatively, use an Arcade Gateway which aggregates tools from multiple sources. + +## Basic comparative evaluation + + + +### Register tools per track + +Create a suite and register tools for each track: + +```python +from arcade_evals import EvalSuite, tool_eval, ExpectedMCPToolCall, BinaryCritic + +@tool_eval() +async def weather_comparison(): + suite = EvalSuite( + name="Weather API Comparison", + system_message="You are a weather assistant.", + ) + + # Track A: Weather API v1 + await suite.add_mcp_server( + "http://weather-v1.example/mcp", + track="Weather v1" + ) + + # Track B: Weather API v2 + await suite.add_mcp_server( + "http://weather-v2.example/mcp", + track="Weather v2" + ) + + return suite +``` + +### Create comparative test case + +Add a test case with track-specific expectations: + +```python +suite.add_comparative_case( + name="get_current_weather", + user_message="What's the weather in Seattle?", +).for_track( + "Weather v1", + expected_tool_calls=[ + ExpectedMCPToolCall( + "GetWeather", + {"city": "Seattle", "type": "current"} + ) + ], + critics=[ + BinaryCritic(critic_field="city", weight=0.7), + BinaryCritic(critic_field="type", weight=0.3), + ], +).for_track( + "Weather v2", + expected_tool_calls=[ + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle"} + ) + ], + critics=[ + BinaryCritic(critic_field="location", weight=1.0), + ], +) +``` + +### Run comparative evaluation + +```bash +arcade evals . +``` + +Results show per-track scores: + +``` +Suite: Weather API Comparison + Case: get_current_weather + Track: Weather v1 -- Score: 1.00 -- PASSED + Track: Weather v2 -- Score: 1.00 -- PASSED +``` + + + +## Track registration + +### From MCP HTTP server + +```python +await suite.add_mcp_server( + url="http://localhost:8000", + headers={"Authorization": "Bearer token"}, + track="Production API", +) +``` + +### From MCP stdio server + +```python +await suite.add_mcp_stdio_server( + command=["python", "server_v2.py"], + env={"API_KEY": "secret"}, + track="Version 2", +) +``` + +### From Arcade Gateway + +```python +await suite.add_arcade_gateway( + gateway_slug="weather-gateway", + track="Arcade Gateway", +) +``` + +### Manual tool definitions + +```python +suite.add_tool_definitions( + tools=[ + { + "name": "GetWeather", + "description": "Get weather for a location", + "inputSchema": {...}, + } + ], + track="Custom Tools", +) +``` + + + Tools must be registered before creating comparative cases that reference + their tracks. + + +## Comparative case builder + +The `add_comparative_case()` method returns a builder for defining track-specific expectations. + +### Basic structure + +```python +suite.add_comparative_case( + name="test_case", + user_message="Do something", +).for_track( + "Track A", + expected_tool_calls=[...], + critics=[...], +).for_track( + "Track B", + expected_tool_calls=[...], + critics=[...], +) +``` + +### Optional parameters + +Add conversation context to comparative cases: + +```python +suite.add_comparative_case( + name="weather_with_context", + user_message="What about the weather there?", + system_message="You are helpful.", # Optional override + additional_messages=[ + {"role": "user", "content": "I'm going to Seattle"}, + ], +).for_track("Weather v1", ...).for_track("Weather v2", ...) +``` + +**Bias-aware message design:** + +Design `additional_messages` to avoid leading the model. Keep them neutral so you measure tool behavior, not prompt hints: + +```python +# ✅ Good - Neutral +additional_messages=[ + {"role": "user", "content": "I need weather information"}, + {"role": "assistant", "content": "I can help with that. Which location?"}, +] + +# ❌ Avoid - Tells the model which tool to call +additional_messages=[ + {"role": "user", "content": "Use the GetWeather tool for Seattle"}, +] +``` + +Keep messages generic so the model chooses tools naturally based on what is available in the track. + +### Different expectations per track + +Tracks can expose different tools and schemas. Because of that, you may need different critics per track: + +```python +suite.add_comparative_case( + name="search_query", + user_message="Search for Python tutorials", +).for_track( + "Google Search", + expected_tool_calls=[ + ExpectedMCPToolCall("Google_Search", {"query": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="query", weight=1.0)], +).for_track( + "Bing Search", + expected_tool_calls=[ + ExpectedMCPToolCall("Bing_WebSearch", {"q": "Python tutorials"}) + ], + # Different schema, so validate the matching field for this track + critics=[BinaryCritic(critic_field="q", weight=1.0)], +) +``` + +## Complete example + +Here's a full comparative evaluation: + +```python +from arcade_evals import ( + EvalSuite, + tool_eval, + ExpectedMCPToolCall, + BinaryCritic, + SimilarityCritic, +) + +@tool_eval() +async def search_comparison(): + """Compare different search APIs.""" + suite = EvalSuite( + name="Search API Comparison", + system_message="You are a search assistant. Use the available tools to search for information.", + ) + + # Register search providers (MCP servers) + await suite.add_mcp_server( + "http://google-search.example/mcp", + track="Google", + ) + + await suite.add_mcp_server( + "http://bing-search.example/mcp", + track="Bing", + ) + + # Mix with manual tool definitions + suite.add_tool_definitions( + tools=[{ + "name": "DDG_Search", + "description": "Search using DuckDuckGo", + "inputSchema": { + "type": "object", + "properties": { + "query": {"type": "string"} + }, + "required": ["query"] + } + }], + track="DuckDuckGo", + ) + + # Simple query + suite.add_comparative_case( + name="basic_search", + user_message="Search for Python tutorials", + ).for_track( + "Google", + expected_tool_calls=[ + ExpectedMCPToolCall("Search", {"query": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="query", weight=1.0)], + ).for_track( + "Bing", + expected_tool_calls=[ + ExpectedMCPToolCall("WebSearch", {"q": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="q", weight=1.0)], + ).for_track( + "DuckDuckGo", + expected_tool_calls=[ + ExpectedMCPToolCall("DDG_Search", {"query": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="query", weight=1.0)], + ) + + # Query with filters + suite.add_comparative_case( + name="search_with_filters", + user_message="Search for Python tutorials from the last month", + ).for_track( + "Google", + expected_tool_calls=[ + ExpectedMCPToolCall( + "Search", + {"query": "Python tutorials", "time_range": "month"} + ) + ], + critics=[ + SimilarityCritic(critic_field="query", weight=0.7), + BinaryCritic(critic_field="time_range", weight=0.3), + ], + ).for_track( + "Bing", + expected_tool_calls=[ + ExpectedMCPToolCall( + "WebSearch", + {"q": "Python tutorials", "freshness": "Month"} + ) + ], + critics=[ + SimilarityCritic(critic_field="q", weight=0.7), + BinaryCritic(critic_field="freshness", weight=0.3), + ], + ).for_track( + "DuckDuckGo", + expected_tool_calls=[ + ExpectedMCPToolCall( + "DDG_Search", + {"query": "Python tutorials"} + ) + ], + critics=[ + SimilarityCritic(critic_field="query", weight=1.0), + ], + ) + + return suite +``` + +Run the comparison: + +```bash +arcade evals . --details +``` + +Output shows side-by-side results: + +``` +Suite: Search API Comparison + +Case: basic_search + Track: Google -- Score: 1.00 -- PASSED + Track: Bing -- Score: 1.00 -- PASSED + Track: DuckDuckGo -- Score: 1.00 -- PASSED + +Case: search_with_filters + Track: Google -- Score: 1.00 -- PASSED + Track: Bing -- Score: 0.85 -- WARNED + Track: DuckDuckGo -- Score: 0.90 -- WARNED +``` + +## Result structure + +Comparative results are organized by track: + +```python +{ + "Google": { + "model": "gpt-4o", + "suite_name": "Search API Comparison", + "track_name": "Google", + "rubric": {...}, + "cases": [ + { + "name": "basic_search", + "track": "Google", + "input": "Search for Python tutorials", + "expected_tool_calls": [...], + "predicted_tool_calls": [...], + "evaluation": { + "score": 1.0, + "result": "passed", + ... + } + } + ] + }, + "Bing": {...}, + "DuckDuckGo": {...} +} +``` + +## Mixing regular and comparative cases + +A suite can have both regular and comparative cases: + +```python +@tool_eval() +async def mixed_suite(): + suite = EvalSuite( + name="Mixed Evaluation", + system_message="You are helpful.", + ) + + # Register default tools + await suite.add_mcp_stdio_server(["python", "server.py"]) + + # Regular case (uses default tools) + suite.add_case( + name="regular_test", + user_message="Do something", + expected_tool_calls=[...], + ) + + # Register track-specific tools + await suite.add_mcp_server("http://api-v2.example", track="v2") + + # Comparative case + suite.add_comparative_case( + name="compare_versions", + user_message="Do something else", + ).for_track( + "default", # Uses default tools + expected_tool_calls=[...], + ).for_track( + "v2", # Uses v2 tools + expected_tool_calls=[...], + ) + + return suite +``` + + + Use track name `"default"` to reference tools registered without a track. + + +## Capture mode with tracks + +Capture tool calls from each track separately: + +```bash +arcade evals . --capture --file captures/comparison --format json +``` + +Output includes track names: + +```json +{ + "captured_cases": [ + { + "case_name": "get_weather", + "track_name": "Weather v1", + "tool_calls": [ + {"name": "GetWeather", "args": {...}} + ] + }, + { + "case_name": "get_weather", + "track_name": "Weather v2", + "tool_calls": [ + {"name": "Weather_GetCurrent", "args": {...}} + ] + } + ] +} +``` + +## Multi-model comparative evaluations + +Combine comparative tracks with multiple models: + +```bash +arcade evals . \ + --use-provider openai:gpt-4o,gpt-4o-mini \ + --use-provider anthropic:claude-sonnet-4-5-20250929 +``` + +Results show: + +- Per-track scores for each model +- Cross-track comparisons for each model +- Cross-model comparisons for each track + +Example output: + +``` +Suite: Weather API Comparison + +Model: gpt-4o + Case: get_weather + Track: Weather v1 -- Score: 1.00 -- PASSED + Track: Weather v2 -- Score: 1.00 -- PASSED + +Model: gpt-4o-mini + Case: get_weather + Track: Weather v1 -- Score: 0.90 -- WARNED + Track: Weather v2 -- Score: 0.95 -- PASSED + +Model: claude-sonnet-4-5-20250929 + Case: get_weather + Track: Weather v1 -- Score: 1.00 -- PASSED + Track: Weather v2 -- Score: 0.85 -- WARNED +``` + +## Best practices + +### Use descriptive track names + +Choose clear names that indicate what's being compared: + +```python +# ✅ Good +track="Weather API v1" +track="OpenWeather Production" +track="Google Weather (Staging)" + +# ❌ Avoid +track="A" +track="Test1" +track="Track2" +``` + +### Keep test cases consistent + +Use the same user message and context across tracks: + +```python +suite.add_comparative_case( + name="get_weather", + user_message="What's the weather in Seattle?", # Same for all tracks +).for_track("v1", ...).for_track("v2", ...) +``` + +### Adjust critics to track differences + +Different tools may have different parameter names or types: + +```python +.for_track( + "Weather v1", + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"city": "Seattle"}) + ], + critics=[ + BinaryCritic(critic_field="city", weight=1.0), # v1 uses "city" + ], +).for_track( + "Weather v2", + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"location": "Seattle"}) + ], + critics=[ + BinaryCritic(critic_field="location", weight=1.0), # v2 uses "location" + ], +) +``` + +### Start with capture mode + +Use capture mode to discover track-specific tool signatures: + +```bash +arcade evals . --capture +``` + +Then create expectations based on captured calls. + +### Test edge cases per track + +Different implementations may handle edge cases differently: + +```python +suite.add_comparative_case( + name="ambiguous_location", + user_message="What's the weather in Portland?", # OR or ME? +).for_track( + "Weather v1", + # v1 defaults to most populous + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"city": "Portland", "state": "OR"}) + ], +).for_track( + "Weather v2", + # v2 requires disambiguation + expected_tool_calls=[ + ExpectedMCPToolCall("DisambiguateLocation", {"city": "Portland"}), + ExpectedMCPToolCall("GetWeather", {"city": "Portland", "state": "OR"}), + ], +) +``` + +## Troubleshooting + +### Track not found + +**Symptom:** `ValueError: Track 'TrackName' not registered` + +**Solution:** Register the track before adding comparative cases: + +```python +# ✅ Correct order +await suite.add_mcp_server(url, track="TrackName") +suite.add_comparative_case(...).for_track("TrackName", ...) + +# ❌ Wrong order - will fail +suite.add_comparative_case(...).for_track("TrackName", ...) +await suite.add_mcp_server(url, track="TrackName") +``` + +### Missing track expectations + +**Symptom:** Case runs against some tracks but not others + +**Explanation:** Comparative cases only run against tracks with `.for_track()` defined. + +**Solution:** Add expectations for all registered tracks: + +```python +suite.add_comparative_case( + name="test", + user_message="...", +).for_track("Track A", ...).for_track("Track B", ...) +``` + +### Tool name mismatches + +**Symptom:** "Tool not found" errors in specific tracks + +**Solution:** Check tool names in each track: + +```python +# List tools per track +print(suite.list_tool_names(track="Track A")) +print(suite.list_tool_names(track="Track B")) +``` + +Use the exact tool names from the output. + +### Inconsistent results across tracks + +**Symptom:** Same user message produces different scores across tracks + +**Explanation:** This is expected. Different tool implementations may work differently. + +**Solution:** Adjust expectations and critics per track to account for implementation differences. + +## Advanced patterns + +### Baseline comparison + +Compare new implementations against a baseline: + +```python +await suite.add_mcp_server( + "http://production.example/mcp", + track="Production (Baseline)" +) + +await suite.add_mcp_server( + "http://staging.example/mcp", + track="Staging (New)" +) +``` + +Results show deviations from baseline. + +### Progressive feature testing + +Test feature support across versions: + +```python +suite.add_comparative_case( + name="advanced_filters", + user_message="Search with advanced filters", +).for_track( + "v1", + expected_tool_calls=[], # Not supported +).for_track( + "v2", + expected_tool_calls=[ + ExpectedMCPToolCall("SearchWithFilters", {...}) + ], +) +``` + +### Tool catalog comparison + +Compare Arcade tool catalogs: + +```python +from arcade_core import ToolCatalog +from my_tools import weather_v1, weather_v2 + +catalog_v1 = ToolCatalog() +catalog_v1.add_tool(weather_v1, "Weather") + +catalog_v2 = ToolCatalog() +catalog_v2.add_tool(weather_v2, "Weather") + +suite.add_tool_catalog(catalog_v1, track="Python v1") +suite.add_tool_catalog(catalog_v2, track="Python v2") +``` + +## Next steps + +- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) with tracks +- Use [capture mode](/home/evaluate-tools/capture-mode) to discover track-specific tool calls +- [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple models and tracks diff --git a/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx b/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx index c82f8e626..838cb86bd 100644 --- a/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx +++ b/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx @@ -3,100 +3,88 @@ title: "Create an evaluation suite" description: "Learn how to evaluate your tools using Arcade" --- -# Evaluate tools +# Create an evaluation suite -In this guide, you'll learn how to evaluate your tools to ensure they are selected and used correctly by an AI model. You'll define evaluation cases and use different critics to assess the outcome of your evaluations. - -We'll create evaluation cases to test the `greet` tool and measure its performance. +Evaluation suites help you test whether AI models use your tools correctly. This guide shows you how to create test cases that measure tool selection and parameter accuracy. import { Steps, Tabs, Callout } from "nextra/components"; -### Prerequisites +### Install dependencies -- [Create an MCP Server](/home/build-tools/create-a-mcp-server) -- Install the evaluation dependencies: +Install Arcade with evaluation support: - + - ```bash - uv tool install 'arcade-mcp[evals]' - ``` +```bash +uv tool install 'arcade-mcp[evals]' +``` - ```bash - pip install 'arcade-mcp[evals]' - ``` +```bash +pip install 'arcade-mcp[evals]' +``` -### Create an evaluation suite +### Create an evaluation file -Navigate to your MCP Server's directory +Navigate to your MCP server directory and create a file starting with `eval_`: ```bash cd my_server +touch eval_server.py ``` -Create a new Python file for your evaluations, e.g., `eval_server.py`. - - For evals, the file name should start with `eval_` and be a Python script - (using the `.py` extension). + Evaluation files must start with `eval_` and use the `.py` extension. The CLI + automatically discovers these files. -### Define your evaluation cases +### Define your evaluation suite -Open `eval_server.py` and add the following code: +Create an evaluation suite that loads tools from your MCP server and defines test cases: ```python from arcade_evals import ( - EvalSuite, tool_eval, EvalRubric, - ExpectedToolCall, BinaryCritic -) -from arcade_core import ToolCatalog - -from server import greet - -# Create a catalog of tools to include in the evaluation -catalog = ToolCatalog() -catalog.add_tool(greet, "Greet") - -# Create rubric with tool calls -rubric = EvalRubric( - fail_threshold=0.8, - warn_threshold=0.9, + EvalSuite, + tool_eval, + ExpectedMCPToolCall, + BinaryCritic, ) @tool_eval() -def hello_eval_suite() -> EvalSuite: - """Create an evaluation suite for the hello tool.""" +async def weather_eval_suite() -> EvalSuite: + """Evaluate weather tool usage.""" suite = EvalSuite( - name="MCP Server Evaluation", - catalog=catalog, - system_message="You are a helpful assistant.", - rubric=rubric, + name="Weather Tools", + system_message="You are a helpful weather assistant.", + ) + + # Load tools from your MCP server + await suite.add_mcp_stdio_server( + command=["python", "server.py"], ) + # Add a test case suite.add_case( - name="Simple Greeting", - user_message="Greet Alice", + name="Get weather for city", + user_message="What's the weather in Seattle?", expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Alice", - }, + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle", "units": "celsius"} ) ], critics=[ - BinaryCritic(critic_field="name", weight=1.0), + BinaryCritic(critic_field="location", weight=0.7), + BinaryCritic(critic_field="units", weight=0.3), ], ) @@ -105,215 +93,244 @@ def hello_eval_suite() -> EvalSuite: ### Run the evaluation -From the server directory, ensure you have an OpenAI API key set in the `OPENAI_API_KEY` environment variable. Then run: +Set your OpenAI API key and run the evaluation: ```bash -export OPENAI_API_KEY= +export OPENAI_API_KEY= arcade evals . ``` -This command executes your evaluation suite and provides a report. +The command discovers all `eval_*.py` files and executes them using OpenAI's `gpt-4o` model by default. - - By default, the evaluation suite will use the `gpt-4o` model. You can specify - a different model and provider using the `--models` and `--provider` options. - If you are using a different provider, you will need to set the appropriate - API key in an environment variable, or use the `--provider-api-key` option. - For more information, see the [Run - evaluations](/home/evaluate-tools/run-evaluations) guide. - +**Using different providers:** -### How it works +```bash +# Anthropic +export ANTHROPIC_API_KEY= +arcade evals . --use-provider anthropic -The evaluation framework in Arcade allows you to define test cases (`EvalCase`) with expected tool calls and use critics to assess an AI model's performance. +# Or specify API key directly +arcade evals . --use-provider anthropic --anthropic-key -Similar to how a unit test suite measures the validity and performance of a function, an eval suite measures how well an AI model understands and uses your tools. +# Multiple models +arcade evals . --use-provider openai:gpt-4o,gpt-4o-mini +``` -### Next steps +See [Run evaluations](/home/evaluate-tools/run-evaluations) for all available options. -- Explore [different types of critics](#critic-classes) and [more complex evaluation cases](#advanced-evaluation-cases) to thoroughly test your tools. -- Understand [how to specify options for your evaluation runs](/home/evaluate-tools/run-evaluations). +### Understand the results - +Evaluation results show: -## Critic classes +- **Passed**: Score meets or exceeds the fail threshold (default: 0.8) +- **Failed**: Score falls below the fail threshold +- **Warned**: Score is between warn and fail thresholds (default: 0.9) -Critics are used to evaluate the correctness of tool calls. For simple tools, "correct" might be binary: is it exactly what we expected? For more complex tools, we might need to evaluate the similarity between expected and actual values, or measure numeric values within an acceptable range. +Example output: -Arcade's evaluation framework provides several critic classes to help you evaluate both exact and "fuzzy" matches between expected and actual values when a model predicts the parameters of a tool call. +``` +Suite: Weather Tools + Model: gpt-4o + PASSED Get weather for city -- Score: 1.00 -### BinaryCritic +Summary -- Total: 1 -- Passed: 1 -- Failed: 0 +``` -Checks if a parameter value matches exactly. +Use `--details` to see critic feedback: -```python -BinaryCritic(critic_field="name", weight=1.0) +```bash +arcade evals . --details ``` -### SimilarityCritic +Detailed output includes per-critic scores: -Evaluates the similarity between expected and actual values. +``` +PASSED Get weather for city -- Score: 1.00 + Details: + location: + Match: True, Score: 0.70/0.70 + units: + Match: True, Score: 0.30/0.30 +``` -```python -from arcade_evals import SimilarityCritic + + +## Loading tools -SimilarityCritic(critic_field="message", weight=1.0) +You can load tools from different sources. All methods are async and must be awaited in your `@tool_eval()` decorated function. + +### From MCP HTTP server + +Load tools from an HTTP or SSE MCP server: + +```python +await suite.add_mcp_server( + url="http://localhost:8000", + headers={"Authorization": "Bearer token"}, # Optional + timeout=10, # Optional: Connection timeout (default: 10) + use_sse=False, # Optional: Use SSE transport (default: False) +) ``` -### NumericCritic +The URL is automatically normalized (appends `/mcp` if not present). + +### From MCP stdio server -Assesses numeric values within a specified tolerance. +Load tools from a stdio MCP server process: ```python -from arcade_evals import NumericCritic +await suite.add_mcp_stdio_server( + command=["python", "server.py"], + env={"API_KEY": "secret"}, # Optional: Environment variables + timeout=10, # Optional: Connection timeout (default: 10) +) +``` -NumericCritic(critic_field="score", tolerance=0.1, weight=1.0) +### From Arcade Gateway + +Load tools from an Arcade MCP Gateway: + +```python +await suite.add_arcade_gateway( + gateway_slug="my-gateway", + arcade_api_key="your-api-key", # Optional: Defaults to ARCADE_API_KEY env var + arcade_user_id="user-id", # Optional: Defaults to ARCADE_USER_ID env var + base_url=None, # Optional: Override gateway URL + timeout=10, # Optional: Connection timeout (default: 10) +) ``` -### DatetimeCritic +### Manual tool definitions + +Define tools manually using MCP format: + +````python +suite.add_tool_definitions([ + { + "name": "Weather.GetCurrent", + "description": "Get current weather for a location", + "inputSchema": { + "type": "object", + "properties": { + "location": {"type": "string"}, + "units": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius" + }, + }, + "required": ["location"], + }, + } +]) -Evaluates the closeness of datetime values within a specified tolerance. +### Mixing tool sources + +You can load tools from multiple sources into the same suite: ```python -from datetime import timedelta -from arcade_evals import DatetimeCritic +# Load from multiple MCP servers +await suite.add_mcp_server("http://server1.example") +await suite.add_mcp_server("http://server2.example") -DatetimeCritic(critic_field="start_time", tolerance=timedelta(seconds=10), weight=1.0) -``` +# Mix with manual definitions +suite.add_tool_definitions([{"name": "CustomTool", ...}]) +```` -## Advanced evaluation cases +All tools are accumulated in the suite's registry and available to the model. -You can add more evaluation cases to test different scenarios. +## Expected tool calls - - Ensure that your `greet` tool and evaluation cases are updated accordingly and - that you rerun `arcade evals .` to test your changes. +Expected tool calls define what the model should predict. Use `ExpectedMCPToolCall` with MCP-style tool names: + +```python +ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle", "units": "celsius"} +) +``` - If your evals fail, use `--details` to see the detailed feedback from each critic. See [Run evaluations](/home/evaluate-tools/run-evaluations) to understand the options available in `arcade evals`. + + Tool names are normalized for compatibility with model tool calling. Dots + (`.`) become underscores (`_`). For example, `Weather.GetCurrent` becomes + `Weather_GetCurrent`. +## Critics -### Example: Greeting with emotion +Critics validate tool call parameters. Each critic type handles different validation needs: -Modify your `hello` tool to accept an `emotion` parameter: +| Critic | Use case | Example | +| ------------------ | --------------- | ------------------------------------------------------------------ | +| `BinaryCritic` | Exact match | `BinaryCritic(critic_field="user_id", weight=1.0)` | +| `SimilarityCritic` | Text similarity | `SimilarityCritic(critic_field="message", weight=0.8)` | +| `NumericCritic` | Numeric range | `NumericCritic(critic_field="temp", tolerance=2.0)` | +| `DatetimeCritic` | Time window | `DatetimeCritic(critic_field="due", tolerance=timedelta(hours=1))` | ```python -from enum import Enum - -class Emotion(str, Enum): - HAPPY = "happy" - SLIGHTLY_HAPPY = "slightly happy" - SAD = "sad" - SLIGHTLY_SAD = "slightly sad" - -@app.tool -def greet( - name: Annotated[str, "The name of the person to greet"], - emotion: Annotated[ - Emotion, "The emotion to convey. Defaults to happy if omitted." - ] = Emotion.HAPPY, -) -> Annotated[str, "A greeting to the user"]: - """ - Greet a person by name, optionally with a specific emotion. - """ - return f"Hello {name}! I'm feeling {emotion.value} today." +from arcade_evals import BinaryCritic, SimilarityCritic + +critics=[ + BinaryCritic(critic_field="location", weight=0.7), + SimilarityCritic(critic_field="message", weight=0.3), +] ``` -Add an evaluation case for this new parameter: +All weights are normalized proportionally to sum to 1.0. Use numeric values or `FuzzyWeight` (`CRITICAL`, `HIGH`, `MEDIUM`, `LOW`). -```python -# At the top of the file: -from server import Emotion -from arcade_evals import SimilarityCritic +## Multiple tool calls + +Test cases can include multiple expected tool calls: -# Inside hello_eval_suite(): +```python suite.add_case( - name="Greeting with Emotion", - user_message="Say hello to Bob sadly", + name="Check weather in multiple cities", + user_message="What's the weather in Seattle and Portland?", expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Bob", - "emotion": Emotion.SAD, - }, - ) - ], - critics=[ - BinaryCritic(critic_field="name", weight=0.5), - SimilarityCritic(critic_field="emotion", weight=0.5), + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Seattle"}), + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Portland"}), ], ) ``` -Add an evaluation case with additional conversation context: +## Conversation context + +Add conversation history to test cases that require context: ```python suite.add_case( - name="Greeting with Emotion from Context", - user_message="Say hello to Bob based on my current mood.", + name="Weather based on previous location", + user_message="What about the weather there?", expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Bob", - "emotion": Emotion.HAPPY, - }, - ) + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Tokyo"}), ], - critics=[ - BinaryCritic(critic_field="name", weight=0.5), - SimilarityCritic(critic_field="emotion", weight=0.5), + additional_messages=[ + {"role": "user", "content": "I'm planning to visit Tokyo next week."}, + {"role": "assistant", "content": "That sounds exciting! What would you like to know about Tokyo?"}, ], - # Add some context to the evaluation case - additional_messages= [ - {"role": "user", "content": "Hi, I'm so happy!"}, - { - "role": "assistant", - "content": "That's awesome! What's got you feeling so happy today?", - }, - ] ) ``` -Add an evaluation case with multiple expected tool calls: +Use OpenAI message format for `additional_messages`. Arcade converts it automatically for Anthropic. + +## Rubrics + +Customize pass/fail thresholds with `EvalRubric`. Default: fail at 0.8, warn at 0.9. ```python -suite.add_case( - name="Multiple Greetings with Emotion from Context", - user_message="Say hello to Bob based on my current mood. And then say hello to Alice with slightly less of that emotion.", - expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Bob", - "emotion": Emotion.HAPPY, - }, - ), - ExpectedToolCall( - func=greet, - args={ - "name": "Alice", - "emotion": Emotion.SLIGHTLY_HAPPY, - }, - ) - ], - critics=[ - BinaryCritic(critic_field="name", weight=0.5), - SimilarityCritic(critic_field="emotion", weight=0.5), - ], - # Add some context to the evaluation case - additional_messages= [ - {"role": "user", "content": "Hi, I'm so happy!"}, - { - "role": "assistant", - "content": "That's awesome! What's got you feeling so happy today?", - }, - ] +from arcade_evals import EvalRubric + +suite = EvalSuite( + name="Strict Evaluation", + system_message="You are helpful.", + rubric=EvalRubric(fail_threshold=0.85, warn_threshold=0.95), ) ``` +If you want stricter suites, increase thresholds (for example `fail_threshold=0.95`). For exploratory testing, lower them (for example `fail_threshold=0.6`). + ## Next steps -- **See an example MCP server with evaluations**: [Source code of a server with evaluations](https://github.com/ArcadeAI/arcade-mcp/tree/139cc2e54db0e5815f1c79dbe9e3285b4fe2bd66/examples/mcp_servers/server_with_evaluations) -- **Learn how to run evaluations**: [Run evaluations](/home/evaluate-tools/run-evaluations) +- Learn how to [run evaluations with different providers](/home/evaluate-tools/run-evaluations) +- Explore [capture mode](/home/evaluate-tools/capture-mode) to record tool calls +- Compare tool sources with [comparative evaluations](/home/evaluate-tools/comparative-evaluations) diff --git a/app/en/home/evaluate-tools/run-evaluations/page.mdx b/app/en/home/evaluate-tools/run-evaluations/page.mdx index 0920ed24c..abd373494 100644 --- a/app/en/home/evaluate-tools/run-evaluations/page.mdx +++ b/app/en/home/evaluate-tools/run-evaluations/page.mdx @@ -3,215 +3,398 @@ title: "Run evaluations" description: "Learn how to run evaluations using Arcade" --- -# Run evaluations with the Arcade CLI +# Run evaluations -The Arcade Evaluation Framework allows you to run evaluations of your tool-enabled language models conveniently using the Arcade CLI. This enables you to execute your evaluation suites, gather results, and analyze the performance of your models in an efficient and streamlined manner. +The `arcade evals` command discovers and executes evaluation suites with support for multiple providers, models, and output formats. - - +import { Callout } from "nextra/components"; -Run evaluations of your tool-enabled language models using the Arcade CLI. + + **Backward compatibility**: All new features (multi-provider support, capture + mode, output formats) work with existing evaluation suites. No code changes + required. + - +## Basic usage - +Run all evaluations in the current directory: -- [Arcade CLI](/home/arcade-cli) -- [An MCP Server](/home/build-tools/create-a-mcp-server) -- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) +```bash +arcade evals . +``` - +The command searches for files starting with `eval_` and ending with `.py`. - +Show detailed results with critic feedback: -- How to use the `arcade evals` CLI command to run evaluations. +```bash +arcade evals . --details +``` + +Filter to show only failures: + +```bash +arcade evals . --failed-only +``` + +## Multi-provider support + +### Single provider with default model + +Use OpenAI with default model (`gpt-4o`): + +```bash +export OPENAI_API_KEY=sk-... +arcade evals . +``` - - +Use Anthropic with default model (`claude-sonnet-4-5-20250929`): -### Using the `arcade evals` Command +```bash +export ANTHROPIC_API_KEY=sk-ant-... +arcade evals . --use-provider anthropic +``` -To run evaluations, use the `arcade evals` command provided by the Arcade CLI. This command searches for evaluation files in the specified directory, executes any functions decorated with `@tool_eval`, and displays the results. +### Specific models -#### Basic Usage +Specify one or more models for a provider: ```bash -arcade evals +arcade evals . --use-provider openai:gpt-4o,gpt-4o-mini ``` -- ``: The directory containing your evaluation files. By default, it searches the current directory (`.`). +### Multiple providers -For example, to run evaluations in the current directory: +Compare performance across providers: ```bash -arcade evals +arcade evals . \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 \ + --openai-key sk-... \ + --anthropic-key sk-ant-... ``` -#### Evaluation File Naming Convention +When you specify multiple models, results show side-by-side comparisons. -The `arcade evals` command looks for Python files that start with `eval_` and end with `.py` (e.g., `eval_math_tools.py`, `eval_slack_messaging.py`). These files should contain your evaluation suites. +## API keys -#### Command Options +API keys are resolved in the following order: -The `arcade evals` command supports several options to customize the evaluation process: +| Priority | OpenAI | Anthropic | +| ---------------- | -------------------- | ----------------------- | +| 1. Explicit flag | `--openai-key` | `--anthropic-key` | +| 2. Environment | `OPENAI_API_KEY` | `ANTHROPIC_API_KEY` | +| 3. `.env` file | `OPENAI_API_KEY=...` | `ANTHROPIC_API_KEY=...` | -- `--details`, `-d`: Show detailed results for each evaluation case, including critic feedback. + + Create a `.env` file in your project directory to avoid setting keys in every + terminal session. + - Example: +## Capture mode - ```bash - arcade evals --details . - ``` +Record tool calls without scoring to bootstrap test expectations: -- `--models`, `-m`: Specify the models to use for evaluation. Provide a comma-separated list of model names. +```bash +arcade evals . --capture --file captures/baseline --format json +``` - Example: +Include conversation context in captured output: - ```bash - arcade evals --models gpt-4o,gpt-5 . - ``` +```bash +arcade evals . --capture --add-context --file captures/detailed +``` -- `--max-concurrent`, `-c`: Set the maximum number of concurrent evaluations to run in parallel. +Capture mode is useful for: - Example: +- Creating initial test expectations +- Debugging model behavior +- Understanding tool call patterns - ```bash - arcade evals --max-concurrent 4 . - ``` +See [Capture mode](/home/evaluate-tools/capture-mode) for details. -- `--provider`, `-p`: The provider of the models to use for evaluation. Uses OpenAI by default. +## Output formats - Example: +### Save results to files - ```bash - arcade evals --provider openai . - ``` +Save results in one or more formats: -- `--provider-api-key`, `-k`: The model provider API key. If not provided, will look for the appropriate environment variable based on the provider (e.g., OPENAI_API_KEY for openai provider), first in the current environment, then in the current working directory's .env file. +```bash +arcade evals . --file results/out --format md,html +``` + +Save in all formats: - Example: +```bash +arcade evals . --file results/out --format all +``` - ```bash - arcade evals --provider-api-key my-api-key . - ``` +### Available formats -- `--debug`: Show debug information in the CLI. +| Format | Extension | Description | +| ------ | --------- | --------------------------------------------- | +| `txt` | `.txt` | Plain text, pytest-style output | +| `md` | `.md` | Markdown with tables and collapsible sections | +| `html` | `.html` | Interactive HTML report | +| `json` | `.json` | Structured JSON for programmatic use | - Example: +Multiple formats generate separate files: - ```bash - arcade evals --debug . - ``` +- `results/out.txt` +- `results/out.md` +- `results/out.html` +- `results/out.json` -- `--help`: Show help information and exit. +## Command options - Example: +### Quick reference - ```bash - arcade evals --help - ``` +| Flag | Purpose | Example | +| ------------------ | ---------------------- | ------------------------------ | +| `--use-provider` | Select provider/model | `--use-provider openai:gpt-4o` | +| `--capture` | Record without scoring | `--capture --file out` | +| `--details` | Show critic feedback | `--details` | +| `--failed-only` | Filter failures | `--failed-only` | +| `--format` | Output format(s) | `--format md,html,json` | +| `--max-concurrent` | Parallel limit | `--max-concurrent 10` | -#### Example Command +### `--use-provider` -Running evaluations in the `arcade_my_tools/evals` directory, showing detailed results, using the `gpt-5` model: +Specify which provider(s) and model(s) to use: ```bash -arcade evals arcade_my_tools/evals --details --models gpt-5 -k my-openai-api-key +--use-provider [:,,...] ``` -### Execution Process +**Supported providers:** -When you run the `arcade evals` command, the following steps occur: +- `openai` (default: `gpt-4o`) +- `anthropic` (default: `claude-sonnet-4-5-20250929`) -1. **Preparation**: The CLI loads the evaluation suites from the specified directory, looking for files that match the naming convention. + + Anthropic model names include date stamps. Check [Anthropic's model + documentation](https://docs.anthropic.com/en/docs/about-claude/models) for the + latest model versions. + -2. **Execution**: The evaluation suites are executed asynchronously. Each suite's evaluation function, decorated with `@tool_eval`, is called with the appropriate configuration, including the model and concurrency settings. +**Examples:** -3. **Concurrency**: Evaluations can run concurrently based on the `--max-concurrent` setting, improving efficiency. +```bash +# Default model for provider +arcade evals . --use-provider anthropic -4. **Result Aggregation**: Results from all evaluation cases and models are collected and aggregated. +# Specific model +arcade evals . --use-provider openai:gpt-4o-mini -### Displaying Results +# Multiple models from same provider +arcade evals . --use-provider openai:gpt-4o,gpt-4o-mini -After the evaluations are complete, the results are displayed in a concise and informative format, similar to testing frameworks like `pytest`. The output includes: +# Multiple providers +arcade evals . \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 +``` -- **Summary**: Shows the total number of cases, how many passed, failed, or issued warnings. +### `--openai-key`, `--anthropic-key` - Example: +Provide API keys explicitly: - ``` - Summary -- Total: 5 -- Passed: 4 -- Failed: 1 - ``` +```bash +arcade evals . --use-provider openai --openai-key sk-... +``` -- **Detailed Case Results**: For each evaluation case, the status (PASSED, FAILED, WARNED), the case name, and the score are displayed. +### `--capture` - Example: +Enable capture mode to record tool calls without scoring: - ``` - PASSED Add two large numbers -- Score: 1.00 - FAILED Send DM with ambiguous username -- Score: 0.75 - ``` +```bash +arcade evals . --capture +``` -- **Critic Feedback**: If the `--details` flag is used, detailed feedback from each critic is provided, highlighting matches, mismatches, and scores for each evaluated field. +### `--add-context` - Example: +Include system messages and conversation history in output: - ``` - Details: - user_name: - Match: False, Score: 0.00/0.50 - Expected: johndoe - Actual: john_doe - message: - Match: True, Score: 0.50/0.50 - ``` +```bash +arcade evals . --add-context --file out --format md +``` -### Interpreting the Results +### `--file` -- **Passed**: The evaluation case met or exceeded the fail threshold specified in the rubric. +Specify output file base name: -- **Failed**: The evaluation case did not meet the fail threshold. +```bash +arcade evals . --file results/evaluation +``` -- **Warnings**: If the score is between the warn threshold and the fail threshold, a warning is issued. +### `--format` -Use the detailed feedback to understand where the model's performance can be improved, particularly focusing on mismatches identified by critics. +Choose output format(s): -### Customizing Evaluations +```bash +arcade evals . --format md,html,json +``` -You can customize the evaluation process by adjusting: +Use `all` for all formats: -- **Rubrics**: Modify fail and warn thresholds, and adjust weights to emphasize different aspects of evaluation. +```bash +arcade evals . --format all +``` -- **Critics**: Add or modify critics in your evaluation cases to target specific arguments or behaviors. +### `--details`, `-d` -- **Concurrency**: Adjust the `--max-concurrent` option to optimize performance based on your environment. +Show detailed results including critic feedback: -### Handling Multiple Models +```bash +arcade evals . --details +``` + +### `--failed-only` + +Show only failed test cases: + +```bash +arcade evals . --failed-only +``` -You can evaluate multiple models in a single run by specifying them in the `--models` option as a comma-separated list. This allows you to compare the performance of different models across the same evaluation suites. +### `--max-concurrent`, `-c` -Example: +Set maximum concurrent evaluations: ```bash -arcade evals . --models gpt-4o,gpt-5 +arcade evals . --max-concurrent 10 ``` -### Considerations +Default is 5 concurrent evaluations. + +### `--debug` + +Show debug information for troubleshooting: + +```bash +arcade evals . --debug +``` -- **Evaluation Files**: Ensure your evaluation files are correctly named and contain the evaluation suites decorated with `@tool_eval`. +Displays detailed error traces and connection information. -- **Provider API Keys**: If you are using a different provider, you will need to set the appropriate API key in an environment variable, or use the `--provider-api-key` option. +## Understanding results -- **Tool Catalog**: Ensure your tool catalog is correctly defined and includes all the tools you want to evaluate. +Results are formatted based on evaluation type (regular, multi-model, or comparative) and selected flags. -- **Weight distribution**: Ensure your weight distribution reflects the importance of each critic and that the sum of the weights is `1.0`. +### Summary format + +Results show overall performance: + +``` +Summary -- Total: 5 -- Passed: 4 -- Failed: 1 +``` + +**How flags affect output:** + +- `--details`: Adds per-critic breakdown for each case +- `--failed-only`: Filters to show only failed cases (summary shows original totals) +- `--add-context`: Includes system messages and conversation history +- Multiple models: Switches to comparison table format +- Comparative tracks: Shows side-by-side track comparison + +### Case results + +Each case displays status and score: + +``` +PASSED Get weather for city -- Score: 1.00 +FAILED Weather with invalid city -- Score: 0.65 +``` + +### Detailed feedback + +Use `--details` to see critic-level analysis: + +``` +Details: + location: + Match: False, Score: 0.00/0.70 + Expected: Seattle + Actual: Seatle + units: + Match: True, Score: 0.30/0.30 +``` + +### Multi-model results + +When using multiple models, results show comparison tables: + +``` +Case: Get weather for city + Model: gpt-4o -- Score: 1.00 -- PASSED + Model: gpt-4o-mini -- Score: 0.95 -- WARNED +``` + +## Advanced usage + +### High concurrency for fast execution + +Increase concurrent evaluations: + +```bash +arcade evals . --max-concurrent 20 +``` + + + High concurrency may hit API rate limits. Start with default (5) and increase + gradually. + + +### Save comprehensive results + +Generate all formats with full details: + +```bash +arcade evals . \ + --details \ + --add-context \ + --file results/full-report \ + --format all +``` + +## Troubleshooting + +### Missing dependencies + +If you see `ImportError: MCP SDK is required`, install the full package: + +```bash +pip install 'arcade-mcp[evals]' +``` + +For Anthropic support: + +```bash +pip install anthropic +``` + +### Tool name mismatches + +Tool names are normalized (dots become underscores). If you see unexpected tool names, check your tool definitions and your expected tool calls. + +### API rate limits + +Reduce `--max-concurrent` value: + +```bash +arcade evals . --max-concurrent 2 +``` -## Conclusion +### No evaluation files found -Running evaluations using the Arcade CLI provides a powerful and convenient way to assess the tool-calling capabilities of your language models. By leveraging the `arcade evals` command, you can efficiently execute your evaluation suites, analyze results, and iterate on your models and tools. +Ensure your evaluation files: -Integrating this evaluation process into your development workflow helps ensure that your models interact with tools as expected, enhances reliability, and builds confidence in deploying actionable language models in production environments. +- Start with `eval_` +- End with `.py` +- Contain functions decorated with `@tool_eval()` ## Next steps -- **See an example MCP server with evaluations**: [Source code of a server with evaluations](https://github.com/ArcadeAI/arcade-mcp/tree/139cc2e54db0e5815f1c79dbe9e3285b4fe2bd66/examples/mcp_servers/server_with_evaluations) +- Explore [capture mode](/home/evaluate-tools/capture-mode) for recording tool calls +- Learn about [comparative evaluations](/home/evaluate-tools/comparative-evaluations) for comparing tool sources diff --git a/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx b/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx index 4f2af3a55..b348b8103 100644 --- a/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx +++ b/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx @@ -3,16 +3,19 @@ title: "Why evaluate tools?" description: "Learn why evaluating your tools is important" --- +import { Callout } from "nextra/components"; + # Why evaluate tools?
- When deploying language models with tool-calling capabilities in production environments, it's essential to ensure their effectiveness and reliability. This evaluation process goes beyond traditional testing and focuses on two key aspects: + Tool evaluations ensure AI models use your tools correctly in production. Unlike traditional testing, evaluations measure two key aspects: + + 1. **Tool selection**: Does the model choose the right tools for the task? + 2. **Parameter accuracy**: Does the model provide correct arguments? - 1. **Tool Utilization**: Assessing how efficiently the language model uses the available tools. - 2. **Intent Understanding**: Evaluating the language model's ability to comprehend user intents and select the appropriate tools to fulfill those intents. + Arcade's evaluation framework helps you validate tool-calling capabilities before deployment, ensuring reliability in real-world applications. You can evaluate tools from MCP servers, Arcade Gateways, or custom implementations. - Arcade's Evaluation Framework provides a comprehensive approach to assess and validate the tool-calling capabilities of language models, ensuring they meet the high standards required for real-world applications.
-## Why Evaluate Tool Calling by Task? - -Language models augmented with tool-use capabilities can perform complex tasks by invoking external tools or APIs. However, without proper evaluation, these models might: - -- **Misinterpret user intents**, leading to incorrect tool selection. -- **Provide incorrect arguments** to tools, causing failures or undesired outcomes. -- **Fail to execute the necessary sequence of tool calls**, especially in tasks requiring multiple steps. - -Evaluating tool calling by task ensures that the language model can handle specific scenarios reliably, providing confidence in its performance in production settings. - -## Evaluation Scoring - -Scoring in the evaluation framework is based on comparing the model's actual tool calls with the expected ones for each evaluation case. The total score for a case depends on: - -1. **Tool Selection**: Whether the model selected the correct tools for the task. -2. **Tool Call Arguments**: The correctness of the arguments provided to the tools, evaluated by critics. -3. **Evaluation Rubric**: Each aspect of the evaluation is weighted according to the rubric, affecting its impact on the final score. - -The evaluation result includes: - -- **Score**: A normalized value between 0.0 and 1.0. -- **Result**: - - _Passed_: Score is above the fail threshold. - - _Failed_: Score is below the fail threshold. - - _Warned_: Score is between the warning and fail thresholds. +## What can go wrong? -## Critics: Types and Usage +Without proper evaluation, AI models might: -Critics are essential for evaluating the correctness of tool call arguments. Different types of critics serve various evaluation needs: +- **Misinterpret user intents**, selecting the wrong tools +- **Provide incorrect arguments**, causing failures or unexpected behavior +- **Skip necessary tool calls**, missing steps in multi-step tasks +- **Make incorrect assumptions** about parameter defaults or formats -### BinaryCritic +## How evaluation works -`BinaryCritic`s check for exact matches between expected and actual values after casting. +Evaluations compare the model's actual tool calls with expected tool calls for each test case. -- **Use Case**: When exact values are required (e.g., specific numeric parameters). -- **Example**: Ensuring the model provides the exact user ID in a function call. +### Scoring components -### NumericCritic +1. **Tool selection**: Did the model choose the correct tool? +2. **Parameter evaluation**: Are the arguments correct? (evaluated by critics) +3. **Weighted scoring**: Each aspect has a weight that affects the final score -`NumericCritic` evaluates numeric values within a specified range, allowing for acceptable deviations. +### Evaluation results -- **Use Case**: When values can be approximate but should be within a certain threshold. -- **Example**: Accepting approximate results in mathematical computations due to floating-point precision. +Each test case receives: -### SimilarityCritic +- **Score**: Calculated from weighted critic scores, normalized proportionally (weights can be any positive value) +- **Status**: + - **Passed**: Score meets or exceeds fail threshold (default: 0.8) + - **Failed**: Score falls below fail threshold + - **Warned**: Score is between warn and fail thresholds (default: 0.9) -`SimilarityCritic` measures the similarity between expected and actual string values using metrics like cosine similarity. +Example output: -- **Use Case**: When the exact wording isn't critical, but the content should be similar. -- **Example**: Evaluating if the message content in a communication tool is similar to the expected message. - -### DatetimeCritic - -`DatetimeCritic` evaluates the closeness of datetime values within a specified tolerance. - -- **Use Case**: When datetime values should be within a certain range of the expected time. -- **Example**: Verifying if a scheduled event time is close enough to the intended time. - -### Choosing the Right Critic - -- **Exact Matches Needed**: Use **BinaryCritic** for strict equality. -- **Numeric Ranges**: Use **NumericCritic** when a tolerance is acceptable. -- **Textual Similarity**: Use **SimilarityCritic** for comparing messages or descriptions. -- **Datetime Tolerance**: Use **DatetimeCritic** when a tolerance is acceptable for datetime comparisons. - -Critics are defined with fields such as `critic_field`, `weight`, and parameters specific to their types (e.g., `similarity_threshold` for `SimilarityCritic`). +``` +PASSED Get weather for city -- Score: 1.00 +WARNED Send message with typo -- Score: 0.85 +FAILED Wrong tool selected -- Score: 0.50 +``` -## Rubrics and Setting Thresholds +## Next steps -An **EvalRubric** defines the evaluation criteria and thresholds for determining pass/fail outcomes. Key components include: +- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) to start testing your tools +- [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple providers +- Explore [capture mode](/home/evaluate-tools/capture-mode) to bootstrap test expectations +- Compare tool sources with [comparative evaluations](/home/evaluate-tools/comparative-evaluations) -- **Fail Threshold**: The minimum score required to pass the evaluation. -- **Warn Threshold**: The score threshold for issuing a warning. -- **Weights**: Assigns importance to different aspects of the evaluation (e.g., tool selection, argument correctness). +## Advanced features -### Setting Up a Rubric +Once you're comfortable with basic evaluations, explore these advanced capabilities: -- **Define Fail and Warn Thresholds**: Choose values between 0.0 and 1.0 to represent acceptable performance levels. -- **Assign Weights**: Allocate weights to tool selection and critics to reflect their importance in the overall evaluation. -- **Configure Failure Conditions**: Set flags like `fail_on_tool_selection` to enforce strict criteria. +### Capture mode -### Example Rubric Configuration: +Record tool calls without scoring to discover what models actually call. Useful for bootstrapping test expectations and debugging. [Learn more →](/home/evaluate-tools/capture-mode) -A rubric that requires a score of at least 0.85 to pass and issues a warning if the score is between 0.85 and 0.95: +### Comparative evaluations -- Fail Threshold: 0.85 -- Warn Threshold: 0.95 -- Fail on Tool Selection: True -- Tool Selection Weight: 1.0 +Test the same cases against different tool sources (tracks) with isolated registries. Compare how models perform with different tool implementations. [Learn more →](/home/evaluate-tools/comparative-evaluations) -```python -rubric = EvalRubric( - fail_threshold=0.85, - warn_threshold=0.95, - fail_on_tool_selection=True, - tool_selection_weight=1.0, -) -``` +### Output formats -## Building an Evaluation Suite - -An **EvalSuite** orchestrates the running of multiple evaluation cases. Here's how to build one: - -1. **Initialize EvalSuite**: Provide a name, system message, tool catalog, and rubric. -2. **Add Evaluation Cases**: Use `add_case` or `extend_case` to include various scenarios. -3. **Specify Expected Tool Calls**: Define the tools and arguments expected for each case. -4. **Assign Critics**: Attach critics relevant to each case to evaluate specific arguments. -5. **Run the Suite**: Execute the suite using the Arcade CLI to collect results. - -### Example: Math Tools Evaluation Suite - -An evaluation suite for math tools might include cases such as: - -- **Adding Two Large Numbers**: - - **User Message**: "Add 12345 and 987654321" - - **Expected Tool Call**: `add(a=12345, b=987654321)` - - **Critics**: - - `BinaryCritic` for arguments `a` and `b` -- **Calculating Square Roots**: - - **User Message**: "What is the square root of 3224990521?" - - **Expected Tool Call**: `sqrt(a=3224990521)` - - **Critics**: - - `BinaryCritic` for argument `a` - -### Example: Slack Messaging Tools Evaluation Suite - -An evaluation suite for Slack messaging tools might include cases such as: - -- **Sending a Direct Message**: - - **User Message**: "Send a direct message to johndoe saying 'Hello, can we meet at 3 PM?'" - - **Expected Tool Call**: `send_dm_to_user(user_name='johndoe', message='Hello, can we meet at 3 PM?')` - - **Critics**: - - `BinaryCritic` for `user_name` - - `SimilarityCritic` for `message` -- **Posting a Message to a Channel**: - - **User Message**: "Post 'The new feature is now live!' in the #announcements channel" - - **Expected Tool Call**: `send_message_to_channel(channel_name='announcements', message='The new feature is now live!')` - - **Critics**: - - `BinaryCritic` for `channel_name` - - `SimilarityCritic` for `message` +Save results in multiple formats (txt, md, html, json) for reporting and analysis. Mix formats with `--format md,html,json` or use `--format all`. [Learn more →](/home/evaluate-tools/run-evaluations#output-formats) diff --git a/public/llms.txt b/public/llms.txt index 286da23b9..689592313 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -1,4 +1,4 @@ - + # Arcade @@ -126,6 +126,8 @@ Arcade delivers three core capabilities: Deploy agents even your security team w ## Evaluate Tools +- [Capture mode](https://docs.arcade.dev/en/home/evaluate-tools/capture-mode.md): The "Capture mode" documentation page guides users on how to record tool calls without scoring, enabling them to bootstrap test expectations, debug model behavior, and explore new tools. It outlines typical workflows, basic usage steps, and best practices for capturing and converting +- [Comparative evaluations](https://docs.arcade.dev/en/home/evaluate-tools/comparative-evaluations.md): This documentation page provides guidance on conducting comparative evaluations by running the same test cases against different tool implementations, allowing users to compare tool sources side-by-side. It explains the concept of tracks, outlines the steps for setting up and executing comparative evaluations, and offers - [Evaluate tools](https://docs.arcade.dev/en/home/evaluate-tools/create-an-evaluation-suite.md): This documentation page provides a comprehensive guide on how to create and run an evaluation suite for assessing tools using the Arcade framework. Users will learn to define evaluation cases, utilize various critics to measure performance, and execute evaluations to ensure their tools are effectively integrated with - [Run evaluations with the Arcade CLI](https://docs.arcade.dev/en/home/evaluate-tools/run-evaluations.md): This documentation page provides guidance on using the Arcade CLI to run evaluations of tool-enabled language models. It outlines the steps to execute evaluation suites, customize the evaluation process with various command options, and analyze the results efficiently. Users will learn how to utilize the - [Why evaluate tools?](https://docs.arcade.dev/en/home/evaluate-tools/why-evaluate-tools.md): This documentation page explains the importance of evaluating tools used in language models with tool-calling capabilities, focusing on their effectiveness and reliability in production environments. It outlines the evaluation framework, which assesses tool utilization and intent understanding, and details the scoring system based on