From b6366fc9c79ed58d52a7eadddc6a4c6142329558 Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 29 Dec 2025 13:10:39 -0300 Subject: [PATCH 1/5] adding eval updted docs= --- app/_components/scope-picker.tsx | 2 + app/en/home/evaluate-tools/_meta.tsx | 3 + .../home/evaluate-tools/capture-mode/page.mdx | 612 +++++++++++++++ .../comparative-evaluations/page.mdx | 704 ++++++++++++++++++ .../create-an-evaluation-suite/page.mdx | 450 ++++++----- .../provider-compatibility/page.mdx | 496 ++++++++++++ .../evaluate-tools/run-evaluations/page.mdx | 405 +++++++--- .../why-evaluate-tools/page.mdx | 373 +++++++--- 8 files changed, 2652 insertions(+), 393 deletions(-) create mode 100644 app/en/home/evaluate-tools/capture-mode/page.mdx create mode 100644 app/en/home/evaluate-tools/comparative-evaluations/page.mdx create mode 100644 app/en/home/evaluate-tools/provider-compatibility/page.mdx diff --git a/app/_components/scope-picker.tsx b/app/_components/scope-picker.tsx index 782e1f9ea..6fa129a3d 100644 --- a/app/_components/scope-picker.tsx +++ b/app/_components/scope-picker.tsx @@ -120,3 +120,5 @@ export default function ScopePicker({ tools }: ScopePickerProps) { ); } + + diff --git a/app/en/home/evaluate-tools/_meta.tsx b/app/en/home/evaluate-tools/_meta.tsx index bc4929294..71434ccac 100644 --- a/app/en/home/evaluate-tools/_meta.tsx +++ b/app/en/home/evaluate-tools/_meta.tsx @@ -2,4 +2,7 @@ export default { "why-evaluate-tools": "Why evaluate tools?", "create-an-evaluation-suite": "Create an evaluation suite", "run-evaluations": "Run evaluations", + "capture-mode": "Capture mode", + "comparative-evaluations": "Comparative evaluations", + "provider-compatibility": "Provider compatibility", }; diff --git a/app/en/home/evaluate-tools/capture-mode/page.mdx b/app/en/home/evaluate-tools/capture-mode/page.mdx new file mode 100644 index 000000000..6324dc079 --- /dev/null +++ b/app/en/home/evaluate-tools/capture-mode/page.mdx @@ -0,0 +1,612 @@ +--- +title: "Capture mode" +description: "Record tool calls without scoring to bootstrap test expectations" +--- + +# Capture mode + +Capture mode records tool calls without evaluating them. Use it to bootstrap test expectations or debug model behavior. + +import { Callout, Steps } from "nextra/components"; + +## When to use capture mode + +**Bootstrapping test expectations**: When you don't know what tool calls to expect, run capture mode to see what the model actually calls. + +**Debugging model behavior**: When evaluations fail unexpectedly, capture mode shows exactly what the model is doing. + +**Exploring new tools**: When adding new tools, capture mode helps you understand how models interpret them. + +**Documenting tool usage**: Create examples of how models use your tools in different scenarios. + +### Typical workflow + +``` +1. Create suite with empty expected_tool_calls + ↓ +2. Run: arcade evals . --capture --format json + ↓ +3. Review captured tool calls in output file + ↓ +4. Copy tool calls into expected_tool_calls + ↓ +5. Add critics for validation + ↓ +6. Run: arcade evals . --details +``` + +## Basic usage + + + +### Create an evaluation suite without expectations + +Create a suite with test cases but empty `expected_tool_calls`: + +```python +from arcade_evals import EvalSuite, tool_eval + +@tool_eval() +async def capture_weather_suite(): + suite = EvalSuite( + name="Weather Capture", + system_message="You are a weather assistant.", + ) + + await suite.add_mcp_stdio_server(["python", "weather_server.py"]) + + # Add cases without expected tool calls + suite.add_case( + name="Simple weather query", + user_message="What's the weather in Seattle?", + expected_tool_calls=[], # Empty for capture + ) + + suite.add_case( + name="Multi-city comparison", + user_message="Compare the weather in Seattle and Portland", + expected_tool_calls=[], + ) + + return suite +``` + +### Run in capture mode + +Run evaluations with the `--capture` flag: + +```bash +arcade evals . --capture --file captures/weather --format json +``` + +This creates `captures/weather.json` with all tool calls. + +### Review captured output + +Open the JSON file to see what the model called: + +```json +{ + "suite_name": "Weather Capture", + "model": "gpt-4o", + "provider": "openai", + "captured_cases": [ + { + "case_name": "Simple weather query", + "user_message": "What's the weather in Seattle?", + "tool_calls": [ + { + "name": "Weather_GetCurrent", + "args": { + "location": "Seattle", + "units": "fahrenheit" + } + } + ] + } + ] +} +``` + +### Convert to test expectations + +Copy the captured calls into your evaluation suite: + +```python +from arcade_evals import ExpectedMCPToolCall, BinaryCritic + +suite.add_case( + name="Simple weather query", + user_message="What's the weather in Seattle?", + expected_tool_calls=[ + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle", "units": "fahrenheit"} + ) + ], + critics=[ + BinaryCritic(critic_field="location", weight=0.7), + BinaryCritic(critic_field="units", weight=0.3), + ], +) +``` + + + +## CLI options + +### Basic capture + +Record tool calls to JSON: + +```bash +arcade evals . --capture --file captures/baseline --format json +``` + +### Include conversation context + +Capture system messages and conversation history: + +```bash +arcade evals . --capture --add-context --file captures/detailed --format json +``` + +Output includes: + +```json +{ + "case_name": "Weather with context", + "user_message": "What about the weather there?", + "system_message": "You are a weather assistant.", + "additional_messages": [ + {"role": "user", "content": "I'm traveling to Tokyo"}, + {"role": "assistant", "content": "Tokyo is a great city!"} + ], + "tool_calls": [...] +} +``` + +### Multiple formats + +Save captures in multiple formats: + +```bash +arcade evals . --capture --file captures/out --format json,md +``` + +Markdown format is more readable for quick review: + +```markdown +## Weather Capture + +### Model: gpt-4o + +#### Case: Simple weather query + +**Input:** What's the weather in Seattle? + +**Tool Calls:** +- `Weather_GetCurrent` + - location: Seattle + - units: fahrenheit +``` + +### Multiple providers + +Capture from multiple providers to compare behavior: + +```bash +arcade evals . --capture \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 \ + --file captures/comparison --format json +``` + +## Programmatic capture + +Use capture mode from Python code: + +```python +import asyncio +from openai import AsyncOpenAI +from arcade_evals import EvalSuite + +async def capture_tool_calls(): + suite = EvalSuite(name="Weather", system_message="You are helpful.") + await suite.add_mcp_stdio_server(["python", "server.py"]) + + suite.add_case( + name="weather_query", + user_message="What's the weather in Seattle?", + expected_tool_calls=[], + ) + + client = AsyncOpenAI(api_key="sk-...") + + result = await suite.capture( + client=client, + model="gpt-4o", + provider="openai", + include_context=True, + ) + + # Access captured data + for case in result.captured_cases: + print(f"Case: {case.case_name}") + for tool_call in case.tool_calls: + print(f" Tool: {tool_call.name}") + print(f" Args: {tool_call.args}") + + # Save to file + result.write_to_file("captures/output.json", include_context=True) + + return result + +asyncio.run(capture_tool_calls()) +``` + +## Capture result structure + +### CaptureResult + +Top-level capture result: + +```python +@dataclass +class CaptureResult: + suite_name: str + model: str + provider: str + captured_cases: list[CapturedCase] +``` + +Methods: +- `to_dict(include_context=False)` → dict +- `to_json(include_context=False, indent=2)` → JSON string +- `write_to_file(file_path, include_context=False, indent=2)` → None + +### CapturedCase + +Individual test case result: + +```python +@dataclass +class CapturedCase: + case_name: str + user_message: str + tool_calls: list[CapturedToolCall] + system_message: str | None = None + additional_messages: list[dict] | None = None + track_name: str | None = None +``` + +### CapturedToolCall + +Individual tool call: + +```python +@dataclass +class CapturedToolCall: + name: str + args: dict[str, Any] +``` + +## Capture with comparative tracks + +Capture from multiple tool sources to see how different implementations behave: + +```python +@tool_eval() +async def capture_comparative(): + suite = EvalSuite( + name="Weather Comparison", + system_message="You are a weather assistant.", + ) + + # Register different tool sources + await suite.add_mcp_server( + "http://weather-api-1.example/mcp", + track="Weather API v1" + ) + + await suite.add_mcp_server( + "http://weather-api-2.example/mcp", + track="Weather API v2" + ) + + # Capture will run against each track + suite.add_case( + name="get_weather", + user_message="What's the weather in Seattle?", + expected_tool_calls=[], + ) + + return suite +``` + +Run capture: + +```bash +arcade evals . --capture --file captures/apis --format json +``` + +Output shows captures per track: + +```json +{ + "captured_cases": [ + { + "case_name": "get_weather", + "track_name": "Weather API v1", + "tool_calls": [ + {"name": "GetCurrentWeather", "args": {...}} + ] + }, + { + "case_name": "get_weather", + "track_name": "Weather API v2", + "tool_calls": [ + {"name": "Weather_Current", "args": {...}} + ] + } + ] +} +``` + +## Best practices + +### Start with broad queries + +Begin with open-ended prompts to see natural model behavior: + +```python +suite.add_case( + name="explore_weather_tools", + user_message="Show me everything you can do with weather", + expected_tool_calls=[], +) +``` + +### Capture edge cases + +Record model behavior on unusual inputs: + +```python +suite.add_case( + name="ambiguous_location", + user_message="What's the weather in Portland?", # OR or ME? + expected_tool_calls=[], +) +``` + +### Include context variations + +Capture with different conversation contexts: + +```python +suite.add_case( + name="weather_from_context", + user_message="How about the weather there?", + additional_messages=[ + {"role": "user", "content": "I'm going to Seattle"}, + ], + expected_tool_calls=[], +) +``` + +### Capture multiple providers + +Compare how different models interpret your tools: + +```bash +arcade evals . --capture \ + --use-provider openai:gpt-4o,gpt-4o-mini \ + --use-provider anthropic:claude-sonnet-4-5-20250929 \ + --file captures/models --format json,md +``` + +## Converting captures to tests + +### Step 1: Identify patterns + +Review captured tool calls to find patterns: + +```json +// Most queries use "fahrenheit" +{"location": "Seattle", "units": "fahrenheit"} +{"location": "Portland", "units": "fahrenheit"} + +// Some use "celsius" +{"location": "Tokyo", "units": "celsius"} +``` + +### Step 2: Create base expectations + +Create expected tool calls based on patterns: + +```python +# Default to fahrenheit for US cities +ExpectedMCPToolCall("GetWeather", {"location": "Seattle", "units": "fahrenheit"}) + +# Use celsius for international cities +ExpectedMCPToolCall("GetWeather", {"location": "Tokyo", "units": "celsius"}) +``` + +### Step 3: Add appropriate critics + +Choose critics based on parameter importance: + +```python +critics=[ + BinaryCritic(critic_field="location", weight=0.8), # Critical + BinaryCritic(critic_field="units", weight=0.2), # Less critical +] +``` + +### Step 4: Run evaluations + +Test with real evaluations: + +```bash +arcade evals . --details +``` + +### Step 5: Iterate + +Use failures to refine: +- Adjust expected values +- Change critic weights +- Modify tool descriptions +- Add more test cases + +## Troubleshooting + +### No tool calls captured + +**Symptom:** Empty `tool_calls` arrays + +**Possible causes:** +1. Model didn't call any tools +2. Tools not properly registered +3. System message doesn't encourage tool use + +**Solution:** + +```python +suite = EvalSuite( + name="Weather", + system_message="You are a weather assistant. Use the available weather tools to answer questions.", +) +``` + +### Unexpected tool names + +**Symptom:** Tool names have underscores instead of dots + +**Explanation:** Tool names are normalized for provider compatibility. `Weather.GetCurrent` becomes `Weather_GetCurrent`. + +**Solution:** Use normalized names in expectations: + +```python +ExpectedMCPToolCall("Weather_GetCurrent", {...}) +``` + +See [Provider compatibility](/home/evaluate-tools/provider-compatibility) for details. + +### Missing parameters + +**Symptom:** Some parameters are missing from captured calls + +**Explanation:** Models may omit optional parameters. + +**Solution:** Check if parameters have defaults in your schema. The evaluation framework applies defaults automatically. + +### Different results per provider + +**Symptom:** OpenAI and Anthropic capture different tool calls + +**Explanation:** Providers interpret tool descriptions differently. + +**Solution:** This is expected. Use captures to understand provider-specific behavior, then create provider-agnostic tests. + +## Example workflow + +Here's a complete workflow from capture to evaluation: + + + +### Create capture suite + +```python +@tool_eval() +async def initial_capture(): + suite = EvalSuite(name="Slack Tools", system_message="You are a Slack assistant.") + await suite.add_arcade_gateway(gateway_slug="slack") + + suite.add_case( + name="send_message", + user_message="Send a message to #general saying 'Hello team'", + expected_tool_calls=[], + ) + + suite.add_case( + name="send_dm", + user_message="Send a DM to alice saying 'Meeting at 3'", + expected_tool_calls=[], + ) + + return suite +``` + +### Capture with multiple models + +```bash +arcade evals . --capture \ + --use-provider openai:gpt-4o,gpt-4o-mini \ + --file captures/slack --format json,md +``` + +### Review markdown output + +```markdown +## Slack Tools + +### Model: gpt-4o + +#### Case: send_message +**Tool Calls:** +- `send_message_to_channel` + - channel: general + - message: Hello team + +#### Case: send_dm +**Tool Calls:** +- `send_dm_to_user` + - user: alice + - message: Meeting at 3 +``` + +### Create evaluation suite + +```python +@tool_eval() +async def slack_eval(): + suite = EvalSuite(name="Slack Tools", system_message="You are a Slack assistant.") + await suite.add_arcade_gateway(gateway_slug="slack") + + suite.add_case( + name="send_message", + user_message="Send a message to #general saying 'Hello team'", + expected_tool_calls=[ + ExpectedMCPToolCall( + "send_message_to_channel", + {"channel": "general", "message": "Hello team"} + ) + ], + critics=[ + BinaryCritic(critic_field="channel", weight=0.4), + SimilarityCritic(critic_field="message", weight=0.6), + ], + ) + + return suite +``` + +### Run evaluations + +```bash +arcade evals . --details +``` + +### Iterate based on results + +Refine expectations and critics based on evaluation results. + + + +## Next steps + +- Learn about [comparative evaluations](/home/evaluate-tools/comparative-evaluations) to compare tool sources +- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) for cross-provider testing +- [Create evaluation suites](/home/evaluate-tools/create-an-evaluation-suite) with expectations + diff --git a/app/en/home/evaluate-tools/comparative-evaluations/page.mdx b/app/en/home/evaluate-tools/comparative-evaluations/page.mdx new file mode 100644 index 000000000..ada3036b2 --- /dev/null +++ b/app/en/home/evaluate-tools/comparative-evaluations/page.mdx @@ -0,0 +1,704 @@ +--- +title: "Comparative evaluations" +description: "Compare different tool implementations with the same test cases" +--- + +# Comparative evaluations + +Comparative evaluations let you run the same test cases against different tool implementations. Use tracks to compare tool sources side-by-side. + +import { Callout, Steps } from "nextra/components"; + +## What are tracks? + +Tracks are isolated tool registries within a single evaluation suite. Each track represents a different source of tools. + +**Common use cases:** + +- **Compare tool providers**: Test Google Weather vs OpenWeather API +- **Version testing**: Compare API v1 vs API v2 +- **Implementation comparison**: Test different MCP servers for the same functionality +- **A/B testing**: Evaluate alternative tool designs + +### When to use comparative evaluations + +Use **comparative evaluations** when: +- ✅ Testing multiple implementations of the same functionality +- ✅ Comparing different API versions +- ✅ Evaluating tool providers side-by-side +- ✅ A/B testing tool designs + +Use **regular evaluations** when: +- ✅ Testing a single tool implementation +- ✅ Validating tool behavior +- ✅ Regression testing + +## Basic comparative evaluation + + + +### Register tools per track + +Create a suite and register tools for each track: + +```python +from arcade_evals import EvalSuite, tool_eval, ExpectedMCPToolCall, BinaryCritic + +@tool_eval() +async def weather_comparison(): + suite = EvalSuite( + name="Weather API Comparison", + system_message="You are a weather assistant.", + ) + + # Track A: Weather API v1 + await suite.add_mcp_server( + "http://weather-v1.example/mcp", + track="Weather v1" + ) + + # Track B: Weather API v2 + await suite.add_mcp_server( + "http://weather-v2.example/mcp", + track="Weather v2" + ) + + return suite +``` + +### Create comparative test case + +Add a test case with track-specific expectations: + +```python +suite.add_comparative_case( + name="get_current_weather", + user_message="What's the weather in Seattle?", +).for_track( + "Weather v1", + expected_tool_calls=[ + ExpectedMCPToolCall( + "GetWeather", + {"city": "Seattle", "type": "current"} + ) + ], + critics=[ + BinaryCritic(critic_field="city", weight=0.7), + BinaryCritic(critic_field="type", weight=0.3), + ], +).for_track( + "Weather v2", + expected_tool_calls=[ + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle"} + ) + ], + critics=[ + BinaryCritic(critic_field="location", weight=1.0), + ], +) +``` + +### Run comparative evaluation + +```bash +arcade evals . +``` + +Results show per-track scores: + +``` +Suite: Weather API Comparison + Case: get_current_weather + Track: Weather v1 -- Score: 1.00 -- PASSED + Track: Weather v2 -- Score: 1.00 -- PASSED +``` + + + +## Track registration + +### From MCP HTTP server + +```python +await suite.add_mcp_server( + url="http://localhost:8000", + headers={"Authorization": "Bearer token"}, + track="Production API", +) +``` + +### From MCP stdio server + +```python +await suite.add_mcp_stdio_server( + command=["python", "server_v2.py"], + env={"API_KEY": "secret"}, + track="Version 2", +) +``` + +### From Arcade Gateway + +```python +await suite.add_arcade_gateway( + gateway_slug="weather-gateway", + track="Arcade Gateway", +) +``` + +### Manual tool definitions + +```python +suite.add_tool_definitions( + tools=[ + { + "name": "GetWeather", + "description": "Get weather for a location", + "inputSchema": {...}, + } + ], + track="Custom Tools", +) +``` + + +Tools must be registered before creating comparative cases that reference their tracks. + + +## Comparative case builder + +The `add_comparative_case()` method returns a builder for defining track-specific expectations. + +### Basic structure + +```python +suite.add_comparative_case( + name="test_case", + user_message="Do something", +).for_track( + "Track A", + expected_tool_calls=[...], + critics=[...], +).for_track( + "Track B", + expected_tool_calls=[...], + critics=[...], +) +``` + +### Optional parameters + +Add conversation context to comparative cases: + +```python +suite.add_comparative_case( + name="weather_with_context", + user_message="What about the weather there?", + system_message="You are helpful.", # Optional override + additional_messages=[ + {"role": "user", "content": "I'm going to Seattle"}, + ], +).for_track("Weather v1", ...).for_track("Weather v2", ...) +``` + +### Different expectations per track + +Tracks often have different tool names and parameters: + +```python +suite.add_comparative_case( + name="search_query", + user_message="Search for Python tutorials", +).for_track( + "Google Search", + expected_tool_calls=[ + ExpectedMCPToolCall("Google_Search", {"query": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="query", weight=1.0)], +).for_track( + "Bing Search", + expected_tool_calls=[ + ExpectedMCPToolCall("Bing_WebSearch", {"q": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="q", weight=1.0)], +) +``` + +## Complete example + +Here's a full comparative evaluation: + +```python +from arcade_evals import ( + EvalSuite, + tool_eval, + ExpectedMCPToolCall, + BinaryCritic, + SimilarityCritic, +) + +@tool_eval() +async def search_comparison(): + """Compare different search APIs.""" + suite = EvalSuite( + name="Search API Comparison", + system_message="You are a search assistant. Use the available tools to search for information.", + ) + + # Register search providers + await suite.add_mcp_server( + "http://google-search.example/mcp", + track="Google", + ) + + await suite.add_mcp_server( + "http://bing-search.example/mcp", + track="Bing", + ) + + await suite.add_mcp_server( + "http://duckduckgo.example/mcp", + track="DuckDuckGo", + ) + + # Simple query + suite.add_comparative_case( + name="basic_search", + user_message="Search for Python tutorials", + ).for_track( + "Google", + expected_tool_calls=[ + ExpectedMCPToolCall("Search", {"query": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="query", weight=1.0)], + ).for_track( + "Bing", + expected_tool_calls=[ + ExpectedMCPToolCall("WebSearch", {"q": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="q", weight=1.0)], + ).for_track( + "DuckDuckGo", + expected_tool_calls=[ + ExpectedMCPToolCall("DDG_Search", {"search_term": "Python tutorials"}) + ], + critics=[BinaryCritic(critic_field="search_term", weight=1.0)], + ) + + # Query with filters + suite.add_comparative_case( + name="search_with_filters", + user_message="Search for Python tutorials from the last month", + ).for_track( + "Google", + expected_tool_calls=[ + ExpectedMCPToolCall( + "Search", + {"query": "Python tutorials", "time_range": "month"} + ) + ], + critics=[ + SimilarityCritic(critic_field="query", weight=0.7), + BinaryCritic(critic_field="time_range", weight=0.3), + ], + ).for_track( + "Bing", + expected_tool_calls=[ + ExpectedMCPToolCall( + "WebSearch", + {"q": "Python tutorials", "freshness": "Month"} + ) + ], + critics=[ + SimilarityCritic(critic_field="q", weight=0.7), + BinaryCritic(critic_field="freshness", weight=0.3), + ], + ).for_track( + "DuckDuckGo", + expected_tool_calls=[ + ExpectedMCPToolCall( + "DDG_Search", + {"search_term": "Python tutorials", "time": "m"} + ) + ], + critics=[ + SimilarityCritic(critic_field="search_term", weight=0.7), + BinaryCritic(critic_field="time", weight=0.3), + ], + ) + + return suite +``` + +Run the comparison: + +```bash +arcade evals . --details +``` + +Output shows side-by-side results: + +``` +Suite: Search API Comparison + +Case: basic_search + Track: Google -- Score: 1.00 -- PASSED + Track: Bing -- Score: 1.00 -- PASSED + Track: DuckDuckGo -- Score: 1.00 -- PASSED + +Case: search_with_filters + Track: Google -- Score: 1.00 -- PASSED + Track: Bing -- Score: 0.85 -- WARNED + Track: DuckDuckGo -- Score: 0.90 -- WARNED +``` + +## Result structure + +Comparative results are organized by track: + +```python +{ + "Google": { + "model": "gpt-4o", + "suite_name": "Search API Comparison", + "track_name": "Google", + "rubric": {...}, + "cases": [ + { + "name": "basic_search", + "track": "Google", + "input": "Search for Python tutorials", + "expected_tool_calls": [...], + "predicted_tool_calls": [...], + "evaluation": { + "score": 1.0, + "result": "passed", + ... + } + } + ] + }, + "Bing": {...}, + "DuckDuckGo": {...} +} +``` + +## Mixing regular and comparative cases + +A suite can have both regular and comparative cases: + +```python +@tool_eval() +async def mixed_suite(): + suite = EvalSuite( + name="Mixed Evaluation", + system_message="You are helpful.", + ) + + # Register default tools + await suite.add_mcp_stdio_server(["python", "server.py"]) + + # Regular case (uses default tools) + suite.add_case( + name="regular_test", + user_message="Do something", + expected_tool_calls=[...], + ) + + # Register track-specific tools + await suite.add_mcp_server("http://api-v2.example", track="v2") + + # Comparative case + suite.add_comparative_case( + name="compare_versions", + user_message="Do something else", + ).for_track( + "default", # Uses default tools + expected_tool_calls=[...], + ).for_track( + "v2", # Uses v2 tools + expected_tool_calls=[...], + ) + + return suite +``` + + +Use track name `"default"` to reference tools registered without a track. + + +## Capture mode with tracks + +Capture tool calls from each track separately: + +```bash +arcade evals . --capture --file captures/comparison --format json +``` + +Output includes track names: + +```json +{ + "captured_cases": [ + { + "case_name": "get_weather", + "track_name": "Weather v1", + "tool_calls": [ + {"name": "GetWeather", "args": {...}} + ] + }, + { + "case_name": "get_weather", + "track_name": "Weather v2", + "tool_calls": [ + {"name": "Weather_GetCurrent", "args": {...}} + ] + } + ] +} +``` + +## Multi-model comparative evaluations + +Combine comparative tracks with multiple models: + +```bash +arcade evals . \ + --use-provider openai:gpt-4o,gpt-4o-mini \ + --use-provider anthropic:claude-sonnet-4-5-20250929 +``` + +Results show: +- Per-track scores for each model +- Cross-track comparisons for each model +- Cross-model comparisons for each track + +Example output: + +``` +Suite: Weather API Comparison + +Model: gpt-4o + Case: get_weather + Track: Weather v1 -- Score: 1.00 -- PASSED + Track: Weather v2 -- Score: 1.00 -- PASSED + +Model: gpt-4o-mini + Case: get_weather + Track: Weather v1 -- Score: 0.90 -- WARNED + Track: Weather v2 -- Score: 0.95 -- PASSED + +Model: claude-sonnet-4-5-20250929 + Case: get_weather + Track: Weather v1 -- Score: 1.00 -- PASSED + Track: Weather v2 -- Score: 0.85 -- WARNED +``` + +## Best practices + +### Use descriptive track names + +Choose clear names that indicate what's being compared: + +```python +# ✅ Good +track="Weather API v1" +track="OpenWeather Production" +track="Google Weather (Staging)" + +# ❌ Avoid +track="A" +track="Test1" +track="Track2" +``` + +### Keep test cases consistent + +Use the same user message and context across tracks: + +```python +suite.add_comparative_case( + name="get_weather", + user_message="What's the weather in Seattle?", # Same for all tracks +).for_track("v1", ...).for_track("v2", ...) +``` + +### Adjust critics to track differences + +Different tools may have different parameter names or types: + +```python +.for_track( + "Weather v1", + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"city": "Seattle"}) + ], + critics=[ + BinaryCritic(critic_field="city", weight=1.0), # v1 uses "city" + ], +).for_track( + "Weather v2", + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"location": "Seattle"}) + ], + critics=[ + BinaryCritic(critic_field="location", weight=1.0), # v2 uses "location" + ], +) +``` + +### Start with capture mode + +Use capture mode to discover track-specific tool signatures: + +```bash +arcade evals . --capture +``` + +Then create expectations based on captured calls. + +### Test edge cases per track + +Different implementations may handle edge cases differently: + +```python +suite.add_comparative_case( + name="ambiguous_location", + user_message="What's the weather in Portland?", # OR or ME? +).for_track( + "Weather v1", + # v1 defaults to most populous + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"city": "Portland", "state": "OR"}) + ], +).for_track( + "Weather v2", + # v2 requires disambiguation + expected_tool_calls=[ + ExpectedMCPToolCall("DisambiguateLocation", {"city": "Portland"}), + ExpectedMCPToolCall("GetWeather", {"city": "Portland", "state": "OR"}), + ], +) +``` + +## Troubleshooting + +### Track not found + +**Symptom:** `ValueError: Track 'TrackName' not registered` + +**Solution:** Register the track before adding comparative cases: + +```python +# ✅ Correct order +await suite.add_mcp_server(url, track="TrackName") +suite.add_comparative_case(...).for_track("TrackName", ...) + +# ❌ Wrong order - will fail +suite.add_comparative_case(...).for_track("TrackName", ...) +await suite.add_mcp_server(url, track="TrackName") +``` + +### Missing track expectations + +**Symptom:** Case runs against some tracks but not others + +**Explanation:** Comparative cases only run against tracks with `.for_track()` defined. + +**Solution:** Add expectations for all registered tracks: + +```python +suite.add_comparative_case( + name="test", + user_message="...", +).for_track("Track A", ...).for_track("Track B", ...) +``` + +### Tool name mismatches + +**Symptom:** "Tool not found" errors in specific tracks + +**Solution:** Check tool names in each track: + +```python +# List tools per track +print(suite.list_tool_names(track="Track A")) +print(suite.list_tool_names(track="Track B")) +``` + +Use the exact tool names from the output. + +### Inconsistent results across tracks + +**Symptom:** Same user message produces different scores across tracks + +**Explanation:** This is expected. Different tool implementations may work differently. + +**Solution:** Adjust expectations and critics per track to account for implementation differences. + +## Advanced patterns + +### Baseline comparison + +Compare new implementations against a baseline: + +```python +await suite.add_mcp_server( + "http://production.example/mcp", + track="Production (Baseline)" +) + +await suite.add_mcp_server( + "http://staging.example/mcp", + track="Staging (New)" +) +``` + +Results show deviations from baseline. + +### Progressive feature testing + +Test feature support across versions: + +```python +suite.add_comparative_case( + name="advanced_filters", + user_message="Search with advanced filters", +).for_track( + "v1", + expected_tool_calls=[], # Not supported +).for_track( + "v2", + expected_tool_calls=[ + ExpectedMCPToolCall("SearchWithFilters", {...}) + ], +) +``` + +### Tool catalog comparison + +Compare Arcade tool catalogs: + +```python +from arcade_core import ToolCatalog +from my_tools import weather_v1, weather_v2 + +catalog_v1 = ToolCatalog() +catalog_v1.add_tool(weather_v1, "Weather") + +catalog_v2 = ToolCatalog() +catalog_v2.add_tool(weather_v2, "Weather") + +suite.add_tool_catalog(catalog_v1, track="Python v1") +suite.add_tool_catalog(catalog_v2, track="Python v2") +``` + +## Next steps + +- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) with tracks +- Use [capture mode](/home/evaluate-tools/capture-mode) to discover track-specific tool calls +- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) when comparing across providers +- [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple models and tracks + diff --git a/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx b/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx index c82f8e626..51434bfe1 100644 --- a/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx +++ b/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx @@ -3,317 +3,405 @@ title: "Create an evaluation suite" description: "Learn how to evaluate your tools using Arcade" --- -# Evaluate tools +# Create an evaluation suite -In this guide, you'll learn how to evaluate your tools to ensure they are selected and used correctly by an AI model. You'll define evaluation cases and use different critics to assess the outcome of your evaluations. - -We'll create evaluation cases to test the `greet` tool and measure its performance. +Evaluation suites help you test whether AI models use your tools correctly. This guide shows you how to create test cases that measure tool selection and parameter accuracy. import { Steps, Tabs, Callout } from "nextra/components"; -### Prerequisites +### Install dependencies -- [Create an MCP Server](/home/build-tools/create-a-mcp-server) -- Install the evaluation dependencies: +Install Arcade with evaluation support: - ```bash - uv tool install 'arcade-mcp[evals]' - ``` +```bash +uv tool install 'arcade-mcp[evals]' +``` - ```bash - pip install 'arcade-mcp[evals]' - ``` +```bash +pip install 'arcade-mcp[evals]' +``` -### Create an evaluation suite +### Create an evaluation file -Navigate to your MCP Server's directory +Navigate to your MCP server directory and create a file starting with `eval_`: ```bash cd my_server +touch eval_server.py ``` -Create a new Python file for your evaluations, e.g., `eval_server.py`. - - For evals, the file name should start with `eval_` and be a Python script - (using the `.py` extension). +Evaluation files must start with `eval_` and use the `.py` extension. The CLI automatically discovers these files. -### Define your evaluation cases +### Define your evaluation suite -Open `eval_server.py` and add the following code: +Create an evaluation suite that loads tools from your MCP server and defines test cases: ```python from arcade_evals import ( - EvalSuite, tool_eval, EvalRubric, - ExpectedToolCall, BinaryCritic -) -from arcade_core import ToolCatalog - -from server import greet - -# Create a catalog of tools to include in the evaluation -catalog = ToolCatalog() -catalog.add_tool(greet, "Greet") - -# Create rubric with tool calls -rubric = EvalRubric( - fail_threshold=0.8, - warn_threshold=0.9, + EvalSuite, + tool_eval, + ExpectedMCPToolCall, + BinaryCritic, ) @tool_eval() -def hello_eval_suite() -> EvalSuite: - """Create an evaluation suite for the hello tool.""" +async def weather_eval_suite() -> EvalSuite: + """Evaluate weather tool usage.""" suite = EvalSuite( - name="MCP Server Evaluation", - catalog=catalog, - system_message="You are a helpful assistant.", - rubric=rubric, + name="Weather Tools", + system_message="You are a helpful weather assistant.", ) - + + # Load tools from your MCP server + await suite.add_mcp_stdio_server( + command=["python", "server.py"], + ) + + # Add a test case suite.add_case( - name="Simple Greeting", - user_message="Greet Alice", + name="Get weather for city", + user_message="What's the weather in Seattle?", expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Alice", - }, + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle", "units": "celsius"} ) ], critics=[ - BinaryCritic(critic_field="name", weight=1.0), + BinaryCritic(critic_field="location", weight=0.7), + BinaryCritic(critic_field="units", weight=0.3), ], ) - + return suite ``` ### Run the evaluation -From the server directory, ensure you have an OpenAI API key set in the `OPENAI_API_KEY` environment variable. Then run: +Set your OpenAI API key and run the evaluation: ```bash -export OPENAI_API_KEY= +export OPENAI_API_KEY= arcade evals . ``` -This command executes your evaluation suite and provides a report. +The command discovers all `eval_*.py` files and executes them. - By default, the evaluation suite will use the `gpt-4o` model. You can specify - a different model and provider using the `--models` and `--provider` options. - If you are using a different provider, you will need to set the appropriate - API key in an environment variable, or use the `--provider-api-key` option. - For more information, see the [Run - evaluations](/home/evaluate-tools/run-evaluations) guide. +By default, evaluations use OpenAI's `gpt-4o` model. To use Anthropic or different models, see [Run evaluations](/home/evaluate-tools/run-evaluations). -### How it works +### Understand the results + +Evaluation results show: -The evaluation framework in Arcade allows you to define test cases (`EvalCase`) with expected tool calls and use critics to assess an AI model's performance. +- **Passed**: Score meets or exceeds the fail threshold (default: 0.8) +- **Failed**: Score falls below the fail threshold +- **Warned**: Score is between warn and fail thresholds (default: 0.9) -Similar to how a unit test suite measures the validity and performance of a function, an eval suite measures how well an AI model understands and uses your tools. +Example output: -### Next steps +``` +Suite: Weather Tools + Model: gpt-4o + PASSED Get weather for city -- Score: 1.00 + +Summary -- Total: 1 -- Passed: 1 -- Failed: 0 +``` -- Explore [different types of critics](#critic-classes) and [more complex evaluation cases](#advanced-evaluation-cases) to thoroughly test your tools. -- Understand [how to specify options for your evaluation runs](/home/evaluate-tools/run-evaluations). +Use `--details` to see critic feedback: + +```bash +arcade evals . --details +``` + +Detailed output includes per-critic scores: + +``` +PASSED Get weather for city -- Score: 1.00 + Details: + location: + Match: True, Score: 0.70/0.70 + units: + Match: True, Score: 0.30/0.30 +``` -## Critic classes +## Loading tools -Critics are used to evaluate the correctness of tool calls. For simple tools, "correct" might be binary: is it exactly what we expected? For more complex tools, we might need to evaluate the similarity between expected and actual values, or measure numeric values within an acceptable range. +You can load tools from different sources depending on your setup. -Arcade's evaluation framework provides several critic classes to help you evaluate both exact and "fuzzy" matches between expected and actual values when a model predicts the parameters of a tool call. + +All tool loading methods are async and must be awaited. Ensure your evaluation function is decorated with `@tool_eval()` and defined as `async`. + + +### From MCP HTTP server + +Load tools from an HTTP or SSE MCP server: + +```python +await suite.add_mcp_server( + url="http://localhost:8000", + headers={"Authorization": "Bearer token"}, +) +``` + +The loader automatically appends `/mcp` to the URL if not present. + +### From MCP stdio server + +Load tools from a stdio MCP server: + +```python +await suite.add_mcp_stdio_server( + command=["python", "server.py"], + env={"API_KEY": "secret"}, +) +``` + +### From Arcade Gateway + +Load tools from an Arcade MCP Gateway: + +```python +await suite.add_arcade_gateway( + gateway_slug="my-gateway", + arcade_api_key="your-api-key", + arcade_user_id="user-id", +) +``` + + +Tool loading results are cached automatically to avoid redundant connections. If you update your MCP server, use `clear_tools_cache()` to reload: + +```python +from arcade_evals import clear_tools_cache + +clear_tools_cache() +``` + + +### Manual tool definitions + +Define tools manually using MCP format: + +```python +suite.add_tool_definitions([ + { + "name": "Weather.GetCurrent", + "description": "Get current weather for a location", + "inputSchema": { + "type": "object", + "properties": { + "location": {"type": "string"}, + "units": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius" + }, + }, + "required": ["location"], + }, + } +]) +``` + +## Expected tool calls + +Expected tool calls define what the model should predict. Use `ExpectedMCPToolCall` with MCP-style tool names: + +```python +ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Seattle", "units": "celsius"} +) +``` + + +Tool names are normalized for provider compatibility. Dots (`.`) become underscores (`_`). For example, `Weather.GetCurrent` becomes `Weather_GetCurrent`. See [Provider compatibility](/home/evaluate-tools/provider-compatibility) for details. + + +## Critics + +Critics evaluate specific parameters of tool calls. Choose the right critic for your validation needs. + +| Critic Type | Use When | Example Field | +|-------------|----------|---------------| +| BinaryCritic | Need exact match | user_id, city, status | +| SimilarityCritic | Semantic match OK | message, description | +| NumericCritic | Range acceptable | temperature, price | +| DatetimeCritic | Time window OK | deadline, start_time | ### BinaryCritic -Checks if a parameter value matches exactly. +Checks for exact matches after type casting: ```python -BinaryCritic(critic_field="name", weight=1.0) +from arcade_evals import BinaryCritic + +# Perfect for IDs, locations, and enum values +BinaryCritic(critic_field="location", weight=0.7) ``` ### SimilarityCritic -Evaluates the similarity between expected and actual values. +Evaluates textual similarity using cosine similarity: ```python from arcade_evals import SimilarityCritic -SimilarityCritic(critic_field="message", weight=1.0) +SimilarityCritic( + critic_field="message", + weight=0.5, + similarity_threshold=0.8 +) ``` ### NumericCritic -Assesses numeric values within a specified tolerance. +Assesses numeric values within tolerance: ```python from arcade_evals import NumericCritic -NumericCritic(critic_field="score", tolerance=0.1, weight=1.0) +NumericCritic( + critic_field="temperature", + tolerance=2.0, + weight=0.3 +) ``` ### DatetimeCritic -Evaluates the closeness of datetime values within a specified tolerance. +Evaluates datetime values within a time window: ```python from datetime import timedelta from arcade_evals import DatetimeCritic -DatetimeCritic(critic_field="start_time", tolerance=timedelta(seconds=10), weight=1.0) +DatetimeCritic( + critic_field="scheduled_time", + tolerance=timedelta(minutes=5), + weight=0.4 +) ``` -## Advanced evaluation cases - -You can add more evaluation cases to test different scenarios. - - - Ensure that your `greet` tool and evaluation cases are updated accordingly and - that you rerun `arcade evals .` to test your changes. +## Fuzzy weights - If your evals fail, use `--details` to see the detailed feedback from each critic. See [Run evaluations](/home/evaluate-tools/run-evaluations) to understand the options available in `arcade evals`. - +Use fuzzy weights when you want qualitative importance levels instead of precise numbers: +```python +from arcade_evals import BinaryCritic, SimilarityCritic +from arcade_evals.weights import FuzzyWeight + +critics = [ + BinaryCritic( + critic_field="user_id", + weight=FuzzyWeight.CRITICAL + ), + SimilarityCritic( + critic_field="message", + weight=FuzzyWeight.MEDIUM + ), + BinaryCritic( + critic_field="priority", + weight=FuzzyWeight.LOW + ), +] +``` -### Example: Greeting with emotion +Fuzzy weights are automatically normalized: -Modify your `hello` tool to accept an `emotion` parameter: +| Weight | Value | Normalized (example above) | +|--------|-------|----------------------------| +| MINIMAL | 1 | - | +| VERY_LOW | 2 | - | +| LOW | 3 | 21.4% | +| MEDIUM | 4 | 28.6% | +| HIGH | 5 | - | +| VERY_HIGH | 6 | - | +| CRITICAL | 7 | 50.0% | -```python -from enum import Enum - -class Emotion(str, Enum): - HAPPY = "happy" - SLIGHTLY_HAPPY = "slightly happy" - SAD = "sad" - SLIGHTLY_SAD = "slightly sad" - -@app.tool -def greet( - name: Annotated[str, "The name of the person to greet"], - emotion: Annotated[ - Emotion, "The emotion to convey. Defaults to happy if omitted." - ] = Emotion.HAPPY, -) -> Annotated[str, "A greeting to the user"]: - """ - Greet a person by name, optionally with a specific emotion. - """ - return f"Hello {name}! I'm feeling {emotion.value} today." -``` +## Multiple tool calls -Add an evaluation case for this new parameter: +Test cases with multiple expected tool calls: ```python -# At the top of the file: -from server import Emotion -from arcade_evals import SimilarityCritic - -# Inside hello_eval_suite(): suite.add_case( - name="Greeting with Emotion", - user_message="Say hello to Bob sadly", + name="Check weather in multiple cities", + user_message="What's the weather in Seattle and Portland?", expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Bob", - "emotion": Emotion.SAD, - }, - ) + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Seattle"}), + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Portland"}), ], critics=[ - BinaryCritic(critic_field="name", weight=0.5), - SimilarityCritic(critic_field="emotion", weight=0.5), + BinaryCritic(critic_field="location", weight=1.0), ], ) ``` -Add an evaluation case with additional conversation context: +## Conversation context + +Add conversation history to test cases that require context: ```python suite.add_case( - name="Greeting with Emotion from Context", - user_message="Say hello to Bob based on my current mood.", + name="Weather based on previous location", + user_message="What about the weather there?", expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Bob", - "emotion": Emotion.HAPPY, - }, - ) + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Tokyo"}), ], critics=[ - BinaryCritic(critic_field="name", weight=0.5), - SimilarityCritic(critic_field="emotion", weight=0.5), + BinaryCritic(critic_field="location", weight=1.0), + ], + additional_messages=[ + {"role": "user", "content": "I'm planning to visit Tokyo next week."}, + {"role": "assistant", "content": "That sounds exciting! What would you like to know about Tokyo?"}, ], - # Add some context to the evaluation case - additional_messages= [ - {"role": "user", "content": "Hi, I'm so happy!"}, - { - "role": "assistant", - "content": "That's awesome! What's got you feeling so happy today?", - }, - ] ) ``` -Add an evaluation case with multiple expected tool calls: +## Rubrics and thresholds + +Customize evaluation thresholds using an `EvalRubric`: ```python -suite.add_case( - name="Multiple Greetings with Emotion from Context", - user_message="Say hello to Bob based on my current mood. And then say hello to Alice with slightly less of that emotion.", - expected_tool_calls=[ - ExpectedToolCall( - func=greet, - args={ - "name": "Bob", - "emotion": Emotion.HAPPY, - }, - ), - ExpectedToolCall( - func=greet, - args={ - "name": "Alice", - "emotion": Emotion.SLIGHTLY_HAPPY, - }, - ) - ], - critics=[ - BinaryCritic(critic_field="name", weight=0.5), - SimilarityCritic(critic_field="emotion", weight=0.5), - ], - # Add some context to the evaluation case - additional_messages= [ - {"role": "user", "content": "Hi, I'm so happy!"}, - { - "role": "assistant", - "content": "That's awesome! What's got you feeling so happy today?", - }, - ] +from arcade_evals import EvalRubric + +rubric = EvalRubric( + fail_threshold=0.85, + warn_threshold=0.95, +) + +suite = EvalSuite( + name="Strict Weather Evaluation", + system_message="You are a weather assistant.", + rubric=rubric, ) ``` +Default thresholds: +- **Fail threshold**: 0.8 +- **Warn threshold**: 0.9 + ## Next steps -- **See an example MCP server with evaluations**: [Source code of a server with evaluations](https://github.com/ArcadeAI/arcade-mcp/tree/139cc2e54db0e5815f1c79dbe9e3285b4fe2bd66/examples/mcp_servers/server_with_evaluations) -- **Learn how to run evaluations**: [Run evaluations](/home/evaluate-tools/run-evaluations) +- Learn how to [run evaluations with different providers](/home/evaluate-tools/run-evaluations) +- Explore [capture mode](/home/evaluate-tools/capture-mode) to record tool calls +- Compare tool sources with [comparative evaluations](/home/evaluate-tools/comparative-evaluations) +- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) diff --git a/app/en/home/evaluate-tools/provider-compatibility/page.mdx b/app/en/home/evaluate-tools/provider-compatibility/page.mdx new file mode 100644 index 000000000..b14c49478 --- /dev/null +++ b/app/en/home/evaluate-tools/provider-compatibility/page.mdx @@ -0,0 +1,496 @@ +--- +title: "Provider compatibility" +description: "Understand how tool evaluations work across different providers" +--- + +# Provider compatibility + +Arcade evaluations support both OpenAI and Anthropic. Each provider has different requirements for tool schemas and message formats. + +import { Callout } from "nextra/components"; + +## Provider comparison + +| Feature | OpenAI | Anthropic | +|---------|--------|-----------| +| **Tool name rules** | Alphanumeric, `-`, `_` (max 64 chars) | Alphanumeric, `_` only | +| **Schema format** | `function.parameters` (JSON Schema) | `input_schema` (JSON Schema) | +| **Strict mode** | Yes (opt-in via `strict: true`) | No (standard JSON Schema) | +| **Optional params** | Required list + null unions | Only required params in `required` | +| **Message roles** | system, user, assistant, tool, function | user, assistant (system separate) | +| **Tool calling format** | `tool_calls` array | `tool_use` content blocks | + +## Tool name normalization + +Arcade uses dotted notation for tool names (e.g., `Weather.GetCurrent`), but providers don't allow dots in function names. + +### How normalization works + +Tool names are automatically normalized: + +```python +from arcade_core.converters.utils import normalize_tool_name + +normalize_tool_name("Weather.GetCurrent") # Returns: "Weather_GetCurrent" +normalize_tool_name("Google.Search") # Returns: "Google_Search" +``` + +When models make tool calls, normalized names are resolved back to original names. + +### Denormalization is lossy + +Reversing normalization can't distinguish between original dots and underscores: + +| Original Name | Normalized | Denormalized | Correct? | +|---------------|------------|--------------|----------| +| `Google.Search` | `Google_Search` | `Google.Search` | ✅ | +| `My_Tool.Name` | `My_Tool_Name` | `My.Tool.Name` | ❌ | +| `Tool_Name` | `Tool_Name` | `Tool.Name` | ❌ | + + +**Best practice**: Use only dots OR only underscores in tool names, never both. + + +### Name collision + +Don't register both dotted and underscore versions of the same name: + +```python +# ❌ Avoid this - creates collision +suite.add_tool_definitions([ + {"name": "Weather.GetCurrent", ...}, + {"name": "Weather_GetCurrent", ...}, # Collision! +]) +``` + +The registry accepts both formats for lookups but they resolve to the same internal name. + +## OpenAI strict mode + +OpenAI's strict mode enforces structured outputs by transforming JSON Schema. This happens automatically in evaluations. + +### Schema transformations + +**1. Unsupported keywords are stripped:** + +```python +# Input schema +{ + "type": "integer", + "minimum": 0, + "maximum": 100, + "default": 50 +} + +# Transformed for OpenAI +{ + "type": ["integer", "null"] +} +``` + +Stripped keywords: +- Validation: `minimum`, `maximum`, `minLength`, `maxLength`, `pattern`, `format` +- Metadata: `default`, `nullable`, `minItems`, `maxItems` + +**2. Optional parameters become required with null unions:** + +```python +# Input schema +{ + "type": "object", + "properties": { + "city": {"type": "string"}, + "units": {"type": "string", "default": "celsius"} + }, + "required": ["city"] +} + +# Transformed for OpenAI +{ + "type": "object", + "properties": { + "city": {"type": "string"}, + "units": {"type": ["string", "null"]} # Now in union with null + }, + "required": ["city", "units"], # units added to required + "additionalProperties": false +} +``` + +**3. Enums are stringified:** + +```python +# Input schema +{ + "type": "integer", + "enum": [0, 1, 2] +} + +# Transformed for OpenAI +{ + "type": "string", + "enum": ["0", "1", "2"] +} +``` + +**4. Additional properties are forbidden:** + +All objects get `"additionalProperties": false` to enforce strict validation. + +### Why defaults still work + +Even though `default` is stripped from schemas, defaults are still applied during evaluation. Here's why: + +1. The evaluation framework stores the original schema with defaults +2. OpenAI sends `null` for optional parameters in strict mode +3. The framework applies defaults when args are missing OR null + +```python +# Model sends: +{"city": "Seattle", "units": null} + +# Framework applies default: +{"city": "Seattle", "units": "celsius"} +``` + + +This behavior ensures consistent evaluation results regardless of provider. + + +## Anthropic schema format + +Anthropic uses standard JSON Schema with minimal transformation. + +### Key differences from OpenAI + +**1. Schema field name:** + +```python +# OpenAI format +{ + "type": "function", + "function": { + "name": "get_weather", + "parameters": {...} # ← Note: "parameters" + } +} + +# Anthropic format +{ + "name": "get_weather", + "input_schema": {...} # ← Note: "input_schema" +} +``` + +**2. No strict mode transformations:** + +Anthropic accepts the schema as-is: +- Validation keywords are preserved +- Optional params stay optional +- Enums keep original types +- Defaults are kept (but not sent to model) + +**3. Only required params in required list:** + +```python +{ + "type": "object", + "properties": { + "city": {"type": "string"}, + "units": {"type": "string", "default": "celsius"} + }, + "required": ["city"] # Only city is required +} +``` + +## Message format conversion + +Arcade evaluations use OpenAI message format internally. When using Anthropic, messages are converted automatically. + +### System messages + +**OpenAI:** + +```python +[ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, +] +``` + +**Anthropic:** + +```python +# system → separate parameter +system = "You are helpful." + +messages = [ + {"role": "user", "content": "Hello"}, +] +``` + +### Tool calls + +**OpenAI:** + +```python +{ + "role": "assistant", + "content": "", + "tool_calls": [ + { + "id": "call_123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": '{"city": "Seattle"}' + } + } + ] +} +``` + +**Anthropic:** + +```python +{ + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": "call_123", + "name": "get_weather", + "input": {"city": "Seattle"} + } + ] +} +``` + +### Tool results + +**OpenAI:** + +```python +{ + "role": "tool", + "tool_call_id": "call_123", + "content": "Sunny, 72°F" +} +``` + +**Anthropic:** + +```python +{ + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_123", + "content": "Sunny, 72°F" + } + ] +} +``` + + +Message conversion happens automatically. You don't need to handle it manually. + + +## Writing provider-agnostic evaluations + +Follow these guidelines to ensure evaluations work with both providers: + +### 1. Use simple tool names + +Prefer names without dots or underscores: + +```python +# ✅ Good +"GetWeather" +"SearchGoogle" +"SendMessage" + +# ⚠️ Acceptable (use only one separator) +"Weather.GetCurrent" +"Google.Search" + +# ❌ Avoid (mixing separators) +"My_Tool.GetData" +"Tool_Name.With_Mixed" +``` + +### 2. Use MCP-style tool definitions + +Define tools using MCP format: + +```python +{ + "name": "GetWeather", + "description": "Get current weather for a city", + "inputSchema": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "units": {"type": "string", "enum": ["celsius", "fahrenheit"]} + }, + "required": ["city"] + } +} +``` + +### 3. Don't rely on strict mode behavior + +Don't assume specific schema transformations: + +```python +# ❌ Don't rely on null unions +{ + "type": ["string", "null"] # Only in OpenAI strict mode +} + +# ✅ Use optional parameters +{ + "type": "string" +} +# In required list: OpenAI adds null union, Anthropic keeps as-is +# Not in required list: Both treat as optional +``` + +### 4. Handle optional parameters consistently + +Use defaults for optional parameters: + +```python +{ + "type": "object", + "properties": { + "units": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius" + } + }, + "required": [] +} +``` + +Both providers will apply the default when the parameter is missing. + +## Testing with multiple providers + +Run evaluations with both providers to verify compatibility: + +```bash +arcade evals . \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 +``` + +### Comparing results + +Results show provider-specific behavior: + +``` +Suite: Weather Tools + Case: Get weather for city + Model: gpt-4o -- Score: 1.00 -- PASSED + Model: claude-sonnet-4-5-20250929 -- Score: 1.00 -- PASSED +``` + +### Common differences + +**Parameter handling:** + +OpenAI might send: +```json +{"city": "Seattle", "units": null} +``` + +Anthropic might send: +```json +{"city": "Seattle"} +``` + +Both are evaluated identically because defaults are applied. + +**Tool name format:** + +Both providers see normalized names (`Weather_GetCurrent`), but your test expectations use original names (`Weather.GetCurrent`). + +## Common pitfalls + +Avoid these common mistakes when working with multiple providers: + +1. **Using dots and underscores together** + ```python + # ❌ Don't mix separators + "My_Tool.GetData" + + # ✅ Use one consistently + "MyTool.GetData" or "MyTool_GetData" + ``` + +2. **Relying on specific schema transformations** + ```python + # ❌ OpenAI-specific null unions + {"type": ["string", "null"]} + + # ✅ Use optional parameters + {"type": "string"} # Not in required list + ``` + +3. **Forgetting to test with both providers** + ```bash + # ✅ Always test both + arcade evals . \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 + ``` + +## Troubleshooting + +### Tool name mismatch + +**Symptom:** Evaluation reports "tool not found" + +**Solution:** Check if tool name uses dots. The normalized name (with underscores) should match: + +```python +# Original: "Weather.GetCurrent" +# Normalized: "Weather_GetCurrent" +# Expected: ExpectedMCPToolCall("Weather_GetCurrent", {...}) +``` + +### Schema validation errors + +**Symptom:** OpenAI returns validation errors + +**Solution:** Check if your schema uses unsupported strict mode keywords. These are automatically stripped, but might affect expected behavior. + +### Missing optional parameters + +**Symptom:** Anthropic doesn't provide optional parameters + +**Solution:** This is expected. Optional parameters may be omitted. Ensure defaults are defined in your schema. + +### Enum type mismatches + +**Symptom:** OpenAI converts numeric enums to strings + +**Solution:** Use string enums in your schema: + +```python +# ✅ Use string enums +{"type": "string", "enum": ["low", "medium", "high"]} + +# ❌ Avoid numeric enums +{"type": "integer", "enum": [0, 1, 2]} # Converted to ["0", "1", "2"] +``` + +## Next steps + +- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) with provider-agnostic tests +- [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple providers +- Explore [capture mode](/home/evaluate-tools/capture-mode) to see actual tool calls + diff --git a/app/en/home/evaluate-tools/run-evaluations/page.mdx b/app/en/home/evaluate-tools/run-evaluations/page.mdx index 0920ed24c..1fdf2bed9 100644 --- a/app/en/home/evaluate-tools/run-evaluations/page.mdx +++ b/app/en/home/evaluate-tools/run-evaluations/page.mdx @@ -3,215 +3,386 @@ title: "Run evaluations" description: "Learn how to run evaluations using Arcade" --- -# Run evaluations with the Arcade CLI +# Run evaluations -The Arcade Evaluation Framework allows you to run evaluations of your tool-enabled language models conveniently using the Arcade CLI. This enables you to execute your evaluation suites, gather results, and analyze the performance of your models in an efficient and streamlined manner. +The `arcade evals` command discovers and executes evaluation suites with support for multiple providers, models, and output formats. - - +import { Callout } from "nextra/components"; -Run evaluations of your tool-enabled language models using the Arcade CLI. +## Basic usage - +Run all evaluations in the current directory: - +```bash +arcade evals . +``` -- [Arcade CLI](/home/arcade-cli) -- [An MCP Server](/home/build-tools/create-a-mcp-server) -- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) +The command searches for files starting with `eval_` and ending with `.py`. - +Show detailed results with critic feedback: - +```bash +arcade evals . --details +``` -- How to use the `arcade evals` CLI command to run evaluations. +Filter to show only failures: - - +```bash +arcade evals . --failed-only +``` -### Using the `arcade evals` Command +## Multi-provider support -To run evaluations, use the `arcade evals` command provided by the Arcade CLI. This command searches for evaluation files in the specified directory, executes any functions decorated with `@tool_eval`, and displays the results. +### Single provider with default model -#### Basic Usage +Use OpenAI with default model (`gpt-4o`): ```bash -arcade evals +export OPENAI_API_KEY=sk-... +arcade evals . ``` -- ``: The directory containing your evaluation files. By default, it searches the current directory (`.`). +Use Anthropic with default model (`claude-sonnet-4-5-20250929`): + +```bash +export ANTHROPIC_API_KEY=sk-ant-... +arcade evals . --use-provider anthropic +``` + +### Specific models + +Specify one or more models for a provider: + +```bash +arcade evals . --use-provider openai:gpt-4o,gpt-4o-mini +``` + +### Multiple providers + +Compare performance across providers: + +```bash +arcade evals . \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 \ + --openai-key sk-... \ + --anthropic-key sk-ant-... +``` + +When you specify multiple models, results show side-by-side comparisons. + +## API keys + +API keys are resolved in the following order: + +| Priority | OpenAI | Anthropic | +|----------|--------|-----------| +| 1. Explicit flag | `--openai-key` | `--anthropic-key` | +| 2. Environment | `OPENAI_API_KEY` | `ANTHROPIC_API_KEY` | +| 3. `.env` file | `OPENAI_API_KEY=...` | `ANTHROPIC_API_KEY=...` | + + +Create a `.env` file in your project directory to avoid setting keys in every terminal session. + + +## Capture mode + +Record tool calls without scoring to bootstrap test expectations: + +```bash +arcade evals . --capture --file captures/baseline --format json +``` + +Include conversation context in captured output: + +```bash +arcade evals . --capture --add-context --file captures/detailed +``` + +Capture mode is useful for: +- Creating initial test expectations +- Debugging model behavior +- Understanding tool call patterns + +See [Capture mode](/home/evaluate-tools/capture-mode) for details. + +## Output formats + +### Save results to files + +Save results in one or more formats: + +```bash +arcade evals . --file results/out --format md,html +``` + +Save in all formats: + +```bash +arcade evals . --file results/out --format all +``` + +### Available formats + +| Format | Extension | Description | +|--------|-----------|-------------| +| `txt` | `.txt` | Plain text, pytest-style output | +| `md` | `.md` | Markdown with tables and collapsible sections | +| `html` | `.html` | Interactive HTML report | +| `json` | `.json` | Structured JSON for programmatic use | + +Multiple formats generate separate files: +- `results/out.txt` +- `results/out.md` +- `results/out.html` +- `results/out.json` + +## Command options -For example, to run evaluations in the current directory: +### Quick reference + +| Flag | Purpose | Example | +|------|---------|---------| +| `--use-provider` | Select provider/model | `--use-provider openai:gpt-4o` | +| `--capture` | Record without scoring | `--capture --file out` | +| `--details` | Show critic feedback | `--details` | +| `--failed-only` | Filter failures | `--failed-only` | +| `--format` | Output format(s) | `--format md,html,json` | +| `--max-concurrent` | Parallel limit | `--max-concurrent 10` | + +### `--use-provider` + +Specify which provider(s) and model(s) to use: ```bash -arcade evals +--use-provider [:,,...] ``` -#### Evaluation File Naming Convention +**Supported providers:** +- `openai` (default: `gpt-4o`) +- `anthropic` (default: `claude-sonnet-4-5-20250929`) -The `arcade evals` command looks for Python files that start with `eval_` and end with `.py` (e.g., `eval_math_tools.py`, `eval_slack_messaging.py`). These files should contain your evaluation suites. + +Anthropic model names include date stamps. Check [Anthropic's model documentation](https://docs.anthropic.com/en/docs/about-claude/models) for the latest model versions. + -#### Command Options +**Examples:** -The `arcade evals` command supports several options to customize the evaluation process: +```bash +# Default model for provider +arcade evals . --use-provider anthropic -- `--details`, `-d`: Show detailed results for each evaluation case, including critic feedback. +# Specific model +arcade evals . --use-provider openai:gpt-4o-mini - Example: +# Multiple models from same provider +arcade evals . --use-provider openai:gpt-4o,gpt-4o-mini - ```bash - arcade evals --details . - ``` +# Multiple providers +arcade evals . \ + --use-provider openai:gpt-4o \ + --use-provider anthropic:claude-sonnet-4-5-20250929 +``` -- `--models`, `-m`: Specify the models to use for evaluation. Provide a comma-separated list of model names. +### `--openai-key`, `--anthropic-key` - Example: +Provide API keys explicitly: - ```bash - arcade evals --models gpt-4o,gpt-5 . - ``` +```bash +arcade evals . --use-provider openai --openai-key sk-... +``` -- `--max-concurrent`, `-c`: Set the maximum number of concurrent evaluations to run in parallel. +### `--capture` - Example: +Enable capture mode to record tool calls without scoring: - ```bash - arcade evals --max-concurrent 4 . - ``` +```bash +arcade evals . --capture +``` -- `--provider`, `-p`: The provider of the models to use for evaluation. Uses OpenAI by default. +### `--add-context` - Example: +Include system messages and conversation history in output: - ```bash - arcade evals --provider openai . - ``` +```bash +arcade evals . --add-context --file out --format md +``` -- `--provider-api-key`, `-k`: The model provider API key. If not provided, will look for the appropriate environment variable based on the provider (e.g., OPENAI_API_KEY for openai provider), first in the current environment, then in the current working directory's .env file. +### `--file` - Example: +Specify output file base name: - ```bash - arcade evals --provider-api-key my-api-key . - ``` +```bash +arcade evals . --file results/evaluation +``` -- `--debug`: Show debug information in the CLI. +### `--format` - Example: +Choose output format(s): - ```bash - arcade evals --debug . - ``` +```bash +arcade evals . --format md,html,json +``` + +Use `all` for all formats: + +```bash +arcade evals . --format all +``` -- `--help`: Show help information and exit. +### `--details`, `-d` - Example: +Show detailed results including critic feedback: - ```bash - arcade evals --help - ``` +```bash +arcade evals . --details +``` -#### Example Command +### `--failed-only` -Running evaluations in the `arcade_my_tools/evals` directory, showing detailed results, using the `gpt-5` model: +Show only failed test cases: ```bash -arcade evals arcade_my_tools/evals --details --models gpt-5 -k my-openai-api-key +arcade evals . --failed-only ``` -### Execution Process +### `--max-concurrent`, `-c` -When you run the `arcade evals` command, the following steps occur: +Set maximum concurrent evaluations: -1. **Preparation**: The CLI loads the evaluation suites from the specified directory, looking for files that match the naming convention. +```bash +arcade evals . --max-concurrent 10 +``` -2. **Execution**: The evaluation suites are executed asynchronously. Each suite's evaluation function, decorated with `@tool_eval`, is called with the appropriate configuration, including the model and concurrency settings. +Default is 5 concurrent evaluations. -3. **Concurrency**: Evaluations can run concurrently based on the `--max-concurrent` setting, improving efficiency. +### `--arcade-url` -4. **Result Aggregation**: Results from all evaluation cases and models are collected and aggregated. +Override Arcade gateway URL for testing: -### Displaying Results +```bash +arcade evals . --arcade-url https://staging.arcade.dev +``` -After the evaluations are complete, the results are displayed in a concise and informative format, similar to testing frameworks like `pytest`. The output includes: +## Understanding results -- **Summary**: Shows the total number of cases, how many passed, failed, or issued warnings. +### Summary format - Example: +Results show overall performance: - ``` - Summary -- Total: 5 -- Passed: 4 -- Failed: 1 - ``` +``` +Summary -- Total: 5 -- Passed: 4 -- Failed: 1 +``` + +### Case results + +Each case displays status and score: + +``` +PASSED Get weather for city -- Score: 1.00 +FAILED Weather with invalid city -- Score: 0.65 +``` + +### Detailed feedback + +Use `--details` to see critic-level analysis: -- **Detailed Case Results**: For each evaluation case, the status (PASSED, FAILED, WARNED), the case name, and the score are displayed. +``` +Details: + location: + Match: False, Score: 0.00/0.70 + Expected: Seattle + Actual: Seatle + units: + Match: True, Score: 0.30/0.30 +``` - Example: +### Multi-model results - ``` - PASSED Add two large numbers -- Score: 1.00 - FAILED Send DM with ambiguous username -- Score: 0.75 - ``` +When using multiple models, results show comparison tables: -- **Critic Feedback**: If the `--details` flag is used, detailed feedback from each critic is provided, highlighting matches, mismatches, and scores for each evaluated field. +``` +Case: Get weather for city + Model: gpt-4o -- Score: 1.00 -- PASSED + Model: gpt-4o-mini -- Score: 0.95 -- WARNED +``` - Example: +## Advanced usage - ``` - Details: - user_name: - Match: False, Score: 0.00/0.50 - Expected: johndoe - Actual: john_doe - message: - Match: True, Score: 0.50/0.50 - ``` +### Test against staging gateway -### Interpreting the Results +Point to a staging Arcade gateway: -- **Passed**: The evaluation case met or exceeded the fail threshold specified in the rubric. +```bash +export ARCADE_API_KEY=... +export ARCADE_USER_ID=... -- **Failed**: The evaluation case did not meet the fail threshold. +arcade evals . \ + --arcade-url https://staging.arcade.dev \ + --use-provider openai +``` -- **Warnings**: If the score is between the warn threshold and the fail threshold, a warning is issued. +### High concurrency for fast execution -Use the detailed feedback to understand where the model's performance can be improved, particularly focusing on mismatches identified by critics. +Increase concurrent evaluations: -### Customizing Evaluations +```bash +arcade evals . --max-concurrent 20 +``` -You can customize the evaluation process by adjusting: + +High concurrency may hit API rate limits. Start with default (5) and increase gradually. + -- **Rubrics**: Modify fail and warn thresholds, and adjust weights to emphasize different aspects of evaluation. +### Save comprehensive results -- **Critics**: Add or modify critics in your evaluation cases to target specific arguments or behaviors. +Generate all formats with full details: -- **Concurrency**: Adjust the `--max-concurrent` option to optimize performance based on your environment. +```bash +arcade evals . \ + --details \ + --add-context \ + --file results/full-report \ + --format all +``` -### Handling Multiple Models +## Troubleshooting -You can evaluate multiple models in a single run by specifying them in the `--models` option as a comma-separated list. This allows you to compare the performance of different models across the same evaluation suites. +### Missing dependencies -Example: +If you see `ImportError: MCP SDK is required`, install the full package: ```bash -arcade evals . --models gpt-4o,gpt-5 +pip install 'arcade-mcp[evals]' ``` -### Considerations +For Anthropic support: + +```bash +pip install anthropic +``` -- **Evaluation Files**: Ensure your evaluation files are correctly named and contain the evaluation suites decorated with `@tool_eval`. +### Tool name mismatches -- **Provider API Keys**: If you are using a different provider, you will need to set the appropriate API key in an environment variable, or use the `--provider-api-key` option. +Tool names are normalized (dots become underscores). If you see unexpected tool names, check [Provider compatibility](/home/evaluate-tools/provider-compatibility). -- **Tool Catalog**: Ensure your tool catalog is correctly defined and includes all the tools you want to evaluate. +### API rate limits -- **Weight distribution**: Ensure your weight distribution reflects the importance of each critic and that the sum of the weights is `1.0`. +Reduce `--max-concurrent` value: -## Conclusion +```bash +arcade evals . --max-concurrent 2 +``` -Running evaluations using the Arcade CLI provides a powerful and convenient way to assess the tool-calling capabilities of your language models. By leveraging the `arcade evals` command, you can efficiently execute your evaluation suites, analyze results, and iterate on your models and tools. +### No evaluation files found -Integrating this evaluation process into your development workflow helps ensure that your models interact with tools as expected, enhances reliability, and builds confidence in deploying actionable language models in production environments. +Ensure your evaluation files: +- Start with `eval_` +- End with `.py` +- Contain functions decorated with `@tool_eval()` ## Next steps -- **See an example MCP server with evaluations**: [Source code of a server with evaluations](https://github.com/ArcadeAI/arcade-mcp/tree/139cc2e54db0e5815f1c79dbe9e3285b4fe2bd66/examples/mcp_servers/server_with_evaluations) +- Explore [capture mode](/home/evaluate-tools/capture-mode) for recording tool calls +- Learn about [comparative evaluations](/home/evaluate-tools/comparative-evaluations) for comparing tool sources +- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) and schema differences diff --git a/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx b/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx index 4f2af3a55..75d5c1e48 100644 --- a/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx +++ b/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx @@ -7,12 +7,12 @@ description: "Learn why evaluating your tools is important"
- When deploying language models with tool-calling capabilities in production environments, it's essential to ensure their effectiveness and reliability. This evaluation process goes beyond traditional testing and focuses on two key aspects: + Tool evaluations ensure AI models use your tools correctly in production. Unlike traditional testing, evaluations measure two key aspects: - 1. **Tool Utilization**: Assessing how efficiently the language model uses the available tools. - 2. **Intent Understanding**: Evaluating the language model's ability to comprehend user intents and select the appropriate tools to fulfill those intents. + 1. **Tool selection**: Does the model choose the right tools for the task? + 2. **Parameter accuracy**: Does the model provide correct arguments? - Arcade's Evaluation Framework provides a comprehensive approach to assess and validate the tool-calling capabilities of language models, ensuring they meet the high standards required for real-world applications. + Arcade's evaluation framework helps you validate tool-calling capabilities before deployment, ensuring reliability in real-world applications.
-## Why Evaluate Tool Calling by Task? +## What can go wrong? -Language models augmented with tool-use capabilities can perform complex tasks by invoking external tools or APIs. However, without proper evaluation, these models might: +Without proper evaluation, AI models might: -- **Misinterpret user intents**, leading to incorrect tool selection. -- **Provide incorrect arguments** to tools, causing failures or undesired outcomes. -- **Fail to execute the necessary sequence of tool calls**, especially in tasks requiring multiple steps. +- **Misinterpret user intents**, selecting the wrong tools +- **Provide incorrect arguments**, causing failures or unexpected behavior +- **Skip necessary tool calls**, missing steps in multi-step tasks +- **Make incorrect assumptions** about parameter defaults or formats -Evaluating tool calling by task ensures that the language model can handle specific scenarios reliably, providing confidence in its performance in production settings. +## How evaluation works -## Evaluation Scoring +Evaluations compare the model's actual tool calls with expected tool calls for each test case. -Scoring in the evaluation framework is based on comparing the model's actual tool calls with the expected ones for each evaluation case. The total score for a case depends on: +### Scoring components -1. **Tool Selection**: Whether the model selected the correct tools for the task. -2. **Tool Call Arguments**: The correctness of the arguments provided to the tools, evaluated by critics. -3. **Evaluation Rubric**: Each aspect of the evaluation is weighted according to the rubric, affecting its impact on the final score. +1. **Tool selection**: Did the model choose the correct tool? +2. **Parameter evaluation**: Are the arguments correct? (evaluated by critics) +3. **Weighted scoring**: Each aspect has a weight that affects the final score -The evaluation result includes: +### Evaluation results -- **Score**: A normalized value between 0.0 and 1.0. -- **Result**: - - _Passed_: Score is above the fail threshold. - - _Failed_: Score is below the fail threshold. - - _Warned_: Score is between the warning and fail thresholds. +Each test case receives: -## Critics: Types and Usage +- **Score**: A value between 0.0 and 1.0 +- **Status**: + - **Passed**: Score meets or exceeds fail threshold (default: 0.8) + - **Failed**: Score falls below fail threshold + - **Warned**: Score is between warn and fail thresholds (default: 0.9) -Critics are essential for evaluating the correctness of tool call arguments. Different types of critics serve various evaluation needs: +Example output: -### BinaryCritic +``` +PASSED Get weather for city -- Score: 1.00 +WARNED Send message with typo -- Score: 0.85 +FAILED Wrong tool selected -- Score: 0.50 +``` -`BinaryCritic`s check for exact matches between expected and actual values after casting. +## Critics: Validating parameters -- **Use Case**: When exact values are required (e.g., specific numeric parameters). -- **Example**: Ensuring the model provides the exact user ID in a function call. +Critics evaluate the correctness of tool call arguments. Choose the right critic for your validation needs. -### NumericCritic +### BinaryCritic -`NumericCritic` evaluates numeric values within a specified range, allowing for acceptable deviations. +Checks for exact matches after type casting. -- **Use Case**: When values can be approximate but should be within a certain threshold. -- **Example**: Accepting approximate results in mathematical computations due to floating-point precision. +**Use case**: Exact values required (user IDs, commands, enum values) + +```python +from arcade_evals import BinaryCritic + +BinaryCritic(critic_field="user_id", weight=1.0) +``` ### SimilarityCritic -`SimilarityCritic` measures the similarity between expected and actual string values using metrics like cosine similarity. +Measures textual similarity using cosine similarity. -- **Use Case**: When the exact wording isn't critical, but the content should be similar. -- **Example**: Evaluating if the message content in a communication tool is similar to the expected message. +**Use case**: Content should be similar but exact wording isn't critical (messages, descriptions) + +```python +from arcade_evals import SimilarityCritic + +SimilarityCritic( + critic_field="message", + weight=0.8, + similarity_threshold=0.85 +) +``` + +### NumericCritic + +Evaluates numeric values within a tolerance range. + +**Use case**: Approximate values acceptable (temperatures, measurements) + +```python +from arcade_evals import NumericCritic + +NumericCritic( + critic_field="temperature", + tolerance=2.0, + weight=0.5 +) +``` ### DatetimeCritic -`DatetimeCritic` evaluates the closeness of datetime values within a specified tolerance. +Checks datetime values within a time window. -- **Use Case**: When datetime values should be within a certain range of the expected time. -- **Example**: Verifying if a scheduled event time is close enough to the intended time. +**Use case**: Times should be close to expected (scheduled events, deadlines) -### Choosing the Right Critic +```python +from datetime import timedelta +from arcade_evals import DatetimeCritic -- **Exact Matches Needed**: Use **BinaryCritic** for strict equality. -- **Numeric Ranges**: Use **NumericCritic** when a tolerance is acceptable. -- **Textual Similarity**: Use **SimilarityCritic** for comparing messages or descriptions. -- **Datetime Tolerance**: Use **DatetimeCritic** when a tolerance is acceptable for datetime comparisons. +DatetimeCritic( + critic_field="due_date", + tolerance=timedelta(hours=1), + weight=0.6 +) +``` -Critics are defined with fields such as `critic_field`, `weight`, and parameters specific to their types (e.g., `similarity_threshold` for `SimilarityCritic`). +## Setting thresholds with rubrics -## Rubrics and Setting Thresholds +An `EvalRubric` defines pass/fail criteria: -An **EvalRubric** defines the evaluation criteria and thresholds for determining pass/fail outcomes. Key components include: +```python +from arcade_evals import EvalRubric -- **Fail Threshold**: The minimum score required to pass the evaluation. -- **Warn Threshold**: The score threshold for issuing a warning. -- **Weights**: Assigns importance to different aspects of the evaluation (e.g., tool selection, argument correctness). +rubric = EvalRubric( + fail_threshold=0.85, # Minimum score to pass + warn_threshold=0.95, # Score for warnings +) +``` -### Setting Up a Rubric +**Default thresholds:** +- Fail threshold: 0.8 +- Warn threshold: 0.9 -- **Define Fail and Warn Thresholds**: Choose values between 0.0 and 1.0 to represent acceptable performance levels. -- **Assign Weights**: Allocate weights to tool selection and critics to reflect their importance in the overall evaluation. -- **Configure Failure Conditions**: Set flags like `fail_on_tool_selection` to enforce strict criteria. +### Example scenarios -### Example Rubric Configuration: +**Strict evaluation** (critical production systems): -A rubric that requires a score of at least 0.85 to pass and issues a warning if the score is between 0.85 and 0.95: +```python +rubric = EvalRubric( + fail_threshold=0.95, + warn_threshold=0.98, +) +``` -- Fail Threshold: 0.85 -- Warn Threshold: 0.95 -- Fail on Tool Selection: True -- Tool Selection Weight: 1.0 +**Lenient evaluation** (exploratory testing): ```python rubric = EvalRubric( - fail_threshold=0.85, - warn_threshold=0.95, - fail_on_tool_selection=True, - tool_selection_weight=1.0, + fail_threshold=0.6, + warn_threshold=0.8, +) +``` + +## Building effective evaluation suites + +A comprehensive evaluation suite includes: + +### 1. Common cases + +Test typical user requests: + +```python +suite.add_case( + name="Get weather for city", + user_message="What's the weather in Seattle?", + expected_tool_calls=[ + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Seattle"}) + ], ) ``` -## Building an Evaluation Suite +### 2. Edge cases + +Test unusual or boundary conditions: + +```python +suite.add_case( + name="Weather with ambiguous location", + user_message="What's the weather in Portland?", # Portland, OR or ME? + expected_tool_calls=[ + ExpectedMCPToolCall( + "Weather_GetCurrent", + {"location": "Portland", "state": "OR"} + ) + ], +) +``` -An **EvalSuite** orchestrates the running of multiple evaluation cases. Here's how to build one: +### 3. Multi-step cases + +Test sequences requiring multiple tool calls: + +```python +suite.add_case( + name="Compare weather in two cities", + user_message="Compare the weather in Seattle and Portland", + expected_tool_calls=[ + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Seattle"}), + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Portland"}), + ], +) +``` -1. **Initialize EvalSuite**: Provide a name, system message, tool catalog, and rubric. -2. **Add Evaluation Cases**: Use `add_case` or `extend_case` to include various scenarios. -3. **Specify Expected Tool Calls**: Define the tools and arguments expected for each case. -4. **Assign Critics**: Attach critics relevant to each case to evaluate specific arguments. -5. **Run the Suite**: Execute the suite using the Arcade CLI to collect results. +### 4. Context-dependent cases + +Test with conversation history: + +```python +suite.add_case( + name="Weather from previous context", + user_message="What about the weather there?", + expected_tool_calls=[ + ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Tokyo"}) + ], + additional_messages=[ + {"role": "user", "content": "I'm traveling to Tokyo next week."}, + {"role": "assistant", "content": "Tokyo is a great destination!"}, + ], +) +``` + +## Example evaluation suites + +### Weather tools + +```python +@tool_eval() +async def weather_eval_suite(): + suite = EvalSuite( + name="Weather Tools", + system_message="You are a weather assistant.", + ) + + await suite.add_mcp_stdio_server(["python", "weather_server.py"]) + + suite.add_case( + name="Current weather", + user_message="What's the weather in Seattle?", + expected_tool_calls=[ + ExpectedMCPToolCall("GetWeather", {"city": "Seattle", "type": "current"}) + ], + critics=[ + BinaryCritic(critic_field="city", weight=0.7), + BinaryCritic(critic_field="type", weight=0.3), + ], + ) + + return suite +``` -### Example: Math Tools Evaluation Suite +### Communication tools -An evaluation suite for math tools might include cases such as: +```python +@tool_eval() +async def slack_eval_suite(): + suite = EvalSuite( + name="Slack Messaging", + system_message="You are a Slack assistant.", + ) + + await suite.add_arcade_gateway(gateway_slug="slack-gateway") + + suite.add_case( + name="Send direct message", + user_message="Send a DM to @alice saying 'Meeting at 3 PM'", + expected_tool_calls=[ + ExpectedMCPToolCall( + "send_dm", + {"username": "alice", "message": "Meeting at 3 PM"} + ) + ], + critics=[ + BinaryCritic(critic_field="username", weight=0.4), + SimilarityCritic(critic_field="message", weight=0.6), + ], + ) + + return suite +``` + +## Best practices + +### Start simple + +Begin with straightforward cases and add complexity gradually: + +1. Single tool call with exact parameters +2. Single tool call with flexible parameters +3. Multiple tool calls +4. Context-dependent calls + +### Weight critics appropriately + +Assign weights based on importance: + +```python +critics=[ + BinaryCritic(critic_field="user_id", weight=0.7), # Critical + SimilarityCritic(critic_field="message", weight=0.3), # Less critical +] +``` + +Or use fuzzy weights: + +```python +from arcade_evals.weights import FuzzyWeight + +critics=[ + BinaryCritic(critic_field="user_id", weight=FuzzyWeight.CRITICAL), + SimilarityCritic(critic_field="message", weight=FuzzyWeight.MEDIUM), +] +``` + +### Test with multiple models + +Compare performance across models: + +```bash +arcade evals . \ + --use-provider openai:gpt-4o,gpt-4o-mini \ + --use-provider anthropic:claude-sonnet-4-5-20250929 +``` -- **Adding Two Large Numbers**: - - **User Message**: "Add 12345 and 987654321" - - **Expected Tool Call**: `add(a=12345, b=987654321)` - - **Critics**: - - `BinaryCritic` for arguments `a` and `b` -- **Calculating Square Roots**: - - **User Message**: "What is the square root of 3224990521?" - - **Expected Tool Call**: `sqrt(a=3224990521)` - - **Critics**: - - `BinaryCritic` for argument `a` +### Iterate based on results -### Example: Slack Messaging Tools Evaluation Suite +Use evaluation results to: +1. Identify common failure patterns +2. Improve tool descriptions +3. Refine parameter validation +4. Add missing test cases -An evaluation suite for Slack messaging tools might include cases such as: +## Next steps -- **Sending a Direct Message**: - - **User Message**: "Send a direct message to johndoe saying 'Hello, can we meet at 3 PM?'" - - **Expected Tool Call**: `send_dm_to_user(user_name='johndoe', message='Hello, can we meet at 3 PM?')` - - **Critics**: - - `BinaryCritic` for `user_name` - - `SimilarityCritic` for `message` -- **Posting a Message to a Channel**: - - **User Message**: "Post 'The new feature is now live!' in the #announcements channel" - - **Expected Tool Call**: `send_message_to_channel(channel_name='announcements', message='The new feature is now live!')` - - **Critics**: - - `BinaryCritic` for `channel_name` - - `SimilarityCritic` for `message` +- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) to start testing your tools +- [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple providers +- Explore [capture mode](/home/evaluate-tools/capture-mode) to bootstrap test expectations +- Compare tool sources with [comparative evaluations](/home/evaluate-tools/comparative-evaluations) From 13becd24830552ab56d57804cf2a32725cf35ce0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 29 Dec 2025 16:17:32 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=A4=96=20Regenerate=20LLMs.txt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- public/llms.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/llms.txt b/public/llms.txt index 286da23b9..0182ee0e4 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -1,4 +1,4 @@ - + # Arcade @@ -126,7 +126,10 @@ Arcade delivers three core capabilities: Deploy agents even your security team w ## Evaluate Tools +- [Capture mode](https://docs.arcade.dev/en/home/evaluate-tools/capture-mode.md): The "Capture mode" documentation page guides users on how to record tool calls without scoring, enabling them to bootstrap test expectations, debug model behavior, and explore new tools. It outlines typical workflows, basic usage steps, and best practices for capturing and converting +- [Comparative evaluations](https://docs.arcade.dev/en/home/evaluate-tools/comparative-evaluations.md): This documentation page provides guidance on conducting comparative evaluations by running the same test cases against different tool implementations, allowing users to compare tool sources side-by-side. It explains the concept of tracks, outlines the steps for setting up and executing comparative evaluations, and offers - [Evaluate tools](https://docs.arcade.dev/en/home/evaluate-tools/create-an-evaluation-suite.md): This documentation page provides a comprehensive guide on how to create and run an evaluation suite for assessing tools using the Arcade framework. Users will learn to define evaluation cases, utilize various critics to measure performance, and execute evaluations to ensure their tools are effectively integrated with +- [Provider compatibility](https://docs.arcade.dev/en/home/evaluate-tools/provider-compatibility.md): This documentation page provides guidance on how to navigate tool evaluations across different providers, specifically OpenAI and Anthropic, highlighting their unique requirements for tool schemas and message formats. It details the normalization of tool names, the implications of OpenAI's strict mode, - [Run evaluations with the Arcade CLI](https://docs.arcade.dev/en/home/evaluate-tools/run-evaluations.md): This documentation page provides guidance on using the Arcade CLI to run evaluations of tool-enabled language models. It outlines the steps to execute evaluation suites, customize the evaluation process with various command options, and analyze the results efficiently. Users will learn how to utilize the - [Why evaluate tools?](https://docs.arcade.dev/en/home/evaluate-tools/why-evaluate-tools.md): This documentation page explains the importance of evaluating tools used in language models with tool-calling capabilities, focusing on their effectiveness and reliability in production environments. It outlines the evaluation framework, which assesses tool utilization and intent understanding, and details the scoring system based on From ae62af64890d3e7c42b5c4433600d851f530e0df Mon Sep 17 00:00:00 2001 From: jottakka Date: Mon, 29 Dec 2025 17:30:58 -0300 Subject: [PATCH 3/5] linting fix --- app/en/home/evaluate-tools/_meta.tsx | 34 +- .../home/evaluate-tools/capture-mode/page.mdx | 238 +-------- .../comparative-evaluations/page.mdx | 123 +++-- .../create-an-evaluation-suite/page.mdx | 221 +++----- .../provider-compatibility/page.mdx | 496 ------------------ .../evaluate-tools/run-evaluations/page.mdx | 92 ++-- .../why-evaluate-tools/page.mdx | 297 +---------- 7 files changed, 280 insertions(+), 1221 deletions(-) delete mode 100644 app/en/home/evaluate-tools/provider-compatibility/page.mdx diff --git a/app/en/home/evaluate-tools/_meta.tsx b/app/en/home/evaluate-tools/_meta.tsx index 71434ccac..eafc38c4c 100644 --- a/app/en/home/evaluate-tools/_meta.tsx +++ b/app/en/home/evaluate-tools/_meta.tsx @@ -1,8 +1,28 @@ -export default { - "why-evaluate-tools": "Why evaluate tools?", - "create-an-evaluation-suite": "Create an evaluation suite", - "run-evaluations": "Run evaluations", - "capture-mode": "Capture mode", - "comparative-evaluations": "Comparative evaluations", - "provider-compatibility": "Provider compatibility", +import type { MetaRecord } from "nextra"; + +const meta: MetaRecord = { + "*": { + theme: { + breadcrumb: true, + toc: true, + copyPage: true, + }, + }, + "why-evaluate-tools": { + title: "Why evaluate tools?", + }, + "create-an-evaluation-suite": { + title: "Create an evaluation suite", + }, + "run-evaluations": { + title: "Run evaluations", + }, + "capture-mode": { + title: "Capture mode", + }, + "comparative-evaluations": { + title: "Comparative evaluations", + }, }; + +export default meta; diff --git a/app/en/home/evaluate-tools/capture-mode/page.mdx b/app/en/home/evaluate-tools/capture-mode/page.mdx index 6324dc079..16bf5f7a5 100644 --- a/app/en/home/evaluate-tools/capture-mode/page.mdx +++ b/app/en/home/evaluate-tools/capture-mode/page.mdx @@ -3,11 +3,17 @@ title: "Capture mode" description: "Record tool calls without scoring to bootstrap test expectations" --- +import { Callout, Steps } from "nextra/components"; + # Capture mode Capture mode records tool calls without evaluating them. Use it to bootstrap test expectations or debug model behavior. -import { Callout, Steps } from "nextra/components"; + + **Backward compatibility**: Capture mode works with existing evaluation + suites. Simply add the `--capture` flag to any `arcade evals` command. No code + changes needed. + ## When to use capture mode @@ -52,22 +58,22 @@ async def capture_weather_suite(): name="Weather Capture", system_message="You are a weather assistant.", ) - + await suite.add_mcp_stdio_server(["python", "weather_server.py"]) - + # Add cases without expected tool calls suite.add_case( name="Simple weather query", user_message="What's the weather in Seattle?", expected_tool_calls=[], # Empty for capture ) - + suite.add_case( name="Multi-city comparison", user_message="Compare the weather in Seattle and Portland", expected_tool_calls=[], ) - + return suite ``` @@ -186,6 +192,7 @@ Markdown format is more readable for quick review: **Input:** What's the weather in Seattle? **Tool Calls:** + - `Weather_GetCurrent` - location: Seattle - units: fahrenheit @@ -202,95 +209,6 @@ arcade evals . --capture \ --file captures/comparison --format json ``` -## Programmatic capture - -Use capture mode from Python code: - -```python -import asyncio -from openai import AsyncOpenAI -from arcade_evals import EvalSuite - -async def capture_tool_calls(): - suite = EvalSuite(name="Weather", system_message="You are helpful.") - await suite.add_mcp_stdio_server(["python", "server.py"]) - - suite.add_case( - name="weather_query", - user_message="What's the weather in Seattle?", - expected_tool_calls=[], - ) - - client = AsyncOpenAI(api_key="sk-...") - - result = await suite.capture( - client=client, - model="gpt-4o", - provider="openai", - include_context=True, - ) - - # Access captured data - for case in result.captured_cases: - print(f"Case: {case.case_name}") - for tool_call in case.tool_calls: - print(f" Tool: {tool_call.name}") - print(f" Args: {tool_call.args}") - - # Save to file - result.write_to_file("captures/output.json", include_context=True) - - return result - -asyncio.run(capture_tool_calls()) -``` - -## Capture result structure - -### CaptureResult - -Top-level capture result: - -```python -@dataclass -class CaptureResult: - suite_name: str - model: str - provider: str - captured_cases: list[CapturedCase] -``` - -Methods: -- `to_dict(include_context=False)` → dict -- `to_json(include_context=False, indent=2)` → JSON string -- `write_to_file(file_path, include_context=False, indent=2)` → None - -### CapturedCase - -Individual test case result: - -```python -@dataclass -class CapturedCase: - case_name: str - user_message: str - tool_calls: list[CapturedToolCall] - system_message: str | None = None - additional_messages: list[dict] | None = None - track_name: str | None = None -``` - -### CapturedToolCall - -Individual tool call: - -```python -@dataclass -class CapturedToolCall: - name: str - args: dict[str, Any] -``` - ## Capture with comparative tracks Capture from multiple tool sources to see how different implementations behave: @@ -302,25 +220,25 @@ async def capture_comparative(): name="Weather Comparison", system_message="You are a weather assistant.", ) - + # Register different tool sources await suite.add_mcp_server( "http://weather-api-1.example/mcp", track="Weather API v1" ) - + await suite.add_mcp_server( "http://weather-api-2.example/mcp", track="Weather API v2" ) - + # Capture will run against each track suite.add_case( name="get_weather", user_message="What's the weather in Seattle?", expected_tool_calls=[], ) - + return suite ``` @@ -432,16 +350,9 @@ ExpectedMCPToolCall("GetWeather", {"location": "Seattle", "units": "fahrenheit"} ExpectedMCPToolCall("GetWeather", {"location": "Tokyo", "units": "celsius"}) ``` -### Step 3: Add appropriate critics +### Step 3: Add critics -Choose critics based on parameter importance: - -```python -critics=[ - BinaryCritic(critic_field="location", weight=0.8), # Critical - BinaryCritic(critic_field="units", weight=0.2), # Less critical -] -``` +Add critics to validate parameters. See [Critics](/home/evaluate-tools/create-an-evaluation-suite#critics) for options. ### Step 4: Run evaluations @@ -454,6 +365,7 @@ arcade evals . --details ### Step 5: Iterate Use failures to refine: + - Adjust expected values - Change critic weights - Modify tool descriptions @@ -466,6 +378,7 @@ Use failures to refine: **Symptom:** Empty `tool_calls` arrays **Possible causes:** + 1. Model didn't call any tools 2. Tools not properly registered 3. System message doesn't encourage tool use @@ -479,20 +392,6 @@ suite = EvalSuite( ) ``` -### Unexpected tool names - -**Symptom:** Tool names have underscores instead of dots - -**Explanation:** Tool names are normalized for provider compatibility. `Weather.GetCurrent` becomes `Weather_GetCurrent`. - -**Solution:** Use normalized names in expectations: - -```python -ExpectedMCPToolCall("Weather_GetCurrent", {...}) -``` - -See [Provider compatibility](/home/evaluate-tools/provider-compatibility) for details. - ### Missing parameters **Symptom:** Some parameters are missing from captured calls @@ -509,104 +408,7 @@ See [Provider compatibility](/home/evaluate-tools/provider-compatibility) for de **Solution:** This is expected. Use captures to understand provider-specific behavior, then create provider-agnostic tests. -## Example workflow - -Here's a complete workflow from capture to evaluation: - - - -### Create capture suite - -```python -@tool_eval() -async def initial_capture(): - suite = EvalSuite(name="Slack Tools", system_message="You are a Slack assistant.") - await suite.add_arcade_gateway(gateway_slug="slack") - - suite.add_case( - name="send_message", - user_message="Send a message to #general saying 'Hello team'", - expected_tool_calls=[], - ) - - suite.add_case( - name="send_dm", - user_message="Send a DM to alice saying 'Meeting at 3'", - expected_tool_calls=[], - ) - - return suite -``` - -### Capture with multiple models - -```bash -arcade evals . --capture \ - --use-provider openai:gpt-4o,gpt-4o-mini \ - --file captures/slack --format json,md -``` - -### Review markdown output - -```markdown -## Slack Tools - -### Model: gpt-4o - -#### Case: send_message -**Tool Calls:** -- `send_message_to_channel` - - channel: general - - message: Hello team - -#### Case: send_dm -**Tool Calls:** -- `send_dm_to_user` - - user: alice - - message: Meeting at 3 -``` - -### Create evaluation suite - -```python -@tool_eval() -async def slack_eval(): - suite = EvalSuite(name="Slack Tools", system_message="You are a Slack assistant.") - await suite.add_arcade_gateway(gateway_slug="slack") - - suite.add_case( - name="send_message", - user_message="Send a message to #general saying 'Hello team'", - expected_tool_calls=[ - ExpectedMCPToolCall( - "send_message_to_channel", - {"channel": "general", "message": "Hello team"} - ) - ], - critics=[ - BinaryCritic(critic_field="channel", weight=0.4), - SimilarityCritic(critic_field="message", weight=0.6), - ], - ) - - return suite -``` - -### Run evaluations - -```bash -arcade evals . --details -``` - -### Iterate based on results - -Refine expectations and critics based on evaluation results. - - - ## Next steps - Learn about [comparative evaluations](/home/evaluate-tools/comparative-evaluations) to compare tool sources -- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) for cross-provider testing - [Create evaluation suites](/home/evaluate-tools/create-an-evaluation-suite) with expectations - diff --git a/app/en/home/evaluate-tools/comparative-evaluations/page.mdx b/app/en/home/evaluate-tools/comparative-evaluations/page.mdx index ada3036b2..4358d0884 100644 --- a/app/en/home/evaluate-tools/comparative-evaluations/page.mdx +++ b/app/en/home/evaluate-tools/comparative-evaluations/page.mdx @@ -5,34 +5,57 @@ description: "Compare different tool implementations with the same test cases" # Comparative evaluations -Comparative evaluations let you run the same test cases against different tool implementations. Use tracks to compare tool sources side-by-side. +Comparative evaluations let you test how well AI models select and use tools from different, isolated tool sources. Each "track" represents a separate tool registry, allowing you to compare implementations side-by-side. import { Callout, Steps } from "nextra/components"; ## What are tracks? -Tracks are isolated tool registries within a single evaluation suite. Each track represents a different source of tools. +**Tracks are isolated tool registries** within a single evaluation suite. Each track has its own set of tools that are **not shared** with other tracks. This isolation lets you test how models perform when given different tool options for the same task. + +**Key concept**: Comparative evaluations test tool **selection** across different tool sets. Each track provides a different context (set of tools) to the model. **Common use cases:** - **Compare tool providers**: Test Google Weather vs OpenWeather API -- **Version testing**: Compare API v1 vs API v2 -- **Implementation comparison**: Test different MCP servers for the same functionality +- **Implementation comparison**: Test different MCP servers offering similar functionality - **A/B testing**: Evaluate alternative tool designs ### When to use comparative evaluations Use **comparative evaluations** when: + - ✅ Testing multiple implementations of the same functionality -- ✅ Comparing different API versions -- ✅ Evaluating tool providers side-by-side -- ✅ A/B testing tool designs +- ✅ Comparing different tool providers +- ✅ Evaluating how models choose between different tool sets Use **regular evaluations** when: + - ✅ Testing a single tool implementation -- ✅ Validating tool behavior +- ✅ Testing mixed tools from multiple sources in the same context - ✅ Regression testing +### Testing mixed tool sources + +To test how multiple MCP servers work **together** in the same context (not isolated), use a regular evaluation and load multiple sources: + +```python +@tool_eval() +async def mixed_tools_eval(): + suite = EvalSuite(name="Mixed Tools", system_message="You are helpful.") + + # All tools available to the model in the same context + await suite.add_mcp_server("http://server1.example") + await suite.add_mcp_server("http://server2.example") + suite.add_tool_definitions([{"name": "CustomTool", ...}]) + + # Model can use any tool from any source + suite.add_case(...) + return suite +``` + +Alternatively, use an Arcade Gateway which aggregates tools from multiple sources. + ## Basic comparative evaluation @@ -50,19 +73,19 @@ async def weather_comparison(): name="Weather API Comparison", system_message="You are a weather assistant.", ) - + # Track A: Weather API v1 await suite.add_mcp_server( "http://weather-v1.example/mcp", track="Weather v1" ) - + # Track B: Weather API v2 await suite.add_mcp_server( "http://weather-v2.example/mcp", track="Weather v2" ) - + return suite ``` @@ -164,7 +187,8 @@ suite.add_tool_definitions( ``` -Tools must be registered before creating comparative cases that reference their tracks. + Tools must be registered before creating comparative cases that reference + their tracks. ## Comparative case builder @@ -203,9 +227,28 @@ suite.add_comparative_case( ).for_track("Weather v1", ...).for_track("Weather v2", ...) ``` +**Bias-aware message design:** + +Design `additional_messages` to avoid leading the model. Keep them neutral so you measure tool behavior, not prompt hints: + +```python +# ✅ Good - Neutral +additional_messages=[ + {"role": "user", "content": "I need weather information"}, + {"role": "assistant", "content": "I can help with that. Which location?"}, +] + +# ❌ Avoid - Tells the model which tool to call +additional_messages=[ + {"role": "user", "content": "Use the GetWeather tool for Seattle"}, +] +``` + +Keep messages generic so the model chooses tools naturally based on what is available in the track. + ### Different expectations per track -Tracks often have different tool names and parameters: +Tracks can expose different tools and schemas. Because of that, you may need different critics per track: ```python suite.add_comparative_case( @@ -222,6 +265,7 @@ suite.add_comparative_case( expected_tool_calls=[ ExpectedMCPToolCall("Bing_WebSearch", {"q": "Python tutorials"}) ], + # Different schema, so validate the matching field for this track critics=[BinaryCritic(critic_field="q", weight=1.0)], ) ``` @@ -246,23 +290,34 @@ async def search_comparison(): name="Search API Comparison", system_message="You are a search assistant. Use the available tools to search for information.", ) - - # Register search providers + + # Register search providers (MCP servers) await suite.add_mcp_server( "http://google-search.example/mcp", track="Google", ) - + await suite.add_mcp_server( "http://bing-search.example/mcp", track="Bing", ) - - await suite.add_mcp_server( - "http://duckduckgo.example/mcp", + + # Mix with manual tool definitions + suite.add_tool_definitions( + tools=[{ + "name": "DDG_Search", + "description": "Search using DuckDuckGo", + "inputSchema": { + "type": "object", + "properties": { + "query": {"type": "string"} + }, + "required": ["query"] + } + }], track="DuckDuckGo", ) - + # Simple query suite.add_comparative_case( name="basic_search", @@ -282,11 +337,11 @@ async def search_comparison(): ).for_track( "DuckDuckGo", expected_tool_calls=[ - ExpectedMCPToolCall("DDG_Search", {"search_term": "Python tutorials"}) + ExpectedMCPToolCall("DDG_Search", {"query": "Python tutorials"}) ], - critics=[BinaryCritic(critic_field="search_term", weight=1.0)], + critics=[BinaryCritic(critic_field="query", weight=1.0)], ) - + # Query with filters suite.add_comparative_case( name="search_with_filters", @@ -320,15 +375,14 @@ async def search_comparison(): expected_tool_calls=[ ExpectedMCPToolCall( "DDG_Search", - {"search_term": "Python tutorials", "time": "m"} + {"query": "Python tutorials"} ) ], critics=[ - SimilarityCritic(critic_field="search_term", weight=0.7), - BinaryCritic(critic_field="time", weight=0.3), + SimilarityCritic(critic_field="query", weight=1.0), ], ) - + return suite ``` @@ -396,20 +450,20 @@ async def mixed_suite(): name="Mixed Evaluation", system_message="You are helpful.", ) - + # Register default tools await suite.add_mcp_stdio_server(["python", "server.py"]) - + # Regular case (uses default tools) suite.add_case( name="regular_test", user_message="Do something", expected_tool_calls=[...], ) - + # Register track-specific tools await suite.add_mcp_server("http://api-v2.example", track="v2") - + # Comparative case suite.add_comparative_case( name="compare_versions", @@ -421,12 +475,12 @@ async def mixed_suite(): "v2", # Uses v2 tools expected_tool_calls=[...], ) - + return suite ``` -Use track name `"default"` to reference tools registered without a track. + Use track name `"default"` to reference tools registered without a track. ## Capture mode with tracks @@ -471,6 +525,7 @@ arcade evals . \ ``` Results show: + - Per-track scores for each model - Cross-track comparisons for each model - Cross-model comparisons for each track @@ -699,6 +754,4 @@ suite.add_tool_catalog(catalog_v2, track="Python v2") - [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) with tracks - Use [capture mode](/home/evaluate-tools/capture-mode) to discover track-specific tool calls -- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) when comparing across providers - [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple models and tracks - diff --git a/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx b/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx index 51434bfe1..838cb86bd 100644 --- a/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx +++ b/app/en/home/evaluate-tools/create-an-evaluation-suite/page.mdx @@ -15,7 +15,7 @@ import { Steps, Tabs, Callout } from "nextra/components"; Install Arcade with evaluation support: - + ```bash @@ -43,7 +43,8 @@ touch eval_server.py ``` -Evaluation files must start with `eval_` and use the `.py` extension. The CLI automatically discovers these files. + Evaluation files must start with `eval_` and use the `.py` extension. The CLI + automatically discovers these files. ### Define your evaluation suite @@ -65,12 +66,12 @@ async def weather_eval_suite() -> EvalSuite: name="Weather Tools", system_message="You are a helpful weather assistant.", ) - + # Load tools from your MCP server await suite.add_mcp_stdio_server( command=["python", "server.py"], ) - + # Add a test case suite.add_case( name="Get weather for city", @@ -86,7 +87,7 @@ async def weather_eval_suite() -> EvalSuite: BinaryCritic(critic_field="units", weight=0.3), ], ) - + return suite ``` @@ -99,11 +100,23 @@ export OPENAI_API_KEY= arcade evals . ``` -The command discovers all `eval_*.py` files and executes them. +The command discovers all `eval_*.py` files and executes them using OpenAI's `gpt-4o` model by default. - -By default, evaluations use OpenAI's `gpt-4o` model. To use Anthropic or different models, see [Run evaluations](/home/evaluate-tools/run-evaluations). - +**Using different providers:** + +```bash +# Anthropic +export ANTHROPIC_API_KEY= +arcade evals . --use-provider anthropic + +# Or specify API key directly +arcade evals . --use-provider anthropic --anthropic-key + +# Multiple models +arcade evals . --use-provider openai:gpt-4o,gpt-4o-mini +``` + +See [Run evaluations](/home/evaluate-tools/run-evaluations) for all available options. ### Understand the results @@ -119,7 +132,7 @@ Example output: Suite: Weather Tools Model: gpt-4o PASSED Get weather for city -- Score: 1.00 - + Summary -- Total: 1 -- Passed: 1 -- Failed: 0 ``` @@ -144,11 +157,7 @@ PASSED Get weather for city -- Score: 1.00 ## Loading tools -You can load tools from different sources depending on your setup. - - -All tool loading methods are async and must be awaited. Ensure your evaluation function is decorated with `@tool_eval()` and defined as `async`. - +You can load tools from different sources. All methods are async and must be awaited in your `@tool_eval()` decorated function. ### From MCP HTTP server @@ -157,20 +166,23 @@ Load tools from an HTTP or SSE MCP server: ```python await suite.add_mcp_server( url="http://localhost:8000", - headers={"Authorization": "Bearer token"}, + headers={"Authorization": "Bearer token"}, # Optional + timeout=10, # Optional: Connection timeout (default: 10) + use_sse=False, # Optional: Use SSE transport (default: False) ) ``` -The loader automatically appends `/mcp` to the URL if not present. +The URL is automatically normalized (appends `/mcp` if not present). ### From MCP stdio server -Load tools from a stdio MCP server: +Load tools from a stdio MCP server process: ```python await suite.add_mcp_stdio_server( command=["python", "server.py"], - env={"API_KEY": "secret"}, + env={"API_KEY": "secret"}, # Optional: Environment variables + timeout=10, # Optional: Connection timeout (default: 10) ) ``` @@ -181,26 +193,18 @@ Load tools from an Arcade MCP Gateway: ```python await suite.add_arcade_gateway( gateway_slug="my-gateway", - arcade_api_key="your-api-key", - arcade_user_id="user-id", + arcade_api_key="your-api-key", # Optional: Defaults to ARCADE_API_KEY env var + arcade_user_id="user-id", # Optional: Defaults to ARCADE_USER_ID env var + base_url=None, # Optional: Override gateway URL + timeout=10, # Optional: Connection timeout (default: 10) ) ``` - -Tool loading results are cached automatically to avoid redundant connections. If you update your MCP server, use `clear_tools_cache()` to reload: - -```python -from arcade_evals import clear_tools_cache - -clear_tools_cache() -``` - - ### Manual tool definitions Define tools manually using MCP format: -```python +````python suite.add_tool_definitions([ { "name": "Weather.GetCurrent", @@ -219,7 +223,21 @@ suite.add_tool_definitions([ }, } ]) -``` + +### Mixing tool sources + +You can load tools from multiple sources into the same suite: + +```python +# Load from multiple MCP servers +await suite.add_mcp_server("http://server1.example") +await suite.add_mcp_server("http://server2.example") + +# Mix with manual definitions +suite.add_tool_definitions([{"name": "CustomTool", ...}]) +```` + +All tools are accumulated in the suite's registry and available to the model. ## Expected tool calls @@ -233,113 +251,36 @@ ExpectedMCPToolCall( ``` -Tool names are normalized for provider compatibility. Dots (`.`) become underscores (`_`). For example, `Weather.GetCurrent` becomes `Weather_GetCurrent`. See [Provider compatibility](/home/evaluate-tools/provider-compatibility) for details. + Tool names are normalized for compatibility with model tool calling. Dots + (`.`) become underscores (`_`). For example, `Weather.GetCurrent` becomes + `Weather_GetCurrent`. ## Critics -Critics evaluate specific parameters of tool calls. Choose the right critic for your validation needs. - -| Critic Type | Use When | Example Field | -|-------------|----------|---------------| -| BinaryCritic | Need exact match | user_id, city, status | -| SimilarityCritic | Semantic match OK | message, description | -| NumericCritic | Range acceptable | temperature, price | -| DatetimeCritic | Time window OK | deadline, start_time | - -### BinaryCritic - -Checks for exact matches after type casting: +Critics validate tool call parameters. Each critic type handles different validation needs: -```python -from arcade_evals import BinaryCritic - -# Perfect for IDs, locations, and enum values -BinaryCritic(critic_field="location", weight=0.7) -``` - -### SimilarityCritic - -Evaluates textual similarity using cosine similarity: - -```python -from arcade_evals import SimilarityCritic - -SimilarityCritic( - critic_field="message", - weight=0.5, - similarity_threshold=0.8 -) -``` - -### NumericCritic - -Assesses numeric values within tolerance: - -```python -from arcade_evals import NumericCritic - -NumericCritic( - critic_field="temperature", - tolerance=2.0, - weight=0.3 -) -``` - -### DatetimeCritic - -Evaluates datetime values within a time window: - -```python -from datetime import timedelta -from arcade_evals import DatetimeCritic - -DatetimeCritic( - critic_field="scheduled_time", - tolerance=timedelta(minutes=5), - weight=0.4 -) -``` - -## Fuzzy weights - -Use fuzzy weights when you want qualitative importance levels instead of precise numbers: +| Critic | Use case | Example | +| ------------------ | --------------- | ------------------------------------------------------------------ | +| `BinaryCritic` | Exact match | `BinaryCritic(critic_field="user_id", weight=1.0)` | +| `SimilarityCritic` | Text similarity | `SimilarityCritic(critic_field="message", weight=0.8)` | +| `NumericCritic` | Numeric range | `NumericCritic(critic_field="temp", tolerance=2.0)` | +| `DatetimeCritic` | Time window | `DatetimeCritic(critic_field="due", tolerance=timedelta(hours=1))` | ```python from arcade_evals import BinaryCritic, SimilarityCritic -from arcade_evals.weights import FuzzyWeight - -critics = [ - BinaryCritic( - critic_field="user_id", - weight=FuzzyWeight.CRITICAL - ), - SimilarityCritic( - critic_field="message", - weight=FuzzyWeight.MEDIUM - ), - BinaryCritic( - critic_field="priority", - weight=FuzzyWeight.LOW - ), + +critics=[ + BinaryCritic(critic_field="location", weight=0.7), + SimilarityCritic(critic_field="message", weight=0.3), ] ``` -Fuzzy weights are automatically normalized: - -| Weight | Value | Normalized (example above) | -|--------|-------|----------------------------| -| MINIMAL | 1 | - | -| VERY_LOW | 2 | - | -| LOW | 3 | 21.4% | -| MEDIUM | 4 | 28.6% | -| HIGH | 5 | - | -| VERY_HIGH | 6 | - | -| CRITICAL | 7 | 50.0% | +All weights are normalized proportionally to sum to 1.0. Use numeric values or `FuzzyWeight` (`CRITICAL`, `HIGH`, `MEDIUM`, `LOW`). ## Multiple tool calls -Test cases with multiple expected tool calls: +Test cases can include multiple expected tool calls: ```python suite.add_case( @@ -349,9 +290,6 @@ suite.add_case( ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Seattle"}), ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Portland"}), ], - critics=[ - BinaryCritic(critic_field="location", weight=1.0), - ], ) ``` @@ -366,9 +304,6 @@ suite.add_case( expected_tool_calls=[ ExpectedMCPToolCall("Weather_GetCurrent", {"location": "Tokyo"}), ], - critics=[ - BinaryCritic(critic_field="location", weight=1.0), - ], additional_messages=[ {"role": "user", "content": "I'm planning to visit Tokyo next week."}, {"role": "assistant", "content": "That sounds exciting! What would you like to know about Tokyo?"}, @@ -376,32 +311,26 @@ suite.add_case( ) ``` -## Rubrics and thresholds +Use OpenAI message format for `additional_messages`. Arcade converts it automatically for Anthropic. + +## Rubrics -Customize evaluation thresholds using an `EvalRubric`: +Customize pass/fail thresholds with `EvalRubric`. Default: fail at 0.8, warn at 0.9. ```python from arcade_evals import EvalRubric -rubric = EvalRubric( - fail_threshold=0.85, - warn_threshold=0.95, -) - suite = EvalSuite( - name="Strict Weather Evaluation", - system_message="You are a weather assistant.", - rubric=rubric, + name="Strict Evaluation", + system_message="You are helpful.", + rubric=EvalRubric(fail_threshold=0.85, warn_threshold=0.95), ) ``` -Default thresholds: -- **Fail threshold**: 0.8 -- **Warn threshold**: 0.9 +If you want stricter suites, increase thresholds (for example `fail_threshold=0.95`). For exploratory testing, lower them (for example `fail_threshold=0.6`). ## Next steps - Learn how to [run evaluations with different providers](/home/evaluate-tools/run-evaluations) - Explore [capture mode](/home/evaluate-tools/capture-mode) to record tool calls - Compare tool sources with [comparative evaluations](/home/evaluate-tools/comparative-evaluations) -- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) diff --git a/app/en/home/evaluate-tools/provider-compatibility/page.mdx b/app/en/home/evaluate-tools/provider-compatibility/page.mdx deleted file mode 100644 index b14c49478..000000000 --- a/app/en/home/evaluate-tools/provider-compatibility/page.mdx +++ /dev/null @@ -1,496 +0,0 @@ ---- -title: "Provider compatibility" -description: "Understand how tool evaluations work across different providers" ---- - -# Provider compatibility - -Arcade evaluations support both OpenAI and Anthropic. Each provider has different requirements for tool schemas and message formats. - -import { Callout } from "nextra/components"; - -## Provider comparison - -| Feature | OpenAI | Anthropic | -|---------|--------|-----------| -| **Tool name rules** | Alphanumeric, `-`, `_` (max 64 chars) | Alphanumeric, `_` only | -| **Schema format** | `function.parameters` (JSON Schema) | `input_schema` (JSON Schema) | -| **Strict mode** | Yes (opt-in via `strict: true`) | No (standard JSON Schema) | -| **Optional params** | Required list + null unions | Only required params in `required` | -| **Message roles** | system, user, assistant, tool, function | user, assistant (system separate) | -| **Tool calling format** | `tool_calls` array | `tool_use` content blocks | - -## Tool name normalization - -Arcade uses dotted notation for tool names (e.g., `Weather.GetCurrent`), but providers don't allow dots in function names. - -### How normalization works - -Tool names are automatically normalized: - -```python -from arcade_core.converters.utils import normalize_tool_name - -normalize_tool_name("Weather.GetCurrent") # Returns: "Weather_GetCurrent" -normalize_tool_name("Google.Search") # Returns: "Google_Search" -``` - -When models make tool calls, normalized names are resolved back to original names. - -### Denormalization is lossy - -Reversing normalization can't distinguish between original dots and underscores: - -| Original Name | Normalized | Denormalized | Correct? | -|---------------|------------|--------------|----------| -| `Google.Search` | `Google_Search` | `Google.Search` | ✅ | -| `My_Tool.Name` | `My_Tool_Name` | `My.Tool.Name` | ❌ | -| `Tool_Name` | `Tool_Name` | `Tool.Name` | ❌ | - - -**Best practice**: Use only dots OR only underscores in tool names, never both. - - -### Name collision - -Don't register both dotted and underscore versions of the same name: - -```python -# ❌ Avoid this - creates collision -suite.add_tool_definitions([ - {"name": "Weather.GetCurrent", ...}, - {"name": "Weather_GetCurrent", ...}, # Collision! -]) -``` - -The registry accepts both formats for lookups but they resolve to the same internal name. - -## OpenAI strict mode - -OpenAI's strict mode enforces structured outputs by transforming JSON Schema. This happens automatically in evaluations. - -### Schema transformations - -**1. Unsupported keywords are stripped:** - -```python -# Input schema -{ - "type": "integer", - "minimum": 0, - "maximum": 100, - "default": 50 -} - -# Transformed for OpenAI -{ - "type": ["integer", "null"] -} -``` - -Stripped keywords: -- Validation: `minimum`, `maximum`, `minLength`, `maxLength`, `pattern`, `format` -- Metadata: `default`, `nullable`, `minItems`, `maxItems` - -**2. Optional parameters become required with null unions:** - -```python -# Input schema -{ - "type": "object", - "properties": { - "city": {"type": "string"}, - "units": {"type": "string", "default": "celsius"} - }, - "required": ["city"] -} - -# Transformed for OpenAI -{ - "type": "object", - "properties": { - "city": {"type": "string"}, - "units": {"type": ["string", "null"]} # Now in union with null - }, - "required": ["city", "units"], # units added to required - "additionalProperties": false -} -``` - -**3. Enums are stringified:** - -```python -# Input schema -{ - "type": "integer", - "enum": [0, 1, 2] -} - -# Transformed for OpenAI -{ - "type": "string", - "enum": ["0", "1", "2"] -} -``` - -**4. Additional properties are forbidden:** - -All objects get `"additionalProperties": false` to enforce strict validation. - -### Why defaults still work - -Even though `default` is stripped from schemas, defaults are still applied during evaluation. Here's why: - -1. The evaluation framework stores the original schema with defaults -2. OpenAI sends `null` for optional parameters in strict mode -3. The framework applies defaults when args are missing OR null - -```python -# Model sends: -{"city": "Seattle", "units": null} - -# Framework applies default: -{"city": "Seattle", "units": "celsius"} -``` - - -This behavior ensures consistent evaluation results regardless of provider. - - -## Anthropic schema format - -Anthropic uses standard JSON Schema with minimal transformation. - -### Key differences from OpenAI - -**1. Schema field name:** - -```python -# OpenAI format -{ - "type": "function", - "function": { - "name": "get_weather", - "parameters": {...} # ← Note: "parameters" - } -} - -# Anthropic format -{ - "name": "get_weather", - "input_schema": {...} # ← Note: "input_schema" -} -``` - -**2. No strict mode transformations:** - -Anthropic accepts the schema as-is: -- Validation keywords are preserved -- Optional params stay optional -- Enums keep original types -- Defaults are kept (but not sent to model) - -**3. Only required params in required list:** - -```python -{ - "type": "object", - "properties": { - "city": {"type": "string"}, - "units": {"type": "string", "default": "celsius"} - }, - "required": ["city"] # Only city is required -} -``` - -## Message format conversion - -Arcade evaluations use OpenAI message format internally. When using Anthropic, messages are converted automatically. - -### System messages - -**OpenAI:** - -```python -[ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": "Hello"}, -] -``` - -**Anthropic:** - -```python -# system → separate parameter -system = "You are helpful." - -messages = [ - {"role": "user", "content": "Hello"}, -] -``` - -### Tool calls - -**OpenAI:** - -```python -{ - "role": "assistant", - "content": "", - "tool_calls": [ - { - "id": "call_123", - "type": "function", - "function": { - "name": "get_weather", - "arguments": '{"city": "Seattle"}' - } - } - ] -} -``` - -**Anthropic:** - -```python -{ - "role": "assistant", - "content": [ - { - "type": "tool_use", - "id": "call_123", - "name": "get_weather", - "input": {"city": "Seattle"} - } - ] -} -``` - -### Tool results - -**OpenAI:** - -```python -{ - "role": "tool", - "tool_call_id": "call_123", - "content": "Sunny, 72°F" -} -``` - -**Anthropic:** - -```python -{ - "role": "user", - "content": [ - { - "type": "tool_result", - "tool_use_id": "call_123", - "content": "Sunny, 72°F" - } - ] -} -``` - - -Message conversion happens automatically. You don't need to handle it manually. - - -## Writing provider-agnostic evaluations - -Follow these guidelines to ensure evaluations work with both providers: - -### 1. Use simple tool names - -Prefer names without dots or underscores: - -```python -# ✅ Good -"GetWeather" -"SearchGoogle" -"SendMessage" - -# ⚠️ Acceptable (use only one separator) -"Weather.GetCurrent" -"Google.Search" - -# ❌ Avoid (mixing separators) -"My_Tool.GetData" -"Tool_Name.With_Mixed" -``` - -### 2. Use MCP-style tool definitions - -Define tools using MCP format: - -```python -{ - "name": "GetWeather", - "description": "Get current weather for a city", - "inputSchema": { - "type": "object", - "properties": { - "city": {"type": "string"}, - "units": {"type": "string", "enum": ["celsius", "fahrenheit"]} - }, - "required": ["city"] - } -} -``` - -### 3. Don't rely on strict mode behavior - -Don't assume specific schema transformations: - -```python -# ❌ Don't rely on null unions -{ - "type": ["string", "null"] # Only in OpenAI strict mode -} - -# ✅ Use optional parameters -{ - "type": "string" -} -# In required list: OpenAI adds null union, Anthropic keeps as-is -# Not in required list: Both treat as optional -``` - -### 4. Handle optional parameters consistently - -Use defaults for optional parameters: - -```python -{ - "type": "object", - "properties": { - "units": { - "type": "string", - "enum": ["celsius", "fahrenheit"], - "default": "celsius" - } - }, - "required": [] -} -``` - -Both providers will apply the default when the parameter is missing. - -## Testing with multiple providers - -Run evaluations with both providers to verify compatibility: - -```bash -arcade evals . \ - --use-provider openai:gpt-4o \ - --use-provider anthropic:claude-sonnet-4-5-20250929 -``` - -### Comparing results - -Results show provider-specific behavior: - -``` -Suite: Weather Tools - Case: Get weather for city - Model: gpt-4o -- Score: 1.00 -- PASSED - Model: claude-sonnet-4-5-20250929 -- Score: 1.00 -- PASSED -``` - -### Common differences - -**Parameter handling:** - -OpenAI might send: -```json -{"city": "Seattle", "units": null} -``` - -Anthropic might send: -```json -{"city": "Seattle"} -``` - -Both are evaluated identically because defaults are applied. - -**Tool name format:** - -Both providers see normalized names (`Weather_GetCurrent`), but your test expectations use original names (`Weather.GetCurrent`). - -## Common pitfalls - -Avoid these common mistakes when working with multiple providers: - -1. **Using dots and underscores together** - ```python - # ❌ Don't mix separators - "My_Tool.GetData" - - # ✅ Use one consistently - "MyTool.GetData" or "MyTool_GetData" - ``` - -2. **Relying on specific schema transformations** - ```python - # ❌ OpenAI-specific null unions - {"type": ["string", "null"]} - - # ✅ Use optional parameters - {"type": "string"} # Not in required list - ``` - -3. **Forgetting to test with both providers** - ```bash - # ✅ Always test both - arcade evals . \ - --use-provider openai:gpt-4o \ - --use-provider anthropic:claude-sonnet-4-5-20250929 - ``` - -## Troubleshooting - -### Tool name mismatch - -**Symptom:** Evaluation reports "tool not found" - -**Solution:** Check if tool name uses dots. The normalized name (with underscores) should match: - -```python -# Original: "Weather.GetCurrent" -# Normalized: "Weather_GetCurrent" -# Expected: ExpectedMCPToolCall("Weather_GetCurrent", {...}) -``` - -### Schema validation errors - -**Symptom:** OpenAI returns validation errors - -**Solution:** Check if your schema uses unsupported strict mode keywords. These are automatically stripped, but might affect expected behavior. - -### Missing optional parameters - -**Symptom:** Anthropic doesn't provide optional parameters - -**Solution:** This is expected. Optional parameters may be omitted. Ensure defaults are defined in your schema. - -### Enum type mismatches - -**Symptom:** OpenAI converts numeric enums to strings - -**Solution:** Use string enums in your schema: - -```python -# ✅ Use string enums -{"type": "string", "enum": ["low", "medium", "high"]} - -# ❌ Avoid numeric enums -{"type": "integer", "enum": [0, 1, 2]} # Converted to ["0", "1", "2"] -``` - -## Next steps - -- [Create an evaluation suite](/home/evaluate-tools/create-an-evaluation-suite) with provider-agnostic tests -- [Run evaluations](/home/evaluate-tools/run-evaluations) with multiple providers -- Explore [capture mode](/home/evaluate-tools/capture-mode) to see actual tool calls - diff --git a/app/en/home/evaluate-tools/run-evaluations/page.mdx b/app/en/home/evaluate-tools/run-evaluations/page.mdx index 1fdf2bed9..abd373494 100644 --- a/app/en/home/evaluate-tools/run-evaluations/page.mdx +++ b/app/en/home/evaluate-tools/run-evaluations/page.mdx @@ -9,6 +9,12 @@ The `arcade evals` command discovers and executes evaluation suites with support import { Callout } from "nextra/components"; + + **Backward compatibility**: All new features (multi-provider support, capture + mode, output formats) work with existing evaluation suites. No code changes + required. + + ## Basic usage Run all evaluations in the current directory: @@ -75,14 +81,15 @@ When you specify multiple models, results show side-by-side comparisons. API keys are resolved in the following order: -| Priority | OpenAI | Anthropic | -|----------|--------|-----------| -| 1. Explicit flag | `--openai-key` | `--anthropic-key` | -| 2. Environment | `OPENAI_API_KEY` | `ANTHROPIC_API_KEY` | -| 3. `.env` file | `OPENAI_API_KEY=...` | `ANTHROPIC_API_KEY=...` | +| Priority | OpenAI | Anthropic | +| ---------------- | -------------------- | ----------------------- | +| 1. Explicit flag | `--openai-key` | `--anthropic-key` | +| 2. Environment | `OPENAI_API_KEY` | `ANTHROPIC_API_KEY` | +| 3. `.env` file | `OPENAI_API_KEY=...` | `ANTHROPIC_API_KEY=...` | -Create a `.env` file in your project directory to avoid setting keys in every terminal session. + Create a `.env` file in your project directory to avoid setting keys in every + terminal session. ## Capture mode @@ -100,6 +107,7 @@ arcade evals . --capture --add-context --file captures/detailed ``` Capture mode is useful for: + - Creating initial test expectations - Debugging model behavior - Understanding tool call patterns @@ -124,14 +132,15 @@ arcade evals . --file results/out --format all ### Available formats -| Format | Extension | Description | -|--------|-----------|-------------| -| `txt` | `.txt` | Plain text, pytest-style output | -| `md` | `.md` | Markdown with tables and collapsible sections | -| `html` | `.html` | Interactive HTML report | -| `json` | `.json` | Structured JSON for programmatic use | +| Format | Extension | Description | +| ------ | --------- | --------------------------------------------- | +| `txt` | `.txt` | Plain text, pytest-style output | +| `md` | `.md` | Markdown with tables and collapsible sections | +| `html` | `.html` | Interactive HTML report | +| `json` | `.json` | Structured JSON for programmatic use | Multiple formats generate separate files: + - `results/out.txt` - `results/out.md` - `results/out.html` @@ -141,14 +150,14 @@ Multiple formats generate separate files: ### Quick reference -| Flag | Purpose | Example | -|------|---------|---------| -| `--use-provider` | Select provider/model | `--use-provider openai:gpt-4o` | -| `--capture` | Record without scoring | `--capture --file out` | -| `--details` | Show critic feedback | `--details` | -| `--failed-only` | Filter failures | `--failed-only` | -| `--format` | Output format(s) | `--format md,html,json` | -| `--max-concurrent` | Parallel limit | `--max-concurrent 10` | +| Flag | Purpose | Example | +| ------------------ | ---------------------- | ------------------------------ | +| `--use-provider` | Select provider/model | `--use-provider openai:gpt-4o` | +| `--capture` | Record without scoring | `--capture --file out` | +| `--details` | Show critic feedback | `--details` | +| `--failed-only` | Filter failures | `--failed-only` | +| `--format` | Output format(s) | `--format md,html,json` | +| `--max-concurrent` | Parallel limit | `--max-concurrent 10` | ### `--use-provider` @@ -159,11 +168,14 @@ Specify which provider(s) and model(s) to use: ``` **Supported providers:** + - `openai` (default: `gpt-4o`) - `anthropic` (default: `claude-sonnet-4-5-20250929`) -Anthropic model names include date stamps. Check [Anthropic's model documentation](https://docs.anthropic.com/en/docs/about-claude/models) for the latest model versions. + Anthropic model names include date stamps. Check [Anthropic's model + documentation](https://docs.anthropic.com/en/docs/about-claude/models) for the + latest model versions. **Examples:** @@ -256,16 +268,20 @@ arcade evals . --max-concurrent 10 Default is 5 concurrent evaluations. -### `--arcade-url` +### `--debug` -Override Arcade gateway URL for testing: +Show debug information for troubleshooting: ```bash -arcade evals . --arcade-url https://staging.arcade.dev +arcade evals . --debug ``` +Displays detailed error traces and connection information. + ## Understanding results +Results are formatted based on evaluation type (regular, multi-model, or comparative) and selected flags. + ### Summary format Results show overall performance: @@ -274,6 +290,14 @@ Results show overall performance: Summary -- Total: 5 -- Passed: 4 -- Failed: 1 ``` +**How flags affect output:** + +- `--details`: Adds per-critic breakdown for each case +- `--failed-only`: Filters to show only failed cases (summary shows original totals) +- `--add-context`: Includes system messages and conversation history +- Multiple models: Switches to comparison table format +- Comparative tracks: Shows side-by-side track comparison + ### Case results Each case displays status and score: @@ -309,19 +333,6 @@ Case: Get weather for city ## Advanced usage -### Test against staging gateway - -Point to a staging Arcade gateway: - -```bash -export ARCADE_API_KEY=... -export ARCADE_USER_ID=... - -arcade evals . \ - --arcade-url https://staging.arcade.dev \ - --use-provider openai -``` - ### High concurrency for fast execution Increase concurrent evaluations: @@ -331,7 +342,8 @@ arcade evals . --max-concurrent 20 ``` -High concurrency may hit API rate limits. Start with default (5) and increase gradually. + High concurrency may hit API rate limits. Start with default (5) and increase + gradually. ### Save comprehensive results @@ -364,7 +376,7 @@ pip install anthropic ### Tool name mismatches -Tool names are normalized (dots become underscores). If you see unexpected tool names, check [Provider compatibility](/home/evaluate-tools/provider-compatibility). +Tool names are normalized (dots become underscores). If you see unexpected tool names, check your tool definitions and your expected tool calls. ### API rate limits @@ -377,6 +389,7 @@ arcade evals . --max-concurrent 2 ### No evaluation files found Ensure your evaluation files: + - Start with `eval_` - End with `.py` - Contain functions decorated with `@tool_eval()` @@ -385,4 +398,3 @@ Ensure your evaluation files: - Explore [capture mode](/home/evaluate-tools/capture-mode) for recording tool calls - Learn about [comparative evaluations](/home/evaluate-tools/comparative-evaluations) for comparing tool sources -- Understand [provider compatibility](/home/evaluate-tools/provider-compatibility) and schema differences diff --git a/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx b/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx index 75d5c1e48..b348b8103 100644 --- a/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx +++ b/app/en/home/evaluate-tools/why-evaluate-tools/page.mdx @@ -3,6 +3,8 @@ title: "Why evaluate tools?" description: "Learn why evaluating your tools is important" --- +import { Callout } from "nextra/components"; + # Why evaluate tools?
@@ -12,7 +14,8 @@ description: "Learn why evaluating your tools is important" 1. **Tool selection**: Does the model choose the right tools for the task? 2. **Parameter accuracy**: Does the model provide correct arguments? - Arcade's evaluation framework helps you validate tool-calling capabilities before deployment, ensuring reliability in real-world applications. + Arcade's evaluation framework helps you validate tool-calling capabilities before deployment, ensuring reliability in real-world applications. You can evaluate tools from MCP servers, Arcade Gateways, or custom implementations. +