agent-framework/python/packages/lab/gaia/samples/gaia_sample.py at main · larohra/agent-framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# Copyright (c) Microsoft. All rights reserved.

"""GAIA Benchmark Sample.

Run the GAIA (General AI Assistant) benchmark with configurable agent providers,
telemetry options, and benchmark parameters.

Agent Providers:
    - Azure AI (default): See azure_ai_agent.py for required environment variables
    - OpenAI: See openai_agent.py for required environment variables

Prerequisites:
    1. Set HF_TOKEN environment variable with your Hugging Face token:
       - Get token: https://huggingface.co/settings/tokens
       - Request dataset access: https://huggingface.co/datasets/gaia-benchmark/GAIA
       - Set: export HF_TOKEN="your-huggingface-token"

    2. Configure your chosen agent provider (see agent module files for details)

Telemetry:
    When using --otlp-endpoint or --trace-file, OpenTelemetry will export trace data
    in JSON format to the console in addition to the configured endpoints. This is
    expected behavior from the OpenTelemetry SDK and provides visibility into the
    telemetry being captured. The traces are also exported to:
    - OTLP endpoint (e.g., Aspire Dashboard) if --otlp-endpoint is specified
    - Local file if --trace-file is specified

    To suppress console output, redirect stderr: `python gaia_sample.py 2>/dev/null`

Usage:
    # Run with default settings (Azure AI agent)
    uv run python gaia_sample.py

    # Run with OpenAI agent
    uv run python gaia_sample.py --agent-provider openai

    # Run with telemetry export to Aspire Dashboard
    uv run python gaia_sample.py --otlp-endpoint http://localhost:4318

    # See all options
    uv run python gaia_sample.py --help
"""

import argparse

from agent_framework.lab.gaia import GAIA, Evaluation, GAIATelemetryConfig, Prediction, Task


async def evaluate_task(task: Task, prediction: Prediction) -> Evaluation:
    """Evaluate the prediction for a given task."""
    # Simple evaluation: check if the prediction contains the answer
    is_correct = (task.answer or "").lower() in prediction.prediction.lower()
    return Evaluation(is_correct=is_correct, score=1 if is_correct else 0)


async def main(
    otlp_endpoint: str | None = None,
    trace_file: str | None = None,
    result_file: str | None = None,
    data_dir: str | None = None,
    agent_provider: str = "azure-ai",
    level: int | list[int] = 1,
    max_n: int = 2,
    parallel: int = 1,
    timeout: int = 120,
) -> None:
    """Run GAIA benchmark with telemetry configuration.

    Args:
        otlp_endpoint: Optional OTLP endpoint URL for exporting traces (e.g., http://localhost:4318)
        trace_file: Optional file path to export traces to. If None, traces won't be saved to file.
        result_file: Optional file path to save benchmark results. If None, results won't be saved to file.
        data_dir: Directory to cache GAIA dataset. If None, uses temp directory.
        agent_provider: Agent provider to use: 'azure-ai' or 'openai' (default: 'azure-ai')
        level: GAIA level(s) to run (1, 2, or 3)
        max_n: Maximum number of tasks to run per level
        parallel: Number of parallel tasks to run
        timeout: Timeout per task in seconds
    """
    # Check for required Hugging Face token
    import logging
    import os

    # Suppress console logging for traces and verbose SDK output
    logging.getLogger("opentelemetry").setLevel(logging.ERROR)
    logging.getLogger("azure").setLevel(logging.WARNING)
    logging.getLogger("agent_framework").setLevel(logging.WARNING)
    logging.getLogger("httpx").setLevel(logging.WARNING)
    logging.getLogger("httpcore").setLevel(logging.WARNING)

    # Suppress OpenTelemetry exporters console output
    import os as _os

    _os.environ.setdefault("OTEL_PYTHON_LOG_LEVEL", "error")

    # Print trace export configuration
    print("\n=== Telemetry Configuration ===")
    if trace_file:
        print(f"📁 Trace file: {os.path.abspath(trace_file)}")
    else:
        print("📁 Trace file: disabled")

    if otlp_endpoint:
        print(f"🌐 OTLP endpoint: {otlp_endpoint}")
    else:
        print("🌐 OTLP endpoint: disabled")

    if result_file:
        print(f"📊 Results file: {os.path.abspath(result_file)}")
    else:
        print("📊 Results file: disabled")

    print("\n=== Run Configuration ===")
    print(f"🤖 Agent provider: {agent_provider}")
    if data_dir:
        print(f"📂 Data directory: {os.path.abspath(data_dir)}")
    else:
        import tempfile
        from pathlib import Path

        default_data_dir = Path(tempfile.gettempdir()) / "data_gaia_hub"
        print(f"📂 Data directory: {default_data_dir} (default)")
    print(f"🎯 Level: {level}")
    print(f"🔢 Max tasks: {max_n}")
    print(f"⚡ Parallel: {parallel}")
    print(f"⏱️  Timeout: {timeout}s")
    print()

    # Import the appropriate agent factory based on provider
    if agent_provider == "azure-ai":
        from azure_ai_agent import create_gaia_agent
    elif agent_provider == "openai":
        from openai_agent import create_gaia_agent
    else:
        raise ValueError(f"Unknown agent provider: {agent_provider}. Use 'azure-ai' or 'openai'.")

    # Configure telemetry for tracing
    telemetry_config = GAIATelemetryConfig(
        enable_tracing=True,  # Enable OpenTelemetry tracing
        trace_to_file=trace_file is not None,  # Export traces to local file only if path provided
        file_path=trace_file,  # Custom file path for traces (can be None)
        otlp_endpoint=otlp_endpoint,  # Optional OTLP endpoint for Aspire Dashboard or other collectors
    )

    # Create a single agent once and reuse it for all tasks
    async with create_gaia_agent() as agent:

        async def run_task(task: Task) -> Prediction:
            """Run a single GAIA task and return the prediction using the shared agent."""
            input_message = f"Task: {task.question}"
            if task.file_name:
                input_message += f"\nFile: {task.file_name}"
            result = await agent.run(input_message)
            return Prediction(prediction=result.text, messages=result.messages)

        # Create the GAIA benchmark runner with telemetry configuration
        runner = GAIA(
            evaluator=evaluate_task,
            telemetry_config=telemetry_config,
            data_dir=data_dir,
        )

        # Run the benchmark with the task runner.
        # By default, this will check for locally cached benchmark data and checkout
        # the latest version from HuggingFace if not found.
        # Note: The GAIA dataset has been updated to use Parquet format.
        # If you encounter issues, try using validation split which has labeled data.
        results = await runner.run(
            run_task,
            level=level,
            max_n=max_n,
            parallel=parallel,
            timeout=timeout,
            out=result_file,  # Output file to save results including detailed traces (optional, None = no file output)
        )

    # Print summary similar to the viewer in gaia.py
    total = len(results)
    correct = sum(1 for r in results if r.evaluation.is_correct)
    accuracy = correct / total if total > 0 else 0.0
    avg_runtime = sum(r.runtime_seconds or 0 for r in results) / total if total > 0 else 0.0

    print("\n=== GAIA Benchmark Summary ===")
    print(f"📝 Total: {total}, ✅ Correct: {correct}, 🎯 Accuracy: {accuracy:.3f}")
    print(f"⏱️  Average runtime: {avg_runtime:.2f}s")
    if result_file:
        print(f"💾 Detailed results saved to: {result_file}")


if __name__ == "__main__":
    import asyncio

    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description="Run GAIA benchmark with optional telemetry export to OTLP endpoint and/or file",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Run with default settings
  python gaia_sample.py

  # Run with custom data directory
  python gaia_sample.py --data-dir ./gaia_data

  # Run with OpenAI agent provider
  python gaia_sample.py --agent-provider openai

  # Run with trace file export
  python gaia_sample.py --trace-file gaia_benchmark_traces.jsonl

  # Run level 2 tasks with 5 maximum tasks
  python gaia_sample.py --level 2 --max-n 5

  # Run with OTLP export to Aspire Dashboard and custom settings
  python gaia_sample.py --otlp-endpoint http://localhost:4318 --level 1 --max-n 10 --parallel 2

  # Run with all options configured
  python gaia_sample.py --agent-provider openai \
  --trace-file traces.jsonl \
  --result-file results.jsonl \
  --otlp-endpoint http://localhost:4318 --level 1 --max-n 5 --parallel 2 --timeout 180
        """,
    )
    parser.add_argument(
        "--otlp-endpoint",
        type=str,
        default=None,
        help="OTLP endpoint URL for exporting traces (e.g., http://localhost:4318 for Aspire Dashboard)",
    )
    parser.add_argument(
        "--trace-file",
        type=str,
        default=None,
        help="File path to export traces to (e.g., gaia_benchmark_traces.jsonl). "
        "If not set, traces won't be saved to file.",
    )
    parser.add_argument(
        "--result-file",
        type=str,
        default="gaia_results_level1.jsonl",
        help="File path to save benchmark results (default: gaia_results_level1.jsonl)",
    )
    parser.add_argument(
        "--data-dir",
        type=str,
        default=None,
        help="Directory to cache GAIA dataset. If not set, uses system temp directory.",
    )
    parser.add_argument(
        "--agent-provider",
        type=str,
        default="azure-ai",
        choices=["azure-ai", "openai"],
        help="Agent provider to use: 'azure-ai' or 'openai' (default: 'azure-ai')",
    )
    parser.add_argument(
        "--level",
        type=int,
        default=1,
        choices=[1, 2, 3],
        help="GAIA benchmark level to run: 1, 2, or 3 (default: 1)",
    )
    parser.add_argument(
        "--max-n",
        type=int,
        default=2,
        help="Maximum number of tasks to run per level (default: 2)",
    )
    parser.add_argument(
        "--parallel",
        type=int,
        default=1,
        help="Number of parallel tasks to run (default: 1)",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=120,
        help="Timeout per task in seconds (default: 120)",
    )
    args = parser.parse_args()

    asyncio.run(
        main(
            otlp_endpoint=args.otlp_endpoint,
            trace_file=args.trace_file,
            result_file=args.result_file,
            data_dir=args.data_dir,
            agent_provider=args.agent_provider,
            level=args.level,
            max_n=args.max_n,
            parallel=args.parallel,
            timeout=args.timeout,
        )
    )