Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
107 commits
Select commit Hold shift + click to select a range
b8cdbf0
decentralization + tests
cvt8 Jul 11, 2025
7553b26
adding notes from meeting
cvt8 Jul 16, 2025
6486d90
decentralization actualization
cvt8 Jul 16, 2025
3da5424
update tests
cvt8 Jul 16, 2025
5cd6c24
Fix function renaming in Tool.to_dict
cvt8 Jul 17, 2025
b9aad79
Merge pull request #1 from cvt8/codex/find-and-fix-a-bug-in-codebase
cvt8 Jul 17, 2025
04a15f7
docs: fix Hugging Face capitalization
cvt8 Jul 17, 2025
011e6c6
Replace HF_API_KEY env var with HF_TOKEN
cvt8 Jul 17, 2025
1e68969
docs: fix Hugging Face capitalization
cvt8 Jul 17, 2025
d6fb349
Merge pull request #3 from cvt8/codex/replace-hf_api_key-with-hf_token
cvt8 Jul 17, 2025
725f757
Merge pull request #4 from cvt8/w9azij-codex/update-hugging-face-key-…
cvt8 Jul 17, 2025
5fe9bb0
Add execution timeout to LocalPythonExecutor
cvt8 Jul 17, 2025
cadb6f4
Merge pull request #5 from cvt8/codex/verify-iteration-limit-in-codea…
cvt8 Jul 17, 2025
31dc574
Merge pull request #2 from cvt8/codex/update-hugging-face-key-capital…
cvt8 Jul 17, 2025
975a5bf
Revert decentralization logic from agents
cvt8 Jul 17, 2025
0e31135
Update README for communication tools
cvt8 Jul 17, 2025
7aec585
some corrections
cvt8 Jul 17, 2025
10d939f
Add message queue tools
cvt8 Jul 17, 2025
a3ca939
add decentralization tests folder
cvt8 Jul 17, 2025
789cf70
Add messaging tools and integrate
cvt8 Jul 17, 2025
29c08ac
identation correction
cvt8 Jul 17, 2025
a34aa56
Merge pull request #7 from cvt8/codex/implement-decentralization-in-t…
cvt8 Jul 17, 2025
b70784d
adding langfuse
cvt8 Jul 17, 2025
4b939a0
update scores
cvt8 Jul 17, 2025
43f15b1
adding test tools
cvt8 Jul 17, 2025
76d9dc3
update gitignore
cvt8 Jul 17, 2025
13ea63c
Fix function renaming in Tool.to_dict
cvt8 Jul 17, 2025
7f23c52
Replace HF_API_KEY env var with HF_TOKEN
cvt8 Jul 17, 2025
8589fd2
docs: fix Hugging Face capitalization
cvt8 Jul 17, 2025
a077101
Add execution timeout to LocalPythonExecutor
cvt8 Jul 17, 2025
e476ed5
add decentralization tests folder
cvt8 Jul 17, 2025
ed5d57a
identation correction
cvt8 Jul 17, 2025
7e41bf8
Add messaging tools and integrate
cvt8 Jul 17, 2025
ed71a66
adding langfuse
cvt8 Jul 17, 2025
35cb15c
update scores
cvt8 Jul 17, 2025
772ad80
adding test tools
cvt8 Jul 17, 2025
4c0a7a6
update agents
cvt8 Jul 17, 2025
b3033c8
Merge branch 'main' into codex/implement-decentralization-feature-in-…
cvt8 Jul 17, 2025
15870a7
Merge pull request #9 from cvt8/codex/implement-decentralization-feat…
cvt8 Jul 17, 2025
4d76a5b
adding langfuse
cvt8 Jul 17, 2025
f3a7fb8
test_logging
cvt8 Jul 17, 2025
c316b8f
update installation.md
cvt8 Jul 17, 2025
8c0b91c
deleted tests-decentralized
cvt8 Jul 17, 2025
538ebd2
actualize gitignore
cvt8 Jul 17, 2025
4cb8457
Merge branch 'main' into codex/implement-decentralization-feature-in-…
cvt8 Jul 17, 2025
43c3675
update repo organization
cvt8 Jul 18, 2025
e4b8b4a
adding span.end()
cvt8 Jul 18, 2025
0b466e0
Update tests to match new agent API
cvt8 Jul 18, 2025
cc9dafe
Merge branch 'main' into codex/update-tests-for-compatibility-with-cu…
cvt8 Jul 18, 2025
dd780fa
Merge pull request #10 from cvt8/codex/update-tests-for-compatibility…
cvt8 Jul 18, 2025
4b2f80f
Add _finalize_step callback invocation and defaults
cvt8 Jul 18, 2025
4f7c44f
Add _finalize_step callback invocation and defaults
cvt8 Jul 18, 2025
d636a6d
Merge pull request #11 from cvt8/codex/implement-finalize_step-method…
cvt8 Jul 18, 2025
a133724
Improve Langfuse tracing
cvt8 Jul 18, 2025
c3405fd
Add final answer validation
cvt8 Jul 18, 2025
eac8a40
Handle final answer interrupt
cvt8 Jul 18, 2025
e9b6b71
Merge branch 'main' into codex/review-lanfuse-logging-implementation
cvt8 Jul 18, 2025
c49eab9
Merge pull request #13 from cvt8/codex/review-lanfuse-logging-impleme…
cvt8 Jul 18, 2025
3a12718
Merge branch 'main' into codex/add-final-answer-validation-checks
cvt8 Jul 18, 2025
2756ac9
Merge pull request #14 from cvt8/codex/add-final-answer-validation-ch…
cvt8 Jul 18, 2025
cbda555
Merge branch 'main' into codex/update-_process_tool_call-and-run-loop
cvt8 Jul 18, 2025
cb3bbb1
Merge pull request #15 from cvt8/codex/update-_process_tool_call-and-…
cvt8 Jul 18, 2025
bb1d08b
Merge branch 'main' into syft7m-codex/implement-finalize_step-method-…
cvt8 Jul 18, 2025
07fcd77
Merge pull request #12 from cvt8/syft7m-codex/implement-finalize_step…
cvt8 Jul 18, 2025
b10f392
update documentation
cvt8 Jul 18, 2025
8d3582c
add test logging and some debugging
cvt8 Jul 18, 2025
233c117
Code debugging to run gaia benchmark on open deep research, debuging …
cvt8 Jul 24, 2025
4034cc3
Merge branch 'main' of https://github.com/huggingface/smolagents
cvt8 Jul 24, 2025
28bf657
improving errors handling
cvt8 Jul 24, 2025
f079ff7
# GAIA Benchmark Tracing Improvements
cvt8 Jul 25, 2025
a78f9ad
# Smolagents Benchmark Debugging Summary
cvt8 Jul 25, 2025
9caac98
make style corrections
cvt8 Jul 25, 2025
e381dab
Merge branch 'huggingface:main' into cvt8/Benchmarking_corrections
cvt8 Jul 25, 2025
ff5f96d
logs availibility
cvt8 Jul 25, 2025
dd49490
correction prompts.
cvt8 Jul 28, 2025
404da42
Merge branch 'huggingface:main' into cvt8/Benchmarking_corrections
cvt8 Aug 2, 2025
9451a5b
Merge branch 'cvt8/Benchmarking_corrections' of https://github.com/cv…
cvt8 Aug 5, 2025
0797059
Merge branch 'huggingface:main' into cvt8/Benchmarking_corrections
cvt8 Aug 12, 2025
b6bf1f4
Merge branch 'main' into cvt8/Benchmarking_corrections
cvt8 Aug 12, 2025
a33564f
Merge pull request #18 from cvt8/cvt8/Benchmarking_corrections
cvt8 Aug 12, 2025
80bcda3
update codebase
cvt8 Aug 18, 2025
bd09658
Merge branch 'huggingface:main' into main
cvt8 Aug 18, 2025
4bc2012
update tests and ids
cvt8 Aug 18, 2025
294ab7a
adding scripts
cvt8 Aug 18, 2025
6818b69
Merge branch 'huggingface:main' into main
cvt8 Aug 20, 2025
3caa80d
decentralized agents 0 shot.
cvt8 Aug 20, 2025
84a8128
tests update
cvt8 Aug 20, 2025
6d0f04b
running zero_shot !
cvt8 Aug 20, 2025
cf085eb
updating prompts
cvt8 Aug 21, 2025
d9d3b34
updating prompts + alternative concensus
cvt8 Aug 21, 2025
9644863
well working decentralized agent !
cvt8 Aug 22, 2025
d37585e
running tests
cvt8 Aug 22, 2025
1cfb019
runner corrections + langfuse logging.
cvt8 Aug 22, 2025
74677a4
update codebase and centralized agents.
cvt8 Aug 25, 2025
ad1d2c2
Merge branch 'huggingface:main' into main
cvt8 Aug 25, 2025
dcd4c9b
Dentraluzed tools, simpler and better code
cvt8 Aug 27, 2025
f5b2ad3
Answer format correction
cvt8 Aug 29, 2025
07d3652
Update communication and avoiding some errors.
cvt8 Sep 1, 2025
62f17cf
Code style corrections
cvt8 Sep 1, 2025
1af042e
Merge branch 'main' into main
cvt8 Sep 1, 2025
7b3044e
adding decentraliozed_agents
cvt8 Sep 1, 2025
b031ef9
centralized_agent comparison.
cvt8 Sep 1, 2025
65ab324
style corrections
cvt8 Sep 1, 2025
28162a2
improving prompt
cvt8 Sep 1, 2025
40ec034
Transform chat messages into a nice HTML
cvt8 Sep 12, 2025
8a3687c
Uploading outputs
cvt8 Sep 13, 2025
beadd38
Merge pull request #19 from huggingface/main
cvt8 Sep 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
20 changes: 17 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,21 @@
logs
tmp
wandb
make_test_log.xml
#runs/
runs_old/
runs_v0/
#output/

#Test gaia
wb/
pdb5wb7.ent
downloads_folder/
model_performance_comparison.png
langfuse_test.py

# Data
data
outputs
data/

# Apple
Expand Down Expand Up @@ -148,8 +159,11 @@ interpreter_workspace/
# Archive
archive/
savedir/
output/
#output/
tool_output/

# Gradio runtime
.gradio/
.gradio/

#Other cache
.ruff_cache/
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ style:

# Run smolagents tests
test:
pytest ./tests/
pytest ./tests/ --junitxml=make_test_log.xml
199 changes: 102 additions & 97 deletions README.md

Large diffs are not rendered by default.

167 changes: 167 additions & 0 deletions examples/decentralized_smolagents_benchmark/decentralized_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env python
# Example run: python examples/decentralized_smolagents_benchmark/decentralized_agent.py --model-type LiteLLMModel --model-id gpt-4o --provider openai "What is the half of the speed of a Leopard?"
"""Entry point for decentralized agent team execution."""

import argparse
import json
import logging
import sys
import uuid
from pathlib import Path

from scripts.agents import DecentralizedAgents
from scripts.message_store import MessageStore


QUESTION_ADDON = """It is critical to respect the format of the answer when it is asked. """


# Langfuse instrumentation setup
try:
from dotenv import load_dotenv

load_dotenv()

from langfuse import Langfuse
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

# Initialize Langfuse client
langfuse_client = Langfuse()
if langfuse_client.auth_check():
print("✅ Langfuse client authenticated successfully")
SmolagentsInstrumentor().instrument()
print("✅ SmolagentsInstrumentor enabled")
else:
print("⚠️ Langfuse authentication failed - tracing disabled")
langfuse_client = None
except ImportError as e:
print(f"⚠️ Langfuse not available: {e}")
langfuse_client = None
except Exception as e:
print(f"⚠️ Langfuse setup error: {e}")
langfuse_client = None


def setup_logging(run_dir: Path) -> None:
"""Setup JSON logging to file."""
log_file = run_dir / "run.log"

# Clear existing handlers to avoid duplication
logger = logging.getLogger()
logger.handlers.clear()

# Create formatters
json_formatter = logging.Formatter('{"timestamp":"%(asctime)s", "level":"%(levelname)s", "message":%(message)s}')
# console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# File handler with JSON format
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(json_formatter)

# Console handler with readable format (optional, for debugging)
# Uncomment the next 4 lines if you want console logging too
# console_handler = logging.StreamHandler(sys.stdout)
# console_handler.setLevel(logging.INFO)
# console_handler.setFormatter(console_formatter)
# logger.addHandler(console_handler)

# Add handlers to logger
logger.addHandler(file_handler)
logger.setLevel(logging.INFO)


def main(args: argparse.Namespace) -> int:
"""Main entry point - simplified execution."""
print(f"🚀 Starting decentralized agent team for: {args.question}")

# Create message store with proper agent names for correct voting thresholds
run_id = str(uuid.uuid4())[:8] # Short run ID
agent_names = ["CodeAgent", "WebSearchAgent", "DeepResearchAgent", "DocumentReaderAgent"]
message_store = MessageStore(run_id, agent_names=agent_names)

# Handle the case where __file__ might not be defined
try:
script_dir = Path(__file__).parent
except NameError:
# Fallback if __file__ is not defined
script_dir = Path(sys.argv[0]).parent.absolute() if sys.argv[0] else Path.cwd()

run_dir = script_dir / "runs" / run_id
run_dir.mkdir(parents=True, exist_ok=True)

# Setup logging
setup_logging(run_dir)
logging.info(json.dumps({"event": "run_started", "run_id": run_id, "args": vars(args)}))

try:
# Create the decentralized agent team
logging.info(json.dumps({"event": "creating_team", "run_id": run_id}))
decentralized_team = DecentralizedAgents(
message_store=message_store,
model_type=args.model_type,
model_id=args.model_id,
provider=args.provider,
run_id=run_id,
)

# Run the team on the task with enhanced collaboration instructions
enhanced_task = f"{args.question}"
logging.info(json.dumps({"event": "starting_execution", "run_id": run_id, "question": args.question}))
result = decentralized_team.run(enhanced_task)

logging.info(
json.dumps(
{
"event": "execution_completed",
"run_id": run_id,
"status": result.get("status", "unknown"),
"has_answer": "answer" in result,
}
)
)

# Output the result
if result["status"] in ["success", "success_early", "success_fallback"]:
print(json.dumps({"answer": result["answer"]}))
return 0
else:
error_msg = result.get("error", "No valid results")
logging.error(
json.dumps({"event": "execution_failed", "run_id": run_id, "error": error_msg, "result": result})
)
print(f"\n❌ Team execution failed: {error_msg}")
return 1

except Exception as e:
# Catch any unexpected errors and log them with full context
logging.error(
json.dumps(
{
"event": "unexpected_error",
"run_id": run_id,
"error_type": type(e).__name__,
"error_message": str(e),
"question": args.question,
}
)
)

# Also log the full stack trace for debugging
import traceback

logging.error(json.dumps({"event": "error_traceback", "run_id": run_id, "traceback": traceback.format_exc()}))

print(f"\n❌ Unexpected error: {e}")
return 1


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run decentralized agent team")
parser.add_argument("--model-type", required=True, help="Model type to use")
parser.add_argument("--model-id", required=True, help="Model ID to use")
parser.add_argument("--provider", help="Model provider")
parser.add_argument("question", help="Question to answer")

args = parser.parse_args()
sys.exit(main(args))
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"model_id": "centralized-LiteLLMModel-gpt-4o",
"action_type": "centralized-agents",
"date": "2025-09-01",
"timestamp": "2025-09-01T21:27:36.244600",
"benchmarks": {
"gaia": {
"total_questions": 34,
"exact_match_score": 0.2647058823529412,
"contains_score": 0.3235294117647059,
"exact_matches": 9.0,
"contains_matches": 11.0
},
"math": {
"total_questions": 52,
"exact_match_score": 0.6538461538461539,
"contains_score": 0.7884615384615384,
"exact_matches": 34.0,
"contains_matches": 41.0
},
"simpleqa": {
"total_questions": 52,
"exact_match_score": 0.6730769230769231,
"contains_score": 0.8846153846153846,
"exact_matches": 35.0,
"contains_matches": 46.0
}
},
"overall": {
"total_questions": 138,
"exact_match_score": 0.5652173913043478,
"contains_score": 0.7101449275362319,
"exact_matches": 78.0,
"contains_matches": 98.0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"model_id": "centralized-LiteLLMModel-gpt-4o",
"action_type": "centralized-agents",
"date": "2025-09-08",
"timestamp": "2025-09-08T20:42:15.809662",
"benchmarks": {
"gaia": {
"total_questions": 32,
"exact_match_score": 0.0,
"contains_score": 0.0,
"exact_matches": 0.0,
"contains_matches": 0.0
},
"math": {
"total_questions": 50,
"exact_match_score": 0.0,
"contains_score": 0.0,
"exact_matches": 0.0,
"contains_matches": 0.0
},
"simpleqa": {
"total_questions": 50,
"exact_match_score": 0.0,
"contains_score": 0.0,
"exact_matches": 0.0,
"contains_matches": 0.0
}
},
"overall": {
"total_questions": 132,
"exact_match_score": 0.0,
"contains_score": 0.0,
"exact_matches": 0.0,
"contains_matches": 0.0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"model_id": "decentralized-LiteLLMModel-gpt-4o",
"action_type": "decentralized-consensus",
"date": "2025-08-27",
"timestamp": "2025-08-27T22:05:53.212587",
"benchmarks": {
"gaia": {
"total_questions": 63,
"exact_match_score": 0.06349206349206349,
"contains_score": 0.2698412698412698,
"exact_matches": 4.0,
"contains_matches": 17.0
},
"math": {
"total_questions": 100,
"exact_match_score": 0.16,
"contains_score": 0.53,
"exact_matches": 16.0,
"contains_matches": 53.0
},
"simpleqa": {
"total_questions": 100,
"exact_match_score": 0.31,
"contains_score": 0.72,
"exact_matches": 31.0,
"contains_matches": 72.0
}
},
"overall": {
"total_questions": 263,
"exact_match_score": 0.19391634980988592,
"contains_score": 0.5399239543726235,
"exact_matches": 51.0,
"contains_matches": 142.0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"model_id": "decentralized-LiteLLMModel-gpt-4o",
"action_type": "decentralized-consensus",
"date": "2025-08-28",
"timestamp": "2025-08-28T20:37:17.560750",
"benchmarks": {
"gaia": {
"total_questions": 71,
"exact_match_score": 0.16901408450704225,
"contains_score": 0.23943661971830985,
"exact_matches": 12.0,
"contains_matches": 17.0
},
"math": {
"total_questions": 100,
"exact_match_score": 0.25,
"contains_score": 0.39,
"exact_matches": 25.0,
"contains_matches": 39.0
},
"simpleqa": {
"total_questions": 97,
"exact_match_score": 0.28865979381443296,
"contains_score": 0.3711340206185567,
"exact_matches": 28.0,
"contains_matches": 36.0
}
},
"overall": {
"total_questions": 268,
"exact_match_score": 0.24253731343283583,
"contains_score": 0.34328358208955223,
"exact_matches": 65.0,
"contains_matches": 92.0
}
}
Loading