huggingface · cvt8 · Jul 11, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,10 +2,21 @@
 logs
 tmp
 wandb
+make_test_log.xml
+#runs/
+runs_old/
+runs_v0/
+#output/
+
+#Test gaia
+wb/
+pdb5wb7.ent
+downloads_folder/
+model_performance_comparison.png
+langfuse_test.py
 
 # Data
 data
-outputs
 data/
 
 # Apple
@@ -148,8 +159,11 @@ interpreter_workspace/
 # Archive
 archive/
 savedir/
-output/
+#output/
 tool_output/
 
 # Gradio runtime
-.gradio/
+.gradio/
+
+#Other cache
+.ruff_cache/
diff --git a/Makefile b/Makefile
@@ -14,4 +14,4 @@ style:
 
 # Run smolagents tests
 test:
-	pytest ./tests/
+	pytest ./tests/ --junitxml=make_test_log.xml
diff --git a/README.md b/README.md
diff --git a/examples/decentralized_smolagents_benchmark/decentralized_agent.py b/examples/decentralized_smolagents_benchmark/decentralized_agent.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# Example run: python examples/decentralized_smolagents_benchmark/decentralized_agent.py --model-type LiteLLMModel --model-id gpt-4o --provider openai "What is the half of the speed of a Leopard?"
+"""Entry point for decentralized agent team execution."""
+
+import argparse
+import json
+import logging
+import sys
+import uuid
+from pathlib import Path
+
+from scripts.agents import DecentralizedAgents
+from scripts.message_store import MessageStore
+
+
+QUESTION_ADDON = """It is critical to respect the format of the answer when it is asked. """
+
+
+# Langfuse instrumentation setup
+try:
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    from langfuse import Langfuse
+    from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+
+    # Initialize Langfuse client
+    langfuse_client = Langfuse()
+    if langfuse_client.auth_check():
+        print("✅ Langfuse client authenticated successfully")
+        SmolagentsInstrumentor().instrument()
+        print("✅ SmolagentsInstrumentor enabled")
+    else:
+        print("⚠️ Langfuse authentication failed - tracing disabled")
+        langfuse_client = None
+except ImportError as e:
+    print(f"⚠️ Langfuse not available: {e}")
+    langfuse_client = None
+except Exception as e:
+    print(f"⚠️ Langfuse setup error: {e}")
+    langfuse_client = None
+
+
+def setup_logging(run_dir: Path) -> None:
+    """Setup JSON logging to file."""
+    log_file = run_dir / "run.log"
+
+    # Clear existing handlers to avoid duplication
+    logger = logging.getLogger()
+    logger.handlers.clear()
+
+    # Create formatters
+    json_formatter = logging.Formatter('{"timestamp":"%(asctime)s", "level":"%(levelname)s", "message":%(message)s}')
+    # console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+
+    # File handler with JSON format
+    file_handler = logging.FileHandler(log_file, encoding="utf-8")
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(json_formatter)
+
+    # Console handler with readable format (optional, for debugging)
+    # Uncomment the next 4 lines if you want console logging too
+    # console_handler = logging.StreamHandler(sys.stdout)
+    # console_handler.setLevel(logging.INFO)
+    # console_handler.setFormatter(console_formatter)
+    # logger.addHandler(console_handler)
+
+    # Add handlers to logger
+    logger.addHandler(file_handler)
+    logger.setLevel(logging.INFO)
+
+
+def main(args: argparse.Namespace) -> int:
+    """Main entry point - simplified execution."""
+    print(f"🚀 Starting decentralized agent team for: {args.question}")
+
+    # Create message store with proper agent names for correct voting thresholds
+    run_id = str(uuid.uuid4())[:8]  # Short run ID
+    agent_names = ["CodeAgent", "WebSearchAgent", "DeepResearchAgent", "DocumentReaderAgent"]
+    message_store = MessageStore(run_id, agent_names=agent_names)
+
+    # Handle the case where __file__ might not be defined
+    try:
+        script_dir = Path(__file__).parent
+    except NameError:
+        # Fallback if __file__ is not defined
+        script_dir = Path(sys.argv[0]).parent.absolute() if sys.argv[0] else Path.cwd()
+
+    run_dir = script_dir / "runs" / run_id
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    # Setup logging
+    setup_logging(run_dir)
+    logging.info(json.dumps({"event": "run_started", "run_id": run_id, "args": vars(args)}))
+
+    try:
+        # Create the decentralized agent team
+        logging.info(json.dumps({"event": "creating_team", "run_id": run_id}))
+        decentralized_team = DecentralizedAgents(
+            message_store=message_store,
+            model_type=args.model_type,
+            model_id=args.model_id,
+            provider=args.provider,
+            run_id=run_id,
+        )
+
+        # Run the team on the task with enhanced collaboration instructions
+        enhanced_task = f"{args.question}"
+        logging.info(json.dumps({"event": "starting_execution", "run_id": run_id, "question": args.question}))
+        result = decentralized_team.run(enhanced_task)
+
+        logging.info(
+            json.dumps(
+                {
+                    "event": "execution_completed",
+                    "run_id": run_id,
+                    "status": result.get("status", "unknown"),
+                    "has_answer": "answer" in result,
+                }
+            )
+        )
+
+        # Output the result
+        if result["status"] in ["success", "success_early", "success_fallback"]:
+            print(json.dumps({"answer": result["answer"]}))
+            return 0
+        else:
+            error_msg = result.get("error", "No valid results")
+            logging.error(
+                json.dumps({"event": "execution_failed", "run_id": run_id, "error": error_msg, "result": result})
+            )
+            print(f"\n❌ Team execution failed: {error_msg}")
+            return 1
+
+    except Exception as e:
+        # Catch any unexpected errors and log them with full context
+        logging.error(
+            json.dumps(
+                {
+                    "event": "unexpected_error",
+                    "run_id": run_id,
+                    "error_type": type(e).__name__,
+                    "error_message": str(e),
+                    "question": args.question,
+                }
+            )
+        )
+
+        # Also log the full stack trace for debugging
+        import traceback
+
+        logging.error(json.dumps({"event": "error_traceback", "run_id": run_id, "traceback": traceback.format_exc()}))
+
+        print(f"\n❌ Unexpected error: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run decentralized agent team")
+    parser.add_argument("--model-type", required=True, help="Model type to use")
+    parser.add_argument("--model-id", required=True, help="Model ID to use")
+    parser.add_argument("--provider", help="Model provider")
+    parser.add_argument("question", help="Question to answer")
+
+    args = parser.parse_args()
+    sys.exit(main(args))
diff --git a/examples/decentralized_smolagents_benchmark/model_performance_comparison.png b/examples/decentralized_smolagents_benchmark/model_performance_comparison.png
diff --git a/...put/benchmark_scores_centralized-LiteLLMModel-gpt-4o__centralized-agents__2025-09-01.json b/...put/benchmark_scores_centralized-LiteLLMModel-gpt-4o__centralized-agents__2025-09-01.json
@@ -0,0 +1,36 @@
+{
+  "model_id": "centralized-LiteLLMModel-gpt-4o",
+  "action_type": "centralized-agents",
+  "date": "2025-09-01",
+  "timestamp": "2025-09-01T21:27:36.244600",
+  "benchmarks": {
+    "gaia": {
+      "total_questions": 34,
+      "exact_match_score": 0.2647058823529412,
+      "contains_score": 0.3235294117647059,
+      "exact_matches": 9.0,
+      "contains_matches": 11.0
+    },
+    "math": {
+      "total_questions": 52,
+      "exact_match_score": 0.6538461538461539,
+      "contains_score": 0.7884615384615384,
+      "exact_matches": 34.0,
+      "contains_matches": 41.0
+    },
+    "simpleqa": {
+      "total_questions": 52,
+      "exact_match_score": 0.6730769230769231,
+      "contains_score": 0.8846153846153846,
+      "exact_matches": 35.0,
+      "contains_matches": 46.0
+    }
+  },
+  "overall": {
+    "total_questions": 138,
+    "exact_match_score": 0.5652173913043478,
+    "contains_score": 0.7101449275362319,
+    "exact_matches": 78.0,
+    "contains_matches": 98.0
+  }
+}
diff --git a/...put/benchmark_scores_centralized-LiteLLMModel-gpt-4o__centralized-agents__2025-09-08.json b/...put/benchmark_scores_centralized-LiteLLMModel-gpt-4o__centralized-agents__2025-09-08.json
@@ -0,0 +1,36 @@
+{
+  "model_id": "centralized-LiteLLMModel-gpt-4o",
+  "action_type": "centralized-agents",
+  "date": "2025-09-08",
+  "timestamp": "2025-09-08T20:42:15.809662",
+  "benchmarks": {
+    "gaia": {
+      "total_questions": 32,
+      "exact_match_score": 0.0,
+      "contains_score": 0.0,
+      "exact_matches": 0.0,
+      "contains_matches": 0.0
+    },
+    "math": {
+      "total_questions": 50,
+      "exact_match_score": 0.0,
+      "contains_score": 0.0,
+      "exact_matches": 0.0,
+      "contains_matches": 0.0
+    },
+    "simpleqa": {
+      "total_questions": 50,
+      "exact_match_score": 0.0,
+      "contains_score": 0.0,
+      "exact_matches": 0.0,
+      "contains_matches": 0.0
+    }
+  },
+  "overall": {
+    "total_questions": 132,
+    "exact_match_score": 0.0,
+    "contains_score": 0.0,
+    "exact_matches": 0.0,
+    "contains_matches": 0.0
+  }
+}
diff --git a/...chmark_scores_decentralized-LiteLLMModel-gpt-4o__decentralized-consensus__2025-08-27.json b/...chmark_scores_decentralized-LiteLLMModel-gpt-4o__decentralized-consensus__2025-08-27.json
@@ -0,0 +1,36 @@
+{
+  "model_id": "decentralized-LiteLLMModel-gpt-4o",
+  "action_type": "decentralized-consensus",
+  "date": "2025-08-27",
+  "timestamp": "2025-08-27T22:05:53.212587",
+  "benchmarks": {
+    "gaia": {
+      "total_questions": 63,
+      "exact_match_score": 0.06349206349206349,
+      "contains_score": 0.2698412698412698,
+      "exact_matches": 4.0,
+      "contains_matches": 17.0
+    },
+    "math": {
+      "total_questions": 100,
+      "exact_match_score": 0.16,
+      "contains_score": 0.53,
+      "exact_matches": 16.0,
+      "contains_matches": 53.0
+    },
+    "simpleqa": {
+      "total_questions": 100,
+      "exact_match_score": 0.31,
+      "contains_score": 0.72,
+      "exact_matches": 31.0,
+      "contains_matches": 72.0
+    }
+  },
+  "overall": {
+    "total_questions": 263,
+    "exact_match_score": 0.19391634980988592,
+    "contains_score": 0.5399239543726235,
+    "exact_matches": 51.0,
+    "contains_matches": 142.0
+  }
+}
diff --git a/...chmark_scores_decentralized-LiteLLMModel-gpt-4o__decentralized-consensus__2025-08-28.json b/...chmark_scores_decentralized-LiteLLMModel-gpt-4o__decentralized-consensus__2025-08-28.json
@@ -0,0 +1,36 @@
+{
+  "model_id": "decentralized-LiteLLMModel-gpt-4o",
+  "action_type": "decentralized-consensus",
+  "date": "2025-08-28",
+  "timestamp": "2025-08-28T20:37:17.560750",
+  "benchmarks": {
+    "gaia": {
+      "total_questions": 71,
+      "exact_match_score": 0.16901408450704225,
+      "contains_score": 0.23943661971830985,
+      "exact_matches": 12.0,
+      "contains_matches": 17.0
+    },
+    "math": {
+      "total_questions": 100,
+      "exact_match_score": 0.25,
+      "contains_score": 0.39,
+      "exact_matches": 25.0,
+      "contains_matches": 39.0
+    },
+    "simpleqa": {
+      "total_questions": 97,
+      "exact_match_score": 0.28865979381443296,
+      "contains_score": 0.3711340206185567,
+      "exact_matches": 28.0,
+      "contains_matches": 36.0
+    }
+  },
+  "overall": {
+    "total_questions": 268,
+    "exact_match_score": 0.24253731343283583,
+    "contains_score": 0.34328358208955223,
+    "exact_matches": 65.0,
+    "contains_matches": 92.0
+  }
+}