Merge pull request #10 from MiroMindAI/blacklist-tool

fangda-ye · web-flow · commit e75d4874db4b · 2026-01-28T13:48:40.000+08:00
feat(tool-blacklist): add tool blacklist function
diff --git a/config/binwang_fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml b/config/binwang_fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml
@@ -0,0 +1,49 @@
+defaults:
+  - benchmark: gaia-validation-text-only
+  - override hydra/job_logging: none
+  - _self_
+
+entrypoint: main_agent
+main_agent:
+  name: main_agent
+  type: IterativeAgentWithToolAndRollback
+  max_consecutive_rollbacks: 3
+  max_turns: 200
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+  prompt: config/prompts/fangda_prompt_main_agent.yaml
+  tools:
+    - config/tool/tool-search-and-scrape-webpage.yaml
+    - config/tool/tool-jina-scrape-llm-summary.yaml
+    - config/tool/tool-python.yaml
+  tool_blacklist:
+    - server: "tool-search-and-scrape-webpage"
+      tool: "sogou_search"
+    - server: "tool-python"
+      tool: "download_file_from_sandbox_to_local"
+  input_processor:
+    - ${input-message-generator}
+  output_processor:
+    - ${output-summary}
+    - ${output-final-answer-extraction}
+    - ${output-failure-experience}
+
+input-message-generator:
+  type: InputMessageGenerator
+output-summary:
+  type: SummaryGenerator
+output-failure-experience:
+  type: FailureExperienceSummaryGenerator
+  prompt: config/prompts/fangda_prompt_main_agent.yaml
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+output-final-answer-extraction:
+  type: FinalAnswerExtractor
+  prompt: config/prompts/fangda_prompt_main_agent.yaml
+  llm:
+    _base_: config/llm/base_mirothinker.yaml
+
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"
+
+
diff --git a/config/tool/tool-python.yaml b/config/tool/tool-python.yaml
@@ -0,0 +1,8 @@
+name: "tool-python"
+tool_command: "python"
+args:
+  - "-m"
+  - "src.tool.mcp_servers.python_mcp_server"
+env:
+  E2B_API_KEY: "${oc.env:E2B_API_KEY}"
+  LOGS_DIR: "./logs"
diff --git a/scripts/binwang_run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh b/scripts/binwang_run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh
@@ -7,13 +7,40 @@
 # Configuration parameters
 NUM_RUNS=3
 BENCHMARK_NAME="gaia-validation-text-only"
-AGENT_SET="binwang_agent_gaia-validation-text-only_mirothinker_single_agent"
-MAX_CONCURRENT=30
+AGENT_SET="binwang_fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools_toolblacklist"
+MAX_CONCURRENT=10
 
 # Set results directory with timestamp
 TIMESTAMP=$(date +%Y%m%d_%H%M)
 RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
 
+# Array to track child PIDs
+declare -a CHILD_PIDS=()
+
+cleanup() {
+    echo ""
+    echo "Received interrupt signal, terminating all processes..."
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Killing process group $pid"
+            kill -TERM -"$pid" 2>/dev/null
+        fi
+    done
+    # Wait a moment for graceful shutdown
+    sleep 2
+    # Force kill any remaining processes
+    for pid in "${CHILD_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            echo "Force killing process group $pid"
+            kill -KILL -"$pid" 2>/dev/null
+        fi
+    done
+    echo "All processes terminated."
+    exit 130
+}
+
+trap cleanup SIGINT SIGTERM
+
 echo "Starting $NUM_RUNS runs of the evaluation..."
 echo "Results will be saved in: $RESULTS_DIR"
 
@@ -27,14 +54,17 @@ for i in $(seq 1 $NUM_RUNS); do
     
     RUN_ID="run_$i"
     
+    # Start process in new process group (set -m creates new pgrp)
     (
+        set -m
         uv run test_benchmark.py \
             --config-path config/${AGENT_SET}.yaml \
             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             output_dir="$RESULTS_DIR/$RUN_ID" \
             > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
         
-        if [ $? -eq 0 ]; then
+        EXIT_CODE=$?
+        if [ $EXIT_CODE -eq 0 ]; then
             echo "Run $i completed successfully"
             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
             if [ -f "$RESULT_FILE" ]; then
@@ -43,15 +73,26 @@ for i in $(seq 1 $NUM_RUNS); do
                 echo "Warning: Result file not found for run $i"
             fi
         else
-            echo "Run $i failed!"
+            # Check if we have JSON result files (task completed but evaluator had issues)
+            JSON_COUNT=$(find "${RESULTS_DIR}/$RUN_ID" -name "task_*.json" 2>/dev/null | wc -l)
+            if [ "$JSON_COUNT" -gt 0 ]; then
+                echo "Run $i finished with exit code $EXIT_CODE but generated $JSON_COUNT task logs"
+            else
+                echo "Run $i failed with exit code $EXIT_CODE"
+            fi
         fi
     ) &
     
+    # Get the PID and store it
+    CHILD_PIDS+=($!)
+    
     sleep 2
 done
 
 echo "All $NUM_RUNS runs have been launched in parallel"
+echo "Child PIDs: ${CHILD_PIDS[*]}"
 echo "Waiting for all runs to complete..."
+echo "Press Ctrl+C to terminate all processes"
 
 wait
 
@@ -66,5 +107,4 @@ echo "=========================================="
 echo "Multiple runs evaluation completed!"
 echo "Check results in: $RESULTS_DIR"
 echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
-echo "==========================================" 
-
+echo "=========================================="
diff --git a/src/agents/base.py b/src/agents/base.py
@@ -71,9 +71,38 @@ def __init__(self, cfg: Optional[DictConfig | dict] = None, parent=None):
         self.llm_client = build_llm_client(cfg=self.cfg.get("llm"))
         self.prompt_manager = PromptManager(config_path=self.cfg.get("prompt"))
         self.sub_agents = self.cfg.get("sub_agents")
-        self.tool_manager = ToolManager(cfg=self.cfg.get("tools"))
+
+        # Parse tool_blacklist from config
+        tool_blacklist = self._parse_tool_blacklist(self.cfg.get("tool_blacklist"))
+        self.tool_manager = ToolManager(
+            cfg=self.cfg.get("tools"), tool_blacklist=tool_blacklist
+        )
         self.skill_manager = SkillManager(skill_dirs=self.cfg.get("skills"))
 
+    def _parse_tool_blacklist(self, blacklist_cfg) -> set:
+        """
+        Parse tool_blacklist config into a set of (server_name, tool_name) tuples.
+
+        Config format:
+            tool_blacklist:
+              - server: "tool-code"
+                tool: "create_sandbox"
+              - server: "tool-search-and-scrape-webpage"
+                tool: "sogou_search"
+
+        Returns:
+            Set of (server_name, tool_name) tuples
+        """
+        if not blacklist_cfg:
+            return set()
+
+        blacklist = set()
+        for item in blacklist_cfg:
+            # Handles both regular dict and OmegaConf DictConfig
+            if hasattr(item, "get") and item.get("server") and item.get("tool"):
+                blacklist.add((str(item.get("server")), str(item.get("tool"))))
+        return blacklist
+
     @abstractmethod
     async def run_internal(self, ctx: AgentContext) -> AgentContext:
         pass
diff --git a/src/tool/manager.py b/src/tool/manager.py
@@ -88,6 +88,8 @@ def __init__(
         logger.info(
             f"ToolManager initialized, loaded servers: {list(self.server_dict.keys())}"
         )
+        if self.tool_blacklist:
+            logger.info(f"Tool blacklist configured: {self.tool_blacklist}")
 
     def _is_huggingface_dataset_or_space_url(self, url):
         """
diff --git a/src/tool/mcp_servers/python_mcp_server.py b/src/tool/mcp_servers/python_mcp_server.py
diff --git a/utils/check_progress_gaia_validation_text_103.py b/utils/check_progress_gaia_validation_text_103.py

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,8 @@ def __init__(`
`88`	`88`	`logger.info(`
`89`	`89`	`f"ToolManager initialized, loaded servers: {list(self.server_dict.keys())}"`
`90`	`90`	`)`
	`91`	`+ if self.tool_blacklist:`
	`92`	`+ logger.info(f"Tool blacklist configured: {self.tool_blacklist}")`
`91`	`93`
`92`	`94`	`def _is_huggingface_dataset_or_space_url(self, url):`
`93`	`95`	`"""`