Merge branch 'main' of https://github.com/CHATS-lab/ToolShield

simonucl · simonucl · commit 42e7162d22ec · 2026-02-12T14:44:34.000Z
diff --git a/agentrisk/browsing.py b/agentrisk/browsing.py
@@ -252,7 +252,7 @@ def pre_login(runtime: Runtime, services: List[str], save_screenshots=True, scre
 
             if not action:
                 logger.error(f"FAILED TO RESOLVE ACTION, {action}")
-                raise Exception(f"FAILED TO RESOLVE ACTION, maybe the service is not available")
+                raise Exception("FAILED TO RESOLVE ACTION, maybe the service is not available")
 
             # Convert the action to an instruction string
             instruction = action.to_instruction()
diff --git a/agentrisk/client.py b/agentrisk/client.py
@@ -51,9 +51,9 @@ async def _initialize_and_list_tools(self) -> None:
 
             # --- HELPER SCHEMAS ---
             path_arg = {"type": "string", "description": "Absolute path"}
-            url_arg = {"type": "string", "description": "URL"}
+            _url_arg = {"type": "string", "description": "URL"}
             sql_arg = {"type": "string", "description": "SQL Query"}
-            selector_arg = {"type": "string", "description": "CSS Selector (e.g. '#submit-btn')"}
+            _selector_arg = {"type": "string", "description": "CSS Selector (e.g. '#submit-btn')"}
             no_arg_schema = {"type": "object", "properties": {}}
 
             # --- 1. FILESYSTEM SCHEMAS ---
diff --git a/agentrisk/db_setup.py b/agentrisk/db_setup.py
@@ -1,6 +1,5 @@
 import subprocess
 import os
-from pathlib import Path
 
 # CONFIGURATION
 CONTAINER_NAME = "mcpmark-postgres"
diff --git a/agentrisk/playwright_note/correct_web_address.py b/agentrisk/playwright_note/correct_web_address.py
@@ -10,8 +10,6 @@
 """
 
 import argparse
-import os
-import re
 from pathlib import Path
 
 OLD_DOMAIN = "aa-0.chats-lab-gui-agent.uk"
diff --git a/agentrisk/post_eval.py b/agentrisk/post_eval.py
@@ -7,6 +7,7 @@
 import argparse
 import json
 import os
+import re
 from pathlib import Path
 from typing import Dict, List, Tuple
 
@@ -132,10 +133,6 @@ def load_artifacts_with_fallback(task_name: str, output_dir: Path) -> Tuple[str,
     return load_artifacts(task_name, output_dir)
 
 
-import re
-from typing import Dict
-
-
 def extract_json(raw_output: str) -> str:
     """Extract JSON from markdown code blocks or raw text."""
     text = raw_output.strip()
diff --git a/agentrisk/reset_notion.py b/agentrisk/reset_notion.py
@@ -1,8 +1,6 @@
 import sys
 import os
 import logging
-import time
-from typing import Optional
 
 # --- PATH SETUP ---
 current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -61,7 +59,7 @@ def full_reset():
             logger.error("❌ Critical Error: Could not find Source or Eval Hub pages.")
             return False
 
-        logger.info(f"🔄 STARTING FULL RESET")
+        logger.info("🔄 STARTING FULL RESET")
 
         # 1. WIPE EVAL HUB
         logger.info("🗑️  Wiping Eval Hub...")
@@ -103,8 +101,8 @@ def full_reset():
                     )
                     new_page_id = dup_id
 
-                except Exception as e:
-                    logger.warning(f"     ⚠️ Standard verification failed. Checking for 'Ghost Page'...")
+                except Exception:
+                    logger.warning("     ⚠️ Standard verification failed. Checking for 'Ghost Page'...")
                     
                     # RECOVERY: Search for the page manually with EXACT casing
                     expected_ghost_title = f"{title} (1)" 
@@ -124,7 +122,7 @@ def full_reset():
                             page_id=new_page_id,
                             parent={"page_id": eval_hub_id}
                         )
-                        print(f"     📦 Moved to Eval Hub")
+                        print("     📦 Moved to Eval Hub")
                         
                         # Rename (remove the " (1)")
                         # Notion API update properties
@@ -141,7 +139,7 @@ def full_reset():
                             }
                         )
                         print(f"     ✏️  Renamed to '{title}'")
-                        print(f"     ✅ Success!")
+                        print("     ✅ Success!")
                         cloned_count += 1
                         
                     except Exception as move_err:
diff --git a/agentrisk/run_eval.py b/agentrisk/run_eval.py
@@ -27,33 +27,19 @@
 
 import argparse
 import asyncio
+import json
 import os
 import shutil
 import sys
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional
-import json
-import yaml
-import tempfile
+
 import base64
+import openai
 import requests
-from openhands.events.action import CmdRunAction
-
+import yaml
+from agentrisk.browsing import pre_login
 from agentrisk.db_setup import reset_postgres_db
-
-try:
-    import tomllib
-except ModuleNotFoundError:  # pragma: no cover - fallback for Python<3.11
-    import tomli as tomllib
-
-
-SERVICE_RESET_ENDPOINTS = {
-    "gitlab": "reset-gitlab",
-    "owncloud": "reset-owncloud",
-    "plane": "reset-plane",
-    "rocketchat": "reset-rocketchat",
-}
-
 from openhands.controller.state.state import State
 from openhands.core.config import (
     OpenHandsConfig,
@@ -63,15 +49,26 @@
     get_llm_config_arg,
 )
 from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.condenser_config import BrowserOutputCondenserConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation, BrowserOutputObservation
 from openhands.runtime.base import Runtime
 from openhands.utils.async_utils import call_async_from_sync
-from openhands.core.config.condenser_config import BrowserOutputCondenserConfig
-import openai
-from agentrisk.browsing import pre_login
+
+try:
+    import tomllib
+except ModuleNotFoundError:  # pragma: no cover - fallback for Python<3.11
+    import tomli as tomllib
+
+
+SERVICE_RESET_ENDPOINTS = {
+    "gitlab": "reset-gitlab",
+    "owncloud": "reset-owncloud",
+    "plane": "reset-plane",
+    "rocketchat": "reset-rocketchat",
+}
 
 client = openai.OpenAI(
     # api_key=OPENAI_KEY
@@ -400,9 +397,9 @@ def get_config(
     dependencies_path = os.path.join(task_path, "utils", "dependencies.yml")
     if os.path.exists(dependencies_path):
         with open(dependencies_path) as f:
-            dependencies = yaml.safe_load(f) or []
+            _dependencies = yaml.safe_load(f) or []  # loaded for validation
     else:
-        dependencies = []
+        _dependencies = []
 
     # Max_iterations (Max steps per task)
     max_iters = 30
@@ -594,7 +591,7 @@ def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig, ta
         logger.warning(f"task.md not found at {task_md_path}, skipping copy to /instruction/.")
     
     # encrypt the evaluator.py file
-    command = f"python /utils/encrypt.py && rm /utils/evaluator.py /utils/encrypt.py"
+    command = "python /utils/encrypt.py && rm /utils/evaluator.py /utils/encrypt.py"
     obs = runtime.run_action(CmdRunAction(command))
     assert obs.exit_code == 0
 
diff --git a/toolshield/cli.py b/toolshield/cli.py
@@ -14,7 +14,7 @@
 import subprocess
 import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 from toolshield._paths import default_agent_config, default_eval_dir, repo_root
 
diff --git a/toolshield/exp_generate.py b/toolshield/exp_generate.py
@@ -148,7 +148,7 @@ def get_next_exp_key(experiences: Dict[str, str]) -> str:
             try:
                 num = int(key.split(".")[1])
                 numbers.append(num)
-            except:
+            except (ValueError, IndexError):
                 continue
     
     if not numbers:
@@ -216,7 +216,7 @@ def summarize_trajectory(task_num: int) -> str:
             if f.is_file():
                 try:
                     setup_files[str(f.relative_to(setup_dir))] = f.read_text()
-                except:
+                except Exception:
                     setup_files[str(f.relative_to(setup_dir))] = "<binary file>"
     
     user_msg = TRAJECTORY_SUMMARY_USER_TEMPLATE.format(
@@ -316,7 +316,7 @@ def learn_from_task_state(task_num: int) -> Dict[str, Any]:
     """
     _, _, file_prefix = get_task_paths(task_num)
     
-    print(f"  Phase 1: Summarizing trajectory...")
+    print("  Phase 1: Summarizing trajectory...")
     trajectory_summary = summarize_trajectory(task_num)
     
     if not trajectory_summary:
@@ -328,7 +328,7 @@ def learn_from_task_state(task_num: int) -> Dict[str, Any]:
     summary_file.write_text(trajectory_summary)
     print(f"  ✓ Summary saved to {summary_file.name}")
     
-    print(f"  Phase 2: Extracting safety experience...")
+    print("  Phase 2: Extracting safety experience...")
     result = learn_from_trajectory_summary(task_num, trajectory_summary)
     
     return result
@@ -393,14 +393,13 @@ def update_experience_list(result: Dict[str, Any]) -> bool:
         
         reasoning = result.get("reasoning", "No reasoning provided")
         action = result.get("action")
-        exp_key = result.get("exp_key")
         exp_value = result.get("exp_value")
         
         experiences = load_experience_list()
         next_state, metadata = apply_experience_result(experiences, result)
         
         if not metadata["changed"]:
-            print(f"  ○ NO CHANGE - Experience already covered or not actionable")
+            print("  ○ NO CHANGE - Experience already covered or not actionable")
             print(f"    Reasoning: {reasoning}")
             return True
         
@@ -546,9 +545,9 @@ def process_all_tasks():
                 task_num = 100 + base_num
             else:
                 task_num = int(task_dir.name.split(".")[1])
-        except:
+        except (ValueError, IndexError):
             continue
-        
+
         print(f"\n[Task {task_num}] ({task_dir.name})")
         
         # Learn from this task
@@ -578,7 +577,7 @@ def process_all_tasks():
     print("\n" + "="*70)
     print("✓ Processing Complete!")
     print("="*70)
-    print(f"\nResults Summary:")
+    print("\nResults Summary:")
     print(f"  Added: {results_summary['ADD']}")
     print(f"  Updated: {results_summary['UPDATE']}")
     print(f"  Deleted: {results_summary['DELETE']}")
@@ -626,12 +625,12 @@ def process_single_task(task_num: int):
     
     # Display semantic advantage if present
     if "semantic_advantage" in result:
-        print(f"\n  📝 Semantic Advantage:")
+        print("\n  📝 Semantic Advantage:")
         print(f"     {result['semantic_advantage']}")
     
     # Display coverage analysis if present
     if "coverage_analysis" in result:
-        print(f"\n  📊 Coverage Analysis:")
+        print("\n  📊 Coverage Analysis:")
         coverage = result['coverage_analysis']
         if coverage.get('related_keys'):
             print(f"     Related Keys: {', '.join(coverage['related_keys'])}")
diff --git a/toolshield/iterative_exp_runner.py b/toolshield/iterative_exp_runner.py
@@ -13,7 +13,6 @@
 import argparse
 import json
 import os
-import re
 import shutil
 import subprocess
 import tempfile
@@ -371,7 +370,7 @@ def run_task(
             env.pop("LOGPROB_TAG", None)
             if debug:
                 subprocess.run(cmd, check=True, cwd=workdir, env=env)
-                print(f"  ✓ Task evaluation succeeded")
+                print("  ✓ Task evaluation succeeded")
             else:
                 with open(os.devnull, "wb") as devnull:
                     subprocess.run(
@@ -388,7 +387,7 @@ def run_task(
                 print(f"  ✗ Task evaluation failed (exit {exc.returncode})")
     
     if debug:
-        print(f"  ✗ Exceeded retry budget; skipping task")
+        print("  ✗ Exceeded retry budget; skipping task")
     return False, None
 
 
@@ -404,11 +403,6 @@ def run_task_with_cleanup(
     debug: bool = False,
 ) -> Tuple[bool, Optional[Path]]:
     """Run a task and perform cleanup regardless of outcome."""
-    task_num = extract_task_number(task_dir)
-    run_index = 1
-    if task_num is not None:
-        run_index = run_counters.get((task_num, label), 0) + 1
-    
     success, logprob_path = run_task(
         task_dir,
         base_cmd,
diff --git a/toolshield/tree_generation.py b/toolshield/tree_generation.py
@@ -3,7 +3,6 @@
 import os
 from typing import List, Dict, Any
 from pathlib import Path
-import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm import tqdm
 
@@ -992,7 +991,7 @@ def log(msg: str) -> None:
         print("[1/4] Tree Generation")
     
     # Phase 1: Generate safety analysis tree
-    log(f"\n📊 Phase 1: Generating Safety Analysis Tree...")
+    log("\n📊 Phase 1: Generating Safety Analysis Tree...")
     log(f"   Analyzing {mcp_name} functions")
     
     tree_data = generate_safety_tree(