Remove 'action_set' from index_black_list in load_result_df and make OSWorldActionSet a dataclass for proper repr.

amanjaiswal73892 · amanjaiswal73892 · commit bb38053a4291 · 2025-07-10T14:56:30.000-04:00
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
@@ -109,7 +109,7 @@ def load_result_df(
     set_index=True,
     result_df=None,
     index_white_list=("agent.*",),
-    index_black_list=("*model_url*", "*extra*", "*._*", "*action_set"),
+    index_black_list=("*model_url*", "*extra*", "*._*"),
     remove_args_suffix=True,
 ):
     """Load the result dataframe.
diff --git a/src/agentlab/benchmarks/osworld.py b/src/agentlab/benchmarks/osworld.py
@@ -547,14 +547,15 @@ def close(self):
         return self.env.close()
 
 
-class OSWorldActionSet(AbstractActionSet):
+@dataclass
+class OSWorldActionSet(AbstractActionSet, DataClassJsonMixin):
     # TODO: Define and use agentlab AbstractActionSet
     # AbstractActionSet should define some standard format to represent actions.(list of dict with keys that are MCP compatible)
     # Should we have 'abstract function' here for action conversion for backend LLM with fixed action set like UI-Tars or Semi-fixed action set LLMs like OpenAI CUA?
     # TODO: We need to support both 'action space as tools' and 'action space as prompt' for agentlab agents
     # and have conversion functions to convert them to format acceptable by environment.
-    def __init__(self, action_space: Literal["computer_13", "pyautogui"]):
-        self.action_space = action_space
+    action_space: Literal["computer_13", "pyautogui"] = "computer_13"
+    multiaction: bool = False
 
     def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str:
         """Describe the OSWorld action set for desktop interactions."""
@@ -598,22 +599,22 @@ def format_response_api_tools_to_anthropic(tools: list[dict]) -> list[dict]:
     return formatted_tools
 
 
-@dataclass
-class OSWorldActionSetArgs(DataClassJsonMixin):
-    action_space: Literal["computer_13", "pyautogui"] = "computer_13"
+# @dataclass
+# class OSWorldActionSetArgs(DataClassJsonMixin):
+#     action_space: Literal["computer_13", "pyautogui"] = "computer_13"
 
-    def make_action_set(self):
-        logger.info(f"Creating OSWorld Action Set with action space: {self.action_space}")
-        return OSWorldActionSet(action_space=self.action_space)
+#     def make_action_set(self):
+#         logger.info(f"Creating OSWorld Action Set with action space: {self.action_space}")
+#         return OSWorldActionSet(action_space=self.action_space)
 
 
 @dataclass
 class OsworldEnvArgs(AbstractEnvArgs):
     task: dict[str, Any]
     task_seed: int = 0
     task_name: str | None = None
-    path_to_vm: str | None = None  # path to .vmx file
-    provider_name: str = "docker"
+    path_to_vm: str | None = "OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx"  # path to .vmx file
+    provider_name: str = "vmware"  # path to .vmx file
     region: str = "us-east-1"  # AWS specific, does not apply to all providers
     snapshot_name: str = "init_state"  # snapshot name to revert to
     action_space: Literal["computer_13", "pyautogui"] = "computer_13"
@@ -653,7 +654,7 @@ def make_env(
 class OsworldBenchmark(AbstractBenchmark):
     name: str = "osworld"
     is_multi_tab: bool = False
-    high_level_action_set_args: OSWorldActionSetArgs = None  # type: ignore
+    high_level_action_set_args: OSWorldActionSet = None  # type: ignore
     test_set_path: str = "OSWorld/evaluation_examples"
     test_set_name: str = "test_all.json"
     domain: str = "all"
@@ -664,7 +665,7 @@ def model_post_init(self, __context: Any) -> None:
         self.env_args_list = []
         if not self.env_args:
             self.env_args = OsworldEnvArgs(task={})
-        self.high_level_action_set_args = OSWorldActionSetArgs(
+        self.high_level_action_set_args = OSWorldActionSet(
             action_space=self.env_args.action_space
         )
         with open(os.path.join(self.test_set_path, self.test_set_name)) as f: