Merge branch 'tlsdc/log_prob' of github.com:ServiceNow/AgentLab into tlsdc/log_prob

TLSDC · TLSDC · commit 24ec7a78af6d · 2025-02-20T14:19:56.000-05:00
diff --git a/add_study_to_repro_journal.py b/add_study_to_repro_journal.py
@@ -0,0 +1,18 @@
+import os
+from pathlib import Path
+from agentlab.experiments.study import Study
+
+
+base_dir = "/home/toolkit/ui_copilot_results"
+
+exp_paths = [
+    "2025-01-31_22-08-34_genericagent-o3-mini-2025-01-31-on-workarena-l1",
+    #  '2025-02-02_01-53-45_genericagent-openai-o1-mini-2024-09-12-on-workarena-l1',
+    "2025-02-02_01-55-04_genericagent-openai-o1-mini-2024-09-12-on-workarena-l1",
+]
+full_paths = [os.path.join(base_dir, exp_path) for exp_path in exp_paths]
+
+for full_path in full_paths:
+    study = Study.load(Path(full_path))
+
+    study.append_to_journal(strict_reproducibility=False)
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
@@ -64,4 +64,12 @@ ThibaultLSDC,GenericAgent-gpt-4o-mini_vision,visualwebarena,0.13.3,2024-12-02_02
 ThibaultLSDC,GenericAgent-gpt-4o_vision,visualwebarena,0.13.3,2024-12-02_07-17-28,7fb7eac8-4bbd-4ebe-be32-15901a7678f2,0.267,0.015,65,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta_vision,visualwebarena,0.13.3,2024-12-02_09-11-35,22f0611d-aeea-4ee9-a533-b45442b5e080,0.21,0.013,178,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,webarena,0.13.3,2024-12-02_23-18-38,fc5747bc-d998-4942-a0eb-e55a3ccc1cb3,0.184,0.014,213,811/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
-
+Leo Boisvert,GenericAgent-o3-mini-2025-01-31,workarena_l1,0.4.1,2025-01-31_22-08-33,a74cc00f-f743-43a1-9cab-59af8bffa3a2,0.482,0.028,3,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.3.2,73baabee6d7ac37a5b8677c80baf83914a4f4dc4,"  M: src/agentlab/agents/generic_agent/__init__.py
+  M: src/agentlab/agents/generic_agent/agent_configs.py
+  M: src/agentlab/analyze/agent_xray.py
+  M: src/agentlab/llm/chat_api.py
+  M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29,"
+Leo Boisvert,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2025-02-02_01-55-04,f3e1fcb8-5fc5-4115-9e00-27251508e2c7,0.518,0.028,5,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.3.2,73baabee6d7ac37a5b8677c80baf83914a4f4dc4,"  M: src/agentlab/agents/generic_agent/__init__.py
+  M: src/agentlab/agents/generic_agent/agent_configs.py
+  M: src/agentlab/analyze/agent_xray.py
+  M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29,"
diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py
@@ -17,15 +17,17 @@
     AGENT_4o_MINI,
     AGENT_CLAUDE_SONNET_35,
     AGENT_4o_VISION,
-    AGENT_4o_MINI_VISION,
-    AGENT_CLAUDE_SONNET_35_VISION,
+    AGENT_o3_MINI,
+    AGENT_o1_MINI,
 )
 
 __all__ = [
     "AGENT_3_5",
     "AGENT_4o",
     "AGENT_4o_MINI",
     "AGENT_4o_VISION",
+    "AGENT_o3_MINI",
+    "AGENT_o1_MINI",
     "AGENT_LLAMA3_70B",
     "AGENT_LLAMA31_70B",
     "AGENT_8B",
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -265,6 +265,15 @@
     flags=FLAGS_GPT_4o,
 )
 
+AGENT_o3_MINI = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/o3-mini-2025-01-31"],
+    flags=FLAGS_GPT_4o,
+)
+
+AGENT_o1_MINI = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/o1-mini-2024-09-12"],
+    flags=FLAGS_GPT_4o,
+)
 # GPT-4o vision default config
 FLAGS_GPT_4o_VISION = FLAGS_GPT_4o.copy()
 FLAGS_GPT_4o_VISION.obs.use_screenshot = True
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
@@ -145,6 +145,7 @@ def make_model(self):
                 temperature=self.temperature,
                 max_new_tokens=self.max_new_tokens,
                 n_retry_server=self.n_retry_server,
+                log_probs=self.log_probs
             )
         else:
             raise ValueError(f"Backend {self.backend} is not supported")
@@ -237,7 +238,7 @@ def __init__(
         self.max_tokens = max_tokens
         self.max_retry = max_retry
         self.min_retry_wait_time = min_retry_wait_time
-        self.logprobs = log_probs
+        self.log_probs = log_probs
 
         # Get the API key from the environment variable if not provided
         if api_key_env_var:
@@ -284,7 +285,7 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
                     n=n_samples,
                     temperature=temperature,
                     max_tokens=self.max_tokens,
-                    logprobs=self.logprobs,
+                    log_probs=self.log_probs,
                 )
 
                 if completion.usage is None:
@@ -315,8 +316,8 @@ def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float
 
         if n_samples == 1:
             res = AIMessage(completion.choices[0].message.content)
-            if self.logprobs:
-                res["logprobs"] = completion.choices[0].logprobs
+            if self.log_probs:
+                res["log_probs"] = completion.choices[0].log_probs
             return res
         else:
             return [AIMessage(c.message.content) for c in completion.choices]
@@ -429,7 +430,7 @@ def __init__(
         n_retry_server: Optional[int] = 4,
         log_probs: Optional[bool] = False,
     ):
-        super().__init__(model_name, base_model_name, n_retry_server)
+        super().__init__(model_name, base_model_name, n_retry_server, log_probs)
         if temperature < 1e-3:
             logging.warning("Models might behave weirdly when temperature is too low.")
         self.temperature = temperature
diff --git a/src/agentlab/llm/huggingface_utils.py b/src/agentlab/llm/huggingface_utils.py
@@ -2,12 +2,11 @@
 import time
 from typing import Any, List, Optional, Union
 
-from pydantic import Field
-from transformers import AutoTokenizer, GPT2TokenizerFast
-
 from agentlab.llm.base_api import AbstractChatModel
 from agentlab.llm.llm_utils import AIMessage, Discussion
 from agentlab.llm.prompt_templates import PromptTemplate, get_prompt_template
+from pydantic import Field
+from transformers import AutoTokenizer, GPT2TokenizerFast
 
 
 class HFBaseChatModel(AbstractChatModel):
@@ -40,9 +39,10 @@ class HFBaseChatModel(AbstractChatModel):
         description="The number of times to retry the server if it fails to respond",
     )
 
-    def __init__(self, model_name, base_model_name, n_retry_server):
+    def __init__(self, model_name, base_model_name, n_retry_server, log_probs):
         super().__init__()
         self.n_retry_server = n_retry_server
+        self.log_probs = log_probs
 
         if base_model_name is None:
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -102,8 +102,9 @@ def __call__(
                     temperature = temperature if temperature is not None else self.temperature
                     answer = self.llm(prompt, temperature=temperature)
                     response = AIMessage(answer)
-                    if hasattr(answer, "details"):
-                        response["log_prob"] = answer.details.log_prob
+                    if self.log_probs:
+                        response["content"] = answer.generated_text
+                        response["log_prob"] = answer.details
                     responses.append(response)
                     break
                 except Exception as e:
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
@@ -63,6 +63,13 @@
         max_input_tokens=16_384,
         max_new_tokens=4096,
     ),
+    "openai/o1-mini": OpenAIModelArgs(
+        model_name="openai/o1-mini",
+        max_total_tokens=128_000,
+        max_input_tokens=128_000,
+        max_new_tokens=64_000,
+        temperature=1e-1,
+    ),
     "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs(
         model_name="gpt-35-turbo",
         deployment_name="gpt-35-turbo",