refactor: update chat model arguments and enable relaunch in experiment script; add rate limit testing functionality

recursix · recursix · commit 139e55a6e20f · 2025-07-04T09:20:47.000-04:00
diff --git a/main_exp_new_models.py b/main_exp_new_models.py
@@ -17,18 +17,21 @@
 
 logging.getLogger().setLevel(logging.INFO)
 
+# chat_model_args = CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-mini-2025-04-14"]
+# chat_model_args = CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-2025-04-14"]
+chat_model_args = CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.7-sonnet"]
 agent_args = [
     GenericAgentArgs(
-        chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-mini-2025-04-14"],
+        chat_model_args=chat_model_args,
         flags=FLAGS_GPT_4o,
     )
 ]
 
 
 # ## select the benchmark to run on
-benchmark = "miniwob_tiny_test"
+# benchmark = "miniwob_tiny_test"
 # benchmark = "miniwob"
-# benchmark = "workarena_l1"
+benchmark = "workarena_l1"
 # benchmark = "workarena_l2"
 # benchmark = "workarena_l3"
 # benchmark = "webarena"
@@ -40,10 +43,10 @@
 
 # Set relaunch = True to relaunch an existing study, this will continue incomplete
 # experiments and relaunch errored experiments
-relaunch = False
+relaunch = True
 
 ## Number of parallel jobs
-n_jobs = 4  # Make sure to use 1 job when debugging in VSCode
+n_jobs = 5  # Make sure to use 1 job when debugging in VSCode
 # n_jobs = -1  # to use all available cores
 
 
diff --git a/tests/verify_rate_limit_anthropic.py b/tests/verify_rate_limit_anthropic.py
@@ -0,0 +1,89 @@
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import anthropic
+
+client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
+
+
+def make_request(messages):
+    response = client.messages.create(
+        model="claude-3-5-sonnet-20241022", max_tokens=10, messages=messages
+    )
+    return response.usage
+
+
+def make_message(text):
+    return {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": text,
+            }
+        ],
+    }
+
+
+def add_cache_control(message: dict, cache_type="ephemeral"):
+    message["content"][0]["cache_control"] = {"type": cache_type}
+
+
+def remove_cache_control(message: dict):
+    if "cache_control" in message["content"][0]:
+        del message["content"][0]["cache_control"]
+
+
+def test_rate_limit_single(thread_id):
+    # Create ~100k token message that will be cached
+    big_text = "This is a large block of text for caching. " * 10000  # ~100k tokens
+    medium_text = "This is a large block of text for caching. " * 2000  # ~10k tokens
+
+    print(f"Thread {thread_id}: Starting rate limit test with cached content...")
+
+    # Rebuild conversation each time (simulating web agent)
+    messages = []
+
+    # Add all previous conversation turns
+    for i in range(5):
+        if i == 0:
+            messages.append(make_message(big_text))
+            t0 = time.time()
+        else:
+            messages.append(make_message(medium_text))
+        add_cache_control(messages[-1])
+        try:
+            usage = make_request(messages)
+            dt = time.time() - t0
+            print(f"{dt:.2f}: Thread {thread_id}: {usage}")
+        except Exception as e:
+            print(f"Thread {thread_id}: Error - {e}")
+            break
+        remove_cache_control(messages[-1])
+
+
+def test_rate_limit_parallel(num_threads=3):
+    print(f"Starting parallel rate limit test with {num_threads} threads...")
+
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = [executor.submit(test_rate_limit_single, i) for i in range(num_threads)]
+
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"Thread completed with error: {e}")
+
+
+def test_rate_limit():
+    # Original single-threaded version
+    test_rate_limit_single(0)
+
+
+if __name__ == "__main__":
+    # Use parallel version to quickly exhaust rate limits
+    test_rate_limit_parallel(num_threads=3)
+
+    # Or use original single-threaded version
+    # test_rate_limit()