From 42230ef1d407f176be56204786e774fc5314b154 Mon Sep 17 00:00:00 2001
From: CCranney <11773171+CCranney@users.noreply.github.com>
Date: Sat, 18 Jan 2025 16:33:51 -0800
Subject: [PATCH] refactored LLM calls for maintenance, future additions

---
 agents.py                       |  18 +++--
 llm_client/__init__.py          |   4 +
 llm_client/factory.py           |  38 +++++++++
 llm_client/models.py            | 130 +++++++++++++++++++++++++++++
 llm_client/query_manager.py     |  52 ++++++++++++
 llm_client/tests/__init__.py    |   0
 llm_client/tests/test_models.py | 139 ++++++++++++++++++++++++++++++++
 mlesolver.py                    |  61 +++++++++-----
 papersolver.py                  |  30 ++++---
 requirements.txt                |   1 +
 10 files changed, 431 insertions(+), 42 deletions(-)
 create mode 100644 llm_client/__init__.py
 create mode 100644 llm_client/factory.py
 create mode 100644 llm_client/models.py
 create mode 100644 llm_client/query_manager.py
 create mode 100644 llm_client/tests/__init__.py
 create mode 100644 llm_client/tests/test_models.py

diff --git a/agents.py b/agents.py
index c6fd4cd..5a0e195 100755
--- a/agents.py
+++ b/agents.py
@@ -1,7 +1,9 @@
 from utils import *
 from tools import *
-from inference import *
+from llm_client import LLMQueryManager
+import json
 
+llm_query_manager = LLMQueryManager()
 
 def extract_json_between_markers(llm_output):
     # Regular expression pattern to find JSON content between ```json and ```
@@ -138,13 +140,13 @@ def get_score(outlined_plan, latex, reward_model_llm, reviewer_type=None, attemp
                       "You are an AI researcher who is reviewing a paper that was submitted to a prestigious ML venue. "
                       f"Be critical and cautious in your decision. {reviewer_type}\n"
                   ) + neurips_form
-            scoring = query_model(
-                model_str=f"{reward_model_llm}",
+            scoring = llm_query_manager.query_model(
+                model_name=f"{reward_model_llm}",
                 system_prompt=sys,
-                openai_api_key=openai_api_key,
+                api_key=openai_api_key,
                 prompt=(
                     f"Outlined in the following text is the research plan that the machine learning engineer was tasked with building: {outlined_plan}\n\n"
-                    f"The following text is the research latex that the model produced: \n{latex}\n\n"), temp=0.0)
+                    f"The following text is the research latex that the model produced: \n{latex}\n\n"), temperature=0.0)
             review_json = extract_json_between_markers(scoring)
 
             overall = int(review_json["Overall"]) / 10
@@ -251,7 +253,7 @@ def inference(self, research_topic, phase, step, feedback="", temp=None):
             f"Current Step #{step}, Phase: {phase}\n{complete_str}\n"
             f"[Objective] Your goal is to perform research on the following topic: {research_topic}\n"
             f"Feedback: {feedback}\nNotes: {notes_str}\nYour previous command was: {self.prev_comm}. Make sure your new output is very different.\nPlease produce a single command below:\n")
-        model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, temp=temp, openai_api_key=self.openai_api_key)
+        model_resp = llm_query_manager.query_model(model_name=self.model, system_prompt=sys_prompt, prompt=prompt, api_key=self.openai_api_key)
         print("^"*50, phase, "^"*50)
         model_resp = self.clean_text(model_resp)
         self.prev_comm = model_resp
@@ -301,7 +303,7 @@ def generate_readme(self):
         prompt = (
             f"""History: {history_str}\n{'~' * 10}\n"""
             f"Please produce the readme below in markdown:\n")
-        model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, openai_api_key=self.openai_api_key)
+        model_resp = llm_query_manager.query_model(model_name=self.model, system_prompt=sys_prompt, prompt=prompt, api_key=self.openai_api_key)
         return model_resp.replace("```markdown", "")
 
     def context(self, phase):
@@ -618,7 +620,7 @@ def requirements_txt(self):
         prompt = (
             f"""History: {history_str}\n{'~' * 10}\n"""
             f"Please produce the requirements.txt below in markdown:\n")
-        model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, openai_api_key=self.openai_api_key)
+        model_resp = llm_query_manager.query_model(model_name=self.model, system_prompt=sys_prompt, prompt=prompt, api_key=self.openai_api_key)
         return model_resp
 
     def example_command(self, phase):
diff --git a/llm_client/__init__.py b/llm_client/__init__.py
new file mode 100644
index 0000000..57c9b4e
--- /dev/null
+++ b/llm_client/__init__.py
@@ -0,0 +1,4 @@
+from .query_manager import LLMQueryManager
+from .models import TokenCounter
+
+__all__ = ['LLMQueryManager', 'TokenCounter']
\ No newline at end of file
diff --git a/llm_client/factory.py b/llm_client/factory.py
new file mode 100644
index 0000000..2f983e2
--- /dev/null
+++ b/llm_client/factory.py
@@ -0,0 +1,38 @@
+from typing import Optional, Dict
+from .models import (
+    LLMStrategy,
+    ModelConfig,
+    OpenAIStrategy,
+    AnthropicStrategy,
+    DeepseekStrategy
+)
+
+class LLMStrategyFactory:
+    def __init__(self, api_keys: Optional[Dict[str, str]] = None):
+        self.api_keys = api_keys or {}
+        self.model_configs = {
+            "gpt-4o": ModelConfig("gpt-4o-2024-08-06", 2.50, 10.00, "openai"),
+            "gpt-4o-mini": ModelConfig("gpt-4o-mini-2024-07-18", 0.15, 0.60, "openai"),
+            "claude-3-5-sonnet": ModelConfig("claude-3-5-sonnet-latest", 3.00, 12.00, "anthropic"),
+            "deepseek-chat": ModelConfig("deepseek-chat", 1.00, 5.00, "deepseek"),
+            "o1-mini": ModelConfig("o1-mini-2024-09-12", 3.00, 12.00, "openai"),
+            "o1": ModelConfig("o1-2024-12-17", 15.00, 60.00, "openai"),
+            "o1-preview": ModelConfig("o1-preview", 15.00, 60.00, "openai"),
+        }
+
+    def create_strategy(self, model_name: str) -> LLMStrategy:
+        if model_name not in self.model_configs:
+            raise ValueError(f"Unknown model: {model_name}")
+
+        config = self.model_configs[model_name]
+        provider = config.provider
+        default_api_key = self.api_keys.get(provider)
+
+        if provider == "openai":
+            return OpenAIStrategy(config, default_api_key)
+        elif provider == "anthropic":
+            return AnthropicStrategy(config, default_api_key)
+        elif provider == "deepseek":
+            return DeepseekStrategy(config, default_api_key)
+        else:
+            raise ValueError(f"No strategy implementation for provider: {provider}")
diff --git a/llm_client/models.py b/llm_client/models.py
new file mode 100644
index 0000000..78c17af
--- /dev/null
+++ b/llm_client/models.py
@@ -0,0 +1,130 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, List, Dict, Mapping
+import tiktoken
+from openai import OpenAI
+import anthropic
+import json
+import os
+import time
+
+
+@dataclass
+class Message:
+    role: str
+    content: str
+
+
+@dataclass
+class ModelConfig:
+    model_name: str
+    input_cost_per_1k: float
+    output_cost_per_1k: float
+    provider: str
+    encoding_name: str = "cl100k_base"
+
+
+class TokenCounter:
+    def __init__(self):
+        self.tokens_in: Dict[str, int] = {}
+        self.tokens_out: Dict[str, int] = {}
+
+    def update_counts(self, model_name: str, input_tokens: int, output_tokens: int):
+        if model_name not in self.tokens_in:
+            self.tokens_in[model_name] = 0
+            self.tokens_out[model_name] = 0
+        self.tokens_in[model_name] += input_tokens
+        self.tokens_out[model_name] += output_tokens
+
+    def calculate_cost(self, model_configs: Dict[str, ModelConfig]) -> float:
+        total_cost = 0.0
+        for model_name, tokens in self.tokens_in.items():
+            if model_name in model_configs:
+                config = model_configs[model_name]
+                input_cost = (tokens * config.input_cost_per_1k) / 1_000_000
+                output_cost = (self.tokens_out[model_name] * config.output_cost_per_1k) / 1_000_000
+                total_cost += input_cost + output_cost
+        return total_cost
+
+
+class LLMStrategy(ABC):
+    def __init__(self, config: ModelConfig):
+        self.config = config
+        self.encoding = tiktoken.get_encoding(config.encoding_name)
+
+    @abstractmethod
+    def query(self, prompt: str, system_prompt: str, api_key: Optional[str] = None, temperature: Optional[float] = None) -> str:
+        pass
+
+    def count_tokens(self, text: str) -> int:
+        return len(self.encoding.encode(text))
+
+
+class OpenAIStrategy(LLMStrategy):
+    def __init__(self, config: ModelConfig, default_api_key: Optional[str] = None):
+        super().__init__(config)
+        self.default_api_key = default_api_key
+
+    def query(self, prompt: str, system_prompt: str, api_key: Optional[str] = None, temperature: Optional[float] = None) -> str:
+        used_key = api_key or self.default_api_key
+        if not used_key:
+            raise ValueError("No API key provided for OpenAI API")
+
+        client = OpenAI(api_key=used_key)
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt}
+        ]
+
+        completion = client.chat.completions.create(
+            model=self.config.model_name,
+            messages=messages,
+            temperature=temperature,
+        )
+        return completion.choices[0].message.content
+
+
+class AnthropicStrategy(LLMStrategy):
+    def __init__(self, config: ModelConfig, default_api_key: Optional[str] = None):
+        super().__init__(config)
+        self.default_api_key = default_api_key
+
+    def query(self, prompt: str, system_prompt: str, api_key: Optional[str] = None, temperature: Optional[float] = None) -> str:
+        used_key = api_key or self.default_api_key
+        if not used_key:
+            raise ValueError("No API key provided for Anthropic API")
+
+        client = anthropic.Anthropic(api_key=used_key)
+        message = client.messages.create(
+            model=self.config.model_name,
+            system=system_prompt,
+            messages=[{"role": "user", "content": prompt}]
+        )
+        return json.loads(message.to_json())["content"][0]["text"]
+
+
+class DeepseekStrategy(LLMStrategy):
+    def __init__(self, config: ModelConfig, default_api_key: Optional[str] = None):
+        super().__init__(config)
+        self.default_api_key = default_api_key
+
+    def query(self, prompt: str, system_prompt: str, api_key: Optional[str] = None, temperature: Optional[float] = None) -> str:
+        used_key = api_key or self.default_api_key
+        if not used_key:
+            raise ValueError("No API key provided for Deepseek API")
+
+        client = OpenAI(
+            api_key=used_key,
+            base_url="https://api.deepseek.com/v1"
+        )
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt}
+        ]
+
+        completion = client.chat.completions.create(
+            model=self.config.model_name,
+            messages=messages,
+            temperature=temperature,
+        )
+        return completion.choices[0].message.content
\ No newline at end of file
diff --git a/llm_client/query_manager.py b/llm_client/query_manager.py
new file mode 100644
index 0000000..53d333d
--- /dev/null
+++ b/llm_client/query_manager.py
@@ -0,0 +1,52 @@
+from typing import Optional, Dict
+from .models import TokenCounter
+from .factory import LLMStrategyFactory
+import time
+
+class LLMQueryManager:
+    def __init__(
+            self,
+            api_keys: Optional[Dict[str, str]] = None,
+            token_counter: Optional[TokenCounter] = None
+    ):
+        self.token_counter = token_counter or TokenCounter()
+        self.factory = LLMStrategyFactory(api_keys)
+
+    def query_model(
+            self,
+            model_name: str,
+            prompt: str,
+            system_prompt: str,
+            api_key: Optional[str] = None,
+            max_retries: int = 5,
+            timeout: float = 5.0,
+            print_cost: bool = True,
+            temperature: Optional[float] = None,
+    ) -> str:
+        strategy = self.factory.create_strategy(model_name)
+        for attempt in range(max_retries):
+            try:
+                answer = strategy.query(prompt, system_prompt, api_key, temperature)
+
+                input_tokens = strategy.count_tokens(system_prompt + prompt)
+                output_tokens = strategy.count_tokens(answer)
+                self.token_counter.update_counts(model_name, input_tokens, output_tokens)
+                if print_cost:
+                    cost = self.token_counter.calculate_cost(
+                        {model_name: strategy.config}
+                    )
+                    print(f"Current experiment cost = ${cost:.6f}, ** Approximate values, may not reflect true cost")
+
+                return answer
+            except ValueError as e:
+                if 'No API key provided for' in str(e):
+                    raise ValueError("No API key provided. Please provide an API key.")
+                raise e
+
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    raise Exception(f"Max retries reached: {str(e)}")
+                print(f"Attempt {attempt + 1} failed: {str(e)}")
+                time.sleep(timeout)
+
+        raise Exception("Max retries: timeout")
\ No newline at end of file
diff --git a/llm_client/tests/__init__.py b/llm_client/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/llm_client/tests/test_models.py b/llm_client/tests/test_models.py
new file mode 100644
index 0000000..5d159f4
--- /dev/null
+++ b/llm_client/tests/test_models.py
@@ -0,0 +1,139 @@
+import pytest
+from unittest.mock import Mock, patch
+from ..models import TokenCounter, ModelConfig, OpenAIStrategy, AnthropicStrategy
+from ..factory import LLMStrategyFactory
+from ..query_manager import LLMQueryManager
+
+
+def test_token_counter():
+    counter = TokenCounter()
+    counter.update_counts("gpt-4o", 100, 50)
+    counter.update_counts("gpt-4o", 150, 75)
+
+    assert counter.tokens_in["gpt-4o"] == 250
+    assert counter.tokens_out["gpt-4o"] == 125
+
+    model_configs = {
+        "gpt-4o": ModelConfig("gpt-4o", 2.50, 10.00, "openai")
+    }
+
+    expected_cost = (250 * 2.50 + 125 * 10.00) / 1_000_000
+    assert counter.calculate_cost(model_configs) == expected_cost
+
+
+def test_openai_strategy_no_api_key():
+    config = ModelConfig("gpt-4o", 2.50, 10.00, "openai")
+    strategy = OpenAIStrategy(config)
+
+    with pytest.raises(ValueError, match="No API key provided for OpenAI API"):
+        strategy.query("test prompt", "test system prompt")
+
+
+def test_openai_strategy_default_key():
+    config = ModelConfig("gpt-4o", 2.50, 10.00, "openai")
+    mock_client = Mock()
+    mock_completion = Mock()
+    mock_completion.choices = [Mock(message=Mock(content="test response"))]
+    mock_client.chat.completions.create.return_value = mock_completion
+
+    with patch('llm_client.models.OpenAI', return_value=mock_client) as mock_openai:
+        strategy = OpenAIStrategy(config, "default-key")
+        response = strategy.query("test prompt", "test system prompt")
+
+        assert response == "test response"
+        mock_openai.assert_called_once_with(api_key="default-key")
+
+
+def test_openai_strategy_override_key():
+    config = ModelConfig("gpt-4o", 2.50, 10.00, "openai")
+    mock_client = Mock()
+    mock_completion = Mock()
+    mock_completion.choices = [Mock(message=Mock(content="test response"))]
+    mock_client.chat.completions.create.return_value = mock_completion
+
+    with patch('llm_client.models.OpenAI', return_value=mock_client) as mock_openai:
+        strategy = OpenAIStrategy(config, "default-key")
+        response = strategy.query("test prompt", "test system prompt", "override-key")
+
+        assert response == "test response"
+        mock_openai.assert_called_once_with(api_key="override-key")
+
+
+def test_anthropic_strategy_no_api_key():
+    config = ModelConfig("claude-3-5-sonnet", 3.00, 12.00, "anthropic")
+    strategy = AnthropicStrategy(config)
+
+    with pytest.raises(ValueError, match="No API key provided for Anthropic API"):
+        strategy.query("test prompt", "test system prompt")
+
+
+def test_strategy_factory_with_api_keys():
+    api_keys = {
+        "openai": "test-openai-key",
+        "anthropic": "test-anthropic-key",
+        "deepseek": "test-deepseek-key"
+    }
+    factory = LLMStrategyFactory(api_keys)
+
+    # Test OpenAI strategy creation with default key
+    strategy = factory.create_strategy("gpt-4o")
+    assert isinstance(strategy, OpenAIStrategy)
+    assert strategy.default_api_key == "test-openai-key"
+
+    # Test Anthropic strategy creation with default key
+    strategy = factory.create_strategy("claude-3-5-sonnet")
+    assert isinstance(strategy, AnthropicStrategy)
+    assert strategy.default_api_key == "test-anthropic-key"
+
+
+@patch('llm_client.factory.LLMStrategyFactory.create_strategy')
+def test_query_manager_integration(mock_create_strategy):
+    api_keys = {"openai": "test-openai-key"}
+    mock_strategy = Mock()
+    mock_strategy.query.return_value = "test response"
+    mock_strategy.count_tokens.return_value = 10
+    mock_strategy.config = ModelConfig("test-model", 1.0, 1.0, "openai")
+    mock_create_strategy.return_value = mock_strategy
+
+    manager = LLMQueryManager(api_keys)
+
+    # Test with default key
+    response = manager.query_model(
+        "gpt-4o",
+        "test prompt",
+        "test system prompt",
+        print_cost=False
+    )
+    mock_strategy.query.assert_called_with(
+        "test prompt",
+        "test system prompt",
+        None,  # Using default key
+        None,
+    )
+
+    # Test with override key
+    response = manager.query_model(
+        "gpt-4o",
+        "test prompt",
+        "test system prompt",
+        api_key="override-key",
+        print_cost=False
+    )
+    mock_strategy.query.assert_called_with(
+        "test prompt",
+        "test system prompt",
+        "override-key",
+        None,
+    )
+
+
+def test_query_manager_no_api_key():
+    manager = LLMQueryManager()  # No API keys provided
+
+    with pytest.raises(ValueError, match="No API key provided. Please provide an API key."):
+        manager.query_model(
+            "gpt-4o",
+            "test prompt",
+            "test system prompt",
+            print_cost=False
+        )
\ No newline at end of file
diff --git a/mlesolver.py b/mlesolver.py
index cfc4896..6ffde56 100755
--- a/mlesolver.py
+++ b/mlesolver.py
@@ -12,6 +12,10 @@
 
 from contextlib import contextmanager
 import sys, os
+from llm_client import LLMQueryManager
+
+llm_query_manager = LLMQueryManager()
+
 
 @contextmanager
 def suppress_stdout():
@@ -157,14 +161,15 @@ def get_score(outlined_plan, code, code_return, REWARD_MODEL_LLM, attempts=3, op
                 f"You are a professor agent who is serving as an expert reward model that can read a research plan, research code, and code output and are able to determine how well a model followed the plan, built the code, and got the proper output scored from 0 to 1 as a float.\n\n"
                 f"You must structure your score exactly in the following way: ```SCORE\n<score here>\n``` where SCORE is just the word score, <score here> is a floating point number between 0 and 1 representing how well the model followed the plan, built the code, and got the proper output."
             )
-            scoring = query_model(
-                model_str=f"{REWARD_MODEL_LLM}",
+            scoring = llm_query_manager.query_model(
+                model_name=f"{REWARD_MODEL_LLM}",
                 system_prompt=sys,
-                openai_api_key=openai_api_key,
+                api_key=openai_api_key,
                 prompt=(
                     f"Outlined in the following text is the research plan that the machine learning engineer was tasked with building: {outlined_plan}\n\n"
                     f"The following text is the research code that the model produced: \n{code}\n\n"
-                    f"The following is the output from the model: {code_return}\n\n"), temp=0.6)
+                    f"The following is the output from the model: {code_return}\n\n"),
+                temperature=0.6)
             performance = extract_prompt(text=scoring, word="SCORE")
             performance = float(performance)
             return performance, f"The performance of your submission is: {performance}", True
@@ -182,11 +187,12 @@ def code_repair(code, error, ctype, REPAIR_LLM, openai_api_key=None):
             "You must wrap the code in the following ```python\n<code here>\n```\n"
             "Do not forget the opening ```python and the closing ```."
         )
-        model_resp = query_model(
-            openai_api_key=openai_api_key,
-            model_str=f"{REPAIR_LLM}",
+        model_resp = llm_query_manager.query_model(
+            api_key=openai_api_key,
+            model_name=f"{REPAIR_LLM}",
             system_prompt=repair_sys,
-            prompt=f"Provided here is the error: {error}\n\nProvided below is the code:\n\n{code}", temp=0.8)
+            prompt=f"Provided here is the error: {error}\n\nProvided below is the code:\n\n{code}",
+            temperature=0.8)
         return extract_prompt(model_resp, "python")
     elif ctype == "edit":
         repair_sys = (
@@ -202,11 +208,12 @@ def code_repair(code, error, ctype, REPAIR_LLM, openai_api_key=None):
             "Do not forget the opening ```EDIT N M and the closing ```."
             "Your output should look like the following\n\n```EDIT N M\n<new lines to replace old lines>\n```"
         )
-        model_resp = query_model(
-            openai_api_key=openai_api_key,
-            model_str=f"{REPAIR_LLM}",
+        model_resp = llm_query_manager.query_model(
+            api_key=openai_api_key,
+            model_name=f"{REPAIR_LLM}",
             system_prompt=repair_sys,
-            prompt=f"Provided here is the error: {error}\n\nProvided below is the code:\n\n{code}", temp=0.2)
+            prompt=f"Provided here is the error: {error}\n\nProvided below is the code:\n\n{code}",
+            temperature=0.2)
         return model_resp
 
 
@@ -269,11 +276,12 @@ def gen_initial_code(self):
                 if len(error_hist) == 5: _ = error_hist.pop(0)
                 err = "\n".join(error_hist)
                 err_hist = "The following is a history of your previous errors\n" + err + "\nDO NOT REPEAT THESE."
-            model_resp = query_model(
-                openai_api_key=self.openai_api_key,
-                model_str=self.model,
+            model_resp = llm_query_manager.query_model(
+                api_key=self.openai_api_key,
+                model_name=self.model,
                 system_prompt=self.system_prompt(),
-                prompt=f"{err_hist}\nYou should now use ```REPLACE to create initial code to solve the challenge. Now please enter the ```REPLACE command below:\n ", temp=1.0)
+                prompt=f"{err_hist}\nYou should now use ```REPLACE to create initial code to solve the challenge. Now please enter the ```REPLACE command below:\n ",
+                temperature=1.0)
             model_resp = self.clean_text(model_resp)
             cmd_str, code_lines, prev_code_ret, should_execute_code, score = self.process_command(model_resp)
             print(f"@@@ INIT ATTEMPT: Command Exec // Attempt {num_attempts}: ", str(cmd_str).replace("\n", " | "))
@@ -291,11 +299,12 @@ def solve(self):
         while True:
             if len(self.commands) == 2: cmd_app_str = "You must output either the ```EDIT or ```REPLACE command immediately. "
             else: cmd_app_str = ""
-            model_resp = query_model(
-                openai_api_key=self.openai_api_key,
-                model_str=self.model,
+            model_resp = llm_query_manager.query_model(
+                api_key=self.openai_api_key,
+                model_name=self.model,
                 system_prompt=self.system_prompt(),
-                prompt=f"The following is your history:{self.history_str()}\n\n{cmd_app_str}Now please enter a command: ", temp=1.0)
+                prompt=f"The following is your history:{self.history_str()}\n\n{cmd_app_str}Now please enter a command: ",
+                temperature=1.0)
             model_resp = self.clean_text(model_resp)
             self.code_lines = copy(random.choice(self.best_codes)[0])
             cmd_str, code_lines, prev_code_ret, should_execute_code, score = self.process_command(model_resp)
@@ -332,7 +341,11 @@ def reflect_code(self):
         code_strs = ("$"*40 + "\n\n").join([self.generate_code_lines(_code[0]) + f"\nCode Return {_code[1]}" for _code in self.best_codes])
         code_strs = f"Please reflect on the following sets of code: {code_strs} and come up with generalizable insights that will help you improve your performance on this benchmark."
         syst = self.system_prompt(commands=False) + code_strs
-        return query_model(prompt="Please reflect on ideas for how to improve your current code. Examine the provided code and think very specifically (with precise ideas) on how to improve performance, which methods to use, how to improve generalization on the test set with line-by-line examples below:\n", system_prompt=syst, model_str=f"{self.llm_str}", openai_api_key=self.openai_api_key)
+        return llm_query_manager.query_model(
+            prompt="Please reflect on ideas for how to improve your current code. Examine the provided code and think very specifically (with precise ideas) on how to improve performance, which methods to use, how to improve generalization on the test set with line-by-line examples below:\n",
+            system_prompt=syst,
+            model_name=f"{self.llm_str}",
+            api_key=self.openai_api_key)
 
     def process_command(self, model_resp):
         """
@@ -507,7 +520,11 @@ def reflection(self, reflect_prompt, code_str, code_return):
         @param code_str: (str) code string
         @return: (str) reflection string
         """
-        refl = query_model(prompt=reflect_prompt, system_prompt=self.system_prompt(commands=False), model_str=f"{self.llm_str}", openai_api_key=self.openai_api_key)
+        refl = llm_query_manager.query_model(
+            prompt=reflect_prompt,
+            system_prompt=self.system_prompt(commands=False),
+            model_name=f"{self.llm_str}",
+            api_key=self.openai_api_key)
         return f"During the previous execution, the following code was run: \n\n{code_str}\n\nThis code returned the following: \n{code_return}\nThe following is your reflection from this feedback {refl}\n"
 
     def generate_dataset_descr_prompt(self):
diff --git a/papersolver.py b/papersolver.py
index 39222f0..c5a2252 100755
--- a/papersolver.py
+++ b/papersolver.py
@@ -12,6 +12,9 @@
 
 from contextlib import contextmanager
 import sys, os
+from llm_client import LLMQueryManager
+
+llm_query_manager = LLMQueryManager()
 
 @contextmanager
 def suppress_stdout():
@@ -279,13 +282,12 @@ def solve(self):
         self.prev_paper_ret = None
         while True:
             self.paper_lines = copy(random.choice(self.best_report)[0])
-            model_resp = query_model(
-                model_str=self.model,
+            model_resp = llm_query_manager.query_model(
+                model_name=self.model,
                 system_prompt=self.system_prompt(),
                 prompt=f"\nNow please enter a command: ",
-                temp=1.0,
-                openai_api_key=self.openai_api_key)
-            #print(model_resp)
+                temperature=1.0,
+                api_key=self.openai_api_key)
             model_resp = self.clean_text(model_resp)
             cmd_str, paper_lines, prev_paper_ret, score = self.process_command(model_resp)
             if score is not None:
@@ -351,7 +353,11 @@ def gen_initial_report(self):
                         break
                     if not first_attempt:
                         att_str = "This is not your first attempt please try to come up with a simpler search query."
-                    search_query = query_model(model_str=f"{self.llm_str}", prompt=f"Given the following research topic {self.topic} and research plan: \n\n{self.plan}\n\nPlease come up with a search query to find relevant papers on arXiv. Respond only with the search query and nothing else. This should be a a string that will be used to find papers with semantically similar content. {att_str}", system_prompt=f"You are a research paper finder. You must find papers for the section {_section}. Query must be text nothing else.", openai_api_key=self.openai_api_key)
+                    search_query = llm_query_manager.query_model(
+                        model_name=f"{self.llm_str}",
+                        prompt=f"Given the following research topic {self.topic} and research plan: \n\n{self.plan}\n\nPlease come up with a search query to find relevant papers on arXiv. Respond only with the search query and nothing else. This should be a a string that will be used to find papers with semantically similar content. {att_str}",
+                        system_prompt=f"You are a research paper finder. You must find papers for the section {_section}. Query must be text nothing else.",
+                        api_key=self.openai_api_key)
                     search_query.replace('"', '')
                     papers = arx.find_papers_by_str(query=search_query, N=10)
                     first_attempt = False
@@ -369,12 +375,12 @@ def gen_initial_report(self):
                     if _section in self.section_related_work:
                         rp = f"Here are related papers you can cite: {self.section_related_work[_section]}. You can cite them just by putting the arxiv ID in parentheses, e.g. (arXiv 2308.11483v1)\n"
                     prompt = f"{err}\n{rp}\nNow please enter the ```REPLACE command to create the designated section, make sure to only write the text for that section and nothing else. Do not include packages or section titles, just the section content:\n "
-                model_resp = query_model(
-                    model_str=self.model,
+                model_resp = llm_query_manager.query_model(
+                    model_name=self.model,
                     system_prompt=self.system_prompt(section=_section),
                     prompt=f"{prompt}",
-                    temp=0.8,
-                    openai_api_key=self.openai_api_key)
+                    temperature=0.8,
+                    api_key=self.openai_api_key)
                 model_resp = self.clean_text(model_resp)
                 if _section == "scaffold":
                     # minimal scaffold (some other sections can be combined)
@@ -437,7 +443,7 @@ def process_command(self, model_resp, scoring=True):
                         else:
                             paper_lines = copy(args[1]) #
                             if scoring:
-                                score, cmd_str, is_valid = get_score(self.plan, "\n".join(paper_lines), reward_model_llm=self.llm_str)
+                                score, cmd_str, is_valid = get_score(self.plan, "\n".join(paper_lines), reward_model_llm=self.llm_str, openai_api_key=self.openai_api_key)
                             else:
                                 score, cmd_str, is_valid = 0.0, "Paper scored successfully", True
                             if is_valid: failed = False
@@ -459,7 +465,7 @@ def process_command(self, model_resp, scoring=True):
                     if success:
                         paper_lines = copy(args[0]) #
                         if scoring:
-                            score, cmd_str, is_valid = get_score(self.plan, "\n".join(paper_lines), reward_model_llm=self.llm_str)
+                            score, cmd_str, is_valid = get_score(self.plan, "\n".join(paper_lines), reward_model_llm=self.llm_str, openai_api_key=self.openai_api_key)
                         else:
                             score, cmd_str, is_valid = 0.0, "Paper scored successfully", True
                         if is_valid: failed = False
diff --git a/requirements.txt b/requirements.txt
index e08992b..25aceb5 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -86,6 +86,7 @@ pydantic_core==2.27.1
 Pygments==2.18.0
 pyparsing==3.2.0
 pypdf==5.1.0
+pytest==8.3.4
 python-dateutil==2.9.0.post0
 pytz==2024.2
 PyYAML==6.0.2