Merge pull request #76 from stratosphereips/harpo-saving-prompts

harpomaxx · web-flow · commit 6fc5e77eb0da · 2025-03-05T11:45:52.000-03:00
add functionality for storing prompts and responses in CSV
diff --git a/agents/attackers/llm_qa/llm_action_planner.py b/agents/attackers/llm_qa/llm_action_planner.py
@@ -74,7 +74,27 @@ def __init__(self, model_name: str, goal: str, memory_len: int = 10, api_url=Non
         self.memory_len = memory_len
         self.logger = logging.getLogger("REACT-agent")
         self.update_instructions(goal.lower())
-
+        self.prompts = []
+        self.states = []
+        self.responses = []
+
+    def get_prompts(self) -> list:
+        """
+        Returns the list of prompts sent to the LLM."""
+        return self.prompts
+
+    def get_responses(self) -> list:
+        """
+        Returns the list of responses received from the LLM. Only Stage 2 responses are included.
+        """
+        return self.responses
+    
+    def get_states(self) -> list:
+        """
+        Returns the list of states received from the LLM. In JSON format.
+        """
+        return self.states
+    
     def update_instructions(self, new_goal: str) -> None:
         template = jinja2.Environment().from_string(self.config['prompts']['INSTRUCTIONS_TEMPLATE'])
         self.instructions = template.render(goal=new_goal)
@@ -141,6 +161,8 @@ def parse_response(self, llm_response: str, state: Observation.state):
     
     
     def get_action_from_obs_react(self, observation: Observation, memory_buf: list) -> tuple:
+        self.states.append(observation.state.as_json())
+        
         status_prompt = create_status_from_state(observation.state)
         Q1 = self.config['questions'][0]['text']
         Q4 = self.config['questions'][3]['text']
@@ -168,8 +190,10 @@ def get_action_from_obs_react(self, observation: Observation, memory_buf: list)
             {"role": "user", "content": memory_prompt},
             {"role": "user", "content": Q4},
         ]
-
+        self.prompts.append(messages)
+        
         response = self.openai_query(messages, max_tokens=80, fmt={"type": "json_object"})
+        self.responses.append(response)
         self.logger.info(f"(Stage 2) Response from LLM: {response}")
         print(f"(Stage 2) Response from LLM: {response}")
         return self.parse_response(response, observation.state) 
diff --git a/agents/attackers/llm_qa/llm_agent_qa.py b/agents/attackers/llm_qa/llm_agent_qa.py
@@ -113,10 +113,11 @@
     num_actions_repeated = []
     reward_memory = ""
 
-    states = []
-    prompts = []
-    responses = []
-    evaluations = []
+ 
+    # Create an empty DataFrame for storing prompts and responses, and evaluations
+    prompt_table = pd.DataFrame(columns=["state", "prompt", "response", "evaluation"])
+    
+    
     # We are still not using this, but we keep track
     is_detected = False
 
@@ -126,7 +127,7 @@
     print("Done")
     for episode in range(1, args.test_episodes + 1):
         actions_took_in_episode = []
-
+        evaluations = [] # used for prompt table storage.
         logger.info(f"Running episode {episode}")
         print(f"Running episode {episode}")
 
@@ -151,9 +152,7 @@
         for i in range(num_iterations):
             good_action = False
             #is_json_ok = True
-            states.append(observation.state.as_json())
             is_valid, response_dict, action = llm_query.get_action_from_obs_react(observation, memories)
-
             if is_valid:
                 observation = agent.make_step(action)
                 logger.info(f"Observation received: {observation}")
@@ -282,7 +281,16 @@
                 )
                 break
 
-
+        episode_prompt_table = {
+            "state": llm_query.get_states(),
+            "prompt": llm_query.get_prompts(),
+            "response": llm_query.get_responses(),
+            "evaluation": evaluations,
+        }
+        episode_prompt_table = pd.DataFrame(episode_prompt_table)
+        prompt_table = pd.concat([prompt_table,episode_prompt_table],axis=0,ignore_index=True)
+        
+    prompt_table.to_csv("states_prompts_responses_new.csv", index=False)
 
     # After all episodes are done. Compute statistics
     test_win_rate = (wins / (args.test_episodes)) * 100