removed duplicate code

LazaroHurtado · LazaroHurtado · commit c6c45e31191c · 2024-03-07T22:31:33.000-08:00
diff --git a/README.md b/README.md
@@ -16,11 +16,12 @@ $ make setup
 $ source ./venv/bin/activate
 $ pip install -r requirements.txt
 ```
-You can then run the analysis on OpenAI or Anthropic models by running `main.py` with the command line arguments shown below. `LLMNeedleHaystackTester` parameters can also be passed as command line arguments, except `model_to_test` and `evaluator` of course.
+You can then run the analysis on OpenAI or Anthropic models by running `main.py` with the command line arguments shown below. `LLMNeedleHaystackTester` and `LLMMultiNeedleHaystackTester` parameters can also be passed as command line arguments, except `model_to_test` and `evaluator` of course.
 * `provider` - The provider of the model, available options are `openai` and `anthropic`. Defaults to `openai`
 * `evaluator` - The evaluator, which can either be a `model` or `LangSmith`. See more on `LangSmith` below. If using a `model`, only `openai` is currently supported. Defaults to `openai`.
 * `api_key` - API key for either OpenAI or Anthropic provider. Can either be passed as a command line argument or an environment variable named `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` depending on the provider. Defaults to `None`.
 * `evaluator_api_key` - API key for OpenAI provider. Can either be passed as a command line argument or an environment variable named `OPENAI_API_KEY`. Defaults to `None`
+* `multi_needle` - Whether to run multi-needle tester or not. Default to `False`
 
 ## The Test
 1. Place a random fact or statement (the 'needle') in the middle of a long context window (the 'haystack')
@@ -57,8 +58,8 @@ I've put the results from the original tests in `/original_results`. I've upgrad
 * `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete
 
 `LLMMultiNeedleHaystackTester` parameters:
-* `multi_needle` - True or False, whether to run multi-needle
 * `needles` - List of needles to insert in the context
+* `eval_set` - The evaluation set identifier.
 
 Other Parameters:
 * `api_key` - API key for either OpenAI or Anthropic provider. Can either be passed when creating the object or an environment variable
@@ -107,16 +108,16 @@ Needle 10: 40 + 9 * 6 = 94
 
 You can use LangSmith to orchestrate evals and store results.
 
-(1) Sign up for [LangSmith](https://docs.smith.langchain.com/setup)
-(2) Set env variables for LangSmith as specified in the setup.
-(3) In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` to start.
-(4) Populate the dataset with a test question:
-```
-question: What are the 5 best things to do in San Franscisco?
-answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge"
-```
-![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a)
-(5) Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.
+1. Sign up for [LangSmith](https://docs.smith.langchain.com/setup)
+2. Set env variables for LangSmith as specified in the setup.
+3. In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` and set dataset type to `Key-Value`.
+4. Populate the dataset with a test question:
+    ```
+    question: What are the 5 best things to do in San Franscisco?
+    answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge"
+    ```
+    ![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a)
+5. Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.
 
 Let's see all these working together on a new dataset, `multi-needle-eval-pizza`.
 
diff --git a/main.py b/main.py
@@ -155,10 +155,8 @@ def main():
     args.evaluator = get_evaluator(args)
     
     if args.multi_needle == True:
-        print("Testing multi-needle")
         tester = LLMMultiNeedleHaystackTester(**args.__dict__)
     else: 
-        print("Testing single-needle")
         tester = LLMNeedleHaystackTester(**args.__dict__)
     tester.start_test()
 
diff --git a/src/evaluators/langsmith_evaluator.py b/src/evaluators/langsmith_evaluator.py
@@ -1,4 +1,4 @@
-from typing import Union
+import os
 import uuid
 
 from langchain_openai import ChatOpenAI  
@@ -12,7 +12,7 @@
 from langsmith.schemas import Example, Run
 
 @run_evaluator
-def score_relevance(run: Run, example: Union[Example, None] = None):
+def score_relevance(run: Run, example: Example | None = None):
     """
     A custom evaluator function that grades the language model's response based on its relevance
     to a reference answer.
@@ -24,10 +24,6 @@ def score_relevance(run: Run, example: Union[Example, None] = None):
     Returns:
         EvaluationResult: The result of the evaluation, containing the relevance score.
     """
-    
-    print("--LANGSMITH EVAL--")
-    #print("--MODEL: ", model_name)
-    #print("--EVAL SET: ", eval_set)
     student_answer = run.outputs["output"]
     reference = example.outputs["answer"]
 
@@ -90,7 +86,10 @@ def __init__(self, api_key: str = None):
         Args:
             api_key (str, optional): The API key for authenticating evaluator model.
         """
-        self.api_key = api_key
+        if (api_key is None) and (not os.getenv('LANGCHAIN_API_KEY')):
+            raise ValueError("Either api_key must be supplied with init, or LANGCHAIN_API_KEY must be in env. Used for evaluation model")
+        
+        self.api_key = api_key or os.getenv('LANGCHAIN_API_KEY')
 
     def evaluate_chain(self, chain, context_length, depth_percent, model_name, eval_set):
         """
diff --git a/src/llm_multi_needle_haystack_tester.py b/src/llm_multi_needle_haystack_tester.py
@@ -1,18 +1,7 @@
-import asyncio
-import glob
-import json
-import os
-import time
-from asyncio import Semaphore
-from datetime import datetime, timezone
-
-import numpy as np
-
 from .evaluators import Evaluator
 from .llm_needle_haystack_tester import LLMNeedleHaystackTester
 from .providers import ModelProvider
 
-
 class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
     """
     Extends LLMNeedleHaystackTester to support testing with multiple needles in the haystack.
@@ -24,21 +13,17 @@ class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
         print_ongoing_status (bool): Flag to print ongoing status messages.
         eval_set (str): The evaluation set identifier.
     """
-    def __init__(self, *args, 
-                 needles=[], 
+    def __init__(self,
                  model_to_test: ModelProvider = None,
-                 evaluator: Evaluator = None, 
-                 print_ongoing_status = True,
+                 evaluator: Evaluator = None,
+                 needles=[],
                  eval_set = "multi-needle-eval-sf",
+                 *args,
                  **kwargs):
 
-        super().__init__(*args, model_to_test=model_to_test, **kwargs)
+        super().__init__(model_to_test, evaluator, *args, **kwargs)
         self.needles = needles
-        self.evaluator = evaluator
-        self.model_to_test = model_to_test
         self.eval_set = eval_set
-        self.model_name = self.model_to_test.model_name
-        self.print_ongoing_status = print_ongoing_status
 
     async def insert_needles(self, context, depth_percent, context_length):
         """
@@ -84,9 +69,6 @@ async def insert_needles(self, context, depth_percent, context_length):
             # For simplicity, evenly distribute needles throughout the context
             insertion_point = int(len(tokens_context) * (depth_percent / 100))
             tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
-            # Log 
-            insertion_percentage = (insertion_point / len(tokens_context)) * 100
-            print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
             # Adjust depth for next needle
             depth_percent += depth_percent_interval  
 
@@ -104,10 +86,7 @@ def encode_and_trim(self, context, context_length):
         Returns:
             str: The encoded and trimmed context.
         """
-        tokens = self.model_to_test.encode_text_to_tokens(context)
-        if len(tokens) > context_length:
-            context = self.model_to_test.decode_tokens(tokens, context_length)
-        return context
+        return super().encode_and_trim(context, context_length)
 
     async def generate_context(self, context_length, depth_percent):
         """
@@ -140,103 +119,19 @@ async def evaluate_and_log(self, context_length, depth_percent):
         # Go generate the required length context and place your needle statement in
         context = await self.generate_context(context_length, depth_percent)
 
-        test_start_time = time.time()
-
         # LangSmith
         ## TODO: Support for many evaluators 
-        if self.evaluator.__class__.__name__ == "LangSmithEvaluator":  
-            print("EVALUATOR: LANGSMITH")
+        if self.evaluation_model.__class__.__name__ == "LangSmithEvaluator":
             chain = self.model_to_test.get_langchain_runnable(context)
-            self.evaluator.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set)
-            test_end_time = time.time()
-            test_elapsed_time = test_end_time - test_start_time
-
+            self.evaluation_model.evaluate_chain(chain, context_length, depth_percent, self.model_name, self.eval_set)
         else:
-            print("EVALUATOR: OpenAI Model")
-            # Prepare your message to send to the model you're going to evaluate
-            prompt = self.model_to_test.generate_prompt(context, self.retrieval_question)
-            # Go see if the model can answer the question to pull out your random fact
-            response = await self.model_to_test.evaluate_model(prompt)
-            # Compare the reponse to the actual needle you placed
-            score = self.evaluation_model.evaluate_response(response)
-
-            test_end_time = time.time()
-            test_elapsed_time = test_end_time - test_start_time
-
-            results = {
-            # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
-            'model' : self.model_to_test.model_name,
-            'context_length' : int(context_length),
-            'depth_percent' : float(depth_percent),
-            'version' : self.results_version,
-            'needle' : self.needle,
-            'model_response' : response,
-            'score' : score,
-            'test_duration_seconds' : test_elapsed_time,
-            'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
-            }
-
-            self.testing_results.append(results)
-
-            if self.print_ongoing_status:
-                print (f"-- Test Summary -- ")
-                print (f"Duration: {test_elapsed_time:.1f} seconds")
-                print (f"Context: {context_length} tokens")
-                print (f"Depth: {depth_percent}%")
-                print (f"Score: {score}")
-                print (f"Response: {response}\n")
-
-            context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
-
-            if self.save_contexts:
-                results['file_name'] = context_file_location
-
-                # Save the context to file for retesting
-                if not os.path.exists('contexts'):
-                    os.makedirs('contexts')
-
-                with open(f'contexts/{context_file_location}_context.txt', 'w') as f:
-                    f.write(context)
-                
-            if self.save_results:
-                # Save the context to file for retesting
-                if not os.path.exists('results'):
-                    os.makedirs('results')
-
-                # Save the result to file for retesting
-                with open(f'results/{context_file_location}_results.json', 'w') as f:
-                    json.dump(results, f)
-
-            if self.seconds_to_sleep_between_completions:
-                await asyncio.sleep(self.seconds_to_sleep_between_completions)
-
-    async def bound_evaluate_and_log(self, sem, *args):
-            async with sem:
-                await self.evaluate_and_log(*args)
-
-    async def run_test(self):
-        sem = Semaphore(self.num_concurrent_requests)
-
-        # Run through each iteration of context_lengths and depths
-        tasks = []
-        for context_length in self.context_lengths:
-            for depth_percent in self.document_depth_percents:
-                task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
-                tasks.append(task)
-
-        # Wait for all tasks to complete
-        await asyncio.gather(*tasks)
+            await super().evaluate_and_log(context, context_length, depth_percent)
 
     def print_start_test_summary(self):
         print ("\n")
-        print ("Starting Needle In A Haystack Testing...")
+        print ("Starting Needles In A Haystack Testing...")
         print (f"- Model: {self.model_name}")
         print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
         print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
-        print (f"- Needle: {self.needle.strip()}")
-        print ("\n\n")
-
-    def start_test(self):
-        if self.print_ongoing_status:
-            self.print_start_test_summary()
-        asyncio.run(self.run_test())
+        print (f"- Needles: {[needle.strip() for needle in self.needles]}")
+        print ("\n\n")