gkamradt · LazaroHurtado · Mar 8, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 12, 2024
diff --git a/README.md b/README.md
@@ -107,9 +107,8 @@ The package `needlehaystack` is available for import in your test cases. Develop
 - `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete
 
 `LLMMultiNeedleHaystackTester` parameters:
-
-- `multi_needle` - True or False, whether to run multi-needle
-- `needles` - List of needles to insert in the context
+* `needles` - List of needles to insert in the context
+* `eval_set` - The evaluation set identifier.
 
 Other Parameters:
 
@@ -162,18 +161,16 @@ Needle 10: 40 + 9 * 6 = 94
 
 You can use LangSmith to orchestrate evals and store results.
 
-(1) Sign up for [LangSmith](https://docs.smith.langchain.com/setup)
-(2) Set env variables for LangSmith as specified in the setup.
-(3) In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` to start.
-(4) Populate the dataset with a test question:
-
-```
-question: What are the 5 best things to do in San Franscisco?
-answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge"
-```
-
-![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a)
-(5) Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.
+1. Sign up for [LangSmith](https://docs.smith.langchain.com/setup)
+2. Set env variables for LangSmith as specified in the setup.
+3. In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` and set dataset type to `Key-Value`.
+4. Populate the dataset with a test question:
+    ```
+    question: What are the 5 best things to do in San Franscisco?
+    answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge"
+    ```
+    ![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a)
+5. Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.
 
 Let's see all these working together on a new dataset, `multi-needle-eval-pizza`.
 

diff --git a/needlehaystack/evaluators/langsmith.py b/needlehaystack/evaluators/langsmith.py
@@ -1,4 +1,4 @@
-from typing import Union
+import os
 import uuid
 
 from langchain_openai import ChatOpenAI  
@@ -12,7 +12,7 @@
 from langsmith.schemas import Example, Run
 
 @run_evaluator
-def score_relevance(run: Run, example: Union[Example, None] = None):
+def score_relevance(run: Run, example: Example | None = None):
     """
     A custom evaluator function that grades the language model's response based on its relevance
     to a reference answer.
@@ -24,10 +24,6 @@ def score_relevance(run: Run, example: Union[Example, None] = None):
     Returns:
         EvaluationResult: The result of the evaluation, containing the relevance score.
     """
-
-    print("--LANGSMITH EVAL--")
-    #print("--MODEL: ", model_name)
-    #print("--EVAL SET: ", eval_set)
     student_answer = run.outputs["output"]
     reference = example.outputs["answer"]
 

diff --git a/needlehaystack/llm_multi_needle_haystack_tester.py b/needlehaystack/llm_multi_needle_haystack_tester.py
@@ -1,44 +1,22 @@
-import asyncio
-import glob
-import json
-import os
-import time
-from asyncio import Semaphore
-from datetime import datetime, timezone
-
-import numpy as np
-
-from .evaluators import Evaluator
 from .llm_needle_haystack_tester import LLMNeedleHaystackTester
-from .providers import ModelProvider
-
 
 class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
     """
     Extends LLMNeedleHaystackTester to support testing with multiple needles in the haystack.
 
     Attributes:
         needles (list): A list of needles (facts) to insert into the haystack (context).
-        model_to_test (ModelProvider): The model being tested.
-        evaluator (Evaluator): The evaluator used to assess the model's performance.
-        print_ongoing_status (bool): Flag to print ongoing status messages.
         eval_set (str): The evaluation set identifier.
     """
-    def __init__(self, *args, 
-                 needles=[], 
-                 model_to_test: ModelProvider = None,
-                 evaluator: Evaluator = None, 
-                 print_ongoing_status = True,
+    def __init__(self,
+                 needles=[],
                  eval_set = "multi-needle-eval-sf",
+                 *args,
                  **kwargs):
 
-        super().__init__(*args, model_to_test=model_to_test, **kwargs)
+        super().__init__(*args, **kwargs)
         self.needles = needles
-        self.evaluator = evaluator
-        self.model_to_test = model_to_test
         self.eval_set = eval_set
-        self.model_name = self.model_to_test.model_name
-        self.print_ongoing_status = print_ongoing_status
         self.insertion_percentages = []
 
     async def insert_needles(self, context, depth_percent, context_length):
@@ -90,50 +68,19 @@ async def insert_needles(self, context, depth_percent, context_length):
                 # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
                 tokens_context = tokens_context + tokens_needle
             else:
-                # Go get the position (in terms of tokens) to insert your needle
-                insertion_point = int(len(tokens_context) * (depth_percent / 100))
-
-                # tokens_new_context represents the tokens before the needle
-                tokens_new_context = tokens_context[:insertion_point]
-
-                # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
-                period_tokens = self.model_to_test.encode_text_to_tokens('.')
-
-                # Then we iteration backwards until we find the first period
-                while tokens_new_context and tokens_new_context[-1] not in period_tokens:
-                    insertion_point -= 1
-                    tokens_new_context = tokens_context[:insertion_point]
-
-                # Insert the needle into the context at the found position
-                tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
+                tokens_context, insertion_point = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent)
 
                 # Log 
                 insertion_percentage = (insertion_point / len(tokens_context)) * 100
                 self.insertion_percentages.append(insertion_percentage)
-                print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
+                # print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
 
                 # Adjust depth for next needle
                 depth_percent += depth_percent_interval  
 
         new_context = self.model_to_test.decode_tokens(tokens_context)
         return new_context
 
-    def encode_and_trim(self, context, context_length):
-        """
-        Encodes the context to tokens and trims it to the specified length.
-
-        Args:
-            context (str): The context to encode and trim.
-            context_length (int): The desired length of the context in tokens.
-
-        Returns:
-            str: The encoded and trimmed context.
-        """
-        tokens = self.model_to_test.encode_text_to_tokens(context)
-        if len(tokens) > context_length:
-            context = self.model_to_test.decode_tokens(tokens, context_length)
-        return context
-
     async def generate_context(self, context_length, depth_percent):
         """
         Generates a context of a specified length and inserts needles at given depth percentages.
@@ -165,103 +112,19 @@ async def evaluate_and_log(self, context_length, depth_percent):
         # Go generate the required length context and place your needle statement in
         context = await self.generate_context(context_length, depth_percent)
 
-        test_start_time = time.time()
-
         # LangSmith
         ## TODO: Support for other evaluators 
-        if self.evaluator.__class__.__name__ == "LangSmithEvaluator":  
-            print("EVALUATOR: LANGSMITH")
+        if self.evaluation_model.__class__.__name__ == "LangSmithEvaluator":
             chain = self.model_to_test.get_langchain_runnable(context)
-            self.evaluator.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages)
-            test_end_time = time.time()
-            test_elapsed_time = test_end_time - test_start_time
-
+            self.evaluation_model.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages)
         else:
-            print("EVALUATOR: OpenAI Model")
-            # Prepare your message to send to the model you're going to evaluate
-            prompt = self.model_to_test.generate_prompt(context, self.retrieval_question)
-            # Go see if the model can answer the question to pull out your random fact
-            response = await self.model_to_test.evaluate_model(prompt)
-            # Compare the reponse to the actual needle you placed
-            score = self.evaluation_model.evaluate_response(response)
-
-            test_end_time = time.time()
-            test_elapsed_time = test_end_time - test_start_time
-
-            results = {
-            # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
-            'model' : self.model_to_test.model_name,
-            'context_length' : int(context_length),
-            'depth_percent' : float(depth_percent),
-            'version' : self.results_version,
-            'needle' : self.needle,
-            'model_response' : response,
-            'score' : score,
-            'test_duration_seconds' : test_elapsed_time,
-            'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
-            }
-
-            self.testing_results.append(results)
-
-            if self.print_ongoing_status:
-                print (f"-- Test Summary -- ")
-                print (f"Duration: {test_elapsed_time:.1f} seconds")
-                print (f"Context: {context_length} tokens")
-                print (f"Depth: {depth_percent}%")
-                print (f"Score: {score}")
-                print (f"Response: {response}\n")
-
-            context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
-
-            if self.save_contexts:
-                results['file_name'] = context_file_location
-
-                # Save the context to file for retesting
-                if not os.path.exists('contexts'):
-                    os.makedirs('contexts')
-
-                with open(f'contexts/{context_file_location}_context.txt', 'w') as f:
-                    f.write(context)
-
-            if self.save_results:
-                # Save the context to file for retesting
-                if not os.path.exists('results'):
-                    os.makedirs('results')
-
-                # Save the result to file for retesting
-                with open(f'results/{context_file_location}_results.json', 'w') as f:
-                    json.dump(results, f)
-
-            if self.seconds_to_sleep_between_completions:
-                await asyncio.sleep(self.seconds_to_sleep_between_completions)
-
-    async def bound_evaluate_and_log(self, sem, *args):
-            async with sem:
-                await self.evaluate_and_log(*args)
-
-    async def run_test(self):
-        sem = Semaphore(self.num_concurrent_requests)
-
-        # Run through each iteration of context_lengths and depths
-        tasks = []
-        for context_length in self.context_lengths:
-            for depth_percent in self.document_depth_percents:
-                task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
-                tasks.append(task)
-
-        # Wait for all tasks to complete
-        await asyncio.gather(*tasks)
+            await super().evaluate_and_log(context, context_length, depth_percent)
 
     def print_start_test_summary(self):
         print ("\n")
-        print ("Starting Needle In A Haystack Testing...")
+        print ("Starting Needles In A Haystack Testing...")
         print (f"- Model: {self.model_name}")
         print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
         print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
         print (f"- Needles: {self.needles}")
         print ("\n\n")
-
-    def start_test(self):
-        if self.print_ongoing_status:
-            self.print_start_test_summary()
-        asyncio.run(self.run_test())
diff --git a/needlehaystack/llm_needle_haystack_tester.py b/needlehaystack/llm_needle_haystack_tester.py
@@ -65,6 +65,8 @@ def __init__(self,
         """
         if not model_to_test:
             raise ValueError("A language model must be provided to test.")
+        if not evaluator:
+            raise ValueError("An evaluator must be provided to evaluate the model's response.")
         if not needle or not haystack_dir or not retrieval_question:
             raise ValueError("Needle, haystack, and retrieval_question must be provided.")
 
@@ -252,31 +254,36 @@ def insert_needle(self, context, depth_percent, context_length):
         if len(tokens_context) + len(tokens_needle) > context_length:
             tokens_context = tokens_context[:context_length - len(tokens_needle)]
 
+        tokens_new_context, _ = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent)
+
+        # Convert back to a string and return it
+        new_context = self.model_to_test.decode_tokens(tokens_new_context)
+        return new_context
+
+    def get_tokens_new_context(self, tokens_context, tokens_needle, depth_percent):
         if depth_percent == 100:
             # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
-            tokens_new_context = tokens_context + tokens_needle
-        else:
-            # Go get the position (in terms of tokens) to insert your needle
-            insertion_point = int(len(tokens_context) * (depth_percent / 100))
+            return tokens_context + tokens_needle
+
+        # Go get the position (in terms of tokens) to insert your needle
+        insertion_point = int(len(tokens_context) * (depth_percent / 100))
 
-            # tokens_new_context represents the tokens before the needle
-            tokens_new_context = tokens_context[:insertion_point]
+        # tokens_new_context represents the tokens before the needle
+        tokens_new_context = tokens_context[:insertion_point]
 
-            # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
-            period_tokens = self.model_to_test.encode_text_to_tokens('.')
-            
-            # Then we iteration backwards until we find the first period
-            while tokens_new_context and tokens_new_context[-1] not in period_tokens:
-                insertion_point -= 1
-                tokens_new_context = tokens_context[:insertion_point]
+        # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
+        period_tokens = self.model_to_test.encode_text_to_tokens('.')
+
+        # Then we iteration backwards until we find the first period
+        while tokens_new_context and tokens_new_context[-1] not in period_tokens:
+            insertion_point -= 1
+            tokens_new_context = tokens_context[:insertion_point]
 
-            # Once we get there, then add in your needle, and stick the rest of your context in on the other end.
-            # Now we have a needle in a haystack
-            tokens_new_context += tokens_needle + tokens_context[insertion_point:]
+        # Once we get there, then add in your needle, and stick the rest of your context in on the other end.
+        # Now we have a needle in a haystack
+        tokens_new_context += tokens_needle + tokens_context[insertion_point:]
 
-        # Convert back to a string and return it
-        new_context = self.model_to_test.decode_tokens(tokens_new_context)
-        return new_context
+        return tokens_new_context, insertion_point
 
     def get_context_length_in_tokens(self, context):
         return len(self.model_to_test.encode_text_to_tokens(context))

diff --git a/needlehaystack/run.py b/needlehaystack/run.py
@@ -101,10 +101,8 @@ def main():
     args.evaluator = get_evaluator(args)
 
     if args.multi_needle == True:
-        print("Testing multi-needle")
         tester = LLMMultiNeedleHaystackTester(**args.__dict__)
     else: 
-        print("Testing single-needle")
         tester = LLMNeedleHaystackTester(**args.__dict__)
     tester.start_test()