diff --git a/README.md b/README.md index c7c22262..3cf3c850 100644 --- a/README.md +++ b/README.md @@ -107,9 +107,8 @@ The package `needlehaystack` is available for import in your test cases. Develop - `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete `LLMMultiNeedleHaystackTester` parameters: - -- `multi_needle` - True or False, whether to run multi-needle -- `needles` - List of needles to insert in the context +* `needles` - List of needles to insert in the context +* `eval_set` - The evaluation set identifier. Other Parameters: @@ -162,18 +161,16 @@ Needle 10: 40 + 9 * 6 = 94 You can use LangSmith to orchestrate evals and store results. -(1) Sign up for [LangSmith](https://docs.smith.langchain.com/setup) -(2) Set env variables for LangSmith as specified in the setup. -(3) In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` to start. -(4) Populate the dataset with a test question: - -``` -question: What are the 5 best things to do in San Franscisco? -answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge" -``` - -![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a) -(5) Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set. +1. Sign up for [LangSmith](https://docs.smith.langchain.com/setup) +2. Set env variables for LangSmith as specified in the setup. +3. In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` and set dataset type to `Key-Value`. +4. Populate the dataset with a test question: + ``` + question: What are the 5 best things to do in San Franscisco? + answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge" + ``` + ![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a) +5. Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set. Let's see all these working together on a new dataset, `multi-needle-eval-pizza`. diff --git a/needlehaystack/evaluators/langsmith.py b/needlehaystack/evaluators/langsmith.py index a53e74e3..6ee8e0f5 100644 --- a/needlehaystack/evaluators/langsmith.py +++ b/needlehaystack/evaluators/langsmith.py @@ -1,4 +1,4 @@ -from typing import Union +import os import uuid from langchain_openai import ChatOpenAI @@ -12,7 +12,7 @@ from langsmith.schemas import Example, Run @run_evaluator -def score_relevance(run: Run, example: Union[Example, None] = None): +def score_relevance(run: Run, example: Example | None = None): """ A custom evaluator function that grades the language model's response based on its relevance to a reference answer. @@ -24,10 +24,6 @@ def score_relevance(run: Run, example: Union[Example, None] = None): Returns: EvaluationResult: The result of the evaluation, containing the relevance score. """ - - print("--LANGSMITH EVAL--") - #print("--MODEL: ", model_name) - #print("--EVAL SET: ", eval_set) student_answer = run.outputs["output"] reference = example.outputs["answer"] diff --git a/needlehaystack/llm_multi_needle_haystack_tester.py b/needlehaystack/llm_multi_needle_haystack_tester.py index 3a8c432d..85b72949 100644 --- a/needlehaystack/llm_multi_needle_haystack_tester.py +++ b/needlehaystack/llm_multi_needle_haystack_tester.py @@ -1,17 +1,4 @@ -import asyncio -import glob -import json -import os -import time -from asyncio import Semaphore -from datetime import datetime, timezone - -import numpy as np - -from .evaluators import Evaluator from .llm_needle_haystack_tester import LLMNeedleHaystackTester -from .providers import ModelProvider - class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester): """ @@ -19,26 +6,17 @@ class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester): Attributes: needles (list): A list of needles (facts) to insert into the haystack (context). - model_to_test (ModelProvider): The model being tested. - evaluator (Evaluator): The evaluator used to assess the model's performance. - print_ongoing_status (bool): Flag to print ongoing status messages. eval_set (str): The evaluation set identifier. """ - def __init__(self, *args, - needles=[], - model_to_test: ModelProvider = None, - evaluator: Evaluator = None, - print_ongoing_status = True, + def __init__(self, + needles=[], eval_set = "multi-needle-eval-sf", + *args, **kwargs): - super().__init__(*args, model_to_test=model_to_test, **kwargs) + super().__init__(*args, **kwargs) self.needles = needles - self.evaluator = evaluator - self.model_to_test = model_to_test self.eval_set = eval_set - self.model_name = self.model_to_test.model_name - self.print_ongoing_status = print_ongoing_status self.insertion_percentages = [] async def insert_needles(self, context, depth_percent, context_length): @@ -90,27 +68,12 @@ async def insert_needles(self, context, depth_percent, context_length): # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end tokens_context = tokens_context + tokens_needle else: - # Go get the position (in terms of tokens) to insert your needle - insertion_point = int(len(tokens_context) * (depth_percent / 100)) - - # tokens_new_context represents the tokens before the needle - tokens_new_context = tokens_context[:insertion_point] - - # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is - period_tokens = self.model_to_test.encode_text_to_tokens('.') - - # Then we iteration backwards until we find the first period - while tokens_new_context and tokens_new_context[-1] not in period_tokens: - insertion_point -= 1 - tokens_new_context = tokens_context[:insertion_point] - - # Insert the needle into the context at the found position - tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:] + tokens_context, insertion_point = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent) # Log insertion_percentage = (insertion_point / len(tokens_context)) * 100 self.insertion_percentages.append(insertion_percentage) - print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens") + # print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens") # Adjust depth for next needle depth_percent += depth_percent_interval @@ -118,22 +81,6 @@ async def insert_needles(self, context, depth_percent, context_length): new_context = self.model_to_test.decode_tokens(tokens_context) return new_context - def encode_and_trim(self, context, context_length): - """ - Encodes the context to tokens and trims it to the specified length. - - Args: - context (str): The context to encode and trim. - context_length (int): The desired length of the context in tokens. - - Returns: - str: The encoded and trimmed context. - """ - tokens = self.model_to_test.encode_text_to_tokens(context) - if len(tokens) > context_length: - context = self.model_to_test.decode_tokens(tokens, context_length) - return context - async def generate_context(self, context_length, depth_percent): """ Generates a context of a specified length and inserts needles at given depth percentages. @@ -165,103 +112,19 @@ async def evaluate_and_log(self, context_length, depth_percent): # Go generate the required length context and place your needle statement in context = await self.generate_context(context_length, depth_percent) - test_start_time = time.time() - # LangSmith ## TODO: Support for other evaluators - if self.evaluator.__class__.__name__ == "LangSmithEvaluator": - print("EVALUATOR: LANGSMITH") + if self.evaluation_model.__class__.__name__ == "LangSmithEvaluator": chain = self.model_to_test.get_langchain_runnable(context) - self.evaluator.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages) - test_end_time = time.time() - test_elapsed_time = test_end_time - test_start_time - + self.evaluation_model.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages) else: - print("EVALUATOR: OpenAI Model") - # Prepare your message to send to the model you're going to evaluate - prompt = self.model_to_test.generate_prompt(context, self.retrieval_question) - # Go see if the model can answer the question to pull out your random fact - response = await self.model_to_test.evaluate_model(prompt) - # Compare the reponse to the actual needle you placed - score = self.evaluation_model.evaluate_response(response) - - test_end_time = time.time() - test_elapsed_time = test_end_time - test_start_time - - results = { - # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large. - 'model' : self.model_to_test.model_name, - 'context_length' : int(context_length), - 'depth_percent' : float(depth_percent), - 'version' : self.results_version, - 'needle' : self.needle, - 'model_response' : response, - 'score' : score, - 'test_duration_seconds' : test_elapsed_time, - 'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z') - } - - self.testing_results.append(results) - - if self.print_ongoing_status: - print (f"-- Test Summary -- ") - print (f"Duration: {test_elapsed_time:.1f} seconds") - print (f"Context: {context_length} tokens") - print (f"Depth: {depth_percent}%") - print (f"Score: {score}") - print (f"Response: {response}\n") - - context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}' - - if self.save_contexts: - results['file_name'] = context_file_location - - # Save the context to file for retesting - if not os.path.exists('contexts'): - os.makedirs('contexts') - - with open(f'contexts/{context_file_location}_context.txt', 'w') as f: - f.write(context) - - if self.save_results: - # Save the context to file for retesting - if not os.path.exists('results'): - os.makedirs('results') - - # Save the result to file for retesting - with open(f'results/{context_file_location}_results.json', 'w') as f: - json.dump(results, f) - - if self.seconds_to_sleep_between_completions: - await asyncio.sleep(self.seconds_to_sleep_between_completions) - - async def bound_evaluate_and_log(self, sem, *args): - async with sem: - await self.evaluate_and_log(*args) - - async def run_test(self): - sem = Semaphore(self.num_concurrent_requests) - - # Run through each iteration of context_lengths and depths - tasks = [] - for context_length in self.context_lengths: - for depth_percent in self.document_depth_percents: - task = self.bound_evaluate_and_log(sem, context_length, depth_percent) - tasks.append(task) - - # Wait for all tasks to complete - await asyncio.gather(*tasks) + await super().evaluate_and_log(context, context_length, depth_percent) def print_start_test_summary(self): print ("\n") - print ("Starting Needle In A Haystack Testing...") + print ("Starting Needles In A Haystack Testing...") print (f"- Model: {self.model_name}") print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}") print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%") print (f"- Needles: {self.needles}") print ("\n\n") - - def start_test(self): - if self.print_ongoing_status: - self.print_start_test_summary() - asyncio.run(self.run_test()) diff --git a/needlehaystack/llm_needle_haystack_tester.py b/needlehaystack/llm_needle_haystack_tester.py index 1d658db3..01aaf7b4 100644 --- a/needlehaystack/llm_needle_haystack_tester.py +++ b/needlehaystack/llm_needle_haystack_tester.py @@ -65,6 +65,8 @@ def __init__(self, """ if not model_to_test: raise ValueError("A language model must be provided to test.") + if not evaluator: + raise ValueError("An evaluator must be provided to evaluate the model's response.") if not needle or not haystack_dir or not retrieval_question: raise ValueError("Needle, haystack, and retrieval_question must be provided.") @@ -252,31 +254,36 @@ def insert_needle(self, context, depth_percent, context_length): if len(tokens_context) + len(tokens_needle) > context_length: tokens_context = tokens_context[:context_length - len(tokens_needle)] + tokens_new_context, _ = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent) + + # Convert back to a string and return it + new_context = self.model_to_test.decode_tokens(tokens_new_context) + return new_context + + def get_tokens_new_context(self, tokens_context, tokens_needle, depth_percent): if depth_percent == 100: # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end - tokens_new_context = tokens_context + tokens_needle - else: - # Go get the position (in terms of tokens) to insert your needle - insertion_point = int(len(tokens_context) * (depth_percent / 100)) + return tokens_context + tokens_needle + + # Go get the position (in terms of tokens) to insert your needle + insertion_point = int(len(tokens_context) * (depth_percent / 100)) - # tokens_new_context represents the tokens before the needle - tokens_new_context = tokens_context[:insertion_point] + # tokens_new_context represents the tokens before the needle + tokens_new_context = tokens_context[:insertion_point] - # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is - period_tokens = self.model_to_test.encode_text_to_tokens('.') - - # Then we iteration backwards until we find the first period - while tokens_new_context and tokens_new_context[-1] not in period_tokens: - insertion_point -= 1 - tokens_new_context = tokens_context[:insertion_point] + # We want to make sure that we place our needle at a sentence break so we first see what token a '.' is + period_tokens = self.model_to_test.encode_text_to_tokens('.') + + # Then we iteration backwards until we find the first period + while tokens_new_context and tokens_new_context[-1] not in period_tokens: + insertion_point -= 1 + tokens_new_context = tokens_context[:insertion_point] - # Once we get there, then add in your needle, and stick the rest of your context in on the other end. - # Now we have a needle in a haystack - tokens_new_context += tokens_needle + tokens_context[insertion_point:] + # Once we get there, then add in your needle, and stick the rest of your context in on the other end. + # Now we have a needle in a haystack + tokens_new_context += tokens_needle + tokens_context[insertion_point:] - # Convert back to a string and return it - new_context = self.model_to_test.decode_tokens(tokens_new_context) - return new_context + return tokens_new_context, insertion_point def get_context_length_in_tokens(self, context): return len(self.model_to_test.encode_text_to_tokens(context)) diff --git a/needlehaystack/run.py b/needlehaystack/run.py index d38519bc..aabd418e 100644 --- a/needlehaystack/run.py +++ b/needlehaystack/run.py @@ -101,10 +101,8 @@ def main(): args.evaluator = get_evaluator(args) if args.multi_needle == True: - print("Testing multi-needle") tester = LLMMultiNeedleHaystackTester(**args.__dict__) else: - print("Testing single-needle") tester = LLMNeedleHaystackTester(**args.__dict__) tester.start_test()