Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 12 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,8 @@ The package `needlehaystack` is available for import in your test cases. Develop
- `print_ongoing_status` - Default: True, whether or not to print the status of test as they complete

`LLMMultiNeedleHaystackTester` parameters:

- `multi_needle` - True or False, whether to run multi-needle
- `needles` - List of needles to insert in the context
* `needles` - List of needles to insert in the context
* `eval_set` - The evaluation set identifier.

Other Parameters:

Expand Down Expand Up @@ -162,18 +161,16 @@ Needle 10: 40 + 9 * 6 = 94

You can use LangSmith to orchestrate evals and store results.

(1) Sign up for [LangSmith](https://docs.smith.langchain.com/setup)
(2) Set env variables for LangSmith as specified in the setup.
(3) In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` to start.
(4) Populate the dataset with a test question:

```
question: What are the 5 best things to do in San Franscisco?
answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge"
```

![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a)
(5) Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.
1. Sign up for [LangSmith](https://docs.smith.langchain.com/setup)
2. Set env variables for LangSmith as specified in the setup.
3. In the `Datasets + Testing` tab, use `+ Dataset` to create a new dataset, call it `multi-needle-eval-sf` and set dataset type to `Key-Value`.
4. Populate the dataset with a test question:
```
question: What are the 5 best things to do in San Franscisco?
answer: "The 5 best things to do in San Francisco are: 1) Go to Dolores Park. 2) Eat at Tony's Pizza Napoletana. 3) Visit Alcatraz. 4) Hike up Twin Peaks. 5) Bike across the Golden Gate Bridge"
```
![Screenshot 2024-03-05 at 4 54 15 PM](https://github.com/rlancemartin/LLMTest_NeedleInAHaystack/assets/122662504/2f903955-ed1d-49cc-b995-ed0407d6212a)
5. Run with ` --evaluator langsmith` and `--eval_set multi-needle-eval-sf` to run against our recently created eval set.

Let's see all these working together on a new dataset, `multi-needle-eval-pizza`.

Expand Down
8 changes: 2 additions & 6 deletions needlehaystack/evaluators/langsmith.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union
import os
import uuid

from langchain_openai import ChatOpenAI
Expand All @@ -12,7 +12,7 @@
from langsmith.schemas import Example, Run

@run_evaluator
def score_relevance(run: Run, example: Union[Example, None] = None):
def score_relevance(run: Run, example: Example | None = None):
"""
A custom evaluator function that grades the language model's response based on its relevance
to a reference answer.
Expand All @@ -24,10 +24,6 @@ def score_relevance(run: Run, example: Union[Example, None] = None):
Returns:
EvaluationResult: The result of the evaluation, containing the relevance score.
"""

print("--LANGSMITH EVAL--")
#print("--MODEL: ", model_name)
#print("--EVAL SET: ", eval_set)
student_answer = run.outputs["output"]
reference = example.outputs["answer"]

Expand Down
157 changes: 10 additions & 147 deletions needlehaystack/llm_multi_needle_haystack_tester.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,22 @@
import asyncio
import glob
import json
import os
import time
from asyncio import Semaphore
from datetime import datetime, timezone

import numpy as np

from .evaluators import Evaluator
from .llm_needle_haystack_tester import LLMNeedleHaystackTester
from .providers import ModelProvider


class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
"""
Extends LLMNeedleHaystackTester to support testing with multiple needles in the haystack.

Attributes:
needles (list): A list of needles (facts) to insert into the haystack (context).
model_to_test (ModelProvider): The model being tested.
evaluator (Evaluator): The evaluator used to assess the model's performance.
print_ongoing_status (bool): Flag to print ongoing status messages.
eval_set (str): The evaluation set identifier.
"""
def __init__(self, *args,
needles=[],
model_to_test: ModelProvider = None,
evaluator: Evaluator = None,
print_ongoing_status = True,
def __init__(self,
needles=[],
eval_set = "multi-needle-eval-sf",
*args,
**kwargs):

super().__init__(*args, model_to_test=model_to_test, **kwargs)
super().__init__(*args, **kwargs)
self.needles = needles
self.evaluator = evaluator
self.model_to_test = model_to_test
self.eval_set = eval_set
self.model_name = self.model_to_test.model_name
self.print_ongoing_status = print_ongoing_status
self.insertion_percentages = []

async def insert_needles(self, context, depth_percent, context_length):
Expand Down Expand Up @@ -90,50 +68,19 @@ async def insert_needles(self, context, depth_percent, context_length):
# If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
tokens_context = tokens_context + tokens_needle
else:
# Go get the position (in terms of tokens) to insert your needle
insertion_point = int(len(tokens_context) * (depth_percent / 100))

# tokens_new_context represents the tokens before the needle
tokens_new_context = tokens_context[:insertion_point]

# We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
period_tokens = self.model_to_test.encode_text_to_tokens('.')

# Then we iteration backwards until we find the first period
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
insertion_point -= 1
tokens_new_context = tokens_context[:insertion_point]

# Insert the needle into the context at the found position
tokens_context = tokens_context[:insertion_point] + tokens_needle + tokens_context[insertion_point:]
tokens_context, insertion_point = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent)

# Log
insertion_percentage = (insertion_point / len(tokens_context)) * 100
self.insertion_percentages.append(insertion_percentage)
print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
# print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")

# Adjust depth for next needle
depth_percent += depth_percent_interval

new_context = self.model_to_test.decode_tokens(tokens_context)
return new_context

def encode_and_trim(self, context, context_length):
"""
Encodes the context to tokens and trims it to the specified length.

Args:
context (str): The context to encode and trim.
context_length (int): The desired length of the context in tokens.

Returns:
str: The encoded and trimmed context.
"""
tokens = self.model_to_test.encode_text_to_tokens(context)
if len(tokens) > context_length:
context = self.model_to_test.decode_tokens(tokens, context_length)
return context

async def generate_context(self, context_length, depth_percent):
"""
Generates a context of a specified length and inserts needles at given depth percentages.
Expand Down Expand Up @@ -165,103 +112,19 @@ async def evaluate_and_log(self, context_length, depth_percent):
# Go generate the required length context and place your needle statement in
context = await self.generate_context(context_length, depth_percent)

test_start_time = time.time()

# LangSmith
## TODO: Support for other evaluators
if self.evaluator.__class__.__name__ == "LangSmithEvaluator":
print("EVALUATOR: LANGSMITH")
if self.evaluation_model.__class__.__name__ == "LangSmithEvaluator":
chain = self.model_to_test.get_langchain_runnable(context)
self.evaluator.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages)
test_end_time = time.time()
test_elapsed_time = test_end_time - test_start_time

self.evaluation_model.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages)
else:
print("EVALUATOR: OpenAI Model")
# Prepare your message to send to the model you're going to evaluate
prompt = self.model_to_test.generate_prompt(context, self.retrieval_question)
# Go see if the model can answer the question to pull out your random fact
response = await self.model_to_test.evaluate_model(prompt)
# Compare the reponse to the actual needle you placed
score = self.evaluation_model.evaluate_response(response)

test_end_time = time.time()
test_elapsed_time = test_end_time - test_start_time

results = {
# 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
'model' : self.model_to_test.model_name,
'context_length' : int(context_length),
'depth_percent' : float(depth_percent),
'version' : self.results_version,
'needle' : self.needle,
'model_response' : response,
'score' : score,
'test_duration_seconds' : test_elapsed_time,
'test_timestamp_utc' : datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S%z')
}

self.testing_results.append(results)

if self.print_ongoing_status:
print (f"-- Test Summary -- ")
print (f"Duration: {test_elapsed_time:.1f} seconds")
print (f"Context: {context_length} tokens")
print (f"Depth: {depth_percent}%")
print (f"Score: {score}")
print (f"Response: {response}\n")

context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'

if self.save_contexts:
results['file_name'] = context_file_location

# Save the context to file for retesting
if not os.path.exists('contexts'):
os.makedirs('contexts')

with open(f'contexts/{context_file_location}_context.txt', 'w') as f:
f.write(context)

if self.save_results:
# Save the context to file for retesting
if not os.path.exists('results'):
os.makedirs('results')

# Save the result to file for retesting
with open(f'results/{context_file_location}_results.json', 'w') as f:
json.dump(results, f)

if self.seconds_to_sleep_between_completions:
await asyncio.sleep(self.seconds_to_sleep_between_completions)

async def bound_evaluate_and_log(self, sem, *args):
async with sem:
await self.evaluate_and_log(*args)

async def run_test(self):
sem = Semaphore(self.num_concurrent_requests)

# Run through each iteration of context_lengths and depths
tasks = []
for context_length in self.context_lengths:
for depth_percent in self.document_depth_percents:
task = self.bound_evaluate_and_log(sem, context_length, depth_percent)
tasks.append(task)

# Wait for all tasks to complete
await asyncio.gather(*tasks)
await super().evaluate_and_log(context, context_length, depth_percent)

def print_start_test_summary(self):
print ("\n")
print ("Starting Needle In A Haystack Testing...")
print ("Starting Needles In A Haystack Testing...")
print (f"- Model: {self.model_name}")
print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
print (f"- Needles: {self.needles}")
print ("\n\n")

def start_test(self):
if self.print_ongoing_status:
self.print_start_test_summary()
asyncio.run(self.run_test())
45 changes: 26 additions & 19 deletions needlehaystack/llm_needle_haystack_tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def __init__(self,
"""
if not model_to_test:
raise ValueError("A language model must be provided to test.")
if not evaluator:
raise ValueError("An evaluator must be provided to evaluate the model's response.")
if not needle or not haystack_dir or not retrieval_question:
raise ValueError("Needle, haystack, and retrieval_question must be provided.")

Expand Down Expand Up @@ -252,31 +254,36 @@ def insert_needle(self, context, depth_percent, context_length):
if len(tokens_context) + len(tokens_needle) > context_length:
tokens_context = tokens_context[:context_length - len(tokens_needle)]

tokens_new_context, _ = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent)

# Convert back to a string and return it
new_context = self.model_to_test.decode_tokens(tokens_new_context)
return new_context

def get_tokens_new_context(self, tokens_context, tokens_needle, depth_percent):
if depth_percent == 100:
# If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
tokens_new_context = tokens_context + tokens_needle
else:
# Go get the position (in terms of tokens) to insert your needle
insertion_point = int(len(tokens_context) * (depth_percent / 100))
return tokens_context + tokens_needle

# Go get the position (in terms of tokens) to insert your needle
insertion_point = int(len(tokens_context) * (depth_percent / 100))

# tokens_new_context represents the tokens before the needle
tokens_new_context = tokens_context[:insertion_point]
# tokens_new_context represents the tokens before the needle
tokens_new_context = tokens_context[:insertion_point]

# We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
period_tokens = self.model_to_test.encode_text_to_tokens('.')
# Then we iteration backwards until we find the first period
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
insertion_point -= 1
tokens_new_context = tokens_context[:insertion_point]
# We want to make sure that we place our needle at a sentence break so we first see what token a '.' is
period_tokens = self.model_to_test.encode_text_to_tokens('.')

# Then we iteration backwards until we find the first period
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
insertion_point -= 1
tokens_new_context = tokens_context[:insertion_point]

# Once we get there, then add in your needle, and stick the rest of your context in on the other end.
# Now we have a needle in a haystack
tokens_new_context += tokens_needle + tokens_context[insertion_point:]
# Once we get there, then add in your needle, and stick the rest of your context in on the other end.
# Now we have a needle in a haystack
tokens_new_context += tokens_needle + tokens_context[insertion_point:]

# Convert back to a string and return it
new_context = self.model_to_test.decode_tokens(tokens_new_context)
return new_context
return tokens_new_context, insertion_point

def get_context_length_in_tokens(self, context):
return len(self.model_to_test.encode_text_to_tokens(context))
Expand Down
2 changes: 0 additions & 2 deletions needlehaystack/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,8 @@ def main():
args.evaluator = get_evaluator(args)

if args.multi_needle == True:
print("Testing multi-needle")
tester = LLMMultiNeedleHaystackTester(**args.__dict__)
else:
print("Testing single-needle")
tester = LLMNeedleHaystackTester(**args.__dict__)
tester.start_test()

Expand Down