-
Notifications
You must be signed in to change notification settings - Fork 237
Expand file tree
/
Copy pathllm_multi_needle_haystack_tester.py
More file actions
130 lines (104 loc) · 6.37 KB
/
llm_multi_needle_haystack_tester.py
File metadata and controls
130 lines (104 loc) · 6.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from .llm_needle_haystack_tester import LLMNeedleHaystackTester
class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
"""
Extends LLMNeedleHaystackTester to support testing with multiple needles in the haystack.
Attributes:
needles (list): A list of needles (facts) to insert into the haystack (context).
eval_set (str): The evaluation set identifier.
"""
def __init__(self,
needles=[],
eval_set = "multi-needle-eval-sf",
*args,
**kwargs):
super().__init__(*args, **kwargs)
self.needles = needles
self.eval_set = eval_set
self.insertion_percentages = []
async def insert_needles(self, context, depth_percent, context_length):
"""
Inserts multiple needles (specific facts or pieces of information) into the original context string at
designated depth percentages, effectively distributing these needles throughout the context. This method
is designed to test a model's ability to retrieve specific information (needles) from a larger body of text
(haystack) based on the placement depth of these needles.
The method first encodes the context and each needle into tokens to calculate their lengths in tokens.
It then adjusts the context length to accommodate the final buffer length. This is crucial for ensuring
that the total token count (context plus needles) does not exceed the maximum allowable context length,
which might otherwise lead to information being truncated.
This approach calculates the initial insertion point for the first needle as before but then calculates even
spacing for the remaining needles based on the remaining context length. It ensures that needles are
distributed as evenly as possible throughout the context after the first insertion.
Args:
context (str): The original context string.
depth_percent (float): The depth percent at which to insert the needles.
context_length (int): The total length of the context in tokens, adjusted for final buffer.
Returns:
str: The new context with needles inserted.
"""
tokens_context = self.model_to_test.encode_text_to_tokens(context)
context_length -= self.final_context_length_buffer
# Calculate the total length of all needles in tokens
total_needles_length = sum(len(self.model_to_test.encode_text_to_tokens(needle)) for needle in self.needles)
# Ensure context length accounts for needles
if len(tokens_context) + total_needles_length > context_length:
tokens_context = tokens_context[:context_length - total_needles_length]
# To evenly distribute the needles, we calculate the intervals they need to be inserted.
depth_percent_interval = (100 - depth_percent) / len(self.needles)
# Reset the insertion percentages list for the current context
self.insertion_percentages = []
# Insert needles at calculated points
for needle in self.needles:
tokens_needle = self.model_to_test.encode_text_to_tokens(needle)
if depth_percent == 100:
# If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
tokens_context = tokens_context + tokens_needle
else:
tokens_context, insertion_point = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent)
# Log
insertion_percentage = (insertion_point / len(tokens_context)) * 100
self.insertion_percentages.append(insertion_percentage)
# print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
# Adjust depth for next needle
depth_percent += depth_percent_interval
new_context = self.model_to_test.decode_tokens(tokens_context)
return new_context
async def generate_context(self, context_length, depth_percent):
"""
Generates a context of a specified length and inserts needles at given depth percentages.
Args:
context_length (int): The total length of the context in tokens.
depth_percent (float): The depth percent for needle insertion.
Returns:
str: The context with needles inserted.
"""
context = self.read_context_files()
context = self.encode_and_trim(context, context_length)
context = await self.insert_needles(context, depth_percent, context_length)
return context
async def evaluate_and_log(self, context_length, depth_percent):
"""
Evaluates the model's performance with the generated context and logs the results.
Args:
context_length (int): The length of the context in tokens.
depth_percent (float): The depth percent for needle insertion.
"""
if self.save_results:
if self.result_exists(context_length, depth_percent):
return
# Go generate the required length context and place your needle statement in
context = await self.generate_context(context_length, depth_percent)
# LangSmith
## TODO: Support for other evaluators
if self.evaluation_model.__class__.__name__ == "LangSmithEvaluator":
chain = self.model_to_test.get_langchain_runnable(context)
self.evaluation_model.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages)
else:
await super().evaluate_and_log(context, context_length, depth_percent)
def print_start_test_summary(self):
print ("\n")
print ("Starting Needles In A Haystack Testing...")
print (f"- Model: {self.model_name}")
print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
print (f"- Needles: {self.needles}")
print ("\n\n")