LLMTest_NeedleInAHaystack/needlehaystack/llm_multi_needle_haystack_tester.py at 792db8cb4343738d7c1a080babc8166984a19774 · gkamradt/LLMTest_NeedleInAHaystack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from .llm_needle_haystack_tester import LLMNeedleHaystackTester

class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
    """
    Extends LLMNeedleHaystackTester to support testing with multiple needles in the haystack.

    Attributes:
        needles (list): A list of needles (facts) to insert into the haystack (context).
        eval_set (str): The evaluation set identifier.
    """
    def __init__(self,
                 needles=[],
                 eval_set = "multi-needle-eval-sf",
                 *args,
                 **kwargs):

        super().__init__(*args, **kwargs)
        self.needles = needles
        self.eval_set = eval_set
        self.insertion_percentages = []

    async def insert_needles(self, context, depth_percent, context_length):
        """
        Inserts multiple needles (specific facts or pieces of information) into the original context string at
        designated depth percentages, effectively distributing these needles throughout the context. This method
        is designed to test a model's ability to retrieve specific information (needles) from a larger body of text
        (haystack) based on the placement depth of these needles.

        The method first encodes the context and each needle into tokens to calculate their lengths in tokens.
        It then adjusts the context length to accommodate the final buffer length. This is crucial for ensuring
        that the total token count (context plus needles) does not exceed the maximum allowable context length,
        which might otherwise lead to information being truncated.

        This approach calculates the initial insertion point for the first needle as before but then calculates even
        spacing for the remaining needles based on the remaining context length. It ensures that needles are
        distributed as evenly as possible throughout the context after the first insertion.

        Args:
            context (str): The original context string.
            depth_percent (float): The depth percent at which to insert the needles.
            context_length (int): The total length of the context in tokens, adjusted for final buffer.

        Returns:
            str: The new context with needles inserted.
        """
        tokens_context = self.model_to_test.encode_text_to_tokens(context)
        context_length -= self.final_context_length_buffer

        # Calculate the total length of all needles in tokens
        total_needles_length = sum(len(self.model_to_test.encode_text_to_tokens(needle)) for needle in self.needles)

        # Ensure context length accounts for needles
        if len(tokens_context) + total_needles_length > context_length:
            tokens_context = tokens_context[:context_length - total_needles_length]

        # To evenly distribute the needles, we calculate the intervals they need to be inserted.
        depth_percent_interval = (100 - depth_percent) / len(self.needles)

        # Reset the insertion percentages list for the current context
        self.insertion_percentages = []

        # Insert needles at calculated points
        for needle in self.needles:

            tokens_needle = self.model_to_test.encode_text_to_tokens(needle)

            if depth_percent == 100:
                # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
                tokens_context = tokens_context + tokens_needle
            else:
                tokens_context, insertion_point = self.get_tokens_new_context(tokens_context, tokens_needle, depth_percent)

                # Log
                insertion_percentage = (insertion_point / len(tokens_context)) * 100
                self.insertion_percentages.append(insertion_percentage)
                # print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")

                # Adjust depth for next needle
                depth_percent += depth_percent_interval

        new_context = self.model_to_test.decode_tokens(tokens_context)
        return new_context

    async def generate_context(self, context_length, depth_percent):
        """
        Generates a context of a specified length and inserts needles at given depth percentages.

        Args:
            context_length (int): The total length of the context in tokens.
            depth_percent (float): The depth percent for needle insertion.

        Returns:
            str: The context with needles inserted.
        """
        context = self.read_context_files()
        context = self.encode_and_trim(context, context_length)
        context = await self.insert_needles(context, depth_percent, context_length)
        return context

    async def evaluate_and_log(self, context_length, depth_percent):
        """
        Evaluates the model's performance with the generated context and logs the results.

        Args:
            context_length (int): The length of the context in tokens.
            depth_percent (float): The depth percent for needle insertion.
        """
        if self.save_results:
            if self.result_exists(context_length, depth_percent):
                return

        # Go generate the required length context and place your needle statement in
        context = await self.generate_context(context_length, depth_percent)

        # LangSmith
        ## TODO: Support for other evaluators
        if self.evaluation_model.__class__.__name__ == "LangSmithEvaluator":
            chain = self.model_to_test.get_langchain_runnable(context)
            self.evaluation_model.evaluate_chain(chain, context_length, depth_percent, self.model_to_test.model_name, self.eval_set, len(self.needles), self.needles, self.insertion_percentages)
        else:
            await super().evaluate_and_log(context, context_length, depth_percent)

    def print_start_test_summary(self):
        print ("\n")
        print ("Starting Needles In A Haystack Testing...")
        print (f"- Model: {self.model_name}")
        print (f"- Context Lengths: {len(self.context_lengths)}, Min: {min(self.context_lengths)}, Max: {max(self.context_lengths)}")
        print (f"- Document Depths: {len(self.document_depth_percents)}, Min: {min(self.document_depth_percents)}%, Max: {max(self.document_depth_percents)}%")
        print (f"- Needles: {self.needles}")
        print ("\n\n")