-
Notifications
You must be signed in to change notification settings - Fork 237
Expand file tree
/
Copy pathlangsmith.py
More file actions
124 lines (104 loc) · 5.23 KB
/
langsmith.py
File metadata and controls
124 lines (104 loc) · 5.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import uuid
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_tools import PydanticToolsParser
from langchain.prompts import PromptTemplate
from langchain.smith import RunEvalConfig
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_tool
from langsmith.client import Client
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run
@run_evaluator
def score_relevance(run: Run, example: Example | None = None):
"""
A custom evaluator function that grades the language model's response based on its relevance
to a reference answer.
Args:
run (Run): The execution run containing the model's response.
example (Union[Example, None]): An optional example containing the reference answer.
Returns:
EvaluationResult: The result of the evaluation, containing the relevance score.
"""
student_answer = run.outputs["output"]
reference = example.outputs["answer"]
# Grade prompt
template = """You are an expert grader of student answers relative to a reference answer. \n
The reference answer is a single ingredient or a list of ingredients related to pizza \n
toppings. The grade is the number of correctly returned ingredient relative to the reference. \n
For example, if the reference has 5 ingredients and the student returns 3, then the grade is 3. \n
Here is the student answer: \n --- --- --- \n {answer}
Here is the reference answer: \n --- --- --- \n {reference}"""
# Prompt
prompt = PromptTemplate(
template=template,
input_variables=["answer", "reference"],
)
# Data model
class grade(BaseModel):
"""Grade output"""
score: int = Field(description="Score from grader")
## LLM
# Use most performant model as grader
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview")
# Tool
grade_tool_oai = convert_to_openai_tool(grade)
# LLM with tool and enforce invocation
llm_with_tool = model.bind(
tools=[grade_tool_oai],
tool_choice={"type": "function", "function": {"name": "grade"}},
)
# Parser
parser_tool = PydanticToolsParser(tools=[grade])
chain = (
prompt
| llm_with_tool
| parser_tool
)
score = chain.invoke({"answer":student_answer,
"reference":reference})
return EvaluationResult(key="needles_retrieved", score=score[0].score)
class LangSmithEvaluator():
"""
An evaluator class that leverages the LangSmith API for evaluating language models' performance on specific tasks.
This class primarily focuses on evaluating the ability of a language model to retrieve and accurately present information
from a provided context (the "needle" in a "haystack").
"""
def evaluate_chain(self, chain, context_length, depth_percent, model_name, eval_set, num_needles, needles, insertion_percentages):
"""
Evaluates a language model's chain of operations, specifically focusing on the model's ability to
retrieve information accurately from a given context. This method defines a custom evaluator that
grades the language model's responses based on relevance to a reference answer.
Args:
chain: The LangChain runnable or chain of operations to be evaluated.
context_length (int): The length of the context in tokens.
depth_percent (float): The percentage depth in the context where the information (needle) is located.
model_name (str): The name of the language model being evaluated.
eval_set (str): The evaluation set identifier, used to categorize and reference the evaluation.
num_needles (int): The number of needles in the haystack.
needles (list[str]): The needles inserted into the haystack.
insertion_percentages (list[float]): The location of each needle in the haystack.
Details:
The evaluation involves creating a grading prompt that asks the model to grade student responses
based on their relevance to a given reference answer. This approach allows for quantifying the
model's accuracy in retrieving and synthesizing information from the provided context.
"""
# Config
evaluation_config = RunEvalConfig(
custom_evaluators = [score_relevance],
)
client = Client()
run_id = uuid.uuid4().hex[:4]
project_name = eval_set
client.run_on_dataset(
dataset_name=eval_set,
llm_or_chain_factory=chain,
project_metadata={"context_length": context_length,
"depth_percent": depth_percent,
"num_needles": num_needles,
"needles": needles,
"insertion_percentages": insertion_percentages,
"model_name": model_name},
evaluation=evaluation_config,
project_name=f"{context_length}-{depth_percent}--{model_name}--{project_name}--{run_id}",
)