Skip to content

Commit 4804609

Browse files
authored
test: added end-to-end test for langsmith (#984)
1 parent 54e9f4d commit 4804609

File tree

5 files changed

+134
-10
lines changed

5 files changed

+134
-10
lines changed

src/ragas/metrics/base.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,13 @@ def get_required_columns(
6161
class Metric(ABC):
6262
@property
6363
@abstractmethod
64-
def name(self) -> str: ...
64+
def name(self) -> str:
65+
...
6566

6667
@property
6768
@abstractmethod
68-
def evaluation_mode(self) -> EvaluationMode: ...
69+
def evaluation_mode(self) -> EvaluationMode:
70+
...
6971

7072
@abstractmethod
7173
def init(self, run_config: RunConfig):
@@ -127,9 +129,8 @@ async def ascore(
127129
return score
128130

129131
@abstractmethod
130-
async def _ascore(
131-
self, row: t.Dict, callbacks: Callbacks, is_async: bool
132-
) -> float: ...
132+
async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> float:
133+
...
133134

134135

135136
@dataclass

src/ragas/testset/generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def generate(
242242
]
243243
total_evolutions = 0
244244
for evolution, probability in distributions.items():
245-
for i in sample(range(test_size),round(probability * test_size)):
245+
for i in sample(range(test_size), round(probability * test_size)):
246246
exec.submit(
247247
evolution.evolve,
248248
current_nodes[i],

tests/conftest.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,16 @@ def pytest_configure(config):
1515
"""
1616
configure pytest
1717
"""
18-
# adda
18+
# Extra Pytest Markers
19+
# add `ragas_ci`
1920
config.addinivalue_line(
20-
"markers", "ragas_ci: Set of tests that will be run as part of Ragas CI"
21+
"markers",
22+
"ragas_ci: Set of tests that will be run as part of Ragas CI",
23+
)
24+
# add `e2e`
25+
config.addinivalue_line(
26+
"markers",
27+
"e2e: End-to-End tests for Ragas",
2128
)
2229

2330

tests/e2e/test_amnesty_in_ci.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from ragas import evaluate
55
from ragas.metrics import (
66
answer_relevancy,
7-
faithfulness,
8-
context_recall,
97
context_precision,
8+
context_recall,
9+
faithfulness,
1010
)
1111

1212
# loading the V2 dataset

tests/e2e/test_langsmith.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
from uuid import UUID
2+
3+
import pandas as pd
4+
import pytest
5+
from langchain_core.output_parsers import StrOutputParser
6+
from langchain_core.prompts import PromptTemplate
7+
from langchain_core.runnables import (
8+
Runnable,
9+
RunnableLambda,
10+
RunnableParallel,
11+
RunnablePassthrough,
12+
)
13+
from langchain_openai import ChatOpenAI
14+
from langsmith import Client
15+
16+
client = Client()
17+
18+
19+
# datarows with queries and ground truth
20+
# sample questions
21+
questions = [
22+
"What are the advantages of remote work? Why does gitlab do it?",
23+
"what are the dis-advantages of remote work in gitlab? How do you mitigate that?",
24+
"What does it mean to be 'all-remote'? Why is it important?",
25+
"How does compensation work in an 'all-remote' setting?",
26+
"How to run effective meetings in 'all-remote' setting",
27+
]
28+
29+
answers = [
30+
"Remote work offers numerous advantages including increased flexibility, the ability to hire top talent globally without geographical constraints, enhanced productivity due to fewer distractions, and significant cost savings on office space and related expenses. GitLab adopts an all-remote model to leverage these benefits, ensuring a more inclusive and diverse workforce, fostering a culture that values output over presence, and providing employees the freedom to work in environments that best suit their personal and professional needs. This approach not only supports individual employee well-being and work-life balance but also positions GitLab as a resilient and adaptive organization in a rapidly changing global work landscape.",
31+
"GitLab's remote work challenges include onboarding difficulties, loneliness, communication breakdowns, work/life balance issues, time zone conflicts, and the need for strong time management skills. To mitigate these, GitLab employs strategies such as providing comprehensive onboarding resources, fostering community through virtual coffee chats and social calls, prioritizing asynchronous communication, reimbursing coworking spaces, empowering employees to manage their own schedules, focusing on results rather than hours, and screening for self-discipline during the hiring process. These measures aim to enhance productivity and employee satisfaction in a remote work setting.",
32+
"Being 'all-remote' means that an organization empowers every individual to work from any location where they feel most fulfilled, without the need to report to a company-owned office, thereby treating all employees equally regardless of their physical location. This approach is important because it eliminates location hierarchy, allowing for a more inclusive work environment where team members have the autonomy to create their ideal workspace and can maintain their job regardless of life changes, such as relocations due to family commitments. It supports a diverse workforce, including caregivers, working parents, and military spouses, by providing them with the flexibility to work from anywhere, fostering equality among all employees and enabling a global talent pool without the constraints of geographical boundaries.",
33+
"In an 'all-remote' setting, such as at GitLab, compensation is structured around local rates rather than a single global standard, which means employees are paid based on the cost of living and market rates in their respective locations. This approach allows the company to hire globally without being bound by the high salary standards of any particular region, like San Francisco. GitLab uses a compensation calculator to ensure transparency and fairness in pay, adjusting salaries based on a combination of factors including location, experience, and market data. Payments are typically made in the local currency of the employee, and for countries where direct employment isn't feasible, GitLab utilizes professional employment organizations or hires contractors. This model supports GitLab's global talent acquisition strategy while managing compensation costs effectively.",
34+
"To run effective meetings in an 'all-remote' setting, it's crucial to be intentional about meeting necessity, provide clear agendas and supporting materials in advance, start and end on time, document discussions in real time, and make attendance optional to respect time zones and individual schedules. Recording meetings for asynchronous viewing, using reliable communication tools like Zoom, and ensuring active participation through video feedback are also key practices. This approach aligns with GitLab's guidelines for maximizing efficiency, inclusivity, and collaboration in a remote work environment.",
35+
]
36+
37+
dataset = {"question": questions, "ground_truth": answers}
38+
39+
40+
def upload_to_langsmith(dataset_name: str) -> UUID:
41+
# Creating a pandas DataFrame from the dataset dictionary
42+
df = pd.DataFrame(dataset)
43+
44+
# upload to langsmith
45+
langsmith_dataset = client.upload_dataframe(
46+
name=dataset_name,
47+
description="temporal dataset for testing langsmith",
48+
df=df,
49+
input_keys=["question"],
50+
output_keys=["ground_truth"],
51+
)
52+
53+
return langsmith_dataset.id
54+
55+
56+
def clean_langsmith(langsmith_dataset_id: UUID):
57+
# clean langsmith
58+
client.delete_dataset(dataset_id=langsmith_dataset_id)
59+
60+
61+
def llm_chain_factory() -> Runnable:
62+
# just LLM
63+
template = """Use the following pieces of context to answer the question at the end.
64+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
65+
Use three sentences maximum and keep the answer as concise as possible.
66+
Always say "thanks for asking!" at the end of the answer.
67+
68+
Question: {question}
69+
70+
Helpful Answer:"""
71+
llm_prompt = PromptTemplate.from_template(template)
72+
73+
# LLM
74+
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
75+
76+
# just llm pipeline
77+
just_llm = (
78+
{"question": RunnablePassthrough()}
79+
| llm_prompt
80+
| llm
81+
| StrOutputParser()
82+
| RunnableParallel(
83+
{
84+
"answer": RunnablePassthrough(),
85+
"contexts": RunnableLambda(lambda _: [""]),
86+
}
87+
)
88+
)
89+
90+
return just_llm
91+
92+
93+
@pytest.fixture()
94+
def langsmith_dataset():
95+
dataset_name = "temporal_dataset"
96+
langsmith_dataset_id = upload_to_langsmith(dataset_name)
97+
yield dataset_name
98+
clean_langsmith(langsmith_dataset_id)
99+
100+
101+
@pytest.mark.e2e()
102+
def test_langsmith_evaluate(langsmith_dataset):
103+
# setup
104+
just_llm = llm_chain_factory()
105+
106+
from ragas.integrations.langsmith import evaluate
107+
from ragas.metrics import answer_correctness
108+
109+
# evaluate just llms
110+
_ = evaluate(
111+
dataset_name=langsmith_dataset,
112+
llm_or_chain_factory=just_llm,
113+
# experiment_name="just_llm",
114+
metrics=[answer_correctness],
115+
verbose=True,
116+
)

0 commit comments

Comments
 (0)