Skip to content

Commit caea0b1

Browse files
committed
marginal progress toward talking to LangSmith
1 parent 0de1540 commit caea0b1

File tree

6 files changed

+1032
-97
lines changed

6 files changed

+1032
-97
lines changed

backend/pyproject.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@ dependencies = [
1919
"python-dotenv",
2020
"pandas>=2.3.0",
2121
"vertexai>=1.43.0",
22-
"langchain>=1.1.0",
23-
"langchain-google-vertexai>=3.1.0",
24-
"langsmith>=0.4.47",
22+
"langchain>=1.1.0,<2.0.0",
23+
"langchain-google-vertexai>=3.1.0,<4.0.0",
24+
"langsmith>=0.4.47,<0.5.0",
2525
"langchain-core>=1.1.0",
2626
"openevals>=0.1.2",
27+
"langchain-google-community>=3.0.1",
28+
"polars>=1.35.2",
2729
]
2830

2931
[tool.setuptools.packages.find]
@@ -49,6 +51,7 @@ dev = [
4951
"types-Flask>=1.1.6",
5052
"types-simplejson>=3.20.0.20250326",
5153
"httpx>=0.27.0",
54+
"google-cloud-discoveryengine>=0.15.0",
5255
]
5356

5457
gen_convo = [

backend/scripts/create_langsmith_dataset.py

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,39 +4,83 @@
44
for automated evaluation.
55
"""
66

7+
import argparse
78
import ast
9+
import os
810
from pathlib import Path
11+
from typing import List, Dict
912

10-
import pandas as pd
13+
import polars as pd
1114
from langsmith import Client
1215

16+
if Path("../.env").exists():
17+
from dotenv import load_dotenv
1318

14-
def create_langsmith_dataset():
19+
load_dotenv(override=True)
20+
21+
22+
def create_langsmith_dataset(
23+
input_csv: Path, limit_examples: int, dataset_name: str, overwrite_dataset=False
24+
):
1525
"""Upload test scenarios to LangSmith for automated evaluation."""
16-
client = Client()
26+
client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))
1727

18-
# Read existing test scenarios.
19-
csv_path = (
20-
Path(__file__).parent
21-
/ "generate_conversation"
22-
/ "tenant_questions_facts_full.csv"
23-
)
24-
df = pd.read_csv(csv_path, encoding="cp1252")
28+
# print(client.info)
29+
30+
dataset_exists = client.has_dataset(dataset_name=dataset_name)
31+
if dataset_exists:
32+
if overwrite_dataset:
33+
print(f"-INFO- Dataset '{dataset_name}' already exists. Deleting for overwrite.")
34+
client.delete_dataset(dataset_name=dataset_name)
35+
else:
36+
raise RuntimeError(
37+
f"-ERROR- Dataset '{dataset_name}' already exists. Aborting to avoid duplicates."
38+
)
2539

2640
# Create dataset in LangSmith.
2741
dataset = client.create_dataset(
28-
dataset_name="tenant-legal-qa-scenarios",
42+
dataset_name=dataset_name,
2943
description="Test scenarios for Oregon tenant legal advice chatbot",
3044
)
3145

46+
# Read existing test scenarios.
47+
csv_path = input_csv
48+
49+
# Try UTF-8 first, fallback to cp1252 if needed
50+
try:
51+
df = pd.read_csv(csv_path, encoding="utf-8", n_rows=limit_examples)
52+
except UnicodeDecodeError:
53+
df = pd.read_csv(csv_path, encoding="cp1252", n_rows=limit_examples)
54+
55+
# replace all empty "city" values with "null" string
56+
df["city"].fill_null("null")
57+
3258
# Convert each row to LangSmith example.
33-
for idx, row in df.iterrows():
59+
for idx, row in enumerate(df.rows(named=True)):
60+
3461
facts = (
3562
ast.literal_eval(row["facts"])
3663
if isinstance(row["facts"], str)
3764
else row["facts"]
3865
)
39-
city = row["city"] if not pd.isna(row["city"]) else "null"
66+
city = row["city"] # if not pd.is_null(row["city"]) else "null"
67+
68+
reference_conversation: List[Dict[str, str]] = []
69+
if row.get("Original conversation") is not None:
70+
for line in row.get("Original conversation").splitlines():
71+
if line.startswith("You:"):
72+
reference_conversation.append(
73+
{"role": "user", "content": line.replace("You:", "").strip()}
74+
)
75+
elif line.startswith("Bot:"):
76+
reference_conversation.append(
77+
{"role": "assistant", "content": line.replace("Bot:", "").strip()}
78+
)
79+
else:
80+
if line.strip() == "":
81+
continue
82+
reference_conversation[-1]["content"] += "\n" + line.strip()
83+
4084

4185
# Each example has inputs and expected metadata.
4286
client.create_example(
@@ -46,21 +90,54 @@ def create_langsmith_dataset():
4690
"city": city,
4791
"state": row["state"],
4892
"facts": facts,
93+
# "message": reference_conversation
4994
},
5095
metadata={
5196
"scenario_id": idx,
5297
"city": city,
5398
"state": row["state"],
5499
# Tag scenarios for filtering.
55-
"tags": ["tenant-rights", f"city-{city}", f"state-{row['state']}"],
100+
"tags": [f"city-{city}", f"state-{row['state']}"],
56101
},
57102
# Optionally include reference conversation for comparison.
58-
outputs={"reference_conversation": row.get("Original conversation", None)},
103+
outputs={"reference_conversation": reference_conversation},
59104
)
60105

61106
print(f"Created dataset '{dataset.name}' with {len(df)} scenarios")
62107
return dataset
63108

64109

65110
if __name__ == "__main__":
66-
create_langsmith_dataset()
111+
parser = argparse.ArgumentParser(
112+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
113+
)
114+
parser.add_argument(
115+
"--input-csv",
116+
type=Path,
117+
default=Path(__file__).parent
118+
/ "generate_conversation/tenant_questions_facts_full.csv",
119+
help="Path to input CSV file",
120+
)
121+
parser.add_argument(
122+
"--limit-examples",
123+
type=int,
124+
default=None,
125+
help="Limit number of examples to upload",
126+
)
127+
parser.add_argument(
128+
"--dataset-name",
129+
type=str,
130+
default="tenant-legal-qa-scenarios",
131+
help="LangSmith dataset name",
132+
)
133+
parser.add_argument(
134+
"--overwrite", action="store_true", help="Overwrite existing dataset"
135+
)
136+
args = parser.parse_args()
137+
138+
create_langsmith_dataset(
139+
input_csv=args.input_csv,
140+
limit_examples=args.limit_examples,
141+
dataset_name=args.dataset_name,
142+
overwrite_dataset=args.overwrite,
143+
)

backend/scripts/langsmith_evaluators.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
"""
66

77
import re
8+
from typing import List, Dict
89

9-
from langsmith.evaluation import Evaluator
10+
# from langsmith import SimpleEvaluator as Evaluator
1011
from openevals import create_llm_as_judge
12+
from openevals.types import SimpleEvaluator, EvaluatorResult
1113
from openevals.prompts import CORRECTNESS_PROMPT
1214

1315
EVALUATOR_MODEL_NAME = "gemini-2.5-pro"
@@ -52,7 +54,7 @@
5254
</Reminders>
5355
"""
5456

55-
citation_accuracy_evaluator: Evaluator = create_llm_as_judge(
57+
citation_accuracy_evaluator: SimpleEvaluator = create_llm_as_judge(
5658
model=EVALUATOR_MODEL_NAME,
5759
prompt=CITATION_PROMPT,
5860
)
@@ -97,17 +99,23 @@
9799
</Reminders>
98100
"""
99101

100-
legal_correctness_evaluator: Evaluator = create_llm_as_judge(
102+
legal_correctness_evaluator: SimpleEvaluator = create_llm_as_judge(
101103
model=EVALUATOR_MODEL_NAME,
102104
prompt=LEGAL_CORRECTNESS_PROMPT,
103105
)
104106

105107

106108
# Evaluator 3: Response Completeness (LLM-as-Judge).
107-
completeness_evaluator: Evaluator = create_llm_as_judge(
108-
model=EVALUATOR_MODEL_NAME,
109-
prompt=CORRECTNESS_PROMPT,
110-
)
109+
def completeness_evaluator(
110+
inputs: dict, outputs: dict, reference_outputs: List[Dict[str,str]]
111+
) -> EvaluatorResult | List[EvaluatorResult]:
112+
tmp = create_llm_as_judge(
113+
model=EVALUATOR_MODEL_NAME,
114+
prompt=CORRECTNESS_PROMPT,
115+
feedback_key="completeness",
116+
)
117+
return tmp(inputs=inputs, outputs=outputs, reference_outputs=reference_outputs)
118+
111119

112120
# Evaluator 4: Tone & Professionalism (LLM-as-Judge).
113121
TONE_PROMPT = """
@@ -141,14 +149,14 @@
141149
</Reminders>
142150
"""
143151

144-
tone_evaluator: Evaluator = create_llm_as_judge(
152+
tone_evaluator: SimpleEvaluator = create_llm_as_judge(
145153
model=EVALUATOR_MODEL_NAME,
146154
prompt=TONE_PROMPT,
147155
)
148156

149157

150158
# Evaluator 5: Citation Format (Heuristic).
151-
def citation_format_evaluator(run, example) -> Evaluator:
159+
def citation_format_evaluator(run, example) -> SimpleEvaluator:
152160
"""Check if citations use proper HTML anchor tag format.
153161
154162
Args:
@@ -188,7 +196,7 @@ def citation_format_evaluator(run, example) -> Evaluator:
188196

189197

190198
# Evaluator 6: Tool Usage (Heuristic).
191-
def tool_usage_evaluator(run, example) -> Evaluator:
199+
def tool_usage_evaluator(run, example) -> SimpleEvaluator:
192200
"""Check if agent used RAG tools appropriately.
193201
194202
Args:
@@ -219,7 +227,7 @@ def tool_usage_evaluator(run, example) -> Evaluator:
219227

220228

221229
# Evaluator 7: Performance Metrics (Heuristic).
222-
def performance_evaluator(run, example) -> Evaluator:
230+
def performance_evaluator(run, example) -> SimpleEvaluator:
223231
"""Track latency and token usage.
224232
225233
Args:

0 commit comments

Comments
 (0)