Skip to content

Commit e36dfed

Browse files
committed
Update to support langsmith dataset splits
1 parent 2b4b97f commit e36dfed

File tree

11 files changed

+185
-29
lines changed

11 files changed

+185
-29
lines changed

examples/langsmith-zeph-hr-qa/README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,15 @@ cd examples/langsmith-zeph-hr-qa
3737
python setup_dataset.py
3838
```
3939

40-
This creates two datasets: `zephhr-qa-opt` and `zephhr-qa-holdout`.
40+
This creates a single dataset `zephhr-qa` with two splits: `opt` (optimization) and `holdout` (validation).
4141

4242
## Optimize
4343

4444
```bash
4545
weco run --source agent.py \
4646
--eval-backend langsmith \
47-
--langsmith-dataset zephhr-qa-opt \
47+
--langsmith-dataset zephhr-qa \
48+
--langsmith-splits opt \
4849
--langsmith-target agent:answer_hr_question \
4950
--langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \
5051
--langsmith-dashboard-evaluators helpfulness correctness \
@@ -58,7 +59,8 @@ weco run --source agent.py \
5859
```bash
5960
weco run --source agent.py \
6061
--eval-backend langsmith \
61-
--langsmith-dataset zephhr-qa-holdout \
62+
--langsmith-dataset zephhr-qa \
63+
--langsmith-splits holdout \
6264
--langsmith-target agent:answer_hr_question \
6365
--langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \
6466
--langsmith-dashboard-evaluators helpfulness correctness \

examples/langsmith-zeph-hr-qa/setup_dataset.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
"""Create ZephHR QA datasets in LangSmith (idempotent).
1+
"""Create ZephHR QA dataset with splits in LangSmith (idempotent).
22
3-
Reads the JSON data splits and creates/updates:
4-
- zephhr-qa-opt (optimization split)
5-
- zephhr-qa-holdout (held-out validation split)
3+
Reads the JSON data files and creates/updates a single dataset with two splits:
4+
- opt (optimization split — 15 questions)
5+
- holdout (held-out validation split — 10 questions)
66
"""
77

88
import json
@@ -13,17 +13,12 @@
1313

1414
DATA_DIR = Path(__file__).with_name("data")
1515

16-
DATASETS = {
17-
"opt": {
18-
"name": "zephhr-qa-opt",
19-
"description": "ZephHR QA optimization split",
20-
"file": "optimization_questions.json",
21-
},
22-
"holdout": {
23-
"name": "zephhr-qa-holdout",
24-
"description": "ZephHR QA held-out validation split",
25-
"file": "holdout_questions.json",
26-
},
16+
DATASET_NAME = "zephhr-qa"
17+
DATASET_DESCRIPTION = "ZephHR QA benchmark"
18+
19+
SPLITS = {
20+
"opt": "optimization_questions.json",
21+
"holdout": "holdout_questions.json",
2722
}
2823

2924

@@ -58,13 +53,12 @@ def _populate(client: Client, dataset, split: str, records: list) -> Tuple[int,
5853
skipped += 1
5954
continue
6055

61-
outputs = {"expected_answer": record["expected_answer"]}
62-
6356
client.create_example(
6457
inputs={"question": record["question"]},
65-
outputs=outputs,
58+
outputs={"expected_answer": record["expected_answer"]},
6659
dataset_id=dataset.id,
6760
metadata={"case_id": case_id, "split": split},
61+
split=split,
6862
)
6963
added += 1
7064

@@ -73,17 +67,18 @@ def _populate(client: Client, dataset, split: str, records: list) -> Tuple[int,
7367

7468
def main():
7569
client = Client()
70+
dataset = _get_or_create_dataset(client, DATASET_NAME, DATASET_DESCRIPTION)
7671

77-
for split, cfg in DATASETS.items():
78-
records = json.loads((DATA_DIR / cfg["file"]).read_text())
79-
dataset = _get_or_create_dataset(client, cfg["name"], cfg["description"])
72+
for split, filename in SPLITS.items():
73+
records = json.loads((DATA_DIR / filename).read_text())
8074
added, skipped = _populate(client, dataset, split, records)
81-
print(f" {cfg['name']}: added={added}, skipped_existing={skipped}, total_target={len(records)}")
75+
print(f" {DATASET_NAME} [{split}]: added={added}, skipped_existing={skipped}, total_target={len(records)}")
8276

8377
print("\n--- Run optimization ---")
8478
print("weco run --source agent.py \\")
8579
print(" --eval-backend langsmith \\")
86-
print(" --langsmith-dataset zephhr-qa-opt \\")
80+
print(f" --langsmith-dataset {DATASET_NAME} \\")
81+
print(" --langsmith-splits opt \\")
8782
print(" --langsmith-target agent:answer_hr_question \\")
8883
print(" --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \\")
8984
print(" --langsmith-dashboard-evaluators helpfulness correctness \\")
@@ -94,7 +89,8 @@ def main():
9489
print("\n--- Run holdout validation ---")
9590
print("weco run --source agent.py \\")
9691
print(" --eval-backend langsmith \\")
97-
print(" --langsmith-dataset zephhr-qa-holdout \\")
92+
print(f" --langsmith-dataset {DATASET_NAME} \\")
93+
print(" --langsmith-splits holdout \\")
9894
print(" --langsmith-target agent:answer_hr_question \\")
9995
print(" --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \\")
10096
print(" --langsmith-dashboard-evaluators helpfulness correctness \\")

tests/langsmith/test_backend.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def test_adds_langsmith_flags(self):
4343
assert hasattr(args, "langsmith_metric_function")
4444
assert hasattr(args, "langsmith_dashboard_evaluators")
4545
assert hasattr(args, "langsmith_dashboard_evaluator_timeout")
46+
assert hasattr(args, "langsmith_splits")
4647

4748
def test_defaults_are_none_or_expected(self):
4849
"""Default values are None for optional args, 'mean' for summary, 'raw' for adapter."""
@@ -57,6 +58,7 @@ def test_defaults_are_none_or_expected(self):
5758
assert args.langsmith_metric_function is None
5859
assert args.langsmith_dashboard_evaluators is None
5960
assert args.langsmith_dashboard_evaluator_timeout == 900
61+
assert args.langsmith_splits is None
6062

6163
def test_parses_all_flags(self):
6264
"""All flags can be parsed from command line."""
@@ -86,6 +88,9 @@ def test_parses_all_flags(self):
8688
"Conciseness",
8789
"--langsmith-dashboard-evaluator-timeout",
8890
"60",
91+
"--langsmith-splits",
92+
"train",
93+
"test",
8994
]
9095
)
9196

@@ -100,6 +105,7 @@ def test_parses_all_flags(self):
100105
assert args.langsmith_metric_function == "scoring:combine"
101106
assert args.langsmith_dashboard_evaluators == ["Conciseness"]
102107
assert args.langsmith_dashboard_evaluator_timeout == 60
108+
assert args.langsmith_splits == ["train", "test"]
103109

104110

105111
# ---------------------------------------------------------------------------
@@ -314,6 +320,28 @@ def test_no_evaluators_flag_when_empty(self):
314320
cmd = build_eval_command(args)
315321
assert "--evaluators" not in cmd
316322

323+
def test_splits_in_command(self):
324+
"""--langsmith-splits appears in command when set."""
325+
parser = _make_parser()
326+
args = parser.parse_args(
327+
["--metric", "acc", "--langsmith-dataset", "data", "--langsmith-target", "m:f",
328+
"--langsmith-evaluators", "acc", "--langsmith-splits", "train", "test"]
329+
)
330+
validate_args(args)
331+
cmd = build_eval_command(args)
332+
assert "--splits train test" in cmd
333+
334+
def test_splits_omitted_when_none(self):
335+
"""--splits is not in command when no splits specified."""
336+
parser = _make_parser()
337+
args = parser.parse_args(
338+
["--metric", "acc", "--langsmith-dataset", "data", "--langsmith-target", "m:f",
339+
"--langsmith-evaluators", "acc"]
340+
)
341+
validate_args(args)
342+
cmd = build_eval_command(args)
343+
assert "--splits" not in cmd
344+
317345

318346
# ---------------------------------------------------------------------------
319347
# Backend dispatch (_load_backend)

tests/langsmith/test_bridge.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,32 @@ def test_empty_results(self, mock_resolve):
301301

302302
assert metrics == {}
303303

304+
@patch("weco.integrations.langsmith.bridge.resolve_evaluators")
305+
def test_splits_filters_examples(self, mock_resolve):
306+
"""When splits are provided, list_examples is called with splits parameter."""
307+
mock_resolve.return_value = [lambda r, e: {"score": 1}]
308+
309+
mock_results = self._make_mock_results([{"accuracy": 1.0}])
310+
311+
mock_client_cls = MagicMock()
312+
mock_client = mock_client_cls.return_value
313+
mock_client.list_examples.return_value = ["example1"]
314+
mock_client.evaluate.return_value = mock_results
315+
316+
with patch.dict("sys.modules", {"langsmith": MagicMock(Client=mock_client_cls)}):
317+
run_langsmith_eval(
318+
dataset_name="test-data",
319+
target=lambda x: x,
320+
evaluator_names=["accuracy"],
321+
metric_name="accuracy",
322+
splits=["train"],
323+
)
324+
325+
mock_client.list_examples.assert_called_once_with(dataset_name="test-data", splits=["train"])
326+
# data should be the filtered examples, not the dataset name
327+
call_kwargs = mock_client.evaluate.call_args
328+
assert call_kwargs[1]["data"] == ["example1"]
329+
304330

305331
# ---------------------------------------------------------------------------
306332
# main() — output format

tests/langsmith/wizard/test_args_mapping.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,12 @@ def test_required_langsmith_args_mapped(self):
144144
assert args.langsmith_target_adapter == "langchain"
145145
assert args.langsmith_evaluators == ["acc", "rel"]
146146
assert args.langsmith_dashboard_evaluators == ["Conciseness"]
147+
148+
def test_splits_mapped(self):
149+
"""Selected splits are mapped from wizard config to args."""
150+
args = self._make_args()
151+
self._run_wizard_with_config(args, {
152+
"dataset": "d", "target": "m:f",
153+
"splits": ["opt", "holdout"],
154+
})
155+
assert args.langsmith_splits == ["opt", "holdout"]

tests/langsmith/wizard/test_server.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,19 @@ def test_list_examples_returns_inputs_outputs(self, mock_client_prop, wizard_ser
250250
assert data["examples"] == [{"inputs": {"q": "hi"}, "outputs": {"a": "hello"}}, {"inputs": {}, "outputs": {}}]
251251
mock_client.list_examples.assert_called_once_with(dataset_name="my-dataset", limit=5)
252252

253+
@patch.object(WizardServer, "client", new_callable=PropertyMock)
254+
def test_list_splits(self, mock_client_prop, wizard_server):
255+
"""GET /api/datasets/<name>/splits returns available splits."""
256+
mock_client = MagicMock()
257+
mock_client.list_dataset_splits.return_value = ["train", "test"]
258+
mock_client_prop.return_value = mock_client
259+
260+
conn, _, _ = wizard_server
261+
resp, data = get_json(conn, "/api/datasets/my-dataset/splits")
262+
assert resp.status == 200
263+
assert data["splits"] == ["train", "test"]
264+
mock_client.list_dataset_splits.assert_called_once_with(dataset_name="my-dataset")
265+
253266
def test_file_tree_lists_cwd(self, wizard_server, tmp_path, monkeypatch):
254267
"""GET /api/file-tree returns directory entries."""
255268
(tmp_path / "agent.py").write_text("pass")

weco/integrations/langsmith/backend.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,13 @@ def register_args(parser: argparse.ArgumentParser) -> None:
8282
"as module:function (e.g. 'scoring:combine'). The function receives a dict "
8383
"of {evaluator_name: aggregated_score} and returns a single float.",
8484
)
85+
parser.add_argument(
86+
"--langsmith-splits",
87+
nargs="+",
88+
type=str,
89+
default=None,
90+
help="Evaluate only examples in these dataset splits (e.g. 'opt', 'holdout').",
91+
)
8592
parser.add_argument(
8693
"--langsmith-dashboard-evaluator-timeout",
8794
type=int,
@@ -143,6 +150,9 @@ def build_eval_command(args: argparse.Namespace) -> str:
143150
parts.extend(["--max-examples", str(args.langsmith_max_examples)])
144151
if args.langsmith_target_adapter != "raw":
145152
parts.extend(["--target-adapter", args.langsmith_target_adapter])
153+
if args.langsmith_splits:
154+
parts.append("--splits")
155+
parts.extend(args.langsmith_splits)
146156
if args.langsmith_metric_function:
147157
parts.extend(["--metric-function", args.langsmith_metric_function])
148158
if args.langsmith_dashboard_evaluators:

weco/integrations/langsmith/bridge.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ def run_langsmith_eval(
200200
summary_mode: str = "mean",
201201
max_concurrency: int = None,
202202
max_examples: int = None,
203+
splits: list = None,
203204
dashboard_evaluators: list = None,
204205
dashboard_evaluator_timeout: int = 0,
205206
metric_function: Callable = None,
@@ -215,6 +216,7 @@ def run_langsmith_eval(
215216
summary_mode: How to aggregate per-example scores (mean/median/min/max).
216217
max_concurrency: Number of parallel evaluation threads.
217218
max_examples: Limit evaluation to N examples from the dataset.
219+
splits: Filter to specific dataset splits (e.g. ['train', 'test']).
218220
dashboard_evaluators: Names of expected dashboard-bound evaluators.
219221
When set, enables polling with a default timeout of 900s.
220222
dashboard_evaluator_timeout: Seconds to poll for dashboard-bound evaluator scores
@@ -235,9 +237,15 @@ def run_langsmith_eval(
235237
client = Client()
236238
evaluators = resolve_evaluators(evaluator_names)
237239

240+
# When splits are specified, filter examples by split
241+
if splits:
242+
data = client.list_examples(dataset_name=dataset_name, splits=splits)
243+
else:
244+
data = dataset_name
245+
238246
# target is positional-only in Client.evaluate(), so pass it separately
239247
eval_kwargs = {
240-
"data": dataset_name,
248+
"data": data,
241249
"evaluators": evaluators,
242250
"experiment_prefix": experiment_prefix or f"weco-{dataset_name}",
243251
}
@@ -353,6 +361,12 @@ def main():
353361
help="Seconds to poll for dashboard evaluator scores (default: 900). "
354362
"Only used when --dashboard-evaluators is set. Polls every 10s.",
355363
)
364+
parser.add_argument(
365+
"--splits",
366+
nargs="+",
367+
default=None,
368+
help="Evaluate only examples in these dataset splits (e.g. 'train', 'test').",
369+
)
356370
parser.add_argument(
357371
"--metric-function",
358372
default=None,
@@ -400,6 +414,7 @@ def main():
400414
summary_mode=args.summary,
401415
max_concurrency=args.max_concurrency,
402416
max_examples=args.max_examples,
417+
splits=args.splits,
403418
dashboard_evaluators=args.dashboard_evaluators,
404419
dashboard_evaluator_timeout=args.dashboard_evaluator_timeout or 0,
405420
metric_function=metric_fn,

weco/integrations/langsmith/wizard/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def run_wizard(args: argparse.Namespace) -> None:
4747
"langsmith_max_examples": getattr(args, "langsmith_max_examples", None),
4848
"langsmith_max_concurrency": getattr(args, "langsmith_max_concurrency", None),
4949
"langsmith_dashboard_evaluator_timeout": getattr(args, "langsmith_dashboard_evaluator_timeout", 900),
50+
"langsmith_splits": getattr(args, "langsmith_splits", None),
5051
}
5152

5253
html_path = Path(__file__).parent / "page.html"
@@ -152,6 +153,8 @@ def run_wizard(args: argparse.Namespace) -> None:
152153
args.langsmith_max_concurrency = config_result["langsmith_max_concurrency"]
153154
if config_result.get("langsmith_dashboard_evaluator_timeout") is not None:
154155
args.langsmith_dashboard_evaluator_timeout = config_result["langsmith_dashboard_evaluator_timeout"]
156+
if config_result.get("splits"):
157+
args.langsmith_splits = config_result["splits"]
155158
if config_result.get("metric_function"):
156159
args.langsmith_metric_function = config_result["metric_function"]
157160

0 commit comments

Comments
 (0)