Update to support langsmith dataset splits

aliroberts · aliroberts · commit e36dfed98558 · 2026-03-09T12:00:34.000Z
diff --git a/examples/langsmith-zeph-hr-qa/README.md b/examples/langsmith-zeph-hr-qa/README.md
@@ -37,14 +37,15 @@ cd examples/langsmith-zeph-hr-qa
 python setup_dataset.py
 ```
 
-This creates two datasets: `zephhr-qa-opt` and `zephhr-qa-holdout`.
+This creates a single dataset `zephhr-qa` with two splits: `opt` (optimization) and `holdout` (validation).
 
 ## Optimize
 
 ```bash
 weco run --source agent.py \
   --eval-backend langsmith \
-  --langsmith-dataset zephhr-qa-opt \
+  --langsmith-dataset zephhr-qa \
+  --langsmith-splits opt \
   --langsmith-target agent:answer_hr_question \
   --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \
   --langsmith-dashboard-evaluators helpfulness correctness \
@@ -58,7 +59,8 @@ weco run --source agent.py \
 ```bash
 weco run --source agent.py \
   --eval-backend langsmith \
-  --langsmith-dataset zephhr-qa-holdout \
+  --langsmith-dataset zephhr-qa \
+  --langsmith-splits holdout \
   --langsmith-target agent:answer_hr_question \
   --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \
   --langsmith-dashboard-evaluators helpfulness correctness \
diff --git a/examples/langsmith-zeph-hr-qa/setup_dataset.py b/examples/langsmith-zeph-hr-qa/setup_dataset.py
@@ -1,8 +1,8 @@
-"""Create ZephHR QA datasets in LangSmith (idempotent).
+"""Create ZephHR QA dataset with splits in LangSmith (idempotent).
 
-Reads the JSON data splits and creates/updates:
-- zephhr-qa-opt        (optimization split)
-- zephhr-qa-holdout    (held-out validation split)
+Reads the JSON data files and creates/updates a single dataset with two splits:
+- opt        (optimization split — 15 questions)
+- holdout    (held-out validation split — 10 questions)
 """
 
 import json
@@ -13,17 +13,12 @@
 
 DATA_DIR = Path(__file__).with_name("data")
 
-DATASETS = {
-    "opt": {
-        "name": "zephhr-qa-opt",
-        "description": "ZephHR QA optimization split",
-        "file": "optimization_questions.json",
-    },
-    "holdout": {
-        "name": "zephhr-qa-holdout",
-        "description": "ZephHR QA held-out validation split",
-        "file": "holdout_questions.json",
-    },
+DATASET_NAME = "zephhr-qa"
+DATASET_DESCRIPTION = "ZephHR QA benchmark"
+
+SPLITS = {
+    "opt": "optimization_questions.json",
+    "holdout": "holdout_questions.json",
 }
 
 
@@ -58,13 +53,12 @@ def _populate(client: Client, dataset, split: str, records: list) -> Tuple[int,
             skipped += 1
             continue
 
-        outputs = {"expected_answer": record["expected_answer"]}
-
         client.create_example(
             inputs={"question": record["question"]},
-            outputs=outputs,
+            outputs={"expected_answer": record["expected_answer"]},
             dataset_id=dataset.id,
             metadata={"case_id": case_id, "split": split},
+            split=split,
         )
         added += 1
 
@@ -73,17 +67,18 @@ def _populate(client: Client, dataset, split: str, records: list) -> Tuple[int,
 
 def main():
     client = Client()
+    dataset = _get_or_create_dataset(client, DATASET_NAME, DATASET_DESCRIPTION)
 
-    for split, cfg in DATASETS.items():
-        records = json.loads((DATA_DIR / cfg["file"]).read_text())
-        dataset = _get_or_create_dataset(client, cfg["name"], cfg["description"])
+    for split, filename in SPLITS.items():
+        records = json.loads((DATA_DIR / filename).read_text())
         added, skipped = _populate(client, dataset, split, records)
-        print(f"  {cfg['name']}: added={added}, skipped_existing={skipped}, total_target={len(records)}")
+        print(f"  {DATASET_NAME} [{split}]: added={added}, skipped_existing={skipped}, total_target={len(records)}")
 
     print("\n--- Run optimization ---")
     print("weco run --source agent.py \\")
     print("  --eval-backend langsmith \\")
-    print("  --langsmith-dataset zephhr-qa-opt \\")
+    print(f"  --langsmith-dataset {DATASET_NAME} \\")
+    print("  --langsmith-splits opt \\")
     print("  --langsmith-target agent:answer_hr_question \\")
     print("  --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \\")
     print("  --langsmith-dashboard-evaluators helpfulness correctness \\")
@@ -94,7 +89,8 @@ def main():
     print("\n--- Run holdout validation ---")
     print("weco run --source agent.py \\")
     print("  --eval-backend langsmith \\")
-    print("  --langsmith-dataset zephhr-qa-holdout \\")
+    print(f"  --langsmith-dataset {DATASET_NAME} \\")
+    print("  --langsmith-splits holdout \\")
     print("  --langsmith-target agent:answer_hr_question \\")
     print("  --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \\")
     print("  --langsmith-dashboard-evaluators helpfulness correctness \\")
diff --git a/tests/langsmith/test_backend.py b/tests/langsmith/test_backend.py
@@ -43,6 +43,7 @@ def test_adds_langsmith_flags(self):
         assert hasattr(args, "langsmith_metric_function")
         assert hasattr(args, "langsmith_dashboard_evaluators")
         assert hasattr(args, "langsmith_dashboard_evaluator_timeout")
+        assert hasattr(args, "langsmith_splits")
 
     def test_defaults_are_none_or_expected(self):
         """Default values are None for optional args, 'mean' for summary, 'raw' for adapter."""
@@ -57,6 +58,7 @@ def test_defaults_are_none_or_expected(self):
         assert args.langsmith_metric_function is None
         assert args.langsmith_dashboard_evaluators is None
         assert args.langsmith_dashboard_evaluator_timeout == 900
+        assert args.langsmith_splits is None
 
     def test_parses_all_flags(self):
         """All flags can be parsed from command line."""
@@ -86,6 +88,9 @@ def test_parses_all_flags(self):
                 "Conciseness",
                 "--langsmith-dashboard-evaluator-timeout",
                 "60",
+                "--langsmith-splits",
+                "train",
+                "test",
             ]
         )
 
@@ -100,6 +105,7 @@ def test_parses_all_flags(self):
         assert args.langsmith_metric_function == "scoring:combine"
         assert args.langsmith_dashboard_evaluators == ["Conciseness"]
         assert args.langsmith_dashboard_evaluator_timeout == 60
+        assert args.langsmith_splits == ["train", "test"]
 
 
 # ---------------------------------------------------------------------------
@@ -314,6 +320,28 @@ def test_no_evaluators_flag_when_empty(self):
         cmd = build_eval_command(args)
         assert "--evaluators" not in cmd
 
+    def test_splits_in_command(self):
+        """--langsmith-splits appears in command when set."""
+        parser = _make_parser()
+        args = parser.parse_args(
+            ["--metric", "acc", "--langsmith-dataset", "data", "--langsmith-target", "m:f",
+             "--langsmith-evaluators", "acc", "--langsmith-splits", "train", "test"]
+        )
+        validate_args(args)
+        cmd = build_eval_command(args)
+        assert "--splits train test" in cmd
+
+    def test_splits_omitted_when_none(self):
+        """--splits is not in command when no splits specified."""
+        parser = _make_parser()
+        args = parser.parse_args(
+            ["--metric", "acc", "--langsmith-dataset", "data", "--langsmith-target", "m:f",
+             "--langsmith-evaluators", "acc"]
+        )
+        validate_args(args)
+        cmd = build_eval_command(args)
+        assert "--splits" not in cmd
+
 
 # ---------------------------------------------------------------------------
 # Backend dispatch (_load_backend)
diff --git a/tests/langsmith/test_bridge.py b/tests/langsmith/test_bridge.py
@@ -301,6 +301,32 @@ def test_empty_results(self, mock_resolve):
 
         assert metrics == {}
 
+    @patch("weco.integrations.langsmith.bridge.resolve_evaluators")
+    def test_splits_filters_examples(self, mock_resolve):
+        """When splits are provided, list_examples is called with splits parameter."""
+        mock_resolve.return_value = [lambda r, e: {"score": 1}]
+
+        mock_results = self._make_mock_results([{"accuracy": 1.0}])
+
+        mock_client_cls = MagicMock()
+        mock_client = mock_client_cls.return_value
+        mock_client.list_examples.return_value = ["example1"]
+        mock_client.evaluate.return_value = mock_results
+
+        with patch.dict("sys.modules", {"langsmith": MagicMock(Client=mock_client_cls)}):
+            run_langsmith_eval(
+                dataset_name="test-data",
+                target=lambda x: x,
+                evaluator_names=["accuracy"],
+                metric_name="accuracy",
+                splits=["train"],
+            )
+
+        mock_client.list_examples.assert_called_once_with(dataset_name="test-data", splits=["train"])
+        # data should be the filtered examples, not the dataset name
+        call_kwargs = mock_client.evaluate.call_args
+        assert call_kwargs[1]["data"] == ["example1"]
+
 
 # ---------------------------------------------------------------------------
 # main() — output format
diff --git a/tests/langsmith/wizard/test_args_mapping.py b/tests/langsmith/wizard/test_args_mapping.py
@@ -144,3 +144,12 @@ def test_required_langsmith_args_mapped(self):
         assert args.langsmith_target_adapter == "langchain"
         assert args.langsmith_evaluators == ["acc", "rel"]
         assert args.langsmith_dashboard_evaluators == ["Conciseness"]
+
+    def test_splits_mapped(self):
+        """Selected splits are mapped from wizard config to args."""
+        args = self._make_args()
+        self._run_wizard_with_config(args, {
+            "dataset": "d", "target": "m:f",
+            "splits": ["opt", "holdout"],
+        })
+        assert args.langsmith_splits == ["opt", "holdout"]
diff --git a/tests/langsmith/wizard/test_server.py b/tests/langsmith/wizard/test_server.py
@@ -250,6 +250,19 @@ def test_list_examples_returns_inputs_outputs(self, mock_client_prop, wizard_ser
         assert data["examples"] == [{"inputs": {"q": "hi"}, "outputs": {"a": "hello"}}, {"inputs": {}, "outputs": {}}]
         mock_client.list_examples.assert_called_once_with(dataset_name="my-dataset", limit=5)
 
+    @patch.object(WizardServer, "client", new_callable=PropertyMock)
+    def test_list_splits(self, mock_client_prop, wizard_server):
+        """GET /api/datasets/<name>/splits returns available splits."""
+        mock_client = MagicMock()
+        mock_client.list_dataset_splits.return_value = ["train", "test"]
+        mock_client_prop.return_value = mock_client
+
+        conn, _, _ = wizard_server
+        resp, data = get_json(conn, "/api/datasets/my-dataset/splits")
+        assert resp.status == 200
+        assert data["splits"] == ["train", "test"]
+        mock_client.list_dataset_splits.assert_called_once_with(dataset_name="my-dataset")
+
     def test_file_tree_lists_cwd(self, wizard_server, tmp_path, monkeypatch):
         """GET /api/file-tree returns directory entries."""
         (tmp_path / "agent.py").write_text("pass")
diff --git a/weco/integrations/langsmith/backend.py b/weco/integrations/langsmith/backend.py
@@ -82,6 +82,13 @@ def register_args(parser: argparse.ArgumentParser) -> None:
         "as module:function (e.g. 'scoring:combine'). The function receives a dict "
         "of {evaluator_name: aggregated_score} and returns a single float.",
     )
+    parser.add_argument(
+        "--langsmith-splits",
+        nargs="+",
+        type=str,
+        default=None,
+        help="Evaluate only examples in these dataset splits (e.g. 'opt', 'holdout').",
+    )
     parser.add_argument(
         "--langsmith-dashboard-evaluator-timeout",
         type=int,
@@ -143,6 +150,9 @@ def build_eval_command(args: argparse.Namespace) -> str:
         parts.extend(["--max-examples", str(args.langsmith_max_examples)])
     if args.langsmith_target_adapter != "raw":
         parts.extend(["--target-adapter", args.langsmith_target_adapter])
+    if args.langsmith_splits:
+        parts.append("--splits")
+        parts.extend(args.langsmith_splits)
     if args.langsmith_metric_function:
         parts.extend(["--metric-function", args.langsmith_metric_function])
     if args.langsmith_dashboard_evaluators:
diff --git a/weco/integrations/langsmith/bridge.py b/weco/integrations/langsmith/bridge.py
@@ -200,6 +200,7 @@ def run_langsmith_eval(
     summary_mode: str = "mean",
     max_concurrency: int = None,
     max_examples: int = None,
+    splits: list = None,
     dashboard_evaluators: list = None,
     dashboard_evaluator_timeout: int = 0,
     metric_function: Callable = None,
@@ -215,6 +216,7 @@ def run_langsmith_eval(
         summary_mode: How to aggregate per-example scores (mean/median/min/max).
         max_concurrency: Number of parallel evaluation threads.
         max_examples: Limit evaluation to N examples from the dataset.
+        splits: Filter to specific dataset splits (e.g. ['train', 'test']).
         dashboard_evaluators: Names of expected dashboard-bound evaluators.
             When set, enables polling with a default timeout of 900s.
         dashboard_evaluator_timeout: Seconds to poll for dashboard-bound evaluator scores
@@ -235,9 +237,15 @@ def run_langsmith_eval(
     client = Client()
     evaluators = resolve_evaluators(evaluator_names)
 
+    # When splits are specified, filter examples by split
+    if splits:
+        data = client.list_examples(dataset_name=dataset_name, splits=splits)
+    else:
+        data = dataset_name
+
     # target is positional-only in Client.evaluate(), so pass it separately
     eval_kwargs = {
-        "data": dataset_name,
+        "data": data,
         "evaluators": evaluators,
         "experiment_prefix": experiment_prefix or f"weco-{dataset_name}",
     }
@@ -353,6 +361,12 @@ def main():
         help="Seconds to poll for dashboard evaluator scores (default: 900). "
         "Only used when --dashboard-evaluators is set. Polls every 10s.",
     )
+    parser.add_argument(
+        "--splits",
+        nargs="+",
+        default=None,
+        help="Evaluate only examples in these dataset splits (e.g. 'train', 'test').",
+    )
     parser.add_argument(
         "--metric-function",
         default=None,
@@ -400,6 +414,7 @@ def main():
             summary_mode=args.summary,
             max_concurrency=args.max_concurrency,
             max_examples=args.max_examples,
+            splits=args.splits,
             dashboard_evaluators=args.dashboard_evaluators,
             dashboard_evaluator_timeout=args.dashboard_evaluator_timeout or 0,
             metric_function=metric_fn,
diff --git a/weco/integrations/langsmith/wizard/__init__.py b/weco/integrations/langsmith/wizard/__init__.py
@@ -47,6 +47,7 @@ def run_wizard(args: argparse.Namespace) -> None:
         "langsmith_max_examples": getattr(args, "langsmith_max_examples", None),
         "langsmith_max_concurrency": getattr(args, "langsmith_max_concurrency", None),
         "langsmith_dashboard_evaluator_timeout": getattr(args, "langsmith_dashboard_evaluator_timeout", 900),
+        "langsmith_splits": getattr(args, "langsmith_splits", None),
     }
 
     html_path = Path(__file__).parent / "page.html"
@@ -152,6 +153,8 @@ def run_wizard(args: argparse.Namespace) -> None:
         args.langsmith_max_concurrency = config_result["langsmith_max_concurrency"]
     if config_result.get("langsmith_dashboard_evaluator_timeout") is not None:
         args.langsmith_dashboard_evaluator_timeout = config_result["langsmith_dashboard_evaluator_timeout"]
+    if config_result.get("splits"):
+        args.langsmith_splits = config_result["splits"]
     if config_result.get("metric_function"):
         args.langsmith_metric_function = config_result["metric_function"]
 
diff --git a/weco/integrations/langsmith/wizard/page.html b/weco/integrations/langsmith/wizard/page.html
diff --git a/weco/integrations/langsmith/wizard/server.py b/weco/integrations/langsmith/wizard/server.py