1- """Create ZephHR QA datasets in LangSmith (idempotent).
1+ """Create ZephHR QA dataset with splits in LangSmith (idempotent).
22
3- Reads the JSON data splits and creates/updates:
4- - zephhr-qa- opt (optimization split)
5- - zephhr-qa- holdout (held-out validation split)
3+ Reads the JSON data files and creates/updates a single dataset with two splits :
4+ - opt (optimization split — 15 questions )
5+ - holdout (held-out validation split — 10 questions )
66"""
77
88import json
1313
1414DATA_DIR = Path (__file__ ).with_name ("data" )
1515
16- DATASETS = {
17- "opt" : {
18- "name" : "zephhr-qa-opt" ,
19- "description" : "ZephHR QA optimization split" ,
20- "file" : "optimization_questions.json" ,
21- },
22- "holdout" : {
23- "name" : "zephhr-qa-holdout" ,
24- "description" : "ZephHR QA held-out validation split" ,
25- "file" : "holdout_questions.json" ,
26- },
16+ DATASET_NAME = "zephhr-qa"
17+ DATASET_DESCRIPTION = "ZephHR QA benchmark"
18+
19+ SPLITS = {
20+ "opt" : "optimization_questions.json" ,
21+ "holdout" : "holdout_questions.json" ,
2722}
2823
2924
@@ -58,13 +53,12 @@ def _populate(client: Client, dataset, split: str, records: list) -> Tuple[int,
5853 skipped += 1
5954 continue
6055
61- outputs = {"expected_answer" : record ["expected_answer" ]}
62-
6356 client .create_example (
6457 inputs = {"question" : record ["question" ]},
65- outputs = outputs ,
58+ outputs = { "expected_answer" : record [ "expected_answer" ]} ,
6659 dataset_id = dataset .id ,
6760 metadata = {"case_id" : case_id , "split" : split },
61+ split = split ,
6862 )
6963 added += 1
7064
@@ -73,17 +67,18 @@ def _populate(client: Client, dataset, split: str, records: list) -> Tuple[int,
7367
7468def main ():
7569 client = Client ()
70+ dataset = _get_or_create_dataset (client , DATASET_NAME , DATASET_DESCRIPTION )
7671
77- for split , cfg in DATASETS .items ():
78- records = json .loads ((DATA_DIR / cfg ["file" ]).read_text ())
79- dataset = _get_or_create_dataset (client , cfg ["name" ], cfg ["description" ])
72+ for split , filename in SPLITS .items ():
73+ records = json .loads ((DATA_DIR / filename ).read_text ())
8074 added , skipped = _populate (client , dataset , split , records )
81- print (f" { cfg [ 'name' ] } : added={ added } , skipped_existing={ skipped } , total_target={ len (records )} " )
75+ print (f" { DATASET_NAME } [ { split } ] : added={ added } , skipped_existing={ skipped } , total_target={ len (records )} " )
8276
8377 print ("\n --- Run optimization ---" )
8478 print ("weco run --source agent.py \\ " )
8579 print (" --eval-backend langsmith \\ " )
86- print (" --langsmith-dataset zephhr-qa-opt \\ " )
80+ print (f" --langsmith-dataset { DATASET_NAME } \\ " )
81+ print (" --langsmith-splits opt \\ " )
8782 print (" --langsmith-target agent:answer_hr_question \\ " )
8883 print (" --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \\ " )
8984 print (" --langsmith-dashboard-evaluators helpfulness correctness \\ " )
@@ -94,7 +89,8 @@ def main():
9489 print ("\n --- Run holdout validation ---" )
9590 print ("weco run --source agent.py \\ " )
9691 print (" --eval-backend langsmith \\ " )
97- print (" --langsmith-dataset zephhr-qa-holdout \\ " )
92+ print (f" --langsmith-dataset { DATASET_NAME } \\ " )
93+ print (" --langsmith-splits holdout \\ " )
9894 print (" --langsmith-target agent:answer_hr_question \\ " )
9995 print (" --langsmith-evaluators evaluators:json_schema_validity evaluators:conciseness \\ " )
10096 print (" --langsmith-dashboard-evaluators helpfulness correctness \\ " )
0 commit comments