@@ -33,17 +33,18 @@ def tokenizer_mock():
3333 )
3434 return tokenizer
3535
36+
3637@pytest .mark .smoke
3738@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
3839@patch (f"{ process_dataset .__module__ } .check_load_processor" )
3940@patch (f"{ process_dataset .__module__ } .Dataset" )
4041@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
4142def test_strategy_handler_called (
42- mock_sampler ,
43- mock_dataset_class ,
44- mock_check_processor ,
45- mock_load_dataset ,
46- tokenizer_mock ,
43+ mock_sampler ,
44+ mock_dataset_class ,
45+ mock_check_processor ,
46+ mock_load_dataset ,
47+ tokenizer_mock ,
4748):
4849 mock_handler = MagicMock (return_value = "processed_prompt" )
4950 with patch .dict (STRATEGY_HANDLERS , {ShortPromptStrategy .IGNORE : mock_handler }):
@@ -68,18 +69,21 @@ def test_strategy_handler_called(
6869 mock_load_dataset .assert_called_once ()
6970 mock_check_processor .assert_called_once ()
7071
72+
7173@pytest .mark .sanity
7274def test_handle_ignore_strategy_too_short (tokenizer_mock ):
7375 result = handle_ignore_strategy ("short" , 10 , tokenizer_mock )
7476 assert result is None
7577 tokenizer_mock .encode .assert_called_with ("short" )
7678
79+
7780@pytest .mark .sanity
7881def test_handle_ignore_strategy_sufficient_length (tokenizer_mock ):
7982 result = handle_ignore_strategy ("long prompt" , 5 , tokenizer_mock )
8083 assert result == "long prompt"
8184 tokenizer_mock .encode .assert_called_with ("long prompt" )
8285
86+
8387@pytest .mark .sanity
8488def test_handle_concatenate_strategy_enough_prompts (tokenizer_mock ):
8589 dataset_iter = iter ([{"prompt" : "longer" }])
@@ -88,6 +92,7 @@ def test_handle_concatenate_strategy_enough_prompts(tokenizer_mock):
8892 )
8993 assert result == "short\n longer"
9094
95+
9196@pytest .mark .sanity
9297def test_handle_concatenate_strategy_not_enough_prompts (tokenizer_mock ):
9398 dataset_iter : Iterator = iter ([])
@@ -96,35 +101,39 @@ def test_handle_concatenate_strategy_not_enough_prompts(tokenizer_mock):
96101 )
97102 assert result is None
98103
104+
99105@pytest .mark .sanity
100106def test_handle_pad_strategy (tokenizer_mock ):
101107 result = handle_pad_strategy ("short" , 10 , tokenizer_mock , "p" )
102108 assert result == "shortppppp"
103109
110+
104111@pytest .mark .sanity
105112def test_handle_error_strategy_valid_prompt (tokenizer_mock ):
106113 result = handle_error_strategy ("valid prompt" , 5 , tokenizer_mock )
107114 assert result == "valid prompt"
108115 tokenizer_mock .encode .assert_called_with ("valid prompt" )
109116
117+
110118@pytest .mark .sanity
111119def test_handle_error_strategy_too_short_prompt (tokenizer_mock ):
112120 with pytest .raises (PromptTooShortError ):
113121 handle_error_strategy ("short" , 10 , tokenizer_mock )
114122
123+
115124@pytest .mark .smoke
116125@patch ("guidellm.preprocess.dataset.save_dataset_to_file" )
117126@patch ("guidellm.preprocess.dataset.Dataset" )
118127@patch ("guidellm.preprocess.dataset.guidellm_load_dataset" )
119128@patch ("guidellm.preprocess.dataset.check_load_processor" )
120129@patch ("guidellm.preprocess.dataset.IntegerRangeSampler" )
121130def test_process_dataset_non_empty (
122- mock_sampler ,
123- mock_check_processor ,
124- mock_load_dataset ,
125- mock_dataset_class ,
126- mock_save_to_file ,
127- tokenizer_mock ,
131+ mock_sampler ,
132+ mock_check_processor ,
133+ mock_load_dataset ,
134+ mock_dataset_class ,
135+ mock_save_to_file ,
136+ tokenizer_mock ,
128137):
129138 from guidellm .preprocess .dataset import process_dataset
130139
@@ -159,17 +168,18 @@ def test_process_dataset_non_empty(
159168 assert "output_tokens_count" in item
160169 assert len (tokenizer_mock .encode (item ["prompt" ])) <= 3
161170
171+
162172@pytest .mark .sanity
163173@patch (f"{ process_dataset .__module__ } .Dataset" )
164174@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
165175@patch (f"{ process_dataset .__module__ } .check_load_processor" )
166176@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
167177def test_process_dataset_empty_after_processing (
168- mock_sampler ,
169- mock_check_processor ,
170- mock_load_dataset ,
171- mock_dataset_class ,
172- tokenizer_mock ,
178+ mock_sampler ,
179+ mock_check_processor ,
180+ mock_load_dataset ,
181+ mock_dataset_class ,
182+ tokenizer_mock ,
173183):
174184 mock_dataset = [{"prompt" : "" }]
175185 mock_load_dataset .return_value = (mock_dataset , {"prompt_column" : "prompt" })
@@ -188,19 +198,20 @@ def test_process_dataset_empty_after_processing(
188198 mock_check_processor .assert_called_once ()
189199 mock_dataset_class .from_list .assert_not_called ()
190200
201+
191202@pytest .mark .smoke
192203@patch (f"{ process_dataset .__module__ } .push_dataset_to_hub" )
193204@patch (f"{ process_dataset .__module__ } .Dataset" )
194205@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
195206@patch (f"{ process_dataset .__module__ } .check_load_processor" )
196207@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
197208def test_process_dataset_push_to_hub_called (
198- mock_sampler ,
199- mock_check_processor ,
200- mock_load_dataset ,
201- mock_dataset_class ,
202- mock_push ,
203- tokenizer_mock ,
209+ mock_sampler ,
210+ mock_check_processor ,
211+ mock_load_dataset ,
212+ mock_dataset_class ,
213+ mock_push ,
214+ tokenizer_mock ,
204215):
205216 mock_dataset = [{"prompt" : "abc" }]
206217 mock_load_dataset .return_value = (mock_dataset , {"prompt_column" : "prompt" })
@@ -221,19 +232,20 @@ def test_process_dataset_push_to_hub_called(
221232 )
222233 mock_push .assert_called_once_with ("id123" , mock_dataset_obj )
223234
235+
224236@pytest .mark .sanity
225237@patch (f"{ process_dataset .__module__ } .push_dataset_to_hub" )
226238@patch (f"{ process_dataset .__module__ } .Dataset" )
227239@patch (f"{ process_dataset .__module__ } .guidellm_load_dataset" )
228240@patch (f"{ process_dataset .__module__ } .check_load_processor" )
229241@patch (f"{ process_dataset .__module__ } .IntegerRangeSampler" )
230242def test_process_dataset_push_to_hub_not_called (
231- mock_sampler ,
232- mock_check_processor ,
233- mock_load_dataset ,
234- mock_dataset_class ,
235- mock_push ,
236- tokenizer_mock ,
243+ mock_sampler ,
244+ mock_check_processor ,
245+ mock_load_dataset ,
246+ mock_dataset_class ,
247+ mock_push ,
248+ tokenizer_mock ,
237249):
238250 mock_dataset = [{"prompt" : "abc" }]
239251 mock_load_dataset .return_value = (mock_dataset , {"prompt_column" : "prompt" })
@@ -253,13 +265,15 @@ def test_process_dataset_push_to_hub_not_called(
253265 )
254266 mock_push .assert_not_called ()
255267
268+
256269@pytest .mark .regression
257270def test_push_dataset_to_hub_success ():
258271 os .environ ["HF_TOKEN" ] = "token"
259272 mock_dataset = MagicMock (spec = Dataset )
260273 push_dataset_to_hub ("dataset_id" , mock_dataset )
261274 mock_dataset .push_to_hub .assert_called_once_with ("dataset_id" , token = "token" )
262275
276+
263277@pytest .mark .regression
264278def test_push_dataset_to_hub_error_no_env ():
265279 if "HF_TOKEN" in os .environ :
@@ -268,13 +282,15 @@ def test_push_dataset_to_hub_error_no_env():
268282 with pytest .raises (ValueError , match = "hub_dataset_id and HF_TOKEN" ):
269283 push_dataset_to_hub ("dataset_id" , mock_dataset )
270284
285+
271286@pytest .mark .regression
272287def test_push_dataset_to_hub_error_no_id ():
273288 os .environ ["HF_TOKEN" ] = "token"
274289 mock_dataset = MagicMock (spec = Dataset )
275290 with pytest .raises (ValueError , match = "hub_dataset_id and HF_TOKEN" ):
276291 push_dataset_to_hub (None , mock_dataset )
277292
293+
278294@pytest .mark .regression
279295@patch .object (Path , "mkdir" )
280296def test_save_dataset_to_file_csv (mock_mkdir ):
@@ -284,6 +300,7 @@ def test_save_dataset_to_file_csv(mock_mkdir):
284300 mock_dataset .to_csv .assert_called_once_with (output_path )
285301 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
286302
303+
287304@pytest .mark .regression
288305@patch .object (Path , "mkdir" )
289306def test_save_dataset_to_file_csv_capitalized (mock_mkdir ):
@@ -293,6 +310,7 @@ def test_save_dataset_to_file_csv_capitalized(mock_mkdir):
293310 mock_dataset .to_csv .assert_called_once_with (output_path )
294311 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
295312
313+
296314@pytest .mark .regression
297315@patch .object (Path , "mkdir" )
298316def test_save_dataset_to_file_json (mock_mkdir ):
@@ -302,6 +320,7 @@ def test_save_dataset_to_file_json(mock_mkdir):
302320 mock_dataset .to_json .assert_called_once_with (output_path )
303321 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
304322
323+
305324@pytest .mark .regression
306325@patch .object (Path , "mkdir" )
307326def test_save_dataset_to_file_json_capitalized (mock_mkdir ):
@@ -311,6 +330,7 @@ def test_save_dataset_to_file_json_capitalized(mock_mkdir):
311330 mock_dataset .to_json .assert_called_once_with (output_path )
312331 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
313332
333+
314334@pytest .mark .regression
315335@patch .object (Path , "mkdir" )
316336def test_save_dataset_to_file_jsonl (mock_mkdir ):
@@ -320,6 +340,7 @@ def test_save_dataset_to_file_jsonl(mock_mkdir):
320340 mock_dataset .to_json .assert_called_once_with (output_path )
321341 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
322342
343+
323344@pytest .mark .regression
324345@patch .object (Path , "mkdir" )
325346def test_save_dataset_to_file_jsonl_capitalized (mock_mkdir ):
@@ -329,6 +350,7 @@ def test_save_dataset_to_file_jsonl_capitalized(mock_mkdir):
329350 mock_dataset .to_json .assert_called_once_with (output_path )
330351 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
331352
353+
332354@pytest .mark .regression
333355@patch .object (Path , "mkdir" )
334356def test_save_dataset_to_file_parquet (mock_mkdir ):
@@ -338,6 +360,7 @@ def test_save_dataset_to_file_parquet(mock_mkdir):
338360 mock_dataset .to_parquet .assert_called_once_with (output_path )
339361 mock_mkdir .assert_called_once_with (parents = True , exist_ok = True )
340362
363+
341364@pytest .mark .regression
342365@patch .object (Path , "mkdir" )
343366def test_save_dataset_to_file_unsupported_type (mock_mkdir ):
0 commit comments