@@ -1151,6 +1151,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
1151
1151
help = "Do not oversample if the dataset has " \
1152
1152
"fewer samples than num-prompts." ,
1153
1153
)
1154
+ parser .add_argument (
1155
+ "--skip-chat-template" ,
1156
+ action = "store_true" ,
1157
+ help =
1158
+ "Skip applying chat template to prompt for datasets that support it." ,
1159
+ )
1154
1160
1155
1161
# group for dataset specific arguments
1156
1162
custom_group = parser .add_argument_group ("custom dataset options" )
@@ -1161,12 +1167,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
1161
1167
help =
1162
1168
"Number of output tokens per request, used only for custom dataset." ,
1163
1169
)
1164
- custom_group .add_argument (
1165
- "--custom-skip-chat-template" ,
1166
- action = "store_true" ,
1167
- help =
1168
- "Skip applying chat template to prompt, used only for custom dataset." ,
1169
- )
1170
1170
1171
1171
spec_bench_group = parser .add_argument_group ("spec bench dataset options" )
1172
1172
spec_bench_group .add_argument (
@@ -1435,7 +1435,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
1435
1435
num_requests = args .num_prompts ,
1436
1436
tokenizer = tokenizer ,
1437
1437
output_len = args .custom_output_len ,
1438
- skip_chat_template = args .custom_skip_chat_template ,
1438
+ skip_chat_template = args .skip_chat_template ,
1439
1439
request_id_prefix = args .request_id_prefix ,
1440
1440
no_oversample = args .no_oversample ,
1441
1441
)
@@ -1576,6 +1576,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
1576
1576
output_len = args .hf_output_len ,
1577
1577
request_id_prefix = args .request_id_prefix ,
1578
1578
no_oversample = args .no_oversample ,
1579
+ skip_chat_template = args .skip_chat_template ,
1579
1580
** hf_kwargs
1580
1581
)
1581
1582
@@ -1815,7 +1816,6 @@ def load_data(self) -> None:
1815
1816
1816
1817
def sample (self , ** kwargs ) -> list :
1817
1818
# leverage CustomDataset sample
1818
- kwargs ["skip_chat_template" ] = False
1819
1819
return super ().sample (** kwargs )
1820
1820
1821
1821
@@ -2221,6 +2221,7 @@ def sample(self,
2221
2221
num_requests : int ,
2222
2222
output_len : Optional [int ] = None ,
2223
2223
enable_multimodal_chat : bool = False ,
2224
+ skip_chat_template : bool = False ,
2224
2225
request_id_prefix : str = "" ,
2225
2226
no_oversample : bool = False ,
2226
2227
** kwargs ) -> list :
@@ -2236,14 +2237,15 @@ def sample(self,
2236
2237
)
2237
2238
2238
2239
# apply template
2239
- prompt = tokenizer .apply_chat_template (
2240
- [{
2241
- "role" : "user" ,
2242
- "content" : prompt
2243
- }],
2244
- add_generation_prompt = True ,
2245
- tokenize = False ,
2246
- )
2240
+ if not skip_chat_template :
2241
+ prompt = tokenizer .apply_chat_template (
2242
+ [{
2243
+ "role" : "user" ,
2244
+ "content" : prompt
2245
+ }],
2246
+ add_generation_prompt = True ,
2247
+ tokenize = False ,
2248
+ )
2247
2249
2248
2250
prompt_len = len (tokenizer (prompt ).input_ids )
2249
2251
sampled_requests .append (
@@ -2284,6 +2286,7 @@ def sample(
2284
2286
num_requests : int ,
2285
2287
output_len : Optional [int ] = None ,
2286
2288
enable_multimodal_chat : bool = False ,
2289
+ skip_chat_template : bool = False ,
2287
2290
request_id_prefix : str = "" ,
2288
2291
no_oversample : bool = False ,
2289
2292
** kwargs ,
@@ -2298,14 +2301,15 @@ def sample(
2298
2301
prompt = item ["turns" ][0 ]
2299
2302
2300
2303
# apply template
2301
- prompt = tokenizer .apply_chat_template (
2302
- [{
2303
- "role" : "user" ,
2304
- "content" : prompt
2305
- }],
2306
- add_generation_prompt = True ,
2307
- tokenize = False ,
2308
- )
2304
+ if not skip_chat_template :
2305
+ prompt = tokenizer .apply_chat_template (
2306
+ [{
2307
+ "role" : "user" ,
2308
+ "content" : prompt
2309
+ }],
2310
+ add_generation_prompt = True ,
2311
+ tokenize = False ,
2312
+ )
2309
2313
2310
2314
prompt_len = len (tokenizer (prompt ).input_ids )
2311
2315
sampled_requests .append (
@@ -2349,6 +2353,7 @@ def sample(
2349
2353
tokenizer : PreTrainedTokenizerBase ,
2350
2354
num_requests : int ,
2351
2355
output_len : Optional [int ] = None ,
2356
+ skip_chat_template : bool = False ,
2352
2357
request_id_prefix : str = "" ,
2353
2358
no_oversample : bool = False ,
2354
2359
min_distance : float = 0.0 ,
@@ -2372,7 +2377,7 @@ def sample(
2372
2377
2373
2378
# template copied from
2374
2379
# https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
2375
- instruction = f"""Given a code file, please apply the change requests and generate the new file.
2380
+ prompt = f"""Given a code file, please apply the change requests and generate the new file.
2376
2381
2377
2382
Original file:
2378
2383
```python
@@ -2385,14 +2390,15 @@ def sample(
2385
2390
Please generate the new code file in the "New file" section below.""" # noqa: E501
2386
2391
2387
2392
# apply template
2388
- prompt = tokenizer .apply_chat_template (
2389
- [{
2390
- "role" : "user" ,
2391
- "content" : instruction
2392
- }],
2393
- add_generation_prompt = True ,
2394
- tokenize = False ,
2395
- )
2393
+ if not skip_chat_template :
2394
+ prompt = tokenizer .apply_chat_template (
2395
+ [{
2396
+ "role" : "user" ,
2397
+ "content" : prompt
2398
+ }],
2399
+ add_generation_prompt = True ,
2400
+ tokenize = False ,
2401
+ )
2396
2402
2397
2403
prompt_len = len (tokenizer (prompt ).input_ids )
2398
2404
0 commit comments