|
9 | 9 | - BurstGPT |
10 | 10 | - HuggingFace |
11 | 11 | - VisionArena |
12 | | -
|
13 | | -TODO: Implement CustomDataset to parse a JSON file and convert its contents into |
14 | | -SampleRequest instances, similar to the approach used in ShareGPT. |
15 | 12 | """ |
16 | 13 |
|
17 | 14 | import base64 |
@@ -442,6 +439,97 @@ def sample( |
442 | 439 | return samples |
443 | 440 |
|
444 | 441 |
|
| 442 | +# ----------------------------------------------------------------------------- |
| 443 | +# Custom Dataset Implementation |
| 444 | +# ----------------------------------------------------------------------------- |
| 445 | + |
| 446 | + |
| 447 | +class CustomDataset(BenchmarkDataset): |
| 448 | + """ |
| 449 | + Implements the Custom dataset. Loads data from a JSONL file and generates |
| 450 | + sample requests based on conversation turns. E.g., |
| 451 | + ``` |
| 452 | + {"prompt": "What is the capital of India?"} |
| 453 | + {"prompt": "What is the capital of Iran?"} |
| 454 | + {"prompt": "What is the capital of China?"} |
| 455 | + ``` |
| 456 | + """ |
| 457 | + |
| 458 | + def __init__(self, **kwargs) -> None: |
| 459 | + super().__init__(**kwargs) |
| 460 | + self.load_data() |
| 461 | + |
| 462 | + def load_data(self) -> None: |
| 463 | + if self.dataset_path is None: |
| 464 | + raise ValueError("dataset_path must be provided for loading data.") |
| 465 | + |
| 466 | + # self.data will be a list of dictionaries |
| 467 | + # e.g., [{"prompt": "What is the capital of India?"}, ...] |
| 468 | + # This will be the standardized format which load_data() |
| 469 | + # has to convert into depending on the filetype of dataset_path. |
| 470 | + # sample() will assume this standardized format of self.data |
| 471 | + self.data = [] |
| 472 | + |
| 473 | + # Load the JSONL file |
| 474 | + if self.dataset_path.endswith(".jsonl"): |
| 475 | + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) |
| 476 | + |
| 477 | + # check if the JSONL file has a 'prompt' column |
| 478 | + if "prompt" not in jsonl_data.columns: |
| 479 | + raise ValueError("JSONL file must contain a 'prompt' column.") |
| 480 | + |
| 481 | + # Convert each row to a dictionary and append to self.data |
| 482 | + # This will convert the DataFrame to a list of dictionaries |
| 483 | + # where each dictionary corresponds to a row in the DataFrame. |
| 484 | + # This is the standardized format we want for self.data |
| 485 | + for _, row in jsonl_data.iterrows(): |
| 486 | + self.data.append(row.to_dict()) |
| 487 | + else: |
| 488 | + raise NotImplementedError( |
| 489 | + "Only JSONL format is supported for CustomDataset." |
| 490 | + ) |
| 491 | + |
| 492 | + random.seed(self.random_seed) |
| 493 | + random.shuffle(self.data) |
| 494 | + |
| 495 | + def sample( |
| 496 | + self, |
| 497 | + tokenizer: PreTrainedTokenizerBase, |
| 498 | + num_requests: int, |
| 499 | + lora_path: Optional[str] = None, |
| 500 | + max_loras: Optional[int] = None, |
| 501 | + output_len: Optional[int] = None, |
| 502 | + enable_multimodal_chat: bool = False, |
| 503 | + skip_chat_template: bool = False, |
| 504 | + **kwargs, |
| 505 | + ) -> list: |
| 506 | + sampled_requests = [] |
| 507 | + for item in self.data: |
| 508 | + if len(sampled_requests) >= num_requests: |
| 509 | + break |
| 510 | + prompt = item["prompt"] |
| 511 | + |
| 512 | + # apply template |
| 513 | + if not skip_chat_template: |
| 514 | + prompt = tokenizer.apply_chat_template( |
| 515 | + [{"role": "user", "content": prompt}], |
| 516 | + add_generation_prompt=True, |
| 517 | + tokenize=False, |
| 518 | + ) |
| 519 | + |
| 520 | + prompt_len = len(tokenizer(prompt).input_ids) |
| 521 | + sampled_requests.append( |
| 522 | + SampleRequest( |
| 523 | + prompt=prompt, |
| 524 | + prompt_len=prompt_len, |
| 525 | + expected_output_len=output_len, |
| 526 | + ) |
| 527 | + ) |
| 528 | + self.maybe_oversample_requests(sampled_requests, num_requests) |
| 529 | + |
| 530 | + return sampled_requests |
| 531 | + |
| 532 | + |
445 | 533 | # ----------------------------------------------------------------------------- |
446 | 534 | # Sonnet Dataset Implementation |
447 | 535 | # ----------------------------------------------------------------------------- |
|
0 commit comments