Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c700d9d
Implemented preprocessing for datasets
TomerG711 May 15, 2025
91027e1
Implemented preprocessing for datasets
TomerG711 May 15, 2025
e02ec8a
Reverted irrelevant formatting
TomerG711 May 15, 2025
fc4bfb2
Removed redundant comments
TomerG711 May 15, 2025
2bf436c
Added support for saving in specific file format
TomerG711 May 18, 2025
64a91de
Fixed code styling
TomerG711 May 18, 2025
b5a4877
Improved error message
TomerG711 May 18, 2025
4d624a2
Fixed code styling
TomerG711 May 20, 2025
8e737b6
Merge branch 'main' into fead/add-preprocess-dataset
TomerG711 May 20, 2025
829366a
Merge branch 'main' into fead/add-preprocess-dataset
markurtz May 27, 2025
6f275d7
Merge branch 'main' into fead/add-preprocess-dataset
markurtz May 29, 2025
cf99526
Merge branch 'main' into fead/add-preprocess-dataset
TomerG711 Jun 1, 2025
e2ca919
Fixed CR comments
TomerG711 Jun 1, 2025
4af81d9
Merge remote-tracking branch 'origin/fead/add-preprocess-dataset' int…
TomerG711 Jun 1, 2025
a9f4fa6
Fixed UTs
TomerG711 Jun 1, 2025
40c1118
Added pytest mark to UTs
TomerG711 Jun 1, 2025
f61b7f0
Added docs
TomerG711 Jun 1, 2025
b6146c9
Ran tox -e style
TomerG711 Jun 1, 2025
f3a3cd7
Fixed help for preprocess dataset subcommand
TomerG711 Jun 1, 2025
448d609
Fixed help for preprocess dataset subcommand
TomerG711 Jun 1, 2025
c48472a
Fixed CR comments
TomerG711 Jun 5, 2025
0cc3ffe
Linters
TomerG711 Jun 5, 2025
c0cd1c9
Linters
TomerG711 Jun 5, 2025
06f19a0
Merge branch 'main' into fead/add-preprocess-dataset
markurtz Jun 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from guidellm.backend import BackendType
from guidellm.benchmark import ProfileType, benchmark_generative_text
from guidellm.config import print_config
from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
from guidellm.scheduler import StrategyType

STRATEGY_PROFILE_CHOICES = set(
Expand Down Expand Up @@ -290,5 +291,187 @@ def config():
print_config()


@cli.group(help="Preprocessing utilities for datasets.")
def preprocess():
pass


@preprocess.command(
help="Convert a dataset to have specific prompt and output token sizes.\n\n"
"INPUT_DATA: Path to the input dataset or dataset ID.\n"
"OUTPUT_PATH: Directory to save the converted dataset. "
"The dataset will be saved as an Arrow dataset (.arrow) inside the directory."
)
@click.argument(
"input_data",
type=str,
metavar="INPUT_DATA",
required=True,
)
@click.argument(
"output_path",
type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
metavar="OUTPUT_PATH",
required=True,
)
@click.option(
"--processor",
type=str,
required=True,
help=(
"The processor or tokenizer to use to calculate token counts for statistics "
"and synthetic data generation."
),
)
@click.option(
"--processor-args",
default=None,
callback=parse_json,
help=(
"A JSON string containing any arguments to pass to the processor constructor "
"as a dict with **kwargs."
),
)
@click.option(
"--data-args",
callback=parse_json,
help=(
"A JSON string containing any arguments to pass to the dataset creation "
"as a dict with **kwargs."
),
)
@click.option(
"--short-prompt-strategy",
type=click.Choice([s.value for s in ShortPromptStrategy]),
default=ShortPromptStrategy.IGNORE.value,
show_default=True,
help="Strategy to handle prompts shorter than the target length. ",
)
@click.option(
"--pad-token",
type=str,
default=None,
help="The token to pad short prompts with when using the 'pad' strategy.",
)
@click.option(
"--prompt-tokens-average",
type=int,
default=10,
show_default=True,
help="Average target number of tokens for prompts.",
)
@click.option(
"--prompt-tokens-stdev",
type=int,
default=None,
help="Standard deviation for prompt tokens sampling.",
)
@click.option(
"--prompt-tokens-min",
type=int,
default=None,
help="Minimum number of prompt tokens.",
)
@click.option(
"--prompt-tokens-max",
type=int,
default=None,
help="Maximum number of prompt tokens.",
)
@click.option(
"--prompt-random-seed",
type=int,
default=42,
show_default=True,
help="Random seed for prompt token sampling.",
)
@click.option(
"--output-tokens-average",
type=int,
default=10,
show_default=True,
help="Average target number of tokens for outputs.",
)
@click.option(
"--output-tokens-stdev",
type=int,
default=None,
help="Standard deviation for output tokens sampling.",
)
@click.option(
"--output-tokens-min",
type=int,
default=None,
help="Minimum number of output tokens.",
)
@click.option(
"--output-tokens-max",
type=int,
default=None,
help="Maximum number of output tokens.",
)
@click.option(
"--output-random-seed",
type=int,
default=123,
show_default=True,
help="Random seed for output token sampling.",
)
@click.option(
"--push-to-hub",
is_flag=True,
help="Set this flag to push the converted dataset to the Hugging Face Hub.",
)
@click.option(
"--hub-dataset-id",
type=str,
default=None,
help="The Hugging Face Hub dataset ID to push to. "
"Required if --push-to-hub is used.",
)
def dataset(
input_data,
output_path,
processor,
processor_args,
data_args,
short_prompt_strategy,
pad_token,
prompt_tokens_average,
prompt_tokens_stdev,
prompt_tokens_min,
prompt_tokens_max,
prompt_random_seed,
output_tokens_average,
output_tokens_stdev,
output_tokens_min,
output_tokens_max,
output_random_seed,
push_to_hub,
hub_dataset_id,
):
process_dataset(
input_data=input_data,
output_path=output_path,
processor=processor,
processor_args=processor_args,
data_args=data_args,
short_prompt_strategy=short_prompt_strategy,
pad_token=pad_token,
prompt_tokens_average=prompt_tokens_average,
prompt_tokens_stdev=prompt_tokens_stdev,
prompt_tokens_min=prompt_tokens_min,
prompt_tokens_max=prompt_tokens_max,
prompt_random_seed=prompt_random_seed,
output_tokens_average=output_tokens_average,
output_tokens_stdev=output_tokens_stdev,
output_tokens_min=output_tokens_min,
output_tokens_max=output_tokens_max,
output_random_seed=output_random_seed,
push_to_hub=push_to_hub,
hub_dataset_id=hub_dataset_id,
)


if __name__ == "__main__":
cli()
3 changes: 3 additions & 0 deletions src/guidellm/preprocess/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .dataset import ShortPromptStrategy, process_dataset

__all__ = ["ShortPromptStrategy", "process_dataset"]
Loading