Skip to content

Commit b65baa2

Browse files
committed
fix: Address code review feedback for Bailian trace loader
Gracefully handle unrecognized 'type' field values during dataset type inference by skipping the explicit type shortcut and falling through to structural detection. This fixes Bailian traces (which use "type" for request type, not dataset type) auto-detecting correctly. Also updates CLI descriptions to reference both trace formats and adds tests for the type field fallback behavior. Signed-off-by: Anthony Casagrande <acasagrande@nvidia.com>
1 parent 34ad3f4 commit b65baa2

File tree

6 files changed

+89
-25
lines changed

6 files changed

+89
-25
lines changed

docs/cli_options.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,11 +258,11 @@ Custom HTTP headers to include with every request. Specify as `Header:Value` pai
258258

259259
#### `--input-file` `<str>`
260260

261-
Path to file or directory containing benchmark dataset. Required when using `--custom-dataset-type`. Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL trace files for `mooncake_trace`, directories for `random_pool`. File is parsed according to `--custom-dataset-type` specification.
261+
Path to file or directory containing benchmark dataset. Required when using `--custom-dataset-type`. Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL for `mooncake_trace`/`bailian_trace` (timestamped traces), directories for `random_pool`. File is parsed according to `--custom-dataset-type` specification.
262262

263263
#### `--fixed-schedule`
264264

265-
Run requests according to timestamps specified in the input dataset. When enabled, AIPerf replays the exact timing pattern from the dataset. This mode is automatically enabled for `mooncake_trace` datasets.
265+
Run requests according to timestamps specified in the input dataset. When enabled, AIPerf replays the exact timing pattern from the dataset. This mode is automatically enabled for trace datasets.
266266
<br>_Flag (no value required)_
267267

268268
#### `--fixed-schedule-auto-offset`
@@ -292,7 +292,7 @@ Pre-configured public dataset to download and use for benchmarking (e.g., `share
292292

293293
#### `--custom-dataset-type` `<str>`
294294

295-
Format specification for custom dataset provided via `--input-file`. Determines parsing logic and expected file structure. Options: `single_turn` (JSONL with single exchanges), `multi_turn` (JSONL with conversation history), `mooncake_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). Requires `--input-file`. Mutually exclusive with `--public-dataset`.
295+
Format specification for custom dataset provided via `--input-file`. Determines parsing logic and expected file structure. Options: `single_turn` (JSONL with single exchanges), `multi_turn` (JSONL with conversation history), `mooncake_trace`/`bailian_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). Requires `--input-file`. Mutually exclusive with `--public-dataset`.
296296
<br>_Choices: [`bailian_trace`, `mooncake_trace`, `multi_turn`, `random_pool`, `single_turn`]_
297297

298298
#### `--dataset-sampling-strategy` `<str>`

src/aiperf/common/config/input_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def validate_goodput(self) -> Self:
192192
Any,
193193
Field(
194194
description="Path to file or directory containing benchmark dataset. Required when using `--custom-dataset-type`. "
195-
"Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL trace files for `mooncake_trace`, "
195+
"Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL for `mooncake_trace`/`bailian_trace` (timestamped traces), "
196196
"directories for `random_pool`. File is parsed according to `--custom-dataset-type` specification.",
197197
),
198198
BeforeValidator(parse_file),
@@ -208,7 +208,7 @@ def validate_goodput(self) -> Self:
208208
bool,
209209
Field(
210210
description="Run requests according to timestamps specified in the input dataset. When enabled, AIPerf replays "
211-
"the exact timing pattern from the dataset. This mode is automatically enabled for `mooncake_trace` datasets."
211+
"the exact timing pattern from the dataset. This mode is automatically enabled for trace datasets."
212212
),
213213
CLIParameter(
214214
name=(
@@ -278,7 +278,7 @@ def validate_goodput(self) -> Self:
278278
Field(
279279
description="Format specification for custom dataset provided via `--input-file`. Determines parsing logic and expected file structure. "
280280
"Options: `single_turn` (JSONL with single exchanges), `multi_turn` (JSONL with conversation history), "
281-
"`mooncake_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). "
281+
"`mooncake_trace`/`bailian_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). "
282282
"Requires `--input-file`. Mutually exclusive with `--public-dataset`.",
283283
),
284284
CLIParameter(

src/aiperf/dataset/composer/custom.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -107,25 +107,21 @@ def _infer_type(
107107
Raises:
108108
ValueError: If the type field is invalid or no loader can handle the data format
109109
"""
110-
# Check for explicit type field first (most efficient)
111-
if data is not None and "type" in data:
112-
try:
113-
# Try to convert the type string to enum
114-
explicit_type = CustomDatasetType(data["type"])
115-
LoaderClass = plugins.get_class(
116-
PluginType.CUSTOM_DATASET_LOADER, explicit_type
117-
)
118-
if not LoaderClass.can_load(data, filename):
119-
raise ValueError(
120-
f"Explicit type field {explicit_type} specified, but loader {LoaderClass.__name__} "
121-
"cannot handle the data format. Please specify --custom-dataset-type explicitly."
122-
)
123-
self.info(f"Using explicit type field: {explicit_type}")
124-
return explicit_type
125-
except (ValueError, KeyError) as e:
110+
# Check for explicit type field first (most efficient).
111+
# Skip values that aren't known dataset types (e.g. Bailian's "type": "text"
112+
# is a request type, not a dataset type) and fall through to structural detection.
113+
if data is not None and data.get("type") in CustomDatasetType:
114+
explicit_type = CustomDatasetType(data["type"])
115+
LoaderClass = plugins.get_class(
116+
PluginType.CUSTOM_DATASET_LOADER, explicit_type
117+
)
118+
if not LoaderClass.can_load(data, filename):
126119
raise ValueError(
127-
f"Invalid type field value: {data['type']}. Please specify --custom-dataset-type explicitly."
128-
) from e
120+
f"Explicit type field {explicit_type} specified, but loader {LoaderClass.__name__} "
121+
"cannot handle the data format. Please specify --custom-dataset-type explicitly."
122+
)
123+
self.info(f"Using explicit type field: {explicit_type}")
124+
return explicit_type
129125

130126
detected_type = None
131127
for entry, LoaderClass in plugins.iter_all(PluginType.CUSTOM_DATASET_LOADER):

src/aiperf/dataset/loader/base_trace_loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def _preprocess_trace(self, trace: TraceT) -> None:
9191
9292
Called after parsing but before filtering. Default is a no-op.
9393
"""
94+
pass
9495

9596
@abstractmethod
9697
def _group_traces(self, items: list[TraceT]) -> dict[str, list[TraceT]]:

src/aiperf/dataset/loader/models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,8 @@ class BailianTrace(AIPerfBaseModel):
263263
belong to the same session and are ordered by ``turn``.
264264
265265
Important: Bailian traces use a block size of 16 tokens per salted SipHash
266-
block. Set ``--isl-block-size 16`` when using this format.
266+
block. Use ``--isl-block-size 16`` when using this format (this is set
267+
automatically in CLI flows).
267268
268269
Examples:
269270
- Root request: ``{"chat_id": 159, "parent_chat_id": -1, "timestamp": 61.114, "input_length": 521, "output_length": 132, "type": "text", "turn": 1, "hash_ids": [1089, 1090, 1091]}``

tests/unit/dataset/loader/test_can_load.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77
from pytest import param
88

9+
from aiperf.dataset.loader.bailian_trace import BailianTraceDatasetLoader
910
from aiperf.dataset.loader.mooncake_trace import MooncakeTraceDatasetLoader
1011
from aiperf.dataset.loader.multi_turn import MultiTurnDatasetLoader
1112
from aiperf.dataset.loader.random_pool import RandomPoolDatasetLoader
@@ -167,6 +168,8 @@ class TestCustomDatasetComposerInferType:
167168
param({"input_length": 100, "output_length": 50}, None, CustomDatasetType.MOONCAKE_TRACE, id="mooncake_input_length"),
168169
param({"type": "mooncake_trace", "input_length": 100}, None, CustomDatasetType.MOONCAKE_TRACE, id="mooncake_explicit"),
169170
param({"text_input": "Hello"}, None, CustomDatasetType.MOONCAKE_TRACE, id="mooncake_text_input"),
171+
param({"type": "bailian_trace", "chat_id": 1, "timestamp": 0.0, "input_length": 100, "output_length": 50}, None, CustomDatasetType.BAILIAN_TRACE, id="bailian_explicit"),
172+
param({"chat_id": 1, "timestamp": 0.0, "input_length": 100, "output_length": 50, "type": "text"}, None, CustomDatasetType.BAILIAN_TRACE, id="bailian_structural_with_request_type"),
170173
],
171174
) # fmt: skip
172175
def test_infer_from_data(
@@ -201,6 +204,15 @@ def test_infer_from_data_raises(self, create_user_config_and_composer, data):
201204
with pytest.raises(ValueError, match="No loader can handle"):
202205
composer._infer_type(data)
203206

207+
def test_infer_explicit_type_loader_rejects_raises(
208+
self, create_user_config_and_composer
209+
):
210+
"""Test that a recognized type field with incompatible data raises ValueError."""
211+
_, composer = create_user_config_and_composer()
212+
data = {"type": "single_turn", "input_length": 100}
213+
with pytest.raises(ValueError, match="cannot handle the data format"):
214+
composer._infer_type(data)
215+
204216
def test_infer_random_pool_with_directory(self, create_user_config_and_composer):
205217
"""Test inferring RandomPool with directory path."""
206218
_, composer = create_user_config_and_composer()
@@ -357,3 +369,57 @@ def test_directory_path_uniquely_identifies_random_pool(self):
357369
assert SingleTurnDatasetLoader.can_load(data=None, filename=temp_path) is False # fmt: skip
358370
assert MultiTurnDatasetLoader.can_load(data=None, filename=temp_path) is False # fmt: skip
359371
assert MooncakeTraceDatasetLoader.can_load(data=None, filename=temp_path) is False # fmt: skip
372+
assert BailianTraceDatasetLoader.can_load(data=None, filename=temp_path) is False # fmt: skip
373+
374+
375+
class TestUnrecognizedTypeFieldFallback:
376+
"""Tests for graceful handling of unrecognized 'type' field values.
377+
378+
Some trace formats (e.g. Bailian) include a 'type' field that represents
379+
something other than the dataset type (e.g. request type: text/search/image).
380+
The inference logic should fall back to structural detection instead of raising."""
381+
382+
def test_bailian_type_field_falls_through_to_structural_detection(
383+
self, create_user_config_and_composer
384+
):
385+
"""Bailian data with type='text' should infer as bailian_trace, not raise."""
386+
_, composer = create_user_config_and_composer()
387+
data = {
388+
"chat_id": 159,
389+
"parent_chat_id": -1,
390+
"timestamp": 61.114,
391+
"input_length": 521,
392+
"output_length": 132,
393+
"type": "text",
394+
"turn": 1,
395+
"hash_ids": [1089, 1090, 1091],
396+
}
397+
result = composer._infer_type(data)
398+
assert result == CustomDatasetType.BAILIAN_TRACE
399+
400+
@pytest.mark.parametrize(
401+
"type_value",
402+
[
403+
param("text", id="text"),
404+
param("search", id="search"),
405+
param("image", id="image"),
406+
param("file", id="file"),
407+
param("unknown_garbage", id="garbage"),
408+
],
409+
) # fmt: skip
410+
def test_unrecognized_type_field_does_not_raise(
411+
self, create_user_config_and_composer, type_value
412+
):
413+
"""Unrecognized type field values should not raise during inference."""
414+
_, composer = create_user_config_and_composer()
415+
data = {
416+
"chat_id": 1,
417+
"parent_chat_id": -1,
418+
"timestamp": 0.0,
419+
"input_length": 100,
420+
"output_length": 50,
421+
"type": type_value,
422+
"turn": 1,
423+
}
424+
result = composer._infer_type(data)
425+
assert result == CustomDatasetType.BAILIAN_TRACE

0 commit comments

Comments
 (0)