fix: Address code review feedback for Bailian trace loader

ajcasagrande · ajcasagrande · commit b65baa265a0d · 2026-02-27T22:19:20.000-08:00
Gracefully handle unrecognized 'type' field values during dataset type
inference by skipping the explicit type shortcut and falling through to
structural detection. This fixes Bailian traces (which use "type" for
request type, not dataset type) auto-detecting correctly. Also updates
CLI descriptions to reference both trace formats and adds tests for the
type field fallback behavior.

Signed-off-by: Anthony Casagrande &lt;acasagrande@nvidia.com&gt;
diff --git a/docs/cli_options.md b/docs/cli_options.md
@@ -258,11 +258,11 @@ Custom HTTP headers to include with every request. Specify as `Header:Value` pai
 
 #### `--input-file` `<str>`
 
-Path to file or directory containing benchmark dataset. Required when using `--custom-dataset-type`. Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL trace files for `mooncake_trace`, directories for `random_pool`. File is parsed according to `--custom-dataset-type` specification.
+Path to file or directory containing benchmark dataset. Required when using `--custom-dataset-type`. Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL for `mooncake_trace`/`bailian_trace` (timestamped traces), directories for `random_pool`. File is parsed according to `--custom-dataset-type` specification.
 
 #### `--fixed-schedule`
 
-Run requests according to timestamps specified in the input dataset. When enabled, AIPerf replays the exact timing pattern from the dataset. This mode is automatically enabled for `mooncake_trace` datasets.
+Run requests according to timestamps specified in the input dataset. When enabled, AIPerf replays the exact timing pattern from the dataset. This mode is automatically enabled for trace datasets.
 <br>_Flag (no value required)_
 
 #### `--fixed-schedule-auto-offset`
@@ -292,7 +292,7 @@ Pre-configured public dataset to download and use for benchmarking (e.g., `share
 
 #### `--custom-dataset-type` `<str>`
 
-Format specification for custom dataset provided via `--input-file`. Determines parsing logic and expected file structure. Options: `single_turn` (JSONL with single exchanges), `multi_turn` (JSONL with conversation history), `mooncake_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). Requires `--input-file`. Mutually exclusive with `--public-dataset`.
+Format specification for custom dataset provided via `--input-file`. Determines parsing logic and expected file structure. Options: `single_turn` (JSONL with single exchanges), `multi_turn` (JSONL with conversation history), `mooncake_trace`/`bailian_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). Requires `--input-file`. Mutually exclusive with `--public-dataset`.
 <br>_Choices: [`bailian_trace`, `mooncake_trace`, `multi_turn`, `random_pool`, `single_turn`]_
 
 #### `--dataset-sampling-strategy` `<str>`
diff --git a/src/aiperf/common/config/input_config.py b/src/aiperf/common/config/input_config.py
@@ -192,7 +192,7 @@ def validate_goodput(self) -> Self:
         Any,
         Field(
             description="Path to file or directory containing benchmark dataset. Required when using `--custom-dataset-type`. "
-            "Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL trace files for `mooncake_trace`, "
+            "Supported formats depend on dataset type: JSONL for `single_turn`/`multi_turn`, JSONL for `mooncake_trace`/`bailian_trace` (timestamped traces), "
             "directories for `random_pool`. File is parsed according to `--custom-dataset-type` specification.",
         ),
         BeforeValidator(parse_file),
@@ -208,7 +208,7 @@ def validate_goodput(self) -> Self:
         bool,
         Field(
             description="Run requests according to timestamps specified in the input dataset. When enabled, AIPerf replays "
-            "the exact timing pattern from the dataset. This mode is automatically enabled for `mooncake_trace` datasets."
+            "the exact timing pattern from the dataset. This mode is automatically enabled for trace datasets."
         ),
         CLIParameter(
             name=(
@@ -278,7 +278,7 @@ def validate_goodput(self) -> Self:
         Field(
             description="Format specification for custom dataset provided via `--input-file`. Determines parsing logic and expected file structure. "
             "Options: `single_turn` (JSONL with single exchanges), `multi_turn` (JSONL with conversation history), "
-            "`mooncake_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). "
+            "`mooncake_trace`/`bailian_trace` (timestamped trace files), `random_pool` (directory of reusable prompts). "
             "Requires `--input-file`. Mutually exclusive with `--public-dataset`.",
         ),
         CLIParameter(
diff --git a/src/aiperf/dataset/composer/custom.py b/src/aiperf/dataset/composer/custom.py
@@ -107,25 +107,21 @@ def _infer_type(
         Raises:
             ValueError: If the type field is invalid or no loader can handle the data format
         """
-        # Check for explicit type field first (most efficient)
-        if data is not None and "type" in data:
-            try:
-                # Try to convert the type string to enum
-                explicit_type = CustomDatasetType(data["type"])
-                LoaderClass = plugins.get_class(
-                    PluginType.CUSTOM_DATASET_LOADER, explicit_type
-                )
-                if not LoaderClass.can_load(data, filename):
-                    raise ValueError(
-                        f"Explicit type field {explicit_type} specified, but loader {LoaderClass.__name__} "
-                        "cannot handle the data format. Please specify --custom-dataset-type explicitly."
-                    )
-                self.info(f"Using explicit type field: {explicit_type}")
-                return explicit_type
-            except (ValueError, KeyError) as e:
+        # Check for explicit type field first (most efficient).
+        # Skip values that aren't known dataset types (e.g. Bailian's "type": "text"
+        # is a request type, not a dataset type) and fall through to structural detection.
+        if data is not None and data.get("type") in CustomDatasetType:
+            explicit_type = CustomDatasetType(data["type"])
+            LoaderClass = plugins.get_class(
+                PluginType.CUSTOM_DATASET_LOADER, explicit_type
+            )
+            if not LoaderClass.can_load(data, filename):
                 raise ValueError(
-                    f"Invalid type field value: {data['type']}. Please specify --custom-dataset-type explicitly."
-                ) from e
+                    f"Explicit type field {explicit_type} specified, but loader {LoaderClass.__name__} "
+                    "cannot handle the data format. Please specify --custom-dataset-type explicitly."
+                )
+            self.info(f"Using explicit type field: {explicit_type}")
+            return explicit_type
 
         detected_type = None
         for entry, LoaderClass in plugins.iter_all(PluginType.CUSTOM_DATASET_LOADER):
diff --git a/src/aiperf/dataset/loader/base_trace_loader.py b/src/aiperf/dataset/loader/base_trace_loader.py
@@ -91,6 +91,7 @@ def _preprocess_trace(self, trace: TraceT) -> None:
 
         Called after parsing but before filtering. Default is a no-op.
         """
+        pass
 
     @abstractmethod
     def _group_traces(self, items: list[TraceT]) -> dict[str, list[TraceT]]:
diff --git a/src/aiperf/dataset/loader/models.py b/src/aiperf/dataset/loader/models.py
@@ -263,7 +263,8 @@ class BailianTrace(AIPerfBaseModel):
     belong to the same session and are ordered by ``turn``.
 
     Important: Bailian traces use a block size of 16 tokens per salted SipHash
-    block.  Set ``--isl-block-size 16`` when using this format.
+    block.  Use ``--isl-block-size 16`` when using this format (this is set
+    automatically in CLI flows).
 
     Examples:
     - Root request:  ``{"chat_id": 159, "parent_chat_id": -1, "timestamp": 61.114, "input_length": 521, "output_length": 132, "type": "text", "turn": 1, "hash_ids": [1089, 1090, 1091]}``
diff --git a/tests/unit/dataset/loader/test_can_load.py b/tests/unit/dataset/loader/test_can_load.py
@@ -6,6 +6,7 @@
 import pytest
 from pytest import param
 
+from aiperf.dataset.loader.bailian_trace import BailianTraceDatasetLoader
 from aiperf.dataset.loader.mooncake_trace import MooncakeTraceDatasetLoader
 from aiperf.dataset.loader.multi_turn import MultiTurnDatasetLoader
 from aiperf.dataset.loader.random_pool import RandomPoolDatasetLoader
@@ -167,6 +168,8 @@ class TestCustomDatasetComposerInferType:
             param({"input_length": 100, "output_length": 50}, None, CustomDatasetType.MOONCAKE_TRACE, id="mooncake_input_length"),
             param({"type": "mooncake_trace", "input_length": 100}, None, CustomDatasetType.MOONCAKE_TRACE, id="mooncake_explicit"),
             param({"text_input": "Hello"}, None, CustomDatasetType.MOONCAKE_TRACE, id="mooncake_text_input"),
+            param({"type": "bailian_trace", "chat_id": 1, "timestamp": 0.0, "input_length": 100, "output_length": 50}, None, CustomDatasetType.BAILIAN_TRACE, id="bailian_explicit"),
+            param({"chat_id": 1, "timestamp": 0.0, "input_length": 100, "output_length": 50, "type": "text"}, None, CustomDatasetType.BAILIAN_TRACE, id="bailian_structural_with_request_type"),
         ],
     )  # fmt: skip
     def test_infer_from_data(
@@ -201,6 +204,15 @@ def test_infer_from_data_raises(self, create_user_config_and_composer, data):
         with pytest.raises(ValueError, match="No loader can handle"):
             composer._infer_type(data)
 
+    def test_infer_explicit_type_loader_rejects_raises(
+        self, create_user_config_and_composer
+    ):
+        """Test that a recognized type field with incompatible data raises ValueError."""
+        _, composer = create_user_config_and_composer()
+        data = {"type": "single_turn", "input_length": 100}
+        with pytest.raises(ValueError, match="cannot handle the data format"):
+            composer._infer_type(data)
+
     def test_infer_random_pool_with_directory(self, create_user_config_and_composer):
         """Test inferring RandomPool with directory path."""
         _, composer = create_user_config_and_composer()
@@ -357,3 +369,57 @@ def test_directory_path_uniquely_identifies_random_pool(self):
             assert SingleTurnDatasetLoader.can_load(data=None, filename=temp_path) is False  # fmt: skip
             assert MultiTurnDatasetLoader.can_load(data=None, filename=temp_path) is False  # fmt: skip
             assert MooncakeTraceDatasetLoader.can_load(data=None, filename=temp_path) is False  # fmt: skip
+            assert BailianTraceDatasetLoader.can_load(data=None, filename=temp_path) is False  # fmt: skip
+
+
+class TestUnrecognizedTypeFieldFallback:
+    """Tests for graceful handling of unrecognized 'type' field values.
+
+    Some trace formats (e.g. Bailian) include a 'type' field that represents
+    something other than the dataset type (e.g. request type: text/search/image).
+    The inference logic should fall back to structural detection instead of raising."""
+
+    def test_bailian_type_field_falls_through_to_structural_detection(
+        self, create_user_config_and_composer
+    ):
+        """Bailian data with type='text' should infer as bailian_trace, not raise."""
+        _, composer = create_user_config_and_composer()
+        data = {
+            "chat_id": 159,
+            "parent_chat_id": -1,
+            "timestamp": 61.114,
+            "input_length": 521,
+            "output_length": 132,
+            "type": "text",
+            "turn": 1,
+            "hash_ids": [1089, 1090, 1091],
+        }
+        result = composer._infer_type(data)
+        assert result == CustomDatasetType.BAILIAN_TRACE
+
+    @pytest.mark.parametrize(
+        "type_value",
+        [
+            param("text", id="text"),
+            param("search", id="search"),
+            param("image", id="image"),
+            param("file", id="file"),
+            param("unknown_garbage", id="garbage"),
+        ],
+    )  # fmt: skip
+    def test_unrecognized_type_field_does_not_raise(
+        self, create_user_config_and_composer, type_value
+    ):
+        """Unrecognized type field values should not raise during inference."""
+        _, composer = create_user_config_and_composer()
+        data = {
+            "chat_id": 1,
+            "parent_chat_id": -1,
+            "timestamp": 0.0,
+            "input_length": 100,
+            "output_length": 50,
+            "type": type_value,
+            "turn": 1,
+        }
+        result = composer._infer_type(data)
+        assert result == CustomDatasetType.BAILIAN_TRACE