ai-dynamo · ajcasagrande · Feb 25, 2026 · ajcasagrande · Feb 25, 2026 · matthewkotila
diff --git a/docs/cli_options.md b/docs/cli_options.md
@@ -401,6 +401,11 @@ Image file format for generated images. Choose `png` for lossless compression (l
 | `jpeg` |  | JPEG format. Lossy compression, smaller file sizes, good for photos. |
 | `random` |  | Randomly select PNG or JPEG for each image. |
 
+#### `--image-source` `<str>`
+
+Source image generation mode. `assets` loads images from the built-in `assets/source_images` directory (ships with a small set of 4 images). `noise` generates random noise images on the fly, providing a larger and more diverse pool without requiring files on disk. Noise mode is useful for stressing multimodal pipelines more realistically. A path to a directory loads images from the given directory (e.g. `--image-source ./source_images`).
+<br>_Default: `assets`_
+
 ### Video Input
 
 #### `--video-batch-size`, `--batch-size-video` `<int>`

diff --git a/src/aiperf/common/config/__init__.py b/src/aiperf/common/config/__init__.py
@@ -5,11 +5,9 @@
 from aiperf.common.config.base_config import BaseConfig
 from aiperf.common.config.cli_parameter import CLIParameter, DisableCLI
 from aiperf.common.config.config_defaults import (
-    AudioDefaults,
     CLIDefaults,
     ConversationDefaults,
     EndpointDefaults,
-    ImageDefaults,
     InputDefaults,
     InputTokensDefaults,
     LoadGeneratorDefaults,
@@ -23,7 +21,6 @@
     TokenizerDefaults,
     TurnDefaults,
     TurnDelayDefaults,
-    VideoDefaults,
     WorkersDefaults,
 )
 from aiperf.common.config.conversation_config import (
@@ -71,7 +68,6 @@
 
 __all__ = [
     "AudioConfig",
-    "AudioDefaults",
     "AudioLengthConfig",
     "BaseConfig",
     "BaseZMQCommunicationConfig",
@@ -85,7 +81,6 @@
     "EndpointDefaults",
     "Groups",
     "ImageConfig",
-    "ImageDefaults",
     "ImageHeightConfig",
     "ImageWidthConfig",
     "InputConfig",
@@ -118,7 +113,6 @@
     "TurnDelayDefaults",
     "UserConfig",
     "VideoConfig",
-    "VideoDefaults",
     "WorkersConfig",
     "WorkersDefaults",
     "ZMQDualBindConfig",

diff --git a/src/aiperf/common/config/audio_config.py b/src/aiperf/common/config/audio_config.py
@@ -3,11 +3,11 @@
 
 from typing import Annotated
 
-from pydantic import BeforeValidator, Field
+from pydantic import BeforeValidator, Field, model_validator
+from typing_extensions import Self
 
 from aiperf.common.config.base_config import BaseConfig
 from aiperf.common.config.cli_parameter import CLIParameter
-from aiperf.common.config.config_defaults import AudioDefaults
 from aiperf.common.config.config_validators import parse_str_or_list_of_positive_values
 from aiperf.common.config.groups import Groups
 from aiperf.common.enums import AudioFormat
@@ -23,6 +23,7 @@ class AudioLengthConfig(BaseConfig):
     mean: Annotated[
         float,
         Field(
+            default=0.0,
             ge=0,
             description="Mean duration in seconds for synthetically generated audio files. Audio lengths follow a normal distribution "
             "around this mean (±`--audio-length-stddev`). Used when `--audio-batch-size` > 0 for multimodal benchmarking. "
@@ -34,11 +35,12 @@ class AudioLengthConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.LENGTH_MEAN
+    ]
 
     stddev: Annotated[
         float,
         Field(
+            default=0.0,
             ge=0,
             description="Standard deviation for synthetic audio duration in seconds. Creates variability in audio lengths when > 0, "
             "simulating mixed-duration audio inputs. Durations follow normal distribution. "
@@ -50,7 +52,7 @@ class AudioLengthConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.LENGTH_STDDEV
+    ]
 
 
 class AudioConfig(BaseConfig):
@@ -63,6 +65,7 @@ class AudioConfig(BaseConfig):
     batch_size: Annotated[
         int,
         Field(
+            default=1,
             ge=0,
             description="The number of audio inputs to include in each request. Supported with the `chat` endpoint type for multimodal models.",
         ),
@@ -73,13 +76,14 @@ class AudioConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.BATCH_SIZE
+    ]
 
     length: AudioLengthConfig = AudioLengthConfig()
 
     format: Annotated[
         AudioFormat,
         Field(
+            default=AudioFormat.WAV,
             description="File format for generated audio files. Supports `wav` (uncompressed PCM, larger files) and `mp3` (compressed, smaller files). "
             "Format choice affects file size in multimodal requests but not audio characteristics (sample rate, bit depth, duration).",
         ),
@@ -89,11 +93,12 @@ class AudioConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.FORMAT
+    ]
 
     depths: Annotated[
         list[int],
         Field(
+            default=[16],
             min_length=1,
             description="List of audio bit depths in bits to randomly select from when generating audio files. Each audio file is assigned "
             "a random depth from this list. Common values: `8` (low quality), `16` (CD quality), `24` (professional), `32` (high-end). "
@@ -106,11 +111,12 @@ class AudioConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.DEPTHS
+    ]
 
     sample_rates: Annotated[
         list[float],
         Field(
+            default=[16.0],
             min_length=1,
             description="A list of audio sample rates to randomly select from in kHz.\n"
             "Common sample rates are 16, 44.1, 48, 96, etc.",
@@ -122,11 +128,12 @@ class AudioConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.SAMPLE_RATES
+    ]
 
     num_channels: Annotated[
         int,
         Field(
+            default=1,
             ge=1,
             le=2,
             description="Number of audio channels for synthetic audio generation. `1` = mono (single channel), `2` = stereo (left/right channels). "
@@ -139,4 +146,18 @@ class AudioConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = AudioDefaults.NUM_CHANNELS
+    ]
+
+    @model_validator(mode="after")
+    def _validate_audio_options(self) -> Self:
+        """Validate the audio options."""
+        audio_options_set = {*self.model_fields_set, *self.length.model_fields_set}
+        if not self.audio_enabled() and audio_options_set:
+            raise ValueError(
+                "Audio generation is disabled but audio options were provided. Please set `--audio-batch-size` and `--audio-length-mean` to enable audio generation."
+            )
+        return self
+
+    def audio_enabled(self) -> bool:
+        """Check if audio is enabled."""
+        return self.length.mean > 0 and self.batch_size > 0
diff --git a/src/aiperf/common/config/config_defaults.py b/src/aiperf/common/config/config_defaults.py
@@ -6,14 +6,10 @@
 
 from aiperf.common.enums import (
     AIPerfLogLevel,
-    AudioFormat,
     ConnectionReuseStrategy,
     ExportLevel,
-    ImageFormat,
     ModelSelectionStrategy,
     ServerMetricsFormat,
-    VideoFormat,
-    VideoSynthType,
 )
 from aiperf.plugin.enums import (
     ArrivalPattern,
@@ -77,39 +73,6 @@ class RankingsDefaults:
     QUERY_PROMPT_TOKEN_STDDEV = 0
 
 
-@dataclass(frozen=True)
-class AudioDefaults:
-    BATCH_SIZE = 1
-    LENGTH_MEAN = 0.0
-    LENGTH_STDDEV = 0.0
-    FORMAT = AudioFormat.WAV
-    DEPTHS = [16]
-    SAMPLE_RATES = [16.0]
-    NUM_CHANNELS = 1
-
-
-@dataclass(frozen=True)
-class ImageDefaults:
-    BATCH_SIZE = 1
-    WIDTH_MEAN = 0.0
-    WIDTH_STDDEV = 0.0
-    HEIGHT_MEAN = 0.0
-    HEIGHT_STDDEV = 0.0
-    FORMAT = ImageFormat.PNG
-
-
-@dataclass(frozen=True)
-class VideoDefaults:
-    BATCH_SIZE = 1
-    DURATION = 5.0
-    FPS = 4
-    WIDTH = None
-    HEIGHT = None
-    SYNTH_TYPE = VideoSynthType.MOVING_SHAPES
-    FORMAT = VideoFormat.WEBM
-    CODEC = "libvpx-vp9"
-
-
 @dataclass(frozen=True)
 class PromptDefaults:
     BATCH_SIZE = 1

diff --git a/src/aiperf/common/config/image_config.py b/src/aiperf/common/config/image_config.py
@@ -1,15 +1,16 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+from pathlib import Path
 from typing import Annotated
 
-from pydantic import Field
+from pydantic import Field, model_validator
+from typing_extensions import Self
 
 from aiperf.common.config.base_config import BaseConfig
 from aiperf.common.config.cli_parameter import CLIParameter
-from aiperf.common.config.config_defaults import ImageDefaults
 from aiperf.common.config.groups import Groups
-from aiperf.common.enums import ImageFormat
+from aiperf.common.enums import ImageFormat, ImageSource
 
 
 class ImageHeightConfig(BaseConfig):
@@ -22,6 +23,7 @@ class ImageHeightConfig(BaseConfig):
     mean: Annotated[
         float,
         Field(
+            default=0.0,
             ge=0,
             description="Mean height in pixels for synthetically generated images. Image heights follow a normal distribution "
             "around this mean (±`--image-height-stddev`). Used when `--image-batch-size` > 0 for multimodal vision benchmarking. "
@@ -33,11 +35,12 @@ class ImageHeightConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = ImageDefaults.HEIGHT_MEAN
+    ]
 
     stddev: Annotated[
         float,
         Field(
+            default=0.0,
             ge=0,
             description="Standard deviation for synthetic image heights in pixels. Creates variability in vertical resolution when > 0, "
             "simulating mixed-resolution image inputs. Heights follow normal distribution. "
@@ -49,7 +52,7 @@ class ImageHeightConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = ImageDefaults.HEIGHT_STDDEV
+    ]
 
 
 class ImageWidthConfig(BaseConfig):
@@ -62,6 +65,7 @@ class ImageWidthConfig(BaseConfig):
     mean: Annotated[
         float,
         Field(
+            default=0.0,
             ge=0,
             description="Mean width in pixels for synthetically generated images. Image widths follow a normal distribution "
             "around this mean (±`--image-width-stddev`). Combined with `--image-height-mean` to determine image dimensions "
@@ -73,11 +77,12 @@ class ImageWidthConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = ImageDefaults.WIDTH_MEAN
+    ]
 
     stddev: Annotated[
         float,
         Field(
+            default=0.0,
             ge=0,
             description="Standard deviation for synthetic image widths in pixels. Creates variability in horizontal resolution when > 0, "
             "simulating mixed-resolution image inputs. Widths follow normal distribution. "
@@ -89,7 +94,7 @@ class ImageWidthConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = ImageDefaults.WIDTH_STDDEV
+    ]
 
 
 class ImageConfig(BaseConfig):
@@ -104,6 +109,7 @@ class ImageConfig(BaseConfig):
     batch_size: Annotated[
         int,
         Field(
+            default=1,
             ge=0,
             description="Number of images to include in each multimodal request. Supported with `chat` endpoint type for vision-language models. "
             "Each image is generated by randomly sampling and resizing source images from `assets/source_images` directory to specified dimensions. "
@@ -116,11 +122,12 @@ class ImageConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = ImageDefaults.BATCH_SIZE
+    ]
 
     format: Annotated[
         ImageFormat,
         Field(
+            default=ImageFormat.PNG,
             description="Image file format for generated images. Choose `png` for lossless compression (larger files, best quality), "
             "`jpeg` for lossy compression (smaller files, good quality), or `random` to randomly select between PNG and JPEG for each image. "
             "Format affects file size in multimodal requests and encoding overhead.",
@@ -131,4 +138,38 @@ class ImageConfig(BaseConfig):
             ),
             group=_CLI_GROUP,
         ),
-    ] = ImageDefaults.FORMAT
+    ]
+
+    source: Annotated[
+        ImageSource | Path,
+        Field(
+            default=ImageSource.ASSETS,
+            description="Source image generation mode. `assets` loads images from the built-in `assets/source_images` directory "
+            "(ships with a small set of 4 images). `noise` generates random noise images on the fly, "
+            "providing a larger and more diverse pool without requiring files on disk. "
+            "Noise mode is useful for stressing multimodal pipelines more realistically. "
+            "A path to a directory loads images from the given directory (e.g. `--image-source ./source_images`).",
+        ),
+        CLIParameter(
+            name=("--image-source",),
+            group=_CLI_GROUP,
+        ),
+    ]
+
+    @model_validator(mode="after")
+    def _validate_image_options(self) -> Self:
+        """Validate the image options."""
+        image_options_set = {
+            *self.model_fields_set,
+            *self.width.model_fields_set,
+            *self.height.model_fields_set,
+        }
+        if not self.images_enabled() and image_options_set:
+            raise ValueError(
+                "Image generation is disabled but image options were provided. Please set `--image-width-mean` and `--image-height-mean` to enable image generation."
+            )
+        return self
+
+    def images_enabled(self) -> bool:
+        """Check if images are enabled."""
+        return self.width.mean > 0 and self.height.mean > 0 and self.batch_size > 0