docs: add models module to code reference (#101)

nabinchha · web-flow · commit 1de2262b940f · 2025-12-05T10:41:43.000-07:00
* Add example notebook showing how to use image contexts

* change 101 -&gt; tutorial

* update _README.md with info on the new tutorial

* add reference in mkdocs.yml

* simplify vlm tutorial

* update num_records on tutorials. Update .gitignore

* update readme info

* add models module to code reference

* fix links to generated ipynb

* change vlm in example tutorial to llama4-scout
diff --git a/docs/code_reference/models.md b/docs/code_reference/models.md
@@ -0,0 +1,11 @@
+# Models
+
+The `models` module defines configuration objects for model-based generation. [ModelProvider](#data_designer.config.models.ModelProvider), specifies connection and authentication details for custom providers. [ModelConfig](#data_designer.config.models.ModelConfig) encapsulates model details including the model alias, identifier, and inference parameters. [InferenceParameters](#data_designer.config.models.InferenceParameters) controls model behavior through settings like `temperature`, `top_p`, and `max_tokens`, with support for both fixed values and distribution-based sampling. The module includes [ImageContext](#data_designer.config.models.ImageContext) for providing image inputs to multimodal models.
+
+For more information on how they are used, see below:
+
+- **[Model Providers](../concepts/models/model-providers.md)**
+- **[Model Configs](../concepts/models/model-configs.md)**
+- **[Image Context](/notebooks/4-providing-images-as-context/)**
+
+::: data_designer.config.models
diff --git a/docs/notebook_source/4-providing-images-as-context.py b/docs/notebook_source/4-providing-images-as-context.py
@@ -94,7 +94,7 @@
 model_configs = [
     ModelConfig(
         alias="vision",
-        model="nvidia/nemotron-nano-12b-v2-vl",
+        model="meta/llama-4-scout-17b-16e-instruct",
         provider=MODEL_PROVIDER,
         inference_parameters=InferenceParameters(
             temperature=0.60,
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -23,6 +23,7 @@ nav:
       - Seeding with an External Dataset: notebooks/3-seeding-with-a-dataset.ipynb
       - Providing Images as Context: notebooks/4-providing-images-as-context.ipynb
   - Code Reference:
+      - models: code_reference/models.md
       - column_configs: code_reference/column_configs.md
       - config_builder: code_reference/config_builder.md
       - data_designer_config: code_reference/data_designer_config.md
diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py
@@ -25,15 +25,21 @@
 
 
 class Modality(str, Enum):
+    """Supported modality types for multimodal model data."""
+
     IMAGE = "image"
 
 
 class ModalityDataType(str, Enum):
+    """Data type formats for multimodal data."""
+
     URL = "url"
     BASE64 = "base64"
 
 
 class ImageFormat(str, Enum):
+    """Supported image formats for image modality."""
+
     PNG = "png"
     JPG = "jpg"
     JPEG = "jpeg"
@@ -42,6 +48,8 @@ class ImageFormat(str, Enum):
 
 
 class DistributionType(str, Enum):
+    """Types of distributions for sampling inference parameters."""
+
     UNIFORM = "uniform"
     MANUAL = "manual"
 
@@ -56,10 +64,27 @@ def get_context(self, record: dict) -> dict[str, Any]: ...
 
 
 class ImageContext(ModalityContext):
+    """Configuration for providing image context to multimodal models.
+
+    Attributes:
+        modality: The modality type (always "image").
+        column_name: Name of the column containing image data.
+        data_type: Format of the image data ("url" or "base64").
+        image_format: Image format (required for base64 data).
+    """
+
     modality: Modality = Modality.IMAGE
     image_format: Optional[ImageFormat] = None
 
     def get_context(self, record: dict) -> dict[str, Any]:
+        """Get the context for the image modality.
+
+        Args:
+            record: The record containing the image data.
+
+        Returns:
+            The context for the image modality.
+        """
         context = dict(type="image_url")
         context_value = record[self.column_name]
         if self.data_type == ModalityDataType.URL:
@@ -90,6 +115,13 @@ def sample(self) -> float: ...
 
 
 class ManualDistributionParams(ConfigBase):
+    """Parameters for manual distribution sampling.
+
+    Attributes:
+        values: List of possible values to sample from.
+        weights: Optional list of weights for each value. If not provided, all values have equal probability.
+    """
+
     values: List[float] = Field(min_length=1)
     weights: Optional[List[float]] = None
 
@@ -107,14 +139,36 @@ def _validate_equal_lengths(self) -> Self:
 
 
 class ManualDistribution(Distribution[ManualDistributionParams]):
+    """Manual (discrete) distribution for sampling inference parameters.
+
+    Samples from a discrete set of values with optional weights. Useful for testing
+    specific values or creating custom probability distributions for temperature or top_p.
+
+    Attributes:
+        distribution_type: Type of distribution ("manual").
+        params: Distribution parameters (values, weights).
+    """
+
     distribution_type: Optional[DistributionType] = "manual"
     params: ManualDistributionParams
 
     def sample(self) -> float:
+        """Sample a value from the manual distribution.
+
+        Returns:
+            A float value sampled from the manual distribution.
+        """
         return float(np.random.choice(self.params.values, p=self.params.weights))
 
 
 class UniformDistributionParams(ConfigBase):
+    """Parameters for uniform distribution sampling.
+
+    Attributes:
+        low: Lower bound (inclusive).
+        high: Upper bound (exclusive).
+    """
+
     low: float
     high: float
 
@@ -126,17 +180,43 @@ def _validate_low_lt_high(self) -> Self:
 
 
 class UniformDistribution(Distribution[UniformDistributionParams]):
+    """Uniform distribution for sampling inference parameters.
+
+    Samples values uniformly between low and high bounds. Useful for exploring
+    a continuous range of values for temperature or top_p.
+
+    Attributes:
+        distribution_type: Type of distribution ("uniform").
+        params: Distribution parameters (low, high).
+    """
+
     distribution_type: Optional[DistributionType] = "uniform"
     params: UniformDistributionParams
 
     def sample(self) -> float:
+        """Sample a value from the uniform distribution.
+
+        Returns:
+            A float value sampled from the uniform distribution.
+        """
         return float(np.random.uniform(low=self.params.low, high=self.params.high, size=1)[0])
 
 
 DistributionT: TypeAlias = Union[UniformDistribution, ManualDistribution]
 
 
 class InferenceParameters(ConfigBase):
+    """Configuration for LLM inference parameters.
+
+    Attributes:
+        temperature: Sampling temperature (0.0-2.0). Can be a fixed value or a distribution for dynamic sampling.
+        top_p: Nucleus sampling probability (0.0-1.0). Can be a fixed value or a distribution for dynamic sampling.
+        max_tokens: Maximum number of tokens (includes both input and output tokens).
+        max_parallel_requests: Maximum number of parallel requests to the model API.
+        timeout: Timeout in seconds for each request.
+        extra_body: Additional parameters to pass to the model API.
+    """
+
     temperature: Optional[Union[float, DistributionT]] = None
     top_p: Optional[Union[float, DistributionT]] = None
     max_tokens: Optional[int] = Field(default=None, ge=1)
@@ -146,6 +226,11 @@ class InferenceParameters(ConfigBase):
 
     @property
     def generate_kwargs(self) -> dict[str, Union[float, int]]:
+        """Get the generate kwargs for the inference parameters.
+
+        Returns:
+            A dictionary of the generate kwargs.
+        """
         result = {}
         if self.temperature is not None:
             result["temperature"] = (
@@ -206,13 +291,32 @@ def _is_value_in_range(self, value: float, min_value: float, max_value: float) -
 
 
 class ModelConfig(ConfigBase):
+    """Configuration for a model used for generation.
+
+    Attributes:
+        alias: User-defined alias to reference in column configurations.
+        model: Model identifier (e.g., from build.nvidia.com or other providers).
+        inference_parameters: Inference parameters for the model (temperature, top_p, max_tokens, etc.).
+        provider: Optional model provider name if using custom providers.
+    """
+
     alias: str
     model: str
     inference_parameters: InferenceParameters = Field(default_factory=InferenceParameters)
     provider: Optional[str] = None
 
 
 class ModelProvider(ConfigBase):
+    """Configuration for a custom model provider.
+
+    Attributes:
+        name: Name of the model provider.
+        endpoint: API endpoint URL for the provider.
+        provider_type: Provider type (default: "openai"). Determines the API format to use.
+        api_key: Optional API key for authentication.
+        extra_body: Additional parameters to pass in API requests.
+    """
+
     name: str
     endpoint: str
     provider_type: str = "openai"