databrickslabs · anupkalburgi · Sep 23, 2025 · Nov 4, 2025 · Nov 5, 2025 · Nov 6, 2025
@@ -0,0 +1,107 @@
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from .compat import BaseModel, root_validator
+
+
+DbldatagenBasicType = Literal[
+    "string",
+    "int",
+    "long",
+    "float",
+    "double",
+    "decimal",
+    "boolean",
+    "date",
+    "timestamp",
+    "short",
+    "byte",
+    "binary",
+    "integer",
+    "bigint",
+    "tinyint",
+]
+"""Type alias representing supported basic Spark SQL data types for column definitions.
+
+Includes both standard SQL types (e.g. string, int, double) and Spark-specific type names
+(e.g. bigint, tinyint). These types are used in the ColumnDefinition to specify the data type
+for generated columns.
+"""
+
+
+class ColumnDefinition(BaseModel):
+    """Defines the specification for a single column in a synthetic data table.
+
+    This class encapsulates all the information needed to generate data for a single column,
+    including its name, type, constraints, and generation options. It supports both primary key
+    columns and derived columns that can reference other columns.
+
+    :param name: Name of the column to be generated
+    :param type: Spark SQL data type for the column (e.g., "string", "int", "timestamp").
+                 If None, type may be inferred from options or baseColumn
+    :param primary: If True, this column will be treated as a primary key column with unique values.
+                    Primary columns cannot have min/max options and cannot be nullable
+    :param options: Dictionary of additional options controlling column generation behavior.
+                    Common options include: min, max, step, values, template, distribution, etc.
+                    See dbldatagen documentation for full list of available options
+    :param nullable: If True, the column may contain NULL values. Primary columns cannot be nullable
+    :param omit: If True, this column will be generated internally but excluded from the final output.
+                 Useful for intermediate columns used in calculations
+    :param baseColumn: Name of another column to use as the basis for generating this column's values.
+                       Default is "id" which refers to the internal row identifier
+    :param baseColumnType: Method for deriving values from the baseColumn. Common values:
+                          "auto" (infer behavior), "hash" (hash the base column values),
+                          "values" (use base column values directly)
+
+    .. note::
+        Primary columns have special constraints:
+        - Must have a type defined
+        - Cannot have min/max options
+        - Cannot be nullable
+
+    .. note::
+        Columns can be chained via baseColumn references, but circular dependencies
+        will be caught during validation
+    """
+    name: str
+    type: DbldatagenBasicType | None = None
+    primary: bool = False
+    options: dict[str, Any] | None = None
+    nullable: bool | None = False
+    omit: bool | None = False
+    baseColumn: str | None = "id"
+    baseColumnType: str | None = "auto"
+
+    @root_validator()
+    def check_model_constraints(cls, values: dict[str, Any]) -> dict[str, Any]:
+        """Validates constraints across the entire ColumnDefinition model.
+
+        This validator runs after all individual field validators and checks for cross-field
+        constraints that depend on multiple fields being set. It ensures that primary key
+        columns meet all necessary requirements and that conflicting options are not specified.
+
+        :param values: Dictionary of all field values for this ColumnDefinition instance
+        :returns: The validated values dictionary, unmodified if all validations pass
+        :raises ValueError: If primary column has min/max options, or if primary column is nullable,
+                           or if primary column doesn't have a type defined
+
+        .. note::
+            This is a Pydantic root validator that runs automatically during model instantiation
+        """
+        is_primary = values.get("primary")
+        options = values.get("options") or {}  # Handle None case
+        name = values.get("name")
+        is_nullable = values.get("nullable")
+        column_type = values.get("type")
+
+        if is_primary:
+            if "min" in options or "max" in options:
+                raise ValueError(f"Primary column '{name}' cannot have min/max options.")
+
+            if is_nullable:
+                raise ValueError(f"Primary column '{name}' cannot be nullable.")
+
+            if column_type is None:
+                raise ValueError(f"Primary column '{name}' must have a type defined.")
+        return values
@@ -0,0 +1,57 @@
+"""Pydantic compatibility layer for supporting both Pydantic V1 and V2.
+
+This module provides a unified interface for Pydantic functionality that works across both
+Pydantic V1.x and V2.x versions. It ensures that the dbldatagen spec API works in multiple
+environments without requiring specific Pydantic version installations.
+
+The module exports a consistent Pydantic V1-compatible API regardless of which version is installed:
+
+- **BaseModel**: Base class for all Pydantic models
+- **Field**: Field definition with metadata and validation
+- **constr**: Constrained string type for validation
+- **root_validator**: Decorator for model-level validation
+- **validator**: Decorator for field-level validation
+
+Usage in other modules:
+    Always import from this compat module, not directly from pydantic::
+
+        # Correct
+        from .compat import BaseModel, validator
+
+        # Incorrect - don't do this
+        from pydantic import BaseModel, validator
+
+Environment Support:
+    - **Pydantic V2.x environments**: Imports from pydantic.v1 compatibility layer
+    - **Pydantic V1.x environments**: Imports directly from pydantic package
+    - **Databricks runtimes**: Works with pre-installed Pydantic versions without conflicts
+
+.. note::
+    This approach is inspired by FastAPI's compatibility layer:
+    https://github.com/fastapi/fastapi/blob/master/fastapi/_compat.py
+
+Benefits:
+    - **No Installation Required**: Works with whatever Pydantic version is available
+    - **Single Codebase**: One set of code works across both Pydantic versions
+    - **Environment Agnostic**: Application code doesn't need to know which version is installed
+    - **Future-Ready**: Easy migration path to Pydantic V2 API when ready
+    - **Databricks Compatible**: Avoids conflicts with pre-installed libraries
+
+Future Migration:
+    When ready to migrate to native Pydantic V2 API:
+    1. Update application code to use V2 patterns
+    2. Modify this compat.py to import from native V2 locations
+    3. Test in both environments
+    4. Deploy incrementally
+"""
+
+try:
+    # This will succeed on environments with Pydantic V2.x
+    # Pydantic V2 provides a v1 compatibility layer for backwards compatibility
+    from pydantic.v1 import BaseModel, Field, constr, root_validator, validator
+except ImportError:
+    # This will be executed on environments with only Pydantic V1.x
+    # Import directly from pydantic since v1 subpackage doesn't exist
+    from pydantic import BaseModel, Field, constr, root_validator, validator  # type: ignore[assignment,no-redef]
+
+__all__ = ["BaseModel", "Field", "constr", "root_validator", "validator"]