-
Notifications
You must be signed in to change notification settings - Fork 86
Ak/spec #372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Ak/spec #372
Changes from 5 commits
20db964
4e9cba5
d37de68
f5214ca
0a8fa2a
61f676d
e139c8b
52a4283
a0ce13b
f6c9a69
71e6451
de1bb75
00da1cd
7a93b01
7cb3789
4eb173d
da8bc16
99d5526
da9e6ef
7da16dd
59bd523
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from typing import Any, Literal | ||
|
|
||
| from .compat import BaseModel, root_validator | ||
|
|
||
|
|
||
| DbldatagenBasicType = Literal[ | ||
| "string", | ||
| "int", | ||
| "long", | ||
| "float", | ||
| "double", | ||
| "decimal", | ||
| "boolean", | ||
| "date", | ||
| "timestamp", | ||
| "short", | ||
| "byte", | ||
| "binary", | ||
| "integer", | ||
| "bigint", | ||
| "tinyint", | ||
| ] | ||
| """Type alias representing supported basic Spark SQL data types for column definitions. | ||
|
|
||
| Includes both standard SQL types (e.g. string, int, double) and Spark-specific type names | ||
| (e.g. bigint, tinyint). These types are used in the ColumnDefinition to specify the data type | ||
| for generated columns. | ||
| """ | ||
|
|
||
|
|
||
| class ColumnDefinition(BaseModel): | ||
| """Defines the specification for a single column in a synthetic data table. | ||
|
|
||
| This class encapsulates all the information needed to generate data for a single column, | ||
| including its name, type, constraints, and generation options. It supports both primary key | ||
| columns and derived columns that can reference other columns. | ||
anupkalburgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| :param name: Name of the column to be generated | ||
| :param type: Spark SQL data type for the column (e.g., "string", "int", "timestamp"). | ||
| If None, type may be inferred from options or baseColumn | ||
| :param primary: If True, this column will be treated as a primary key column with unique values. | ||
| Primary columns cannot have min/max options and cannot be nullable | ||
| :param options: Dictionary of additional options controlling column generation behavior. | ||
| Common options include: min, max, step, values, template, distribution, etc. | ||
| See dbldatagen documentation for full list of available options | ||
| :param nullable: If True, the column may contain NULL values. Primary columns cannot be nullable | ||
| :param omit: If True, this column will be generated internally but excluded from the final output. | ||
| Useful for intermediate columns used in calculations | ||
| :param baseColumn: Name of another column to use as the basis for generating this column's values. | ||
| Default is "id" which refers to the internal row identifier | ||
| :param baseColumnType: Method for deriving values from the baseColumn. Common values: | ||
anupkalburgi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "auto" (infer behavior), "hash" (hash the base column values), | ||
| "values" (use base column values directly) | ||
|
|
||
| .. note:: | ||
| Primary columns have special constraints: | ||
| - Must have a type defined | ||
| - Cannot have min/max options | ||
| - Cannot be nullable | ||
|
|
||
| .. note:: | ||
| Columns can be chained via baseColumn references, but circular dependencies | ||
| will be caught during validation | ||
| """ | ||
| name: str | ||
| type: DbldatagenBasicType | None = None | ||
| primary: bool = False | ||
| options: dict[str, Any] | None = None | ||
| nullable: bool | None = False | ||
| omit: bool | None = False | ||
| baseColumn: str | None = "id" | ||
| baseColumnType: str | None = "auto" | ||
|
|
||
| @root_validator() | ||
| def check_model_constraints(cls, values: dict[str, Any]) -> dict[str, Any]: | ||
| """Validates constraints across the entire ColumnDefinition model. | ||
|
|
||
| This validator runs after all individual field validators and checks for cross-field | ||
| constraints that depend on multiple fields being set. It ensures that primary key | ||
| columns meet all necessary requirements and that conflicting options are not specified. | ||
|
|
||
| :param values: Dictionary of all field values for this ColumnDefinition instance | ||
anupkalburgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| :returns: The validated values dictionary, unmodified if all validations pass | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docstring might be a bit misleading here? We will only ever return the unmodified values dictionary, right?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not always, we raise an exception if it is not valid. |
||
| :raises ValueError: If primary column has min/max options, or if primary column is nullable, | ||
| or if primary column doesn't have a type defined | ||
|
|
||
| .. note:: | ||
| This is a Pydantic root validator that runs automatically during model instantiation | ||
| """ | ||
| is_primary = values.get("primary") | ||
| options = values.get("options") or {} # Handle None case | ||
| name = values.get("name") | ||
| is_nullable = values.get("nullable") | ||
| column_type = values.get("type") | ||
|
|
||
| if is_primary: | ||
| if "min" in options or "max" in options: | ||
| raise ValueError(f"Primary column '{name}' cannot have min/max options.") | ||
|
|
||
| if is_nullable: | ||
| raise ValueError(f"Primary column '{name}' cannot be nullable.") | ||
|
|
||
| if column_type is None: | ||
| raise ValueError(f"Primary column '{name}' must have a type defined.") | ||
anupkalburgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return values | ||
anupkalburgi marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| """Pydantic compatibility layer for supporting both Pydantic V1 and V2. | ||
|
|
||
| This module provides a unified interface for Pydantic functionality that works across both | ||
| Pydantic V1.x and V2.x versions. It ensures that the dbldatagen spec API works in multiple | ||
| environments without requiring specific Pydantic version installations. | ||
|
|
||
| The module exports a consistent Pydantic V1-compatible API regardless of which version is installed: | ||
|
|
||
| - **BaseModel**: Base class for all Pydantic models | ||
| - **Field**: Field definition with metadata and validation | ||
| - **constr**: Constrained string type for validation | ||
| - **root_validator**: Decorator for model-level validation | ||
| - **validator**: Decorator for field-level validation | ||
|
|
||
| Usage in other modules: | ||
| Always import from this compat module, not directly from pydantic:: | ||
|
|
||
| # Correct | ||
| from .compat import BaseModel, validator | ||
anupkalburgi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Incorrect - don't do this | ||
| from pydantic import BaseModel, validator | ||
|
|
||
| Environment Support: | ||
| - **Pydantic V2.x environments**: Imports from pydantic.v1 compatibility layer | ||
| - **Pydantic V1.x environments**: Imports directly from pydantic package | ||
| - **Databricks runtimes**: Works with pre-installed Pydantic versions without conflicts | ||
|
|
||
| .. note:: | ||
| This approach is inspired by FastAPI's compatibility layer: | ||
| https://github.com/fastapi/fastapi/blob/master/fastapi/_compat.py | ||
|
|
||
| Benefits: | ||
| - **No Installation Required**: Works with whatever Pydantic version is available | ||
| - **Single Codebase**: One set of code works across both Pydantic versions | ||
| - **Environment Agnostic**: Application code doesn't need to know which version is installed | ||
| - **Future-Ready**: Easy migration path to Pydantic V2 API when ready | ||
| - **Databricks Compatible**: Avoids conflicts with pre-installed libraries | ||
|
|
||
| Future Migration: | ||
| When ready to migrate to native Pydantic V2 API: | ||
| 1. Update application code to use V2 patterns | ||
| 2. Modify this compat.py to import from native V2 locations | ||
| 3. Test in both environments | ||
| 4. Deploy incrementally | ||
| """ | ||
|
|
||
| try: | ||
| # This will succeed on environments with Pydantic V2.x | ||
| # Pydantic V2 provides a v1 compatibility layer for backwards compatibility | ||
| from pydantic.v1 import BaseModel, Field, constr, root_validator, validator | ||
| except ImportError: | ||
| # This will be executed on environments with only Pydantic V1.x | ||
| # Import directly from pydantic since v1 subpackage doesn't exist | ||
| from pydantic import BaseModel, Field, constr, root_validator, validator # type: ignore[assignment,no-redef] | ||
|
|
||
| __all__ = ["BaseModel", "Field", "constr", "root_validator", "validator"] | ||
Uh oh!
There was an error while loading. Please reload this page.