diff --git a/docs/code_reference/analysis.md b/docs/code_reference/analysis.md new file mode 100644 index 00000000..7c8e1034 --- /dev/null +++ b/docs/code_reference/analysis.md @@ -0,0 +1,31 @@ +# Analysis + +The `analysis` modules provide tools for profiling and analyzing generated datasets. It includes statistics tracking, column profiling, and reporting capabilities. + +## Column Statistics + +Column statistics are automatically computed for every column after generation. They provide basic metrics specific to the column type. For example, LLM columns track token usage statistics, sampler columns track distribution information, and validation columns track validation success rates. + +The classes below are result objects that store the computed statistics for each column type and provide methods for formatting these results for display in reports. + +::: data_designer.config.analysis.column_statistics + +## Column Profilers + +Column profilers are optional analysis tools that provide deeper insights into specific column types. Currently, the only column profiler available is the Judge Score Profiler. + +The classes below are result objects that store the computed profiler results and provide methods for formatting these results for display in reports. + +::: data_designer.config.analysis.column_profilers + +## Dataset Profiler + +The [DatasetProfilerResults](#data_designer.config.analysis.dataset_profiler.DatasetProfilerResults) class contains complete profiling results for a generated dataset. It aggregates column-level statistics, metadata, and profiler results, and provides methods to: + +- Compute dataset-level metrics (completion percentage, column type summary) +- Filter statistics by column type +- Generate formatted analysis reports via the `to_report()` method + +Reports can be displayed in the console or exported to HTML/SVG formats. + +::: data_designer.config.analysis.dataset_profiler diff --git a/docs/concepts/plugins.md b/docs/concepts/plugins.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/js/toc-toggle.js b/docs/js/toc-toggle.js index 9083e38b..22f7e079 100644 --- a/docs/js/toc-toggle.js +++ b/docs/js/toc-toggle.js @@ -7,14 +7,17 @@ if (typeof document$ !== "undefined") { // Check if this is a Concepts page (URL contains /concepts/) const isConceptsPage = window.location.pathname.includes("/concepts/"); - if (isCodeReferencePage || isConceptsPage) { - // Show TOC for Code Reference and Concepts pages by adding class to body + // Check if this is a Plugins page (URL contains /plugins/) + const isPluginsPage = window.location.pathname.includes("/plugins/"); + + if (isCodeReferencePage || isConceptsPage || isPluginsPage) { + // Show TOC for Code Reference, Concepts, and Plugins pages by adding class to body document.body.classList.add("show-toc"); - console.log("Code Reference or Concepts page detected - showing TOC"); + console.log("Code Reference, Concepts, or Plugins page detected - showing TOC"); } else { // Hide TOC for all other pages by removing class from body document.body.classList.remove("show-toc"); - console.log("Non-Code Reference/Concepts page - hiding TOC"); + console.log("Non-Code Reference/Concepts/Plugins page - hiding TOC"); } }); } else { diff --git a/docs/plugins/available.md b/docs/plugins/available.md new file mode 100644 index 00000000..2489dcfd --- /dev/null +++ b/docs/plugins/available.md @@ -0,0 +1,3 @@ +# 🚧 Coming Soon + +This page will list available Data Designer plugins. Stay tuned! diff --git a/docs/plugins/example.md b/docs/plugins/example.md new file mode 100644 index 00000000..9c6929d1 --- /dev/null +++ b/docs/plugins/example.md @@ -0,0 +1,306 @@ +!!! warning "Experimental Feature" + The plugin system is currently **experimental** and under active development. The documentation, examples, and plugin interface are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). + + +# Example Plugin: Index Multiplier + +In this guide, we will build a simple plugin that generates values by multiplying the row index by a user-specified multiplier. Admittedly, not the most useful plugin, but it demonstrates the required steps 😜. + +A Data Designer plugin is implemented as a Python package with three main components: + +1. **Configuration Class**: Defines the parameters users can configure +2. **Task Class**: Contains the core implementation of the plugin +3. **Plugin Object**: Connects the config and task classes to make the plugin discoverable + +Let's build the `data-designer-index-multiplier` plugin step by step. + +## Step 1: Create a Python package + +Data Designer plugins are implemented as Python packages. We recommend using a standard structure for your plugin package. + +For example, here is the structure of a `data-designer-index-multiplier` plugin: + +``` +data-designer-index-multiplier/ +├── pyproject.toml +└── src/ + └── data_designer_index_multiplier/ + ├── __init__.py + └── plugin.py +``` + +## Step 2: Create the config class + +The configuration class defines what parameters users can set when using your plugin. For column generator plugins, it must inherit from [SingleColumnConfig](../code_reference/column_configs.md#data_designer.config.column_configs.SingleColumnConfig) and include a [discriminator field](https://docs.pydantic.dev/latest/concepts/unions/#discriminated-unions). + +```python +from typing import Literal +from data_designer.config.column_configs import SingleColumnConfig + +class IndexMultiplierColumnConfig(SingleColumnConfig): + """Configuration for the index multiplier column generator.""" + + # Configurable parameter for this plugin + multiplier: int = 2 + + # Required: discriminator field with a unique Literal type + # This value identifies your plugin and becomes its column_type + column_type: Literal["index-multiplier"] = "index-multiplier" +``` + +**Key points:** + +- The `column_type` field must be a `Literal` type with a string default +- This value uniquely identifies your plugin (use kebab-case) +- Add any custom parameters your plugin needs (here: `multiplier`) +- `SingleColumnConfig` is a Pydantic model, so you can leverage all of Pydantic's validation features + +## Step 3: Create the task class + +The task class implements the actual business logic of the plugin. For column generator plugins, it inherits from [ColumnGenerator](../code_reference/column_generators.md#data_designer.engine.column_generators.generators.base.ColumnGenerator) and must implement a `metadata` static method and `generate` method: + + +```python +import logging +import pandas as pd + +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, +) + +# Data Designer uses the standard Python logging module for logging +logger = logging.getLogger(__name__) + +class IndexMultiplierColumnGenerator(ColumnGenerator[IndexMultiplierColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + """Define metadata about this generator.""" + return GeneratorMetadata( + name="index-multiplier", + description="Generates values by multiplying the row index by a user-specified multiplier", + generation_strategy=GenerationStrategy.FULL_COLUMN, + required_resources=None, + ) + + def generate(self, data: pd.DataFrame) -> pd.DataFrame: + """Generate the column data. + + Args: + data: The current DataFrame being built + + Returns: + The DataFrame with the new column added + """ + logger.info( + f"Generating column {self.config.name} " + f"with multiplier {self.config.multiplier}" + ) + + # Access config via self.config + data[self.config.name] = data.index * self.config.multiplier + + return data +``` + +**Key points:** + +- Generic type `ColumnGenerator[IndexMultiplierColumnConfig]` connects the task to its config +- `metadata()` describes your generator and its requirements +- `generation_strategy` can be `FULL_COLUMN`, `CELL_BY_CELL` +- You have access to the configuration parameters via `self.config` +- `required_resources` lists any required resources (models, artifact storages, etc.). This parameter will evolve in the near future, so keeping it as `None` is safe for now. That said, if your task will use the model registry, adding `data_designer.engine.resources.ResourceType.MODEL_REGISTRY` will enable automatic model health checking for your column generation task. + +!!! info "Understanding generation_strategy" + The `generation_strategy` specifies how the column generator will generate data. + + - **`FULL_COLUMN`**: Generates the full column (at the batch level) in a single call to `generate` + - `generate` must take as input a `pd.DataFrame` with all previous columns and return a `pd.DataFrame` with the generated column appended + + - **`CELL_BY_CELL`**: Generates one cell at a time + - `generate` must take as input a `dict` with key/value pairs for all previous columns and return a `dict` with an additional key/value for the generated cell + - Supports concurrent workers via a `max_parallel_requests` parameter on the configuration + +## Step 4: Create the plugin object + +Create a `Plugin` object that makes the plugin discoverable and connects the task and config classes. + +```python +from data_designer.plugins import Plugin, PluginType + +# Plugin instance - this is what gets loaded via entry point +plugin = Plugin( + task_cls=IndexMultiplierColumnGenerator, + config_cls=IndexMultiplierColumnConfig, + plugin_type=PluginType.COLUMN_GENERATOR, + emoji="🔌", +) +``` + +### Complete plugin code + +Pulling it all together, here is the complete plugin code for `src/data_designer_index_multiplier/plugin.py`: + +```python +import logging +from typing import Literal + +import pandas as pd + +from data_designer.config.column_configs import SingleColumnConfig +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, +) +from data_designer.plugins import Plugin, PluginType + +# Data Designer uses the standard Python logging module for logging +logger = logging.getLogger(__name__) + + +class IndexMultiplierColumnConfig(SingleColumnConfig): + """Configuration for the index multiplier column generator.""" + + # Configurable parameter for this plugin + multiplier: int = 2 + + # Required: discriminator field with a unique Literal type + # This value identifies your plugin and becomes its column_type + column_type: Literal["index-multiplier"] = "index-multiplier" + + +class IndexMultiplierColumnGenerator(ColumnGenerator[IndexMultiplierColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + """Define metadata about this generator.""" + return GeneratorMetadata( + name="index-multiplier", + description="Generates values by multiplying the row index by a user-specified multiplier", + generation_strategy=GenerationStrategy.FULL_COLUMN, + required_resources=None, + ) + + def generate(self, data: pd.DataFrame) -> pd.DataFrame: + """Generate the column data. + + Args: + data: The current DataFrame being built + + Returns: + The DataFrame with the new column added + """ + logger.info( + f"Generating column {self.config.name} " + f"with multiplier {self.config.multiplier}" + ) + + # Access config via self.config + data[self.config.name] = data.index * self.config.multiplier + + return data + + +# Plugin instance - this is what gets loaded via entry point +plugin = Plugin( + task_cls=IndexMultiplierColumnGenerator, + config_cls=IndexMultiplierColumnConfig, + plugin_type=PluginType.COLUMN_GENERATOR, + emoji="🔌", +) +``` + +## Step 5: Package your plugin + +Create a `pyproject.toml` file to define your package and register the entry point: + +```toml +[project] +name = "data-designer-index-multiplier" +version = "1.0.0" +description = "Data Designer index multiplier plugin" +requires-python = ">=3.10" +dependencies = [ + "data-designer", +] + +# Register this plugin via entry points +[project.entry-points."data_designer.plugins"] +index-multiplier = "data_designer_index_multiplier.plugin:plugin" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/data_designer_index_multiplier"] +``` + +!!! info "Entry Point Registration" + Plugins are discovered automatically using [Python entry points](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata). It is important to register your plugin as an entry point under the `data_designer.plugins` group. + + The entry point format is: + ```toml + [project.entry-points."data_designer.plugins"] + = ":" + ``` + +## Step 6: Use your plugin + +Install your plugin in editable mode for testing: + +```bash +# From the plugin directory +uv pip install -e . +``` + +Once installed, your plugin works just like built-in column types: + +```python +from data_designer_index_multiplier.plugin import IndexMultiplierColumnConfig + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + SamplerColumnConfig, +) + +data_designer = DataDesigner() +builder = DataDesignerConfigBuilder() + +# Add a regular column +builder.add_column( + SamplerColumnConfig( + name="category", + sampler_type="category", + params=CategorySamplerParams(values=["A", "B", "C"]), + ) +) + +# Add your custom plugin column +builder.add_column( + IndexMultiplierColumnConfig( + name="v", + multiplier=5, + ) +) + +# Generate data +results = data_designer.create(builder, num_records=10) +print(results.load_dataset()) +``` + +Output: +``` + category multiplied-index +0 B 0 +1 A 5 +2 C 10 +3 A 15 +4 B 20 +... +``` + +That's it! You have now created and used your first Data Designer plugin. The last step is to package your plugin and share it with the community 🚀 diff --git a/docs/plugins/overview.md b/docs/plugins/overview.md new file mode 100644 index 00000000..0016120f --- /dev/null +++ b/docs/plugins/overview.md @@ -0,0 +1,45 @@ +# Data Designer Plugins + +!!! warning "Experimental Feature" + The plugin system is currently **experimental** and under active development. The documentation, examples, and plugin interface are subject to significant changes in future releases. If you encounter any issues, have questions, or have ideas for improvement, please consider starting [a discussion on GitHub](https://github.com/NVIDIA-NeMo/DataDesigner/discussions). + +## What are plugins? + +Plugins are Python packages that extend Data Designer's capabilities without modifying the core library. Similar to [VS Code extensions](https://marketplace.visualstudio.com/vscode) and [Pytest plugins](https://docs.pytest.org/en/stable/reference/plugin_list.html), the plugin system empowers you to build specialized extensions for your specific use cases and share them with the community. + +**Current capabilities**: Data Designer currently supports plugins for column generators (the column types you pass to the config builder's [add_column](../code_reference/config_builder.md#data_designer.config.config_builder.DataDesignerConfigBuilder.add_column) method). + +**Coming soon**: Plugin support for processors, validators, and more! + +## How do you use plugins? + +A Data Designer plugin is just a Python package configured with an [entry point](https://packaging.python.org/en/latest/guides/creating-and-discovering-plugins/#using-package-metadata) that points to a Data Designer `Plugin` object. Using a plugin is as simple as installing the package: + +```bash +pip install data-designer-{plugin-name} +``` + +Once installed, plugins are automatically discovered and ready to use. See the [example plugin](example.md) for a complete walkthrough. + +## How do you create plugins? + +Creating a plugin involves three main steps: + +### 1. Implement the Plugin Components + +- Create a task class inheriting from `ColumnGenerator` +- Create a config class inheriting from `SingleColumnConfig` +- Instantiate a `Plugin` object connecting them + +### 2. Package Your Plugin + +- Set up a Python package with `pyproject.toml` +- Register your plugin using entry points +- Define dependencies (including `data-designer`) + +### 3. Share Your Plugin + +- Publish to PyPI or another package index +- Share with the community! + +**Ready to get started?** See the [Example Plugin](example.md) for a complete walkthrough! diff --git a/mkdocs.yml b/mkdocs.yml index db22e60d..beae5e89 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,6 +31,10 @@ nav: - QA and Chat: - Product Info QA: recipes/qa_and_chat/product_info_qa.md - Multi-Turn Chat: recipes/qa_and_chat/multi_turn_chat.md + - Plugins: + - Overview: plugins/overview.md + - Example Plugin: plugins/example.md + - Available Plugin List: plugins/available.md - Code Reference: - models: code_reference/models.md - column_configs: code_reference/column_configs.md @@ -38,6 +42,7 @@ nav: - data_designer_config: code_reference/data_designer_config.md - sampler_params: code_reference/sampler_params.md - validator_params: code_reference/validator_params.md + - analysis: code_reference/analysis.md theme: name: material diff --git a/src/data_designer/config/analysis/__init__.py b/src/data_designer/config/analysis/__init__.py new file mode 100644 index 00000000..1a8431c3 --- /dev/null +++ b/src/data_designer/config/analysis/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/src/data_designer/config/analysis/column_profilers.py b/src/data_designer/config/analysis/column_profilers.py index 490b15ca..bf5437f8 100644 --- a/src/data_designer/config/analysis/column_profilers.py +++ b/src/data_designer/config/analysis/column_profilers.py @@ -27,7 +27,20 @@ class ColumnProfilerType(str, Enum): class ColumnProfilerResults(BaseModel, ABC): + """Abstract base class for column profiler results. + + Stores results from column profiling operations. Subclasses hold profiler-specific + analysis results and provide methods for generating formatted report sections for display. + """ + def create_report_section(self) -> Panel: + """Creates a Rich Panel containing the formatted profiler results for display. + + Returns: + A Rich Panel containing the formatted profiler results. Default implementation + returns a "Not Implemented" message; subclasses should override to provide + specific formatting. + """ return Panel( f"Report section generation not implemented for '{self.__class__.__name__}'.", title="Not Implemented", @@ -37,16 +50,50 @@ def create_report_section(self) -> Panel: class JudgeScoreProfilerConfig(ConfigBase): + """Configuration for the LLM-as-a-judge score profiler. + + Attributes: + model_alias: Alias of the LLM model to use for generating score distribution summaries. + Must match a model alias defined in the Data Designer configuration. + summary_score_sample_size: Number of score samples to include when prompting the LLM + to generate summaries. Larger sample sizes provide more context but increase + token usage. Must be at least 1. Defaults to 20. + """ + model_alias: str summary_score_sample_size: Optional[int] = Field(default=20, ge=1) class JudgeScoreSample(BaseModel): + """Container for a single judge score and its associated reasoning. + + Stores a paired score-reasoning sample extracted from an LLM-as-a-judge column. + Used when generating summaries to provide the LLM with examples of scoring patterns. + + Attributes: + score: The score value assigned by the judge. Can be numeric (int) or categorical (str). + reasoning: The reasoning or explanation provided by the judge for this score. + """ + score: Union[int, str] reasoning: str class JudgeScoreDistributions(BaseModel): + """Container for computed distributions across all judge score dimensions. + + Stores the complete distribution analysis for all score dimensions in an LLM-as-a-judge + column. Each score dimension (e.g., "relevance", "fluency") has its own distribution + computed from the generated data. + + Attributes: + scores: Mapping of each score dimension name to its list of score values. + reasoning: Mapping of each score dimension name to its list of reasoning texts. + distribution_types: Mapping of each score dimension name to its classification. + distributions: Mapping of each score dimension name to its computed distribution statistics. + histograms: Mapping of each score dimension name to its histogram data. + """ + scores: dict[str, list[Union[int, str]]] reasoning: dict[str, list[str]] distribution_types: dict[str, ColumnDistributionType] @@ -55,12 +102,34 @@ class JudgeScoreDistributions(BaseModel): class JudgeScoreSummary(BaseModel): + """Container for an LLM-generated summary of a judge score dimension. + + Stores the natural language summary and sample data for a single score dimension + generated by the judge score profiler. The summary is created by an LLM analyzing + the distribution and patterns in the score-reasoning pairs. + + Attributes: + score_name: Name of the score dimension being summarized (e.g., "relevance", "fluency"). + summary: LLM-generated natural language summary describing the scoring patterns, + distribution characteristics, and notable trends for this score dimension. + score_samples: List of score-reasoning pairs that were used to generate the summary. + These are the examples of the scoring behavior that were used to generate the summary. + """ + score_name: str summary: str score_samples: list[JudgeScoreSample] class JudgeScoreProfilerResults(ColumnProfilerResults): + """Container for complete judge score profiler analysis results. + + Attributes: + column_name: Name of the judge column that was profiled. + summaries: Mapping of each score dimension name to its LLM-generated summary. + score_distributions: Complete distribution analysis across all score dimensions. + """ + column_name: str summaries: dict[str, JudgeScoreSummary] score_distributions: Union[JudgeScoreDistributions, MissingValue] diff --git a/src/data_designer/config/analysis/column_statistics.py b/src/data_designer/config/analysis/column_statistics.py index 236cf200..4f0f7675 100644 --- a/src/data_designer/config/analysis/column_statistics.py +++ b/src/data_designer/config/analysis/column_statistics.py @@ -32,13 +32,42 @@ class ColumnDistributionType(str, Enum): class BaseColumnStatistics(BaseModel, ABC): + """Abstract base class for all column statistics types. + + Serves as a container for computed statistics across different column types in + Data-Designer-generated datasets. Subclasses hold column-specific statistical results + and provide methods for formatting these results for display in reports. + """ + model_config = ConfigDict(use_enum_values=True) @abstractmethod - def create_report_row_data(self) -> dict[str, str]: ... + def create_report_row_data(self) -> dict[str, str]: + """Creates a formatted dictionary of statistics for display in reports. + + Returns: + Dictionary mapping display labels to formatted statistic values. + """ + ... class GeneralColumnStatistics(BaseColumnStatistics): + """Container for general statistics applicable to all column types. + + Holds core statistical measures that apply universally across all column types, + including null counts, unique values, and data type information. Serves as the base + for more specialized column statistics classes that store additional column-specific metrics. + + Attributes: + column_name: Name of the column being analyzed. + num_records: Total number of records in the column. + num_null: Number of null/missing values in the column. + num_unique: Number of distinct values in the column. If a value is not hashable, it is converted to a string. + pyarrow_dtype: PyArrow data type of the column as a string. + simple_dtype: Simplified human-readable data type label. + column_type: Discriminator field, always "general" for this statistics type. + """ + column_name: str num_records: Union[int, MissingValue] num_null: Union[int, MissingValue] @@ -84,6 +113,21 @@ def _is_missing_value(self, v: Union[float, int, MissingValue]) -> bool: class LLMTextColumnStatistics(GeneralColumnStatistics): + """Container for statistics on LLM-generated text columns. + + Inherits general statistics plus token usage metrics specific to LLM text generation. + Stores both prompt and completion token consumption data. + + Attributes: + completion_tokens_mean: Mean number of completion tokens generated per record. + completion_tokens_median: Median number of completion tokens generated per record. + completion_tokens_stddev: Standard deviation of completion tokens per record. + prompt_tokens_mean: Mean number of prompt tokens used per record. + prompt_tokens_median: Median number of prompt tokens used per record. + prompt_tokens_stddev: Standard deviation of prompt tokens per record. + column_type: Discriminator field, always "llm-text" for this statistics type. + """ + completion_tokens_mean: Union[float, MissingValue] completion_tokens_median: Union[float, MissingValue] completion_tokens_stddev: Union[float, MissingValue] @@ -123,18 +167,62 @@ def create_report_row_data(self) -> dict[str, Any]: class LLMCodeColumnStatistics(LLMTextColumnStatistics): + """Container for statistics on LLM-generated code columns. + + Inherits all token usage metrics from LLMTextColumnStatistics. Stores + statistics from columns that generate code snippets in specific programming languages. + + Attributes: + column_type: Discriminator field, always "llm-code" for this statistics type. + """ + column_type: Literal[DataDesignerColumnType.LLM_CODE.value] = DataDesignerColumnType.LLM_CODE.value class LLMStructuredColumnStatistics(LLMTextColumnStatistics): + """Container for statistics on LLM-generated structured JSON columns. + + Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from + columns that generate structured data conforming to JSON schemas or Pydantic models. + + Attributes: + column_type: Discriminator field, always "llm-structured" for this statistics type. + """ + column_type: Literal[DataDesignerColumnType.LLM_STRUCTURED.value] = DataDesignerColumnType.LLM_STRUCTURED.value class LLMJudgedColumnStatistics(LLMTextColumnStatistics): + """Container for statistics on LLM-as-a-judge quality assessment columns. + + Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from + columns that evaluate and score other generated content based on defined criteria. + + Attributes: + column_type: Discriminator field, always "llm-judge" for this statistics type. + """ + column_type: Literal[DataDesignerColumnType.LLM_JUDGE.value] = DataDesignerColumnType.LLM_JUDGE.value class SamplerColumnStatistics(GeneralColumnStatistics): + """Container for statistics on sampler-generated columns. + + Inherits general statistics plus sampler-specific information including the sampler type + used and the empirical distribution of generated values. Stores both categorical and + numerical distribution results. + + Attributes: + sampler_type: Type of sampler used to generate this column (e.g., "uniform", "category", + "gaussian", "person"). + distribution_type: Classification of the column's distribution (categorical, numerical, + text, other, or unknown). + distribution: Empirical distribution statistics for the generated values. Can be + CategoricalDistribution (for discrete values), NumericalDistribution (for continuous + values), or MissingValue if distribution could not be computed. + column_type: Discriminator field, always "sampler" for this statistics type. + """ + sampler_type: SamplerType distribution_type: ColumnDistributionType distribution: Optional[Union[CategoricalDistribution, NumericalDistribution, MissingValue]] @@ -148,14 +236,43 @@ def create_report_row_data(self) -> dict[str, str]: class SeedDatasetColumnStatistics(GeneralColumnStatistics): + """Container for statistics on columns sourced from seed datasets. + + Inherits general statistics and stores statistics computed from columns that originate + from existing data provided via the seed dataset functionality. + + Attributes: + column_type: Discriminator field, always "seed-dataset" for this statistics type. + """ + column_type: Literal[DataDesignerColumnType.SEED_DATASET.value] = DataDesignerColumnType.SEED_DATASET.value class ExpressionColumnStatistics(GeneralColumnStatistics): + """Container for statistics on expression-based derived columns. + + Inherits general statistics and stores statistics computed from columns that are derived + from columns that are derived from Jinja2 expressions referencing other column values. + + Attributes: + column_type: Discriminator field, always "expression" for this statistics type. + """ + column_type: Literal[DataDesignerColumnType.EXPRESSION.value] = DataDesignerColumnType.EXPRESSION.value class ValidationColumnStatistics(GeneralColumnStatistics): + """Container for statistics on validation result columns. + + Inherits general statistics plus validation-specific metrics including the count and + percentage of records that passed validation. Stores results from validation logic + (Python, SQL, or remote) executed against target columns. + + Attributes: + num_valid_records: Number of records that passed validation. + column_type: Discriminator field, always "validation" for this statistics type. + """ + num_valid_records: Union[int, MissingValue] column_type: Literal[DataDesignerColumnType.VALIDATION.value] = DataDesignerColumnType.VALIDATION.value @@ -177,6 +294,15 @@ def create_report_row_data(self) -> dict[str, str]: class CategoricalHistogramData(BaseModel): + """Container for categorical distribution histogram data. + + Stores the computed frequency distribution of categorical values. + + Attributes: + categories: List of unique category values that appear in the data. + counts: List of occurrence counts for each category. + """ + categories: list[Union[float, int, str]] counts: list[int] @@ -194,6 +320,14 @@ def from_series(cls, series: Series) -> Self: class CategoricalDistribution(BaseModel): + """Container for computed categorical distribution statistics. + + Attributes: + most_common_value: The category value that appears most frequently in the data. + least_common_value: The category value that appears least frequently in the data. + histogram: Complete frequency distribution showing all categories and their counts. + """ + most_common_value: Union[str, int] least_common_value: Union[str, int] histogram: CategoricalHistogramData @@ -213,6 +347,16 @@ def from_series(cls, series: Series) -> Self: class NumericalDistribution(BaseModel): + """Container for computed numerical distribution statistics. + + Attributes: + min: Minimum value in the distribution. + max: Maximum value in the distribution. + mean: Arithmetic mean (average) of all values. + stddev: Standard deviation measuring the spread of values around the mean. + median: Median value of the distribution. + """ + min: Union[float, int] max: Union[float, int] mean: float diff --git a/src/data_designer/config/analysis/dataset_profiler.py b/src/data_designer/config/analysis/dataset_profiler.py index b119fff8..f0976293 100644 --- a/src/data_designer/config/analysis/dataset_profiler.py +++ b/src/data_designer/config/analysis/dataset_profiler.py @@ -16,6 +16,21 @@ class DatasetProfilerResults(BaseModel): + """Container for complete dataset profiling and analysis results. + + Stores profiling results for a generated dataset, including statistics for all columns, + dataset-level metadata, and optional advanced profiler results. Provides methods for + computing derived metrics and generating formatted reports. + + Attributes: + num_records: Actual number of records successfully generated in the dataset. + target_num_records: Target number of records that were requested to be generated. + column_statistics: List of statistics objects for all columns in the dataset. Each + column has statistics appropriate to its type. Must contain at least one column. + side_effect_column_names: Column names that were generated as side effects of other columns. + column_profiles: Column profiler results for specific columns when configured. + """ + num_records: int target_num_records: int column_statistics: list[Annotated[ColumnStatisticsT, Field(discriminator="column_type")]] = Field(..., min_length=1) @@ -28,10 +43,12 @@ def ensure_python_integers(cls, v: int) -> int: @property def percent_complete(self) -> float: + """Returns the completion percentage of the dataset.""" return 100 * self.num_records / (self.target_num_records + EPSILON) @cached_property def column_types(self) -> list[str]: + """Returns a sorted list of unique column types present in the dataset.""" display_order = get_column_display_order() return sorted( list(set([c.column_type for c in self.column_statistics])), @@ -39,6 +56,7 @@ def column_types(self) -> list[str]: ) def get_column_statistics_by_type(self, column_type: DataDesignerColumnType) -> list[ColumnStatisticsT]: + """Filters column statistics to return only those of the specified type.""" return [c for c in self.column_statistics if c.column_type == column_type] def to_report( diff --git a/src/data_designer/engine/dataset_builders/column_wise_builder.py b/src/data_designer/engine/dataset_builders/column_wise_builder.py index a741290c..beeedd6f 100644 --- a/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -171,8 +171,6 @@ def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None: max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR if isinstance(generator, WithLLMGeneration): max_workers = generator.inference_parameters.max_parallel_requests - elif hasattr(generator.config, "max_parallel_requests"): - max_workers = generator.config.max_parallel_requests self._fan_out_with_threads(generator, max_workers=max_workers) def _run_full_column_generator(self, generator: ColumnGenerator) -> None: