sdv-dev
diff --git a/‎HISTORY.md‎
Lines changed: 16 additions & 0 deletions b/‎HISTORY.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎latest_requirements.txt‎
Lines changed: 2 additions & 2 deletions b/‎latest_requirements.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdv/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎sdv/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdv/cag/_utils.py‎
Lines changed: 3 additions & 2 deletions b/‎sdv/cag/_utils.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎sdv/cag/one_hot_encoding.py‎
Lines changed: 73 additions & 9 deletions b/‎sdv/cag/one_hot_encoding.py‎
Lines changed: 73 additions & 9 deletions
diff --git a/‎sdv/errors.py‎
Lines changed: 8 additions & 0 deletions b/‎sdv/errors.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎sdv/multi_table/base.py‎
Lines changed: 3 additions & 1 deletion b/‎sdv/multi_table/base.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sdv/multi_table/hma.py‎
Lines changed: 5 additions & 1 deletion b/‎sdv/multi_table/hma.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sdv/single_table/base.py‎
Lines changed: 13 additions & 9 deletions b/‎sdv/single_table/base.py‎
Lines changed: 13 additions & 9 deletions
@@ -1,5 +1,21 @@
 # Release Notes
 
+## v1.27.0 - 2025-09-15
+
+### New Features
+
+* Create a specific warning type for the purposes of refitting a synthesizer - Issue [#2662](https://github.com/sdv-dev/SDV/issues/2662) by @frances-h
+* [OneHotEncoding constraint] Allow me to specify whether to keep the one-hot columns or collapse them into one categorical column - Issue [#2650](https://github.com/sdv-dev/SDV/issues/2650) by @fealho
+
+### Bugs Fixed
+
+* "numerical_distributions" in HMASynthesizer get ignored - Issue [#2648](https://github.com/sdv-dev/SDV/issues/2648) by @fealho
+
+### Internal
+
+* Add helper method for transforming conditions - Issue [#2660](https://github.com/sdv-dev/SDV/issues/2660) by @rwedge
+* [OneHotEncoding Constraint] For higher quality, ensure the model creates floating point numbers - Issue [#2649](https://github.com/sdv-dev/SDV/issues/2649) by @fealho
+
 ## v1.26.0 - 2025-08-18
 
 ### New Features
 
@@ -4,8 +4,8 @@ ctgan==0.11.0
 deepecho==0.7.0
 graphviz==0.21
 numpy==2.3.2
-pandas==2.3.1
-platformdirs==4.3.8
+pandas==2.3.2
+platformdirs==4.4.0
 rdt==1.18.0
 sdmetrics==0.23.0
 tqdm==4.67.1
@@ -143,7 +143,7 @@ namespaces = false
 version = {attr = 'sdv.__version__'}
 
 [tool.bumpversion]
-current_version = "1.26.0"
+current_version = "1.27.0.dev0"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
 
@@ -6,7 +6,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = '[email protected]'
-__version__ = '1.26.0'
+__version__ = '1.27.0.dev0'
 
 
 import sys
 
@@ -5,7 +5,7 @@
 import pandas as pd
 
 from sdv.cag._errors import ConstraintNotMetError
-from sdv.errors import SynthesizerInputError, TableNameError
+from sdv.errors import RefitWarning, SynthesizerInputError, TableNameError
 from sdv.metadata import Metadata
 
 
@@ -185,7 +185,8 @@ def _validate_constraints(constraints, synthesizer_fitted):
 
     if synthesizer_fitted:
         warnings.warn(
-            "For these constraints to take effect, please refit the synthesizer using 'fit'."
+            "For these constraints to take effect, please refit the synthesizer using 'fit'.",
+            RefitWarning,
         )
 
     return _filter_old_style_constraints(constraints)
 
@@ -1,16 +1,22 @@
 """One Hot Encoding constraint."""
 
+from copy import deepcopy
+
 import numpy as np
 
+from sdv._utils import _create_unique_name
 from sdv.cag._errors import ConstraintNotMetError
 from sdv.cag._utils import (
     _get_is_valid_dict,
     _is_list_of_type,
+    _remove_columns_from_metadata,
     _validate_table_and_column_names,
     _validate_table_name_if_defined,
 )
 from sdv.cag.base import BaseConstraint
 
+EPSILON = float(np.finfo(np.float32).eps)
+
 
 class OneHotEncoding(BaseConstraint):
     """Ensure the appropriate columns are one hot encoded.
@@ -26,20 +32,30 @@ class OneHotEncoding(BaseConstraint):
         table_name (str, optional):
             The name of the table that contains the columns. Optional if the
             data is only a single table. Defaults to None.
+        learning_strategy (str, optional):
+            Strategy for how the model should learn the one-hot fields. Supported values:
+            - 'one_hot' (default): Learn each one-hot column separately.
+            - 'categorical': Internally collapse the one-hot columns into a single categorical
+              column for the model to learn, then expand back to one-hot at sampling time.
     """
 
     @staticmethod
-    def _validate_init_inputs(column_names, table_name):
+    def _validate_init_inputs(column_names, table_name, learning_strategy):
         if not _is_list_of_type(column_names):
             raise ValueError('`column_names` must be a list of strings.')
 
         _validate_table_name_if_defined(table_name)
 
-    def __init__(self, column_names, table_name=None):
+        if learning_strategy not in ['one_hot', 'categorical']:
+            raise ValueError("`learning_strategy` must be either 'one_hot' or 'categorical'.")
+
+    def __init__(self, column_names, table_name=None, learning_strategy='one_hot'):
         super().__init__()
-        self._validate_init_inputs(column_names, table_name)
+        self._validate_init_inputs(column_names, table_name, learning_strategy)
         self._column_names = column_names
         self.table_name = table_name
+        self.learning_strategy = learning_strategy
+        self._categorical_column = '#'.join(self._column_names)
 
     def _validate_constraint_with_metadata(self, metadata):
         """Validate the constraint is compatible with the provided metadata.
@@ -88,6 +104,28 @@ def _fit(self, data, metadata):
         """
         pass
 
+    def _get_updated_metadata(self, metadata):
+        table_name = self._get_single_table_name(metadata)
+        if self.learning_strategy == 'categorical':
+            self._categorical_column = _create_unique_name(
+                self._categorical_column, metadata.tables[table_name].columns
+            )
+            md = metadata.to_dict()
+            md['tables'][table_name]['columns'][self._categorical_column] = {
+                'sdtype': 'categorical'
+            }
+            return _remove_columns_from_metadata(md, table_name, columns_to_drop=self._column_names)
+
+        else:
+            metadata = deepcopy(metadata)
+            for column in self._column_names:
+                if metadata.tables[table_name].columns[column]['sdtype'] in [
+                    'categorical',
+                    'boolean',
+                ]:
+                    metadata.tables[table_name].columns[column]['sdtype'] = 'numerical'
+            return metadata
+
     def _transform(self, data):
         """Transform the data.
 
@@ -99,6 +137,17 @@ def _transform(self, data):
             dict[str, pd.DataFrame]:
                 Transformed data.
         """
+        table_name = self._get_single_table_name(self.metadata)
+        if self.learning_strategy == 'categorical':
+            table_data = data[table_name]
+            categories = table_data[self._column_names].idxmax(axis=1)
+            table_data[self._categorical_column] = categories
+            data[table_name] = table_data.drop(self._column_names, axis=1)
+        else:
+            one_hot_data = data[table_name][self._column_names]
+            one_hot_data = np.where(one_hot_data == 0, EPSILON, 1 - EPSILON)
+            data[table_name][self._column_names] = one_hot_data
+
         return data
 
     def _reverse_transform(self, data):
@@ -116,13 +165,28 @@ def _reverse_transform(self, data):
         """
         table_name = self._get_single_table_name(self.metadata)
         table_data = data[table_name]
-        one_hot_data = table_data[self._column_names]
-        transformed_data = np.zeros_like(one_hot_data.to_numpy())
-        max_category_indices = np.argmax(one_hot_data.to_numpy(), axis=1)
-        transformed_data[np.arange(len(one_hot_data)), max_category_indices] = 1
-        table_data[self._column_names] = transformed_data
-        data[table_name] = table_data
 
+        if self.learning_strategy == 'categorical':
+            categories = table_data.pop(self._categorical_column)
+            num_rows = len(table_data)
+            num_cols = len(self._column_names)
+            transformed = np.zeros((num_rows, num_cols), dtype=float)
+
+            column_to_index = {name: idx for idx, name in enumerate(self._column_names)}
+            indices = categories.map(lambda x: column_to_index[x]).to_numpy()
+            transformed[np.arange(num_rows), indices] = 1
+
+            for idx, col in enumerate(self._column_names):
+                table_data[col] = transformed[:, idx]
+
+        else:
+            one_hot_data = table_data[self._column_names]
+            transformed_data = np.zeros_like(one_hot_data.to_numpy())
+            max_category_indices = np.argmax(one_hot_data.to_numpy(), axis=1)
+            transformed_data[np.arange(len(one_hot_data)), max_category_indices] = 1
+            table_data[self._column_names] = transformed_data
+
+        data[table_name] = table_data
         return data
 
     def _is_valid(self, data, metadata):
 
@@ -83,3 +83,11 @@ def __init__(self, message):
 
 
 TableNameError = ValueError('`table_name` must be a string or None.')
+
+
+class RefitWarning(UserWarning):
+    """Warning to be raised if the synthesizer needs to be refit.
+
+    Warning to be raised if a change to a synthesizer requires the synthesizer
+    to be refit for the change to be applied.
+    """
@@ -29,6 +29,7 @@
 from sdv.cag.programmable_constraint import ProgrammableConstraint, ProgrammableConstraintHarness
 from sdv.errors import (
     InvalidDataError,
+    RefitWarning,
     SamplingError,
     SynthesizerInputError,
 )
@@ -551,10 +552,11 @@ def preprocess(self, data):
         self.validate(data)
         data = self._validate_transform_constraints(data)
         if self._fitted:
-            warnings.warn(
+            msg = (
                 'This model has already been fitted. To use the new preprocessed data, '
                 "please refit the model using 'fit' or 'fit_processed_data'."
             )
+            warnings.warn(msg, RefitWarning)
 
         processed_data = {}
         pbar_args = self._get_pbar_args(desc='Preprocess Tables')
 
@@ -283,7 +283,11 @@ def _set_extended_columns_distributions(self, synthesizer, table_name, valid_col
         for extended_column in self._parent_extended_columns[table_name]:
             if extended_column in valid_columns:
                 numerical_distributions[extended_column] = DEFAULT_EXTENDED_COLUMNS_DISTRIBUTION
-        synthesizer._set_numerical_distributions(numerical_distributions)
+
+        if numerical_distributions:
+            existing = getattr(synthesizer, 'numerical_distributions', {}) or {}
+            merged = {**existing, **numerical_distributions}
+            synthesizer._set_numerical_distributions(merged)
 
     def _get_extension(self, child_name, child_table, foreign_key, progress_bar_desc):
         """Generate the extension columns for this child table.
 
@@ -41,6 +41,7 @@
 from sdv.errors import (
     ConstraintsNotMetError,
     InvalidDataError,
+    RefitWarning,
     SamplingError,
     SynthesizerInputError,
 )
@@ -306,7 +307,7 @@ def update_transformers(self, column_name_to_transformer):
         self._data_processor.update_transformers(column_name_to_transformer)
         if self._fitted:
             msg = 'For this change to take effect, please refit the synthesizer using `fit`.'
-            warnings.warn(msg, UserWarning)
+            warnings.warn(msg, RefitWarning)
 
     def get_parameters(self):
         """Return the parameters used to instantiate the synthesizer."""
@@ -587,10 +588,12 @@ def _preprocess_helper(self, data):
         """
         self.validate(data)
         if self._fitted:
-            warnings.warn(
+            msg = (
                 'This model has already been fitted. To use the new preprocessed data, '
                 "please refit the model using 'fit' or 'fit_processed_data'."
             )
+            warnings.warn(msg, RefitWarning)
+
         data = self._validate_transform_constraints(data)
 
         return data
@@ -1208,18 +1211,19 @@ def sample(self, num_rows, max_tries_per_batch=100, batch_size=None, output_file
 
         return sampled_data
 
+    def _transform_conditions(self, condition_df):
+        return self._data_processor.transform(condition_df, is_condition=True)
+
     def _transform_conditions_chained_constraints(self, condition_df):
         try:
             transformed_condition = self._validate_transform_constraints(condition_df)
-            transformed_condition = self._data_processor.transform(
-                transformed_condition, is_condition=True
-            )
+            transformed_condition = self._transform_conditions(transformed_condition)
         except ConstraintNotMetError:
             raise ConstraintNotMetError(
                 'Provided conditions are not valid for the given constraints.'
             )
         except Exception:
-            transformed_condition = self._data_processor.transform(condition_df, is_condition=True)
+            transformed_condition = self._transform_conditions(condition_df)
 
         return transformed_condition
 
@@ -1274,13 +1278,13 @@ def _sample_with_conditions(
 
             condition = dict(zip(condition_columns, group))
             condition_df = dataframe.iloc[0].to_frame().T
+            dtypes = conditions.dtypes.to_dict()
+            condition_df = condition_df.astype(dtypes)
             if hasattr(self, '_chained_constraints'):
                 transformed_condition = self._transform_conditions_chained_constraints(condition_df)
             else:
                 try:
-                    transformed_condition = self._data_processor.transform(
-                        condition_df, is_condition=True
-                    )
+                    transformed_condition = self._transform_conditions(condition_df)
                 except ConstraintsNotMetError as error:
                     raise ConstraintsNotMetError(
                         'Provided conditions are not valid for the given constraints.'