Skip to content

Commit 3115f6e

Browse files
authored
Update integer boundaries for ID columns (#2282)
1 parent 315266f commit 3115f6e

File tree

3 files changed

+21
-20
lines changed

3 files changed

+21
-20
lines changed

sdv/data_processing/data_processor.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,12 @@
2222
)
2323
from sdv.data_processing.datetime_formatter import DatetimeFormatter
2424
from sdv.data_processing.errors import InvalidConstraintsError, NotFittedError
25-
from sdv.data_processing.numerical_formatter import INTEGER_BOUNDS, NumericalFormatter
25+
from sdv.data_processing.numerical_formatter import NumericalFormatter
2626
from sdv.data_processing.utils import load_module_from_path
2727
from sdv.errors import SynthesizerInputError, log_exc_stacktrace
2828
from sdv.metadata.single_table import SingleTableMetadata
2929

3030
LOGGER = logging.getLogger(__name__)
31-
INTEGER_BOUNDS = {str(key).lower(): value for key, value in INTEGER_BOUNDS.items()}
3231

3332

3433
class DataProcessor:
@@ -70,8 +69,6 @@ class DataProcessor:
7069
'M': 'datetime',
7170
}
7271

73-
_COLUMN_RELATIONSHIP_TO_TRANSFORMER = {'address': 'RandomLocationGenerator', 'gps': 'GPSNoiser'}
74-
7572
def _update_numerical_transformer(self, enforce_rounding, enforce_min_max_values):
7673
custom_float_formatter = rdt.transformers.FloatFormatter(
7774
missing_value_replacement='mean',
@@ -124,6 +121,10 @@ def __init__(
124121
self._constraints = []
125122
self._constraints_to_reverse = []
126123
self._custom_constraint_classes = {}
124+
self._COLUMN_RELATIONSHIP_TO_TRANSFORMER = {
125+
'address': 'RandomLocationGenerator',
126+
'gps': 'GPSNoiser',
127+
}
127128

128129
self._transformers_by_sdtype = deepcopy(get_default_transformers())
129130
self._transformers_by_sdtype['id'] = rdt.transformers.RegexGenerator()
@@ -575,11 +576,11 @@ def _create_config(self, data, columns_created_by_constraints):
575576
if is_numeric:
576577
function_name = 'random_int'
577578
column_dtype = str(column_dtype).lower()
578-
function_kwargs = {'min': 0, 'max': 9999999}
579-
for key in INTEGER_BOUNDS:
580-
if key in column_dtype:
581-
_, max_value = INTEGER_BOUNDS[key]
582-
function_kwargs = {'min': 0, 'max': max_value}
579+
function_kwargs = {'min': 0, 'max': 16777216}
580+
if 'int8' in column_dtype:
581+
function_kwargs['max'] = 127
582+
elif 'int16' in column_dtype:
583+
function_kwargs['max'] = 32767
583584

584585
else:
585586
function_kwargs = {'text': 'sdv-id-??????'}

tests/integration/single_table/test_copulas.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -348,16 +348,16 @@ def test_numerical_columns_gets_pii():
348348
# Assert
349349
expected_sampled = pd.DataFrame({
350350
'id': [
351-
1089619006166876142,
352-
8373046707753416652,
353-
9070705361670139280,
354-
7227045982112645011,
355-
3461931576753619633,
356-
1005734164466301683,
357-
3312031189447929384,
358-
82456842876428117,
359-
1819741328868365520,
360-
8019169766233150107,
351+
1982005,
352+
15967014,
353+
10406639,
354+
15230483,
355+
14028549,
356+
16499516,
357+
9244156,
358+
13145920,
359+
10106629,
360+
6297216,
361361
],
362362
'city': [
363363
'Danielfort',

tests/unit/data_processing/test_data_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1258,7 +1258,7 @@ def test__create_config(self):
12581258
assert id_numeric_int_32_transformer.function_name == 'random_int'
12591259
assert id_numeric_int_32_transformer.function_kwargs == {
12601260
'min': 0,
1261-
'max': 2147483647,
1261+
'max': 16777216,
12621262
}
12631263

12641264
id_column_transformer = config['transformers']['id_column']

0 commit comments

Comments
 (0)