Skip to content

Commit eb252ea

Browse files
committed
Implement fallback
1 parent ae6e1c0 commit eb252ea

File tree

2 files changed

+170
-29
lines changed

2 files changed

+170
-29
lines changed

sdv/single_table/_dayz_utils.py

Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,79 @@ def detect_table_parameters(data):
1818
return {'num_rows': len(data)}
1919

2020

21+
def _compute_missing_values_proportion(series):
22+
"""Compute missing value proportion with a safe fallback for empty series."""
23+
if len(series) == 0:
24+
return 0.0
25+
26+
value = float(series.isna().mean())
27+
return 0.0 if pd.isna(value) else value
28+
29+
30+
def _detect_numerical_column_parameters(series):
31+
"""Detect numerical-specific parameters with fallbacks when undetectable.
32+
33+
Returns only keys that can be reliably detected (no None values).
34+
"""
35+
params = {}
36+
non_null = series.dropna()
37+
if non_null.empty:
38+
return params
39+
40+
try:
41+
num_decimal_digits = learn_rounding_digits(series)
42+
if isinstance(num_decimal_digits, int) and num_decimal_digits >= 0:
43+
params['num_decimal_digits'] = num_decimal_digits
44+
except Exception:
45+
pass
46+
47+
min_value = non_null.min()
48+
max_value = non_null.max()
49+
if not pd.isna(min_value):
50+
params['min_value'] = min_value.item() if hasattr(min_value, 'item') else float(min_value)
51+
if not pd.isna(max_value):
52+
params['max_value'] = max_value.item() if hasattr(max_value, 'item') else float(max_value)
53+
54+
return params
55+
56+
57+
def _detect_datetime_column_parameters(series, column_metadata):
58+
"""Detect datetime-specific parameters with fallbacks when undetectable.
59+
60+
Returns only keys that can be reliably detected (no None values).
61+
"""
62+
params = {}
63+
datetime_format = column_metadata.get('datetime_format', None)
64+
if datetime_format:
65+
datetime_column = pd.to_datetime(series, format=datetime_format, errors='coerce')
66+
else:
67+
datetime_column = pd.to_datetime(series, errors='coerce')
68+
69+
non_na = datetime_column[~pd.isna(datetime_column)]
70+
if non_na.empty:
71+
return params
72+
73+
start_dt = non_na.min()
74+
end_dt = non_na.max()
75+
if datetime_format:
76+
params['start_timestamp'] = start_dt.strftime(datetime_format)
77+
params['end_timestamp'] = end_dt.strftime(datetime_format)
78+
else:
79+
params['start_timestamp'] = str(start_dt)
80+
params['end_timestamp'] = str(end_dt)
81+
82+
return params
83+
84+
85+
def _detect_categorical_or_boolean_column_parameters(series):
86+
"""Detect categorical/boolean parameters."""
87+
categorical_values = series.dropna().unique()
88+
if len(categorical_values) == 0:
89+
return {}
90+
91+
return {'category_values': categorical_values.tolist()}
92+
93+
2194
def detect_column_parameters(data, metadata, table_name):
2295
"""Detect all column-level Dayz parameters.
2396
@@ -37,40 +110,17 @@ def detect_column_parameters(data, metadata, table_name):
37110
table_metadata = metadata.tables[table_name]
38111
column_parameters = {}
39112
for column_name, column_metadata in table_metadata.columns.items():
40-
column_parameters[column_name] = {}
41113
sdtype = column_metadata['sdtype']
114+
params = {}
42115
if sdtype == 'numerical':
43-
column_parameters[column_name] = {
44-
'num_decimal_digits': learn_rounding_digits(data[column_name]),
45-
'min_value': data[column_name].min().item(),
46-
'max_value': data[column_name].max().item(),
47-
}
116+
params.update(_detect_numerical_column_parameters(data[column_name]))
48117
elif sdtype == 'datetime':
49-
datetime_format = column_metadata.get('datetime_format', None)
50-
if datetime_format:
51-
datetime_column = pd.to_datetime(
52-
data[column_name], format=datetime_format, errors='coerce'
53-
)
54-
start_timestamp = datetime_column.min().strftime(datetime_format)
55-
end_timestamp = datetime_column.max().strftime(datetime_format)
56-
57-
else:
58-
datetime_column = pd.to_datetime(data[column_name], errors='coerce')
59-
start_timestamp = str(datetime_column.min())
60-
end_timestamp = str(datetime_column.max())
61-
62-
column_parameters[column_name] = {
63-
'start_timestamp': start_timestamp,
64-
'end_timestamp': end_timestamp,
65-
}
118+
params.update(_detect_datetime_column_parameters(data[column_name], column_metadata))
66119
elif sdtype in ['categorical', 'boolean']:
67-
column_parameters[column_name] = {
68-
'category_values': data[column_name].dropna().unique().tolist()
69-
}
120+
params.update(_detect_categorical_or_boolean_column_parameters(data[column_name]))
70121

71-
column_parameters[column_name]['missing_values_proportion'] = (
72-
data[column_name].isna().mean().item()
73-
)
122+
params['missing_values_proportion'] = _compute_missing_values_proportion(data[column_name])
123+
column_parameters[column_name] = params
74124

75125
return {'columns': column_parameters}
76126

tests/unit/single_table/test_dayz.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22
from unittest.mock import call, patch
33

4+
import numpy as np
45
import pandas as pd
56
import pytest
67

@@ -351,3 +352,93 @@ def test_validate_parameters(
351352

352353
# Assert
353354
mock__validate_parameters.assert_called_once_with(metadata, dayz_parameters)
355+
356+
def test_create_parameters_with_empty_dataframe_returns_valid_defaults(self):
357+
"""create_parameters should not emit invalid values for empty dataframes."""
358+
# Setup
359+
data = pd.DataFrame({'col': []})
360+
metadata = Metadata.detect_from_dataframe(data)
361+
362+
# Run
363+
params = DayZSynthesizer.create_parameters(data, metadata)
364+
365+
# Assert
366+
assert params == {
367+
'tables': {
368+
'table': {
369+
'columns': {
370+
'col': {'missing_values_proportion': 0.0},
371+
},
372+
'num_rows': 0,
373+
},
374+
},
375+
'DAYZ_SPEC_VERSION': 'V1',
376+
}
377+
378+
def test_create_parameters_all_null_categorical_column(self):
379+
"""Categorical column with all nulls should produce empty category_values."""
380+
# Setup
381+
data = pd.DataFrame({'col': [None, None, np.nan, pd.NA]})
382+
metadata = Metadata.detect_from_dataframe(data)
383+
384+
# Run
385+
params = DayZSynthesizer.create_parameters(data, metadata)
386+
387+
# Assert
388+
assert params == {
389+
'tables': {
390+
'table': {
391+
'columns': {
392+
'col': {'missing_values_proportion': 1.0},
393+
},
394+
'num_rows': 4,
395+
},
396+
},
397+
'DAYZ_SPEC_VERSION': 'V1',
398+
}
399+
400+
def test_create_parameters_all_null_numerical_column(self):
401+
"""Numerical column with all nulls should produce empty min/max values."""
402+
# Setup
403+
data = pd.DataFrame({'col': [np.nan]})
404+
metadata = Metadata()
405+
metadata.add_table('table')
406+
metadata.add_column('col', 'table', sdtype='numerical')
407+
408+
# Run
409+
params = DayZSynthesizer.create_parameters(data, metadata)
410+
411+
# Assert
412+
assert params == {
413+
'tables': {
414+
'table': {
415+
'columns': {
416+
'col': {'missing_values_proportion': 1.0},
417+
},
418+
'num_rows': 1,
419+
},
420+
},
421+
'DAYZ_SPEC_VERSION': 'V1',
422+
}
423+
424+
def test_create_parameters_all_null_datetime_column(self):
425+
"""Datetime column with all nulls should omit start/end timestamps."""
426+
# Setup
427+
data = pd.DataFrame({'col': pd.to_datetime([None, None])})
428+
metadata = Metadata.detect_from_dataframe(data)
429+
430+
# Run
431+
params = DayZSynthesizer.create_parameters(data, metadata)
432+
433+
# Assert
434+
assert params == {
435+
'tables': {
436+
'table': {
437+
'columns': {
438+
'col': {'missing_values_proportion': 1.0},
439+
},
440+
'num_rows': 2,
441+
},
442+
},
443+
'DAYZ_SPEC_VERSION': 'V1',
444+
}

0 commit comments

Comments
 (0)