Skip to content

Commit ac19c4d

Browse files
authored
Fix KSComplement instability for constant float values (#654)
1 parent 8b450df commit ac19c4d

File tree

5 files changed

+54
-10
lines changed

5 files changed

+54
-10
lines changed

sdmetrics/single_column/statistical/kscomplement.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Kolmogorov-Smirnov test based Metric."""
22

3+
import sys
4+
35
import numpy as np
46
import pandas as pd
57
from scipy.stats import ks_2samp
@@ -8,6 +10,8 @@
810
from sdmetrics.single_column.base import SingleColumnMetric
911
from sdmetrics.utils import is_datetime
1012

13+
MAX_DECIMALS = sys.float_info.dig - 1
14+
1115

1216
class KSComplement(SingleColumnMetric):
1317
"""Kolmogorov-Smirnov statistic based metric.
@@ -57,6 +61,9 @@ def compute(real_data, synthetic_data):
5761
real_data = pd.to_numeric(real_data)
5862
synthetic_data = pd.to_numeric(synthetic_data)
5963

64+
real_data = real_data.round(MAX_DECIMALS)
65+
synthetic_data = synthetic_data.round(MAX_DECIMALS)
66+
6067
try:
6168
statistic, _ = ks_2samp(real_data, synthetic_data)
6269
except ValueError as e:

tests/integration/reports/multi_table/test_quality_report.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import numpy as np
55
import pandas as pd
6+
from packaging import version
67

78
from sdmetrics.demos import load_demo
89
from sdmetrics.reports.multi_table.quality_report import QualityReport
@@ -299,6 +300,15 @@ def test_quality_report_with_errors():
299300
'Property': ['Column Shapes', 'Column Pair Trends', 'Cardinality', 'Intertable Trends'],
300301
'Score': [0.8165079365079364, 0.55, 0.95, 0.5833333333333334],
301302
})
303+
304+
pandas_version = version.parse(pd.__version__)
305+
if pandas_version >= version.parse('2.2.0'):
306+
err1 = "TypeError: '<' not supported between instances of 'int' and 'str'"
307+
err2 = "TypeError: '<' not supported between instances of 'Timestamp' and 'str'"
308+
err3 = "TypeError: '<' not supported between instances of 'float' and 'str'"
309+
else:
310+
err1 = err2 = err3 = "TypeError: can't multiply sequence by non-int of type 'float'"
311+
302312
expected_details = pd.DataFrame({
303313
'Table': [
304314
'users',
@@ -334,11 +344,11 @@ def test_quality_report_with_errors():
334344
'Error': [
335345
None,
336346
None,
337-
"TypeError: '<' not supported between instances of 'int' and 'str'",
347+
err1,
338348
np.nan,
339349
np.nan,
340-
"TypeError: '<' not supported between instances of 'Timestamp' and 'str'",
341-
"TypeError: '<' not supported between instances of 'float' and 'str'",
350+
err2,
351+
err3,
342352
None,
343353
],
344354
})

tests/integration/reports/single_table/_properties/test_column_shapes.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pandas as pd
2+
from packaging import version
23

34
from sdmetrics.demos import load_demo
45
from sdmetrics.reports.single_table._properties import ColumnShapes
@@ -84,15 +85,22 @@ def test_get_score_errors(self):
8485

8586
# Run
8687
column_shape_property = ColumnShapes()
87-
88-
expected_message_1 = (
89-
"TypeError: '<' not supported between instances of 'Timestamp' and 'int'"
90-
)
91-
expected_message_2 = "TypeError: '<' not supported between instances of 'str' and 'float'"
92-
9388
score = column_shape_property.get_score(real_data, synthetic_data, metadata)
9489

9590
# Assert
91+
pandas_version = version.parse(pd.__version__)
92+
if pandas_version >= version.parse('2.2.0'):
93+
expected_message_1 = (
94+
"TypeError: '<' not supported between instances of 'Timestamp' and 'int'"
95+
)
96+
expected_message_2 = (
97+
"TypeError: '<' not supported between instances of 'str' and 'float'"
98+
)
99+
else:
100+
expected_message_1 = (
101+
"TypeError: unsupported operand type(s) for *: 'Timestamp' and 'float'"
102+
)
103+
expected_message_2 = "TypeError: can't multiply sequence by non-int of type 'float'"
96104

97105
details = column_shape_property.details
98106
details_nan = details.loc[pd.isna(details['Score'])]

tests/integration/single_column/statistical/test_kscomplement.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,16 @@ def test_bad(array_like):
4646

4747
assert 0.0 <= output < 0.5
4848
assert 0.0 <= normalized < 0.5
49+
50+
51+
def test_one_float_value():
52+
"""Test KSComplement.compute when both data have the same float values GH#652."""
53+
# Setup
54+
real = pd.Series([0.3 - 0.2])
55+
synth = pd.Series([0.2 - 0.1])
56+
57+
# Run
58+
output = KSComplement.compute(real, synth)
59+
60+
# Assert
61+
assert output == 1

tests/unit/reports/single_table/_properties/test_column_shapes.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import numpy as np
44
import pandas as pd
5+
from packaging import version
56

67
from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
78

@@ -108,7 +109,12 @@ def test__generate_details_error(self):
108109
result = column_shape_property._generate_details(real_data, synthetic_data, metadata)
109110

110111
# Assert
111-
expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'"
112+
pandas_version = version.parse(pd.__version__)
113+
if pandas_version >= version.parse('2.2.0'):
114+
expected_message = "TypeError: '<' not supported between instances of 'str' and 'int'"
115+
else:
116+
expected_message = "TypeError: can't multiply sequence by non-int of type 'float'"
117+
112118
result_nan = result.loc[pd.isna(result['Score'])]
113119
column_names_nan = result_nan['Column'].tolist()
114120
error_message = result_nan['Error'].tolist()

0 commit comments

Comments
 (0)