Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sdmetrics/reports/single_table/_properties/data_validity.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,22 @@ def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=No
error_messages = []
primary_key = metadata.get('primary_key')
alternate_keys = metadata.get('alternate_keys', [])
sequence_index = metadata.get('sequence_index')

for column_name in metadata['columns']:
sdtype = metadata['columns'][column_name]['sdtype']
primary_key_match = column_name == primary_key
alternate_key_match = column_name in alternate_keys
is_unique = primary_key_match or alternate_key_match
is_sequence_index = column_name == sequence_index

try:
if sdtype not in self._sdtype_to_metric and not is_unique:
continue

if is_sequence_index and self._sdtype_to_metric.get(sdtype) == BoundaryAdherence:
continue

metric = self._sdtype_to_metric.get(sdtype, KeyUniqueness)
column_score = metric.compute(real_data[column_name], synthetic_data[column_name])
error_message = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,53 @@ def test_get_visualization(self, mock_px):
margin={'t': 150},
font={'size': 18},
)

@patch('sdmetrics.reports.single_table._properties.data_validity.BoundaryAdherence.compute')
@patch('sdmetrics.reports.single_table._properties.data_validity.CategoryAdherence.compute')
@patch('sdmetrics.reports.single_table._properties.data_validity.KeyUniqueness.compute')
def test__generate_details_skip_sequence_index_boundary_adherence(
self, key_uniqueness_mock, category_a_compute_mock, boundary_a_compute_mock
):
"""Test that sequence_index columns are excluded from BoundaryAdherence checks."""
# Setup
real_data = pd.DataFrame({
'date': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']),
'value': [1, 2, 3],
'category': ['a', 'b', 'c'],
})
synthetic_data = pd.DataFrame({
'date': pd.to_datetime(['2020-01-04', '2020-01-05', '2020-01-06']),
'value': [4, 5, 6],
'category': ['d', 'e', 'f'],
})
metadata = {
'sequence_index': 'date', # This should skip BoundaryAdherence
'columns': {
'date': {'sdtype': 'datetime'},
'value': {'sdtype': 'numerical'},
'category': {'sdtype': 'categorical'},
},
}

boundary_a_compute_mock.return_value = 0.8
category_a_compute_mock.return_value = 0.9

# Run
data_validity_property = DataValidity()
result = data_validity_property._generate_details(real_data, synthetic_data, metadata)

# Assert
expected_calls_ba = [call(real_data['value'], synthetic_data['value'])]
boundary_a_compute_mock.assert_has_calls(expected_calls_ba)
assert boundary_a_compute_mock.call_count == 1

expected_calls_ca = [call(real_data['category'], synthetic_data['category'])]
category_a_compute_mock.assert_has_calls(expected_calls_ca)
assert category_a_compute_mock.call_count == 1

key_uniqueness_mock.assert_not_called()

expected_columns = ['value', 'category']
assert list(result['Column']) == expected_columns
expected_metrics = ['BoundaryAdherence', 'CategoryAdherence']
assert list(result['Metric']) == expected_metrics
Loading