Add threshold

fealho · fealho · commit 79508b7c6374 · 2026-01-15T09:44:11.000-08:00
diff --git a/sdmetrics/column_pairs/statistical/contingency_similarity.py b/sdmetrics/column_pairs/statistical/contingency_similarity.py
@@ -1,6 +1,8 @@
 """Contingency Similarity Metric."""
 
+import numpy as np
 import pandas as pd
+from scipy.stats.contingency import association
 
 from sdmetrics.column_pairs.base import ColumnPairsMetric
 from sdmetrics.goal import Goal
@@ -28,7 +30,12 @@ class ContingencySimilarity(ColumnPairsMetric):
 
     @staticmethod
     def _validate_inputs(
-        real_data, synthetic_data, continuous_column_names, num_discrete_bins, num_rows_subsample
+        real_data,
+        synthetic_data,
+        continuous_column_names,
+        num_discrete_bins,
+        num_rows_subsample,
+        real_association_threshold,
     ):
         for data in [real_data, synthetic_data]:
             if not isinstance(data, pd.DataFrame) or len(data.columns) != 2:
@@ -53,6 +60,14 @@ def _validate_inputs(
             if not isinstance(num_rows_subsample, int) or num_rows_subsample <= 0:
                 raise ValueError('`num_rows_subsample` must be an integer greater than zero.')
 
+        if (
+            not isinstance(real_association_threshold, (int, float))
+            or real_association_threshold < 0
+        ):
+            raise ValueError(
+                '`real_association_threshold` must be a number greater than or equal to zero.'
+            )
+
     @classmethod
     def compute_breakdown(
         cls,
@@ -61,6 +76,7 @@ def compute_breakdown(
         continuous_column_names=None,
         num_discrete_bins=10,
         num_rows_subsample=None,
+        real_association_threshold=0,
     ):
         """Compute the breakdown of this metric."""
         cls._validate_inputs(
@@ -69,6 +85,7 @@ def compute_breakdown(
             continuous_column_names,
             num_discrete_bins,
             num_rows_subsample,
+            real_association_threshold,
         )
         columns = real_data.columns[:2]
 
@@ -84,7 +101,14 @@ def compute_breakdown(
                     real[column], synthetic[column], num_discrete_bins=num_discrete_bins
                 )
 
-        contingency_real = real.groupby(list(columns), dropna=False).size() / len(real)
+        contingency_real_counts = real.groupby(list(columns), dropna=False).size()
+        if real_association_threshold > 0:
+            contingency_2d = contingency_real_counts.unstack(fill_value=0)  # noqa: PD010
+            real_cramer = association(contingency_2d.values, method='cramer')
+            if real_cramer < real_association_threshold:
+                return {'score': np.nan}
+
+        contingency_real = contingency_real_counts / len(real)
         contingency_synthetic = synthetic.groupby(list(columns), dropna=False).size() / len(
             synthetic
         )
@@ -103,6 +127,7 @@ def compute(
         continuous_column_names=None,
         num_discrete_bins=10,
         num_rows_subsample=None,
+        real_association_threshold=0,
     ):
         """Compare the contingency similarity of two discrete columns.
 
@@ -120,17 +145,23 @@ def compute(
             num_rows_subsample (int, optional):
                 The number of rows to subsample from the real and synthetic data before computing
                 the metric. Defaults to ``None``.
+            real_association_threshold (float, optional):
+                The minimum Cramer's V association score required in the real data for the
+                metric to be computed. If the real data's association is below this threshold,
+                the metric returns NaN. Defaults to 0 (no threshold).
 
         Returns:
             float:
-                The contingency similarity of the two columns.
+                The contingency similarity of the two columns, or NaN if the real data's
+                association is below the threshold.
         """
         return cls.compute_breakdown(
             real_data,
             synthetic_data,
             continuous_column_names,
             num_discrete_bins,
             num_rows_subsample,
+            real_association_threshold,
         )['score']
 
     @classmethod
diff --git a/tests/readme_test/README.md b/tests/readme_test/README.md
@@ -0,0 +1,145 @@
+<div align="center">
+<br/>
+<p align="center">
+    <i>This repository is part of <a href="https://sdv.dev">The Synthetic Data Vault Project</a>, a project from <a href="https://datacebo.com">DataCebo</a>.</i>
+</p>
+
+[![Development Status](https://img.shields.io/badge/Development%20Status-2%20--%20Pre--Alpha-yellow)](https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha)
+[![PyPI Shield](https://img.shields.io/pypi/v/sdmetrics.svg)](https://pypi.python.org/pypi/sdmetrics)
+[![Downloads](https://pepy.tech/badge/sdmetrics)](https://pepy.tech/project/sdmetrics)
+[![Tests](https://github.com/sdv-dev/SDMetrics/workflows/Run%20Tests/badge.svg)](https://github.com/sdv-dev/SDMetrics/actions?query=workflow%3A%22Run+Tests%22+branch%3Amain)
+[![Coverage Status](https://codecov.io/gh/sdv-dev/SDMetrics/branch/main/graph/badge.svg)](https://codecov.io/gh/sdv-dev/SDMetrics)
+[![Slack](https://img.shields.io/badge/Community-Slack-blue?style=plastic&logo=slack)](https://bit.ly/sdv-slack-invite)
+[![Tutorial](https://img.shields.io/badge/Demo-Get%20started-orange?style=plastic&logo=googlecolab)](https://bit.ly/sdmetrics-demo)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14279167.svg)](https://doi.org/10.5281/zenodo.14279167)
+
+<div align="left">
+<br/>
+<p align="center">
+<a href="https://github.com/sdv-dev/SDV">
+<img align="center" width=40% src="https://github.com/sdv-dev/SDV/blob/stable/docs/images/SDMetrics-DataCebo.png"></img>
+</a>
+</p>
+</div>
+
+</div>
+
+# Overview
+
+The SDMetrics library evaluates synthetic data by comparing it to the real data that you're trying to mimic. It includes a variety of metrics to capture different aspects of the data, for example **quality and privacy**. It also includes reports that you can run to generate insights, visualize data and share with your team.
+
+The SDMetrics library is **model-agnostic**, meaning you can use any synthetic data. The library does not need to know how you created the data. 
+
+<img align="center" src="docs/images/column_comparison.png"></img>
+
+# Install
+
+Install SDMetrics using pip or conda. We recommend using a virtual environment to avoid conflicts with other software on your device.
+
+```bash
+pip install sdmetrics
+```
+
+```bash
+conda install -c conda-forge sdmetrics
+```
+
+For more information about using SDMetrics, visit the [SDMetrics Documentation](https://docs.sdv.dev/sdmetrics).
+
+# Usage
+
+Get started with **SDMetrics Reports** using some demo data,
+
+```python
+from sdmetrics import load_demo
+from sdmetrics.reports.single_table import QualityReport
+
+real_data, synthetic_data, metadata = load_demo(modality='single_table')
+
+my_report = QualityReport()
+my_report.generate(real_data, synthetic_data, metadata)
+```
+```
+Creating report: 100%|██████████| 4/4 [00:00<00:00,  5.22it/s]
+
+Overall Quality Score: 82.84%
+
+Properties:
+Column Shapes: 82.78%
+Column Pair Trends: 82.9%
+```
+
+Once you generate the report, you can drill down on the details and visualize the results.
+
+```python
+my_report.get_visualization(property_name='Column Pair Trends')
+```
+<img align="center" src="docs/images/column_pairs.png"></img>
+
+Save the report and share it with your team.
+```python
+my_report.save(filepath='demo_data_quality_report.pkl')
+
+# load it at any point in the future
+my_report = QualityReport.load(filepath='demo_data_quality_report.pkl')
+```
+
+**Want more metrics?** You can also manually apply any of the metrics in this library to your data.
+
+```python
+# calculate whether the synthetic data respects the min/max bounds
+# set by the real data
+from sdmetrics.single_column import BoundaryAdherence
+
+BoundaryAdherence.compute(
+    real_data['start_date'],
+    synthetic_data['start_date']
+)
+```
+```
+0.8503937007874016
+```
+
+```python
+# calculate whether the synthetic data is new or whether it's an exact copy of the real data
+from sdmetrics.single_table import NewRowSynthesis
+
+NewRowSynthesis.compute(
+    real_data,
+    synthetic_data,
+    metadata
+)
+```
+```
+1.0
+```
+
+# What's next?
+
+To learn more about the reports and metrics, visit the [SDMetrics Documentation](https://docs.sdv.dev/sdmetrics). 
+
+---
+
+
+<div align="center">
+<a href="https://datacebo.com"><img align="center" width=40% src="https://github.com/sdv-dev/SDV/blob/stable/docs/images/DataCebo.png"></img></a>
+</div>
+<br/>
+<br/>
+
+[The Synthetic Data Vault Project](https://sdv.dev) was first created at MIT's [Data to AI Lab](
+https://dai.lids.mit.edu/) in 2016. After 4 years of research and traction with enterprise, we
+created [DataCebo](https://datacebo.com) in 2020 with the goal of growing the project.
+Today, DataCebo is the proud developer of SDV, the largest ecosystem for
+synthetic data generation & evaluation. It is home to multiple libraries that support synthetic
+data, including:
+
+* 🔄 Data discovery & transformation. Reverse the transforms to reproduce realistic data.
+* 🧠 Multiple machine learning models -- ranging from Copulas to Deep Learning -- to create tabular,
+  multi table and time series data.
+* 📊 Measuring quality and privacy of synthetic data, and comparing different synthetic data
+  generation models.
+
+[Get started using the SDV package](https://sdv.dev/SDV/getting_started/install.html) -- a fully
+integrated solution and your one-stop shop for synthetic data. Or, use the standalone libraries
+for specific needs.
diff --git a/tests/unit/column_pairs/statistical/test_contingency_similarity.py b/tests/unit/column_pairs/statistical/test_contingency_similarity.py
@@ -27,6 +27,7 @@ def test__validate_inputs(self):
             continuous_column_names=None,
             num_discrete_bins=10,
             num_rows_subsample=3,
+            real_association_threshold=0,
         )
         expected_bad_data = re.escape('The data must be a pandas DataFrame with two columns.')
         with pytest.raises(ValueError, match=expected_bad_data):
@@ -36,6 +37,7 @@ def test__validate_inputs(self):
                 continuous_column_names=None,
                 num_discrete_bins=10,
                 num_rows_subsample=3,
+                real_association_threshold=0,
             )
 
         expected_mismatch_columns_error = re.escape(
@@ -48,6 +50,7 @@ def test__validate_inputs(self):
                 continuous_column_names=None,
                 num_discrete_bins=10,
                 num_rows_subsample=3,
+                real_association_threshold=0,
             )
 
         expected_bad_continous_column_error = re.escape(
@@ -60,6 +63,7 @@ def test__validate_inputs(self):
                 continuous_column_names=bad_continous_columns,
                 num_discrete_bins=10,
                 num_rows_subsample=3,
+                real_association_threshold=0,
             )
 
         expected_bad_num_discrete_bins_error = re.escape(
@@ -72,6 +76,7 @@ def test__validate_inputs(self):
                 continuous_column_names=['col1'],
                 num_discrete_bins=bad_num_discrete_bins,
                 num_rows_subsample=3,
+                real_association_threshold=0,
             )
         expected_bad_num_rows_subsample_error = re.escape(
             '`num_rows_subsample` must be an integer greater than zero.'
@@ -83,6 +88,20 @@ def test__validate_inputs(self):
                 continuous_column_names=['col1'],
                 num_discrete_bins=10,
                 num_rows_subsample=bad_num_rows_subsample,
+                real_association_threshold=0,
+            )
+
+        expected_bad_threshold_error = re.escape(
+            '`real_association_threshold` must be a number greater than or equal to zero.'
+        )
+        with pytest.raises(ValueError, match=expected_bad_threshold_error):
+            ContingencySimilarity._validate_inputs(
+                real_data=real_data,
+                synthetic_data=synthetic_data,
+                continuous_column_names=['col1'],
+                num_discrete_bins=10,
+                num_rows_subsample=3,
+                real_association_threshold=-0.1,
             )
 
     @patch(
@@ -99,7 +118,7 @@ def test_compute_mock(self, compute_breakdown_mock):
         score = ContingencySimilarity.compute(real_data, synthetic_data)
 
         # Assert
-        compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data, None, 10, None)
+        compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data, None, 10, None, 0)
         assert score == 0.25
 
     @patch(
@@ -134,6 +153,7 @@ def test_compute_breakdown(self, validate_inputs_mock):
             None,
             10,
             None,
+            0,
         )
         assert result == {'score': expected_score}
 
@@ -218,3 +238,47 @@ def test_no_runtime_warning_raised(self):
         ContingencySimilarity.compute(
             real_data=real_data[['A', 'B']], synthetic_data=synthetic_data[['A', 'B']]
         )
+
+    def test_real_association_threshold_returns_nan(self):
+        """Test that NaN is returned when real association is below threshold."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C'], size=100),
+            'col2': np.random.choice(['X', 'Y', 'Z'], size=100),
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': np.random.choice(['A', 'B', 'C'], size=100),
+            'col2': np.random.choice(['X', 'Y', 'Z'], size=100),
+        })
+
+        # Run
+        result = ContingencySimilarity.compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            real_association_threshold=0.3,
+        )
+
+        # Assert
+        assert np.isnan(result)
+
+    def test_real_association_threshold_computes_normally(self):
+        """Test that metric computes normally when real association exceeds threshold."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': ['A'] * 50 + ['B'] * 50,
+            'col2': ['X'] * 48 + ['Y'] * 2 + ['Y'] * 48 + ['X'] * 2,
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': ['A'] * 50 + ['B'] * 50,
+            'col2': ['X'] * 45 + ['Y'] * 5 + ['Y'] * 45 + ['X'] * 5,
+        })
+
+        # Run
+        result = ContingencySimilarity.compute(
+            real_data=real_data,
+            synthetic_data=synthetic_data,
+            real_association_threshold=0.3,
+        )
+
+        # Assert
+        assert 0 <= result <= 1