Harmonizer: add overlap/method merge, bump version

dirreno · dirreno · commit ce661ff86c4d · 2026-03-02T11:00:43.000-05:00
Bump package version to 1.0.4 in setup.py. Refactor Harmonizer: remove the similarity_threshold attribute and its property/setter; replace with an overlap_threshold parameter and a method parameter on s4h_vertical_merge (supports 'union' and 'intersection'). Normalize column names (upper/strip) when comparing, compute overlap using the Szymkiewicz–Simpson coefficient, validate overlap_threshold and method, and adjust grouping/concatenation logic (union reindexes columns, intersection keeps common columns). Add a new ECV_test.py script for extracting and merging ECV datasets, and update tests/mytest.py to remove the old similarity_threshold assignment and call s4h_vertical_merge with explicit merge parameters.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,15 @@ The format is based on "Keep a Changelog" (https://keepachangelog.com/en/1.0.0/)
 ## [Unreleased]
 - Prepare improvements and documentation updates.
 
+## [1.0.4] - 2026-03-02
+### Fixed
+ - Remove the similarity_threshold attribute and its property/setter,
+ - Normalize column names (upper/strip) when comparing.
+### Added
+ - Replace with an overlap_threshold parameter using the Szymkiewicz–Simpson coefficient.
+ - Method parameter on s4h_vertical_merge (supports 'union' and 'intersection').
+
+
 ## [1.0.3] - 2026-02-23
 ### Fixed
 - Extractor now decompress deflated64.
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
     # For a discussion on single-sourcing the version across setup.py and the
     # project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='1.0.3',  # Required
+    version='1.0.4',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the "Summary" metadata field:
diff --git a/src/socio4health/harmonizer.py b/src/socio4health/harmonizer.py
@@ -25,8 +25,6 @@ class Harmonizer:
         ----------
         min_common_columns : int
             Minimum number of common columns required for vertical merge (default is 1).
-        similarity_threshold : float
-            Similarity threshold to consider for vertical merge (default is 0.8).
         nan_threshold : float
             Percentage threshold of ``NaN`` values to drop columns (default is 1.0).
         sample_frac : float or ``None``
@@ -58,7 +56,6 @@ class Harmonizer:
     """
     def __init__(self,
                  min_common_columns: int = 1,
-                 similarity_threshold: float = 1,
                  nan_threshold: float = 1.0,
                  sample_frac: Optional[float] = None,
                  column_mapping: Optional[Union[Type[Enum], Dict[str, Dict[str, str]], str, Path]] = None,
@@ -77,7 +74,6 @@ def __init__(self,
         Initialize the Harmonizer class with default parameters.
         """
         self.min_common_columns = min_common_columns
-        self.similarity_threshold = similarity_threshold
         self.nan_threshold = nan_threshold
         self.sample_frac = sample_frac
         self.column_mapping = column_mapping
@@ -100,12 +96,7 @@ def __init__(self,
     def min_common_columns(self) -> int:
         """Get the minimum number of common columns required for vertical merge."""
         return self._min_common_columns
-
-    @property
-    def similarity_threshold(self) -> float:
-        """Get the similarity threshold for vertical merge."""
-        return self._similarity_threshold
-
+    
     @property
     def nan_threshold(self) -> float:
         """Get the NaN threshold for column dropping."""
@@ -174,13 +165,6 @@ def min_common_columns(self, value: int):
             raise ValueError("min_common_columns must be a non-negative integer")
         self._min_common_columns = value
 
-    @similarity_threshold.setter
-    def similarity_threshold(self, value: float):
-        """Set the similarity threshold for vertical merge."""
-        if not isinstance(value, (int, float)) or not 0 <= value <= 1:
-            raise ValueError("similarity_threshold must be a float between 0 and 1")
-        self._similarity_threshold = float(value)
-
     @nan_threshold.setter
     def nan_threshold(self, value: float):
         """Set the NaN threshold for column dropping."""
@@ -257,14 +241,20 @@ def extra_cols(self, value: List[str]):
             raise ValueError("extra_cols must be a list of strings")
         self._extra_cols = value
 
-    def s4h_vertical_merge(self, ddfs: List[dd.DataFrame]) -> List[dd.DataFrame]:
+    def s4h_vertical_merge(self, ddfs: List[dd.DataFrame], overlap_threshold: float = 1, method: str = "union") -> List[dd.DataFrame]:
         """
         Merge a list of `Dask <https://docs.dask.org>`_ DataFrames vertically using instance parameters.
 
         Parameters
         ----------
         ddfs : list of `dask.dataframe.DataFrame <https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.html>`_
             List of `Dask <https://docs.dask.org>`_ DataFrames to be merged.
+        overlap_threshold : float, optional
+            Overlap coefficient (Szymkiewicz–Simpson coefficient) threshold to consider for vertical merge (default is 1).
+        method : str, optional
+            Method to use for merging (default is "union").
+                - "union": Merge all columns from all DataFrames, filling missing values with NaN.
+                - "intersection": Merge only columns that are common to all DataFrames.
 
         Returns
         -------
@@ -273,20 +263,28 @@ def s4h_vertical_merge(self, ddfs: List[dd.DataFrame]) -> List[dd.DataFrame]:
 
         Notes
         -----
-        - DataFrames are grouped and merged if they share at least ``min_common_columns`` columns and their column similarity is above ``similarity_threshold``.
+        - DataFrames are grouped and merged if they share at least ``min_common_columns`` columns and their column overlap coefficient is above ``overlap_threshold``.
         - Only columns with matching data types are considered compatible for merging.
         """
         if not ddfs:
             return []
 
+        if not isinstance(overlap_threshold, (int, float)) or not 0 <= overlap_threshold <= 1:
+            raise ValueError("overlap_threshold must be a float between 0 and 1")
+
         groups = []
         used_indices = set()
 
         for i, df1 in enumerate(tqdm(ddfs, desc="Grouping DataFrames")):
             if i in used_indices:
                 continue
 
-            cols1 = set(df1.columns)
+            cols1 = set(
+                df1
+                .columns
+                .str.upper()
+                .str.strip()
+            )
             dtypes1 = {col: str(df1[col].dtype) for col in df1.columns}
             current_group = [i]
             used_indices.add(i)
@@ -296,12 +294,18 @@ def s4h_vertical_merge(self, ddfs: List[dd.DataFrame]) -> List[dd.DataFrame]:
                 if j_actual in used_indices:
                     continue
 
-                cols2 = set(df2.columns)
+                cols2 = set(
+                    df2
+                    .columns
+                    .str.upper()
+                    .str.strip()
+                )
                 common_cols = cols1 & cols2
-                similarity = len(common_cols) / max(len(cols1), len(cols2))
+                
+                overlap = len(common_cols) / min(len(cols1), len(cols2)) if min(len(cols1), len(cols2)) > 0 else 0
 
                 if (len(common_cols) >= self.min_common_columns and
-                        similarity >= self.similarity_threshold):
+                        overlap >= overlap_threshold):
 
                     compatible = True
                     for col in common_cols:
@@ -325,16 +329,19 @@ def s4h_vertical_merge(self, ddfs: List[dd.DataFrame]) -> List[dd.DataFrame]:
                 merged_dfs.append(ddfs[group_indices[0]])
             else:
                 group_dfs = [ddfs[i] for i in group_indices]
-                common_cols = set(group_dfs[0].columns)
-                for df in group_dfs[1:]:
-                    common_cols.intersection_update(df.columns)
-
-                aligned_dfs = []
-                for df in group_dfs:
-                    common_cols_ordered = [col for col in df.columns if col in common_cols]
-                    other_cols = [col for col in df.columns if col not in common_cols]
-                    aligned_dfs.append(df[common_cols_ordered + other_cols])
-
+                if method == "intersection":
+                    common_cols = set(group_dfs[0].columns)
+                    for df in group_dfs[1:]:
+                        common_cols.intersection_update(df.columns)
+                    aligned_dfs = [df[list(common_cols)] for df in group_dfs]
+                elif method == "union":
+                    all_cols = set()
+                    for df in group_dfs:
+                        all_cols.update(df.columns)
+                    all_cols = list(all_cols)
+                    aligned_dfs = [df.reindex(columns=all_cols) for df in group_dfs]
+                else:
+                    raise ValueError("method must be 'union' or 'intersection'")
                 merged_df = dd.concat(aligned_dfs, axis=0, ignore_index=True)
                 merged_dfs.append(merged_df)
         if len(merged_dfs) > 1:
diff --git a/tests/ECV_test.py b/tests/ECV_test.py
@@ -0,0 +1,57 @@
+from socio4health import Extractor, Harmonizer
+
+if __name__ == "__main__":
+    ecv_data = {
+        2010: "https://microdatos.dane.gov.co/index.php/catalog/201/get-microdata",
+        2011: "https://microdatos.dane.gov.co/index.php/catalog/196/get-microdata",
+        2012: "https://microdatos.dane.gov.co/index.php/catalog/124/get-microdata",
+        2013: "https://microdatos.dane.gov.co/index.php/catalog/213/get-microdata",
+        2014: "https://microdatos.dane.gov.co/index.php/catalog/342/get-microdata",
+        2015: "https://microdatos.dane.gov.co/index.php/catalog/419/get-microdata",
+        2016: "https://microdatos.dane.gov.co/index.php/catalog/456/get-microdata",
+        2017: "https://microdatos.dane.gov.co/index.php/catalog/544/get-microdata",
+        2018: "https://microdatos.dane.gov.co/index.php/catalog/607/get-microdata",
+        2019: "https://microdatos.dane.gov.co/index.php/catalog/678/get-microdata",
+        2020: "https://microdatos.dane.gov.co/index.php/catalog/718/get-microdata",
+        2021: "https://microdatos.dane.gov.co/index.php/catalog/734/get-microdata",
+        2022: "https://microdatos.dane.gov.co/index.php/catalog/793/get-microdata",
+        2023: "https://microdatos.dane.gov.co/index.php/catalog/827/get-microdata",
+        2024: "https://microdatos.dane.gov.co/index.php/catalog/861/get-microdata"
+    }
+
+    ddfs = []
+    for year, url in ecv_data.items():
+        print(f"{year}: {url}")
+        extractor = Extractor(
+            input_path=url,
+            down_ext=['.sav', '.zip'],
+            sep=' ',
+            output_path = f"data/ECV_{year}",
+            depth=0,
+            key_words=[
+                r"(?i)datos[\s_]+(?:de[\s_]+)?identificaci[oó]n",
+                r"(?i)servicios[\s_]+(?:del[\s_]+)?hogar",
+                r"(?i)composici[oó]n[\s_]+(?:del[\s_]+)?hogar",
+                r"(?i)datos[\s_]+(?:de[\s_]+)?(?:la[\s_]+)?vivienda",
+            ],
+            delete_zip_after=True
+        )
+
+        df_extracted = extractor.s4h_extract()
+        for df in df_extracted:
+            df['year'] = year
+        ddfs.extend(df_extracted)
+
+    for ddf in ddfs:
+        print(ddf.head())
+
+    har = Harmonizer()
+    dfs = har.s4h_vertical_merge(ddfs, overlap_threshold=0.6, method="union")
+
+    print(len(dfs))
+
+    for i, df in enumerate(dfs):
+        print(f"DF {i} columns:")
+        print(len(df.columns))
+        print(list(df.columns))
+        print("-" * 40)
diff --git a/tests/mytest.py b/tests/mytest.py
@@ -97,7 +97,6 @@ def test():
 
     extractor = col_online_extractor
     har.dict_df = col_dict
-    har.similarity_threshold = 0.9
 
     har.join_key = 'DIRECTORIO'
     har.aux_key = 'ORDEN'
@@ -111,10 +110,11 @@ def test():
         print(df.head())
         print("-" * 50)
 
-    """
+    
     print('Vertical merge_____________________________________')
-    dfs = har.s4h_vertical_merge(dfs)
+    dfs = har.s4h_vertical_merge(dfs, similarity_threshold=0.5, method="union")
 
+"""
     har.categories = ["Business"]
     har.key_col = 'DPTO'
     har.key_val = ['11']