Normalize & dedupe column names; bump v1.0.5

dirreno · dirreno · commit 784c376da456 · 2026-03-03T11:50:52.000-05:00
Normalize dataframe column names (uppercase + strip) columns across Harmonizer code paths to ensure consistent column-matching logic and avoid duplicate-column issues. Replaced ad-hoc column-set construction with normalized assignments in pairwise comparison, cleaning in process_ddf/process_dataframe/merge/preparation steps, and final pandas conversion. Also bumped package version to 1.0.5 and added a changelog entry noting the normalization fix.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ The format is based on "Keep a Changelog" (https://keepachangelog.com/en/1.0.0/)
 ## [Unreleased]
 - Prepare improvements and documentation updates.
 
+## [1.0.5] - 2026-03-03
+### Fixed
+ - Fix normalization of columns
+
 ## [1.0.4] - 2026-03-02
 ### Fixed
  - Remove the similarity_threshold attribute and its property/setter,
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
     # For a discussion on single-sourcing the version across setup.py and the
     # project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='1.0.4',  # Required
+    version='1.0.5',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the "Summary" metadata field:
diff --git a/src/socio4health/harmonizer.py b/src/socio4health/harmonizer.py
@@ -279,12 +279,8 @@ def s4h_vertical_merge(self, ddfs: List[dd.DataFrame], overlap_threshold: float
             if i in used_indices:
                 continue
 
-            cols1 = set(
-                df1
-                .columns
-                .str.upper()
-                .str.strip()
-            )
+            df1.columns = df1.columns.str.upper().str.strip()
+            cols1 = set(df1.columns)
             dtypes1 = {col: str(df1[col].dtype) for col in df1.columns}
             current_group = [i]
             used_indices.add(i)
@@ -294,12 +290,8 @@ def s4h_vertical_merge(self, ddfs: List[dd.DataFrame], overlap_threshold: float
                 if j_actual in used_indices:
                     continue
 
-                cols2 = set(
-                    df2
-                    .columns
-                    .str.upper()
-                    .str.strip()
-                )
+                df2.columns = df2.columns.str.upper().str.strip()
+                cols2 = set(df2.columns)
                 common_cols = cols1 & cols2
                 
                 overlap = len(common_cols) / min(len(cols1), len(cols2)) if min(len(cols1), len(cols2)) > 0 else 0
@@ -377,6 +369,8 @@ def drop_nan_columns(self, ddf_or_ddfs: Union[dd.DataFrame, List[dd.DataFrame]])
             raise ValueError("Threshold must be between 0 and 1")
 
         def process_ddf(ddf):
+            ddf.columns = ddf.columns.str.upper().str.strip()
+            #ddf = ddf.loc[:, ~ddf.columns.duplicated()]
             if self.sample_frac is not None:
                 if not 0 < self.sample_frac <= 1:
                     raise ValueError("sample_frac must be between 0 and 1")
@@ -432,6 +426,9 @@ def s4h_get_available_columns(df_or_dfs: Union[dd.DataFrame, pd.DataFrame, List[
         for df in df_or_dfs:
             if not isinstance(df, (dd.DataFrame, pd.DataFrame)):
                 raise TypeError("All elements in the list must be DataFrames (Dask or pandas)")
+            # Clean columns: uppercase, strip, deduplicate
+            df.columns = df.columns.str.upper().str.strip()
+            df = df.loc[:, ~df.columns.duplicated()]
             unique_columns.update(df.columns)
 
         return sorted(unique_columns)
@@ -493,6 +490,9 @@ def get_country_mapping(mapping_obj, country):
 
         def process_dataframe(df: dd.DataFrame, country: str) -> dd.DataFrame:
             """Process a single dataframe"""
+            # Clean columns: uppercase, strip, deduplicate
+            df.columns = df.columns.str.upper().str.strip()
+            df = df.loc[:, ~df.columns.duplicated()]
             # Get mappings for this country
             col_map = get_country_mapping(column_mapping, country)
             val_maps = get_country_mapping(value_mappings, country)
@@ -600,7 +600,9 @@ def s4h_data_selector(self, ddfs: List[dd.DataFrame]) -> List[dd.DataFrame]:
 
         filtered_ddfs = []
         for ddf in ddfs:
-            ddf.columns = ddf.columns.str.upper()
+            # Clean columns: uppercase, strip, deduplicate
+            ddf.columns = ddf.columns.str.upper().str.strip()
+            ddf = ddf.loc[:, ~ddf.columns.duplicated()]
 
             if self.key_col and self.key_val:
                 if key_column_upper not in ddf.columns:
@@ -664,6 +666,9 @@ def s4h_join_data(self, ddfs: List[dd.DataFrame]) -> pd.DataFrame:
             Merged DataFrame with duplicate columns removed.
         """
         pandas_dfs = [df.compute() for df in ddfs]
+        # Clean columns: uppercase, strip, deduplicate
+        pandas_dfs = [df.rename(columns=lambda x: str(x).upper().strip()) for df in pandas_dfs]
+        pandas_dfs = [df.loc[:, ~df.columns.duplicated()] for df in pandas_dfs]
 
         def identify_primary_df(dfs):
             candidates = []