fix: CSV merging

JesperDramsch · JesperDramsch · commit 549bcd841784 · 2025-08-11T23:22:28.000+02:00
diff --git a/utils/import_python_organizers.py b/utils/import_python_organizers.py
@@ -54,37 +54,79 @@ def map_columns(df, reverse=False):
 
 def write_csv(df, year, csv_location):
     """Write the CSV files for the conferences."""
+    from logging_config import get_tqdm_logger
+
+    logger = get_tqdm_logger(__name__)
+
+    logger.info(f"Starting write_csv for year {year} with df shape: {df.shape}")
+    logger.debug(f"write_csv input columns: {df.columns.tolist()}")
+
+    # Validate conference names before processing
+    invalid_conferences = df[~df["conference"].apply(lambda x: isinstance(x, str) and len(str(x).strip()) > 0)]
+    if not invalid_conferences.empty:
+        logger.error(f"Found {len(invalid_conferences)} rows with invalid conference names in write_csv:")
+        for idx, row in invalid_conferences.iterrows():
+            logger.error(f"  Row {idx}: conference = {row['conference']} (type: {type(row['conference'])})")
+        # Fix invalid conference names
+        df.loc[invalid_conferences.index, "conference"] = df.loc[invalid_conferences.index, "conference"].apply(
+            lambda x: str(x) if pd.notna(x) else f"Conference_{invalid_conferences.index}",
+        )
+
     df["cfp"] = df["cfp"].str.slice(stop=10).str.replace(r"\b(TBA|None)\b", "", regex=True)
     df["tutorial_deadline"] = (
         df["tutorial_deadline"].fillna("").apply(str).str.slice(stop=10).str.replace(r"\b(TBA|None)\b", "", regex=True)
     )
     df = map_columns(df, reverse=True)
+    logger.debug(f"After map_columns, df shape: {df.shape}")
+
     for y in range(year, datetime.now(tz=timezone.utc).year + 10):
         if y in df["year"].unique():
-            df.loc[
-                df["year"] == y,
-                [
-                    "Subject",
-                    "Start Date",
-                    "End Date",
-                    "Location",
-                    "Country",
-                    "Venue",
-                    "Tutorial Deadline",
-                    "Talk Deadline",
-                    "Website URL",
-                    "Proposal URL",
-                    "Sponsorship URL",
-                ],
-            ].fillna("").astype(str).sort_values(by="Start Date").to_csv(Path(csv_location, f"{y}.csv"), index=False)
+            # Extract and prepare data for this year
+            df_year_subset = df.loc[df["year"] == y]
+            logger.debug(f"Year {y} subset shape: {df_year_subset.shape}")
+
+            csv_data = (
+                df_year_subset[
+                    [
+                        "Subject",
+                        "Start Date",
+                        "End Date",
+                        "Location",
+                        "Country",
+                        "Venue",
+                        "Tutorial Deadline",
+                        "Talk Deadline",
+                        "Website URL",
+                        "Proposal URL",
+                        "Sponsorship URL",
+                    ]
+                ]
+                .fillna("")
+                .astype(str)
+                .sort_values(by=["Start Date", "End Date", "Subject"])
+            )
+
+            logger.debug(f"Writing CSV for year {y} with {len(csv_data)} conferences")
+            logger.debug(f"Sample conference names: {csv_data['Subject'].head().tolist()}")
+
+            csv_data.to_csv(Path(csv_location, f"{y}.csv"), index=False)
+            logger.info(f"Successfully wrote {Path(csv_location, f'{y}.csv')}")
 
 
 def main(year=None, base=""):
     """Import Python conferences from a csv file Github."""
+    from logging_config import get_tqdm_logger
+
+    # Setup tqdm-compatible logging for this module
+    logger = get_tqdm_logger(__name__)
+    logger.info("🚀 Starting import_python_organizers main function")
+
     # If no year is provided, use the current year
     if year is None:
         year = datetime.now(tz=timezone.utc).year
 
+    logger.info(f"Processing conferences for year: {year}")
+
     # Load current conferences
     _data_path = Path(base, "_data")
     _utils_path = Path(base, "utils")
@@ -150,15 +192,24 @@ def main(year=None, base=""):
             )
             continue
 
-        df_merged, df_remote = fuzzy_match(
-            df_yml[df_yml["year"] == y],
-            df_csv_for_merge.loc[df_csv_for_merge["year"] == y],
-        )
+        logger.info(f"Processing year {y} merge operations")
+        df_yml_year = df_yml[df_yml["year"] == y]
+        df_csv_year = df_csv_for_merge.loc[df_csv_for_merge["year"] == y]
+        logger.debug(f"Year {y}: df_yml_year shape: {df_yml_year.shape}, df_csv_year shape: {df_csv_year.shape}")
+
+        df_merged, df_remote = fuzzy_match(df_yml_year, df_csv_year)
+        logger.info(f"Fuzzy match completed for year {y}. df_merged shape: {df_merged.shape}")
+
         df_merged["year"] = y
         df_merged = df_merged.drop(["conference"], axis=1)
+        logger.debug(f"After dropping conference column: {df_merged.shape}")
+
         df_merged = deduplicate(df_merged)
         df_remote = deduplicate(df_remote)
+        logger.debug(f"After deduplication - df_merged: {df_merged.shape}, df_remote: {df_remote.shape}")
+
         df_merged = merge_conferences(df_merged, df_remote)
+        logger.info(f"Merge conferences completed for year {y}. Final shape: {df_merged.shape}")
 
         df_new = pd.concat([df_new, df_merged], ignore_index=True)
 
diff --git a/utils/tidy_conf/interactive_merge.py b/utils/tidy_conf/interactive_merge.py
@@ -1,3 +1,4 @@
+import logging
 import sys
 from collections import defaultdict
 
@@ -23,9 +24,16 @@ def fuzzy_match(df_yml, df_remote):
     Keeps temporary track of rejections to avoid asking the same question multiple
     times.
     """
+    logger = logging.getLogger(__name__)
+    logger.info(f"Starting fuzzy_match with df_yml shape: {df_yml.shape}, df_remote shape: {df_remote.shape}")
+
     df_yml = tidy_df_names(df_yml)
     df_remote = tidy_df_names(df_remote)
 
+    logger.debug(f"After tidy_df_names - df_yml shape: {df_yml.shape}, df_remote shape: {df_remote.shape}")
+    logger.debug(f"df_yml columns: {df_yml.columns.tolist()}")
+    logger.debug(f"df_remote columns: {df_remote.columns.tolist()}")
+
     _, known_rejections = load_title_mappings(path="utils/tidy_conf/data/.tmp/rejections.yml")
 
     new_mappings = defaultdict(list)
@@ -73,23 +81,46 @@ def fuzzy_match(df_yml, df_remote):
     update_title_mappings(new_rejections, path="utils/tidy_conf/data/.tmp/rejections.yml")
 
     # Combine dataframes
+    logger.info("Combining dataframes using title_match index")
     df.set_index("title_match", inplace=True)
+    logger.debug(f"df index after set_index: {df.index.tolist()[:5]}...")
+
     df_new = df.combine_first(df_remote)
+    logger.info(f"Combined dataframe shape: {df_new.shape}")
+    logger.debug(f"df_new index: {df_new.index.tolist()[:5]}...")
+
+    # Validate that the index contains actual conference names, not integers
+    integer_indices = [idx for idx in df_new.index if isinstance(idx, int)]
+    if integer_indices:
+        logger.warning(f"Found {len(integer_indices)} integer indices in df_new: {integer_indices[:5]}...")
 
     # Fill missing CFPs with "TBA"
     df_new.loc[df_new["cfp"].isna(), "cfp"] = "TBA"
 
+    logger.info("fuzzy_match completed successfully")
     return df_new, df_remote
 
 
 def merge_conferences(df_yml, df_remote):
     """Merge two dataframes on title and interactively resolve conflicts."""
+    logger = logging.getLogger(__name__)
+    logger.info(f"Starting merge_conferences with df_yml shape: {df_yml.shape}, df_remote shape: {df_remote.shape}")
+
+    # Data validation before merge
+    logger.debug(f"df_yml columns: {df_yml.columns.tolist()}")
+    logger.debug(f"df_remote columns: {df_remote.columns.tolist()}")
+    logger.debug(f"df_yml index: {df_yml.index.tolist()[:5]}...")  # Show first 5 indices
+    logger.debug(f"df_remote index: {df_remote.index.tolist()[:5]}...")
+
     df_new = get_schema()
     columns = df_new.columns.tolist()
+    logger.debug(f"Schema columns: {columns}")
 
     with contextlib.suppress(KeyError):
+        logger.debug("Dropping 'conference' column from df_yml")
         df_yml = df_yml.drop(["conference"], axis=1)
     with contextlib.suppress(KeyError):
+        logger.debug("Dropping 'conference' column from df_remote")
         df_remote = df_remote.drop(["conference"], axis=1)
 
     replacements = {
@@ -98,9 +129,32 @@ def merge_conferences(df_yml, df_remote):
         "Czech Republic": "Czechia",
     }
 
+    logger.info("Performing pandas merge on 'title_match'")
     df_merge = pd.merge(left=df_yml, right=df_remote, how="outer", on="title_match", validate="one_to_one")
+    logger.info(f"Merge completed. df_merge shape: {df_merge.shape}")
+    logger.debug(f"df_merge columns: {df_merge.columns.tolist()}")
+    logger.debug(f"df_merge index: {df_merge.index.tolist()[:5]}...")
+
     for i, row in df_merge.iterrows():
-        df_new.loc[i, "conference"] = i
+        # Use the actual conference name from title_match index, not the row index
+        conference_name = df_merge.index.name if hasattr(df_merge.index, "name") and df_merge.index.name else i
+        if hasattr(row, "name") and row.name:
+            conference_name = row.name
+            logger.debug(f"Using row.name for conference: {conference_name}")
+        elif "title_match" in row and pd.notna(row["title_match"]):
+            conference_name = row["title_match"]
+            logger.debug(f"Using title_match for conference: {conference_name}")
+        else:
+            logger.warning(f"Falling back to index {i} for conference name")
+            conference_name = i
+
+        # Validate conference name is a string
+        if not isinstance(conference_name, str):
+            logger.error(f"Conference name is not a string: {type(conference_name)} = {conference_name}")
+            conference_name = str(conference_name)
+
+        df_new.loc[i, "conference"] = conference_name
+        logger.debug(f"Set conference[{i}] = {conference_name}")
         for column in columns:
             cx, cy = column + "_x", column + "_y"
             # print(i,cx,cy,cx in df_merge.columns and cy in df_merge.columns,column in df_merge.columns,)
@@ -257,4 +311,22 @@ def merge_conferences(df_yml, df_remote):
 
     # Fill in missing CFPs with TBA
     df_new.loc[df_new.cfp.isna(), "cfp"] = "TBA"
+
+    # Final validation before returning
+    logger.info(f"Merge completed. Final df_new shape: {df_new.shape}")
+    logger.debug(f"Final df_new columns: {df_new.columns.tolist()}")
+
+    # Validate conference names
+    invalid_conferences = df_new[~df_new["conference"].apply(lambda x: isinstance(x, str) and len(str(x).strip()) > 0)]
+    if not invalid_conferences.empty:
+        logger.error(f"Found {len(invalid_conferences)} rows with invalid conference names:")
+        for idx, row in invalid_conferences.iterrows():
+            logger.error(f"  Row {idx}: conference = {row['conference']} (type: {type(row['conference'])})")
+
+    # Check for null conference names
+    null_conferences = df_new[df_new["conference"].isna()]
+    if not null_conferences.empty:
+        logger.error(f"Found {len(null_conferences)} rows with null conference names")
+
+    logger.info("Merge validation completed")
     return df_new