1+ import logging
12import sys
23from collections import defaultdict
34
@@ -23,9 +24,16 @@ def fuzzy_match(df_yml, df_remote):
2324 Keeps temporary track of rejections to avoid asking the same question multiple
2425 times.
2526 """
27+ logger = logging .getLogger (__name__ )
28+ logger .info (f"Starting fuzzy_match with df_yml shape: { df_yml .shape } , df_remote shape: { df_remote .shape } " )
29+
2630 df_yml = tidy_df_names (df_yml )
2731 df_remote = tidy_df_names (df_remote )
2832
33+ logger .debug (f"After tidy_df_names - df_yml shape: { df_yml .shape } , df_remote shape: { df_remote .shape } " )
34+ logger .debug (f"df_yml columns: { df_yml .columns .tolist ()} " )
35+ logger .debug (f"df_remote columns: { df_remote .columns .tolist ()} " )
36+
2937 _ , known_rejections = load_title_mappings (path = "utils/tidy_conf/data/.tmp/rejections.yml" )
3038
3139 new_mappings = defaultdict (list )
@@ -73,23 +81,46 @@ def fuzzy_match(df_yml, df_remote):
7381 update_title_mappings (new_rejections , path = "utils/tidy_conf/data/.tmp/rejections.yml" )
7482
7583 # Combine dataframes
84+ logger .info ("Combining dataframes using title_match index" )
7685 df .set_index ("title_match" , inplace = True )
86+ logger .debug (f"df index after set_index: { df .index .tolist ()[:5 ]} ..." )
87+
7788 df_new = df .combine_first (df_remote )
89+ logger .info (f"Combined dataframe shape: { df_new .shape } " )
90+ logger .debug (f"df_new index: { df_new .index .tolist ()[:5 ]} ..." )
91+
92+ # Validate that the index contains actual conference names, not integers
93+ integer_indices = [idx for idx in df_new .index if isinstance (idx , int )]
94+ if integer_indices :
95+ logger .warning (f"Found { len (integer_indices )} integer indices in df_new: { integer_indices [:5 ]} ..." )
7896
7997 # Fill missing CFPs with "TBA"
8098 df_new .loc [df_new ["cfp" ].isna (), "cfp" ] = "TBA"
8199
100+ logger .info ("fuzzy_match completed successfully" )
82101 return df_new , df_remote
83102
84103
85104def merge_conferences (df_yml , df_remote ):
86105 """Merge two dataframes on title and interactively resolve conflicts."""
106+ logger = logging .getLogger (__name__ )
107+ logger .info (f"Starting merge_conferences with df_yml shape: { df_yml .shape } , df_remote shape: { df_remote .shape } " )
108+
109+ # Data validation before merge
110+ logger .debug (f"df_yml columns: { df_yml .columns .tolist ()} " )
111+ logger .debug (f"df_remote columns: { df_remote .columns .tolist ()} " )
112+ logger .debug (f"df_yml index: { df_yml .index .tolist ()[:5 ]} ..." ) # Show first 5 indices
113+ logger .debug (f"df_remote index: { df_remote .index .tolist ()[:5 ]} ..." )
114+
87115 df_new = get_schema ()
88116 columns = df_new .columns .tolist ()
117+ logger .debug (f"Schema columns: { columns } " )
89118
90119 with contextlib .suppress (KeyError ):
120+ logger .debug ("Dropping 'conference' column from df_yml" )
91121 df_yml = df_yml .drop (["conference" ], axis = 1 )
92122 with contextlib .suppress (KeyError ):
123+ logger .debug ("Dropping 'conference' column from df_remote" )
93124 df_remote = df_remote .drop (["conference" ], axis = 1 )
94125
95126 replacements = {
@@ -98,9 +129,32 @@ def merge_conferences(df_yml, df_remote):
98129 "Czech Republic" : "Czechia" ,
99130 }
100131
132+ logger .info ("Performing pandas merge on 'title_match'" )
101133 df_merge = pd .merge (left = df_yml , right = df_remote , how = "outer" , on = "title_match" , validate = "one_to_one" )
134+ logger .info (f"Merge completed. df_merge shape: { df_merge .shape } " )
135+ logger .debug (f"df_merge columns: { df_merge .columns .tolist ()} " )
136+ logger .debug (f"df_merge index: { df_merge .index .tolist ()[:5 ]} ..." )
137+
102138 for i , row in df_merge .iterrows ():
103- df_new .loc [i , "conference" ] = i
139+ # Use the actual conference name from title_match index, not the row index
140+ conference_name = df_merge .index .name if hasattr (df_merge .index , "name" ) and df_merge .index .name else i
141+ if hasattr (row , "name" ) and row .name :
142+ conference_name = row .name
143+ logger .debug (f"Using row.name for conference: { conference_name } " )
144+ elif "title_match" in row and pd .notna (row ["title_match" ]):
145+ conference_name = row ["title_match" ]
146+ logger .debug (f"Using title_match for conference: { conference_name } " )
147+ else :
148+ logger .warning (f"Falling back to index { i } for conference name" )
149+ conference_name = i
150+
151+ # Validate conference name is a string
152+ if not isinstance (conference_name , str ):
153+ logger .error (f"Conference name is not a string: { type (conference_name )} = { conference_name } " )
154+ conference_name = str (conference_name )
155+
156+ df_new .loc [i , "conference" ] = conference_name
157+ logger .debug (f"Set conference[{ i } ] = { conference_name } " )
104158 for column in columns :
105159 cx , cy = column + "_x" , column + "_y"
106160 # print(i,cx,cy,cx in df_merge.columns and cy in df_merge.columns,column in df_merge.columns,)
@@ -257,4 +311,22 @@ def merge_conferences(df_yml, df_remote):
257311
258312 # Fill in missing CFPs with TBA
259313 df_new .loc [df_new .cfp .isna (), "cfp" ] = "TBA"
314+
315+ # Final validation before returning
316+ logger .info (f"Merge completed. Final df_new shape: { df_new .shape } " )
317+ logger .debug (f"Final df_new columns: { df_new .columns .tolist ()} " )
318+
319+ # Validate conference names
320+ invalid_conferences = df_new [~ df_new ["conference" ].apply (lambda x : isinstance (x , str ) and len (str (x ).strip ()) > 0 )]
321+ if not invalid_conferences .empty :
322+ logger .error (f"Found { len (invalid_conferences )} rows with invalid conference names:" )
323+ for idx , row in invalid_conferences .iterrows ():
324+ logger .error (f" Row { idx } : conference = { row ['conference' ]} (type: { type (row ['conference' ])} )" )
325+
326+ # Check for null conference names
327+ null_conferences = df_new [df_new ["conference" ].isna ()]
328+ if not null_conferences .empty :
329+ logger .error (f"Found { len (null_conferences )} rows with null conference names" )
330+
331+ logger .info ("Merge validation completed" )
260332 return df_new
0 commit comments