@@ -66,7 +66,7 @@ def preprocess(self, df, dtypes):
6666 # insert new column in df
6767 # TODO beware of '_NaN' naming
6868 col_nan_name = col + '_NaN'
69- df .insert (df .columns .get_loc (col ), col_nan_name , 0 )
69+ df .insert (df .columns .get_loc (col ), col_nan_name , 0 ) #inserts columName_NaN in dataframe
7070
7171 self .processing_dict [NAN_KEY ][col ] = {'col_nan_name' : col_nan_name ,
7272 'dtype' : self .spop .df_dtypes [col ],
@@ -82,9 +82,10 @@ def preprocess(self, df, dtypes):
8282 df .loc [:,col_nan_name ] = df [col_nan_name ].astype ('category' )
8383 self .spop .df_dtypes [col_nan_name ] = 'category'
8484
85- return df
85+ return df #Sex_NaN is a part of this data frame
8686
8787 def postprocess (self , synth_df ):
88+ #sex_NaN is not a column of synth_df
8889 for col , processing_numtocat_col_dict in self .processing_dict [NUMTOCAT_KEY ].items ():
8990 synth_df [col ] = synth_df [col ].astype (object )
9091 col_synth_df = synth_df [col ].copy ()
@@ -110,10 +111,12 @@ def postprocess(self, synth_df):
110111 synth_df [col ] = synth_df [col ].astype ('category' )
111112
112113 # NaNs in numerical columns
113- elif processing_nan_col_dict ['dtype' ] in NUM_COLS_DTYPES :
114- for col_nan_flag , col_nan_value in processing_nan_col_dict ['nan_flags' ].items ():
115- nan_flag_indices = synth_df [processing_nan_col_dict ['col_nan_name' ]] == col_nan_flag
116- synth_df .loc [nan_flag_indices , col ] = col_nan_value
117- synth_df .drop (columns = processing_nan_col_dict ['col_nan_name' ], inplace = True )
114+ #The code below sets changes NANs in numerical columns to a given value, and removes the NAN indicator column.
115+ #The NAN_indicator columns are not synthesised.
116+ # elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
117+ # for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
118+ # nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag #expects columnName_NAN in the synthetic data set
119+ # synth_df.loc[nan_flag_indices, col] = col_nan_value
120+ # synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
118121
119122 return synth_df
0 commit comments