synthesises data without errors, but does not reproduce NAs in numerical columns

KoomenErina · KoomenErina · commit dbabea045ce5 · 2024-12-12T10:18:39.000+01:00
diff --git a/main.py b/main.py
@@ -3,9 +3,9 @@
 import pyreadr
 
 def synBar():
-    df = pd.read_csv("bar_pass_prediction.csv")[[ 'race1', 'ugpa', 'bar']]
+    df = pd.read_csv("bar_pass_prediction.csv")[['sex', 'race1', 'ugpa', 'bar']]
     print(df.dtypes)
-    dtype_map = { 'race1': 'category', 'ugpa': 'float', 'bar': 'category'}
+    dtype_map = {'sex': 'float', 'race1': 'category', 'ugpa': 'float', 'bar': 'category'}
 
     for (k,v) in dtype_map.items():
 
@@ -19,6 +19,7 @@ def synBar():
     synth_df = spop.generate(len(df))
 
     print(synth_df.head())
+    print(synth_df.isna().sum())
 
 def synSD2011():
     df0 = pyreadr.read_r("SD2011.rda")['SD2011']
diff --git a/synthpop/processor/processor.py b/synthpop/processor/processor.py
@@ -66,7 +66,7 @@ def preprocess(self, df, dtypes):
                         # insert new column in df
                         # TODO beware of '_NaN' naming
                         col_nan_name = col + '_NaN'
-                        df.insert(df.columns.get_loc(col), col_nan_name, 0)
+                        df.insert(df.columns.get_loc(col), col_nan_name, 0) #inserts columName_NaN in dataframe
 
                         self.processing_dict[NAN_KEY][col] = {'col_nan_name': col_nan_name,
                                                               'dtype': self.spop.df_dtypes[col],
@@ -82,9 +82,10 @@ def preprocess(self, df, dtypes):
                         df.loc[:,col_nan_name] = df[col_nan_name].astype('category')
                         self.spop.df_dtypes[col_nan_name] = 'category'
 
-        return df
+        return df #Sex_NaN is a part of this data frame
 
     def postprocess(self, synth_df):
+        #sex_NaN is not a column of synth_df
         for col, processing_numtocat_col_dict in self.processing_dict[NUMTOCAT_KEY].items():
             synth_df[col] = synth_df[col].astype(object)
             col_synth_df = synth_df[col].copy()
@@ -110,10 +111,12 @@ def postprocess(self, synth_df):
                 synth_df[col] = synth_df[col].astype('category')
 
             # NaNs in numerical columns
-            elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
-                for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
-                    nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag
-                    synth_df.loc[nan_flag_indices, col] = col_nan_value
-                synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
+            #The code below sets changes NANs in numerical columns to a given value, and removes the NAN indicator column.
+            #The NAN_indicator columns are not synthesised.
+            # elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
+            #     for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
+            #         nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag #expects columnName_NAN in the synthetic data set
+            #         synth_df.loc[nan_flag_indices, col] = col_nan_value
+            #     synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
 
         return synth_df
diff --git a/synthpop/synthpop.py b/synthpop/synthpop.py
@@ -56,6 +56,10 @@ def fit(self, df, dtypes=None):
         # check processor
         self.validator.check_processor()
         # preprocess
+
+        #processor.preprocess has side effects on the processor object and on this (self) object
+        #processor.processing_dict[NAN_KEY][col]
+        #spop.df_dtypes[col_nan_name]
         processed_df = self.processor.preprocess(df, self.df_dtypes)
         print(processed_df)
         self.processed_df_columns = processed_df.columns.tolist()