adding and synthesising NaN columns works

KoomenErina · KoomenErina · commit 78d550b5e277 · 2024-12-12T14:39:05.000+01:00
diff --git a/main.py b/main.py
@@ -14,6 +14,7 @@ def synBar():
 
     print(df.dtypes)
     spop = Synthpop()
+
     spop.fit(df,dtype_map)
 
     synth_df = spop.generate(len(df))
@@ -26,6 +27,7 @@ def synSD2011():
     #pd.read_csv("bar_pass_prediction.csv")
     print(df0.dtypes)
     df = df0[['age', 'unempdur', 'income', 'sex']]#df0[['sex', 'race1', 'ugpa', 'bar']]
+    print(df.isna().sum())
     #df.to_excel("inputData.xlsx")
     dtype_map ={
         "age":"float",
diff --git a/synthpop/processor/processor.py b/synthpop/processor/processor.py
@@ -15,6 +15,7 @@ def __init__(self, spop):
                                 NAN_KEY: {}
                                 }
 
+
     def preprocess(self, df, dtypes):
         for col in self.spop.visited_columns:
             col_nan_indices = df[col].isna()
diff --git a/synthpop/synthpop.py b/synthpop/synthpop.py
@@ -41,14 +41,35 @@ def __init__(self,
         # check init
         self.validator.check_init()
 
+    def pre_preprocess(self,df,dtypes,nan_fill):
+
+        for column in df:
+            if dtypes[column] != 'float':
+                continue
+            maybe_nans = df[column].isnull()
+            if not maybe_nans.any():
+                continue
+
+            df.loc[maybe_nans,column] = nan_fill
+
+            nan_col_name = column+"_NaN"
+            df.loc[:,nan_col_name] = maybe_nans
+
+            dtypes[nan_col_name] = 'category'
+
+
+        return df,dtypes
+
+    def post_postprocessing(self,syn_df):
+        return syn_df
     def fit(self, df, dtypes=None):
         # TODO check df and check/EXTRACT dtypes
         # - all column names of df are unique
         # - all columns data of df are consistent
         # - all dtypes of df are correct ('int', 'float', 'datetime', 'category', 'bool'; no object)
         # - can map dtypes (if given) correctly to df
         # should create map col: dtype (self.df_dtypes)
-
+        df,dtypes = self.pre_preprocess(df,dtypes,-8)
         self.df_columns = df.columns.tolist()
         self.n_df_rows, self.n_df_columns = np.shape(df)
         self.df_dtypes = dtypes
diff --git a/synthpop/validator/validator.py b/synthpop/validator/validator.py
@@ -40,7 +40,7 @@ def check_init(self):
         self.default_method_validator(step=step)
         self.method_validator(step=step)
         self.visit_sequence_validator(step=step)
-        # self.predictor_matrix_validator(step=step)
+        self.predictor_matrix_validator(step=step)
         self.proper_validator(step=step)
         self.cont_na_validator(step=step)
         self.smoothing_validator(step=step)
diff --git a/tests_processing.py b/tests_processing.py
@@ -0,0 +1,27 @@
+import unittest
+from synthpop import Synthpop
+import pandas as pd
+import numpy as np
+
+class TestProcessing(unittest.TestCase):
+
+    def test_add_NaN_columns_for_numeric_columns(self):
+        df = pd.DataFrame({'a':[1,2,np.nan], 'b':[1,1,1], 'c':['x','y',None]})
+        spop = Synthpop()
+        dtype_map = {'a':'float','b':'float', 'c':'categorical'}
+        res,dtype_res = spop.pre_preprocess(df,dtype_map,nan_fill=-8)
+
+        self.assertTrue('a_NaN' in res,"Nan column not made")
+        self.assertFalse('b_NaN' in res,"Nan column should not be made if there are no NaNs")
+        self.assertFalse('c_NaN' in res,"Nan column should not be made for categorical columns")
+        self.assertTrue(res['a_NaN'][2])
+        self.assertEqual(res['a'][2], -8)
+        self.assertEqual(dtype_res['a_NaN'],'category')
+
+    def test_apply_and_remove_added_NaN_columns(self):
+        df = pd.DataFrame({'a':[1,2,np.nan],'a_NaN':[False,False,True], 'b':[1,1,1], 'c':['x','y',None]})
+        spop = Synthpop()
+
+
+if __name__ == '__main__':
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@ def __init__(self, spop):`
`15`	`15`	`NAN_KEY: {}`
`16`	`16`	`}`
`17`	`17`
	`18`	`+`
`18`	`19`	`def preprocess(self, df, dtypes):`
`19`	`20`	`for col in self.spop.visited_columns:`
`20`	`21`	`col_nan_indices = df[col].isna()`