visit sequence adjusted for nan columns

KoomenErina · KoomenErina · commit 0b1e566cb562 · 2024-12-13T15:24:07.000+01:00
diff --git a/main.py b/main.py
@@ -35,7 +35,7 @@ def synSD2011():
     df0 = pyreadr.read_r("SD2011.rda")['SD2011']
     #pd.read_csv("bar_pass_prediction.csv")
     #print(df0.dtypes)
-    df = df0#df0[['sex', 'race1', 'ugpa', 'bar']]
+    df = df0[['age', 'unempdur', 'income', 'sex']]
     #print(df.isna().sum())
     #df.to_excel("inputData.xlsx")
     dtype_map ={
@@ -64,7 +64,7 @@ def synSD2011():
 
 
     r = df.dtypes.keys()
-    spop = Synthpop()
+    spop = Synthpop(visit_sequence=['age', 'unempdur', 'income_NaN','income', 'sex'])
     spop.fit(df,dtype_map)
 
     synth_df = spop.generate(len(df))
diff --git a/synthpop/synthpop.py b/synthpop/synthpop.py
@@ -40,6 +40,14 @@ def __init__(self,
         self.map_column_to_NaN_column = {}
         # check init
         self.validator.check_init()
+    def include_nan_columns(self):
+        for (col,nan_col) in self.map_column_to_NaN_column.items():
+
+            if col not in self.visit_sequence:
+                continue
+
+            index_of_col = self.visit_sequence.index(col)
+            self.visit_sequence.insert(index_of_col,nan_col)
 
     def pre_preprocess(self,df,dtypes,nan_fill):
 
@@ -79,7 +87,10 @@ def fit(self, df, dtypes=None):
         # - can map dtypes (if given) correctly to df
         # should create map col: dtype (self.df_dtypes)
         df,dtypes = self.pre_preprocess(df,dtypes,-8)
+
         self.df_columns = df.columns.tolist()
+        self.visit_sequence = df.columns.tolist()
+        self.include_nan_columns()
         self.n_df_rows, self.n_df_columns = np.shape(df)
         self.df_dtypes = dtypes
 
diff --git a/tests_processing.py b/tests_processing.py
@@ -18,6 +18,16 @@ def test_add_NaN_columns_for_numeric_columns(self):
         self.assertEqual(res['a'][2], -8)
         self.assertEqual(dtype_res['a_NaN'],'category')
         self.assertEqual(spop.map_column_to_NaN_column['a'],'a_NaN')
+    def test_make_visit_sequence_when_one_is_given(self):
+
+        visit_seq = ['x','a','b']
+        spop = Synthpop(visit_sequence=visit_seq)
+        spop.map_column_to_NaN_column = {'a':'a_NaN','c':'c_NaN'}
+
+        spop.include_nan_columns()
+
+        self.assertSequenceEqual(spop.visit_sequence,['x','a_NaN','a','b'])
+
 
     def test_apply_and_remove_added_NaN_columns(self):
         df = pd.DataFrame({'a':[1,2,-8],'a_NaN':[False,True,False], 'b':[1,1,1], 'c':['x','y',None]})