Skip to content

Commit 78d550b

Browse files
committed
adding and synthesising NaN columns works
1 parent dbabea0 commit 78d550b

File tree

5 files changed

+53
-2
lines changed

5 files changed

+53
-2
lines changed

main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def synBar():
1414

1515
print(df.dtypes)
1616
spop = Synthpop()
17+
1718
spop.fit(df,dtype_map)
1819

1920
synth_df = spop.generate(len(df))
@@ -26,6 +27,7 @@ def synSD2011():
2627
#pd.read_csv("bar_pass_prediction.csv")
2728
print(df0.dtypes)
2829
df = df0[['age', 'unempdur', 'income', 'sex']]#df0[['sex', 'race1', 'ugpa', 'bar']]
30+
print(df.isna().sum())
2931
#df.to_excel("inputData.xlsx")
3032
dtype_map ={
3133
"age":"float",

synthpop/processor/processor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def __init__(self, spop):
1515
NAN_KEY: {}
1616
}
1717

18+
1819
def preprocess(self, df, dtypes):
1920
for col in self.spop.visited_columns:
2021
col_nan_indices = df[col].isna()

synthpop/synthpop.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,35 @@ def __init__(self,
4141
# check init
4242
self.validator.check_init()
4343

44+
def pre_preprocess(self,df,dtypes,nan_fill):
45+
46+
for column in df:
47+
if dtypes[column] != 'float':
48+
continue
49+
maybe_nans = df[column].isnull()
50+
if not maybe_nans.any():
51+
continue
52+
53+
df.loc[maybe_nans,column] = nan_fill
54+
55+
nan_col_name = column+"_NaN"
56+
df.loc[:,nan_col_name] = maybe_nans
57+
58+
dtypes[nan_col_name] = 'category'
59+
60+
61+
return df,dtypes
62+
63+
def post_postprocessing(self,syn_df):
64+
return syn_df
4465
def fit(self, df, dtypes=None):
4566
# TODO check df and check/EXTRACT dtypes
4667
# - all column names of df are unique
4768
# - all columns data of df are consistent
4869
# - all dtypes of df are correct ('int', 'float', 'datetime', 'category', 'bool'; no object)
4970
# - can map dtypes (if given) correctly to df
5071
# should create map col: dtype (self.df_dtypes)
51-
72+
df,dtypes = self.pre_preprocess(df,dtypes,-8)
5273
self.df_columns = df.columns.tolist()
5374
self.n_df_rows, self.n_df_columns = np.shape(df)
5475
self.df_dtypes = dtypes

synthpop/validator/validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def check_init(self):
4040
self.default_method_validator(step=step)
4141
self.method_validator(step=step)
4242
self.visit_sequence_validator(step=step)
43-
# self.predictor_matrix_validator(step=step)
43+
self.predictor_matrix_validator(step=step)
4444
self.proper_validator(step=step)
4545
self.cont_na_validator(step=step)
4646
self.smoothing_validator(step=step)

tests_processing.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import unittest
2+
from synthpop import Synthpop
3+
import pandas as pd
4+
import numpy as np
5+
6+
class TestProcessing(unittest.TestCase):
7+
8+
def test_add_NaN_columns_for_numeric_columns(self):
9+
df = pd.DataFrame({'a':[1,2,np.nan], 'b':[1,1,1], 'c':['x','y',None]})
10+
spop = Synthpop()
11+
dtype_map = {'a':'float','b':'float', 'c':'categorical'}
12+
res,dtype_res = spop.pre_preprocess(df,dtype_map,nan_fill=-8)
13+
14+
self.assertTrue('a_NaN' in res,"Nan column not made")
15+
self.assertFalse('b_NaN' in res,"Nan column should not be made if there are no NaNs")
16+
self.assertFalse('c_NaN' in res,"Nan column should not be made for categorical columns")
17+
self.assertTrue(res['a_NaN'][2])
18+
self.assertEqual(res['a'][2], -8)
19+
self.assertEqual(dtype_res['a_NaN'],'category')
20+
21+
def test_apply_and_remove_added_NaN_columns(self):
22+
df = pd.DataFrame({'a':[1,2,np.nan],'a_NaN':[False,False,True], 'b':[1,1,1], 'c':['x','y',None]})
23+
spop = Synthpop()
24+
25+
26+
if __name__ == '__main__':
27+
unittest.main()

0 commit comments

Comments
 (0)