Skip to content

Commit dbabea0

Browse files
committed
synthesises data without errors, but does not reproduce NAs in numerical columns
1 parent a368a49 commit dbabea0

File tree

3 files changed

+17
-9
lines changed

3 files changed

+17
-9
lines changed

main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
import pyreadr
44

55
def synBar():
6-
df = pd.read_csv("bar_pass_prediction.csv")[[ 'race1', 'ugpa', 'bar']]
6+
df = pd.read_csv("bar_pass_prediction.csv")[['sex', 'race1', 'ugpa', 'bar']]
77
print(df.dtypes)
8-
dtype_map = { 'race1': 'category', 'ugpa': 'float', 'bar': 'category'}
8+
dtype_map = {'sex': 'float', 'race1': 'category', 'ugpa': 'float', 'bar': 'category'}
99

1010
for (k,v) in dtype_map.items():
1111

@@ -19,6 +19,7 @@ def synBar():
1919
synth_df = spop.generate(len(df))
2020

2121
print(synth_df.head())
22+
print(synth_df.isna().sum())
2223

2324
def synSD2011():
2425
df0 = pyreadr.read_r("SD2011.rda")['SD2011']

synthpop/processor/processor.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def preprocess(self, df, dtypes):
6666
# insert new column in df
6767
# TODO beware of '_NaN' naming
6868
col_nan_name = col + '_NaN'
69-
df.insert(df.columns.get_loc(col), col_nan_name, 0)
69+
df.insert(df.columns.get_loc(col), col_nan_name, 0) #inserts columName_NaN in dataframe
7070

7171
self.processing_dict[NAN_KEY][col] = {'col_nan_name': col_nan_name,
7272
'dtype': self.spop.df_dtypes[col],
@@ -82,9 +82,10 @@ def preprocess(self, df, dtypes):
8282
df.loc[:,col_nan_name] = df[col_nan_name].astype('category')
8383
self.spop.df_dtypes[col_nan_name] = 'category'
8484

85-
return df
85+
return df #Sex_NaN is a part of this data frame
8686

8787
def postprocess(self, synth_df):
88+
#sex_NaN is not a column of synth_df
8889
for col, processing_numtocat_col_dict in self.processing_dict[NUMTOCAT_KEY].items():
8990
synth_df[col] = synth_df[col].astype(object)
9091
col_synth_df = synth_df[col].copy()
@@ -110,10 +111,12 @@ def postprocess(self, synth_df):
110111
synth_df[col] = synth_df[col].astype('category')
111112

112113
# NaNs in numerical columns
113-
elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
114-
for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
115-
nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag
116-
synth_df.loc[nan_flag_indices, col] = col_nan_value
117-
synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
114+
#The code below sets changes NANs in numerical columns to a given value, and removes the NAN indicator column.
115+
#The NAN_indicator columns are not synthesised.
116+
# elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
117+
# for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
118+
# nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag #expects columnName_NAN in the synthetic data set
119+
# synth_df.loc[nan_flag_indices, col] = col_nan_value
120+
# synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
118121

119122
return synth_df

synthpop/synthpop.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ def fit(self, df, dtypes=None):
5656
# check processor
5757
self.validator.check_processor()
5858
# preprocess
59+
60+
#processor.preprocess has side effects on the processor object and on this (self) object
61+
#processor.processing_dict[NAN_KEY][col]
62+
#spop.df_dtypes[col_nan_name]
5963
processed_df = self.processor.preprocess(df, self.df_dtypes)
6064
print(processed_df)
6165
self.processed_df_columns = processed_df.columns.tolist()

0 commit comments

Comments
 (0)