Skip to content

Commit 474caf3

Browse files
committed
synthesis works on full bar data set
1 parent 949d87c commit 474caf3

File tree

2 files changed

+39
-20
lines changed

2 files changed

+39
-20
lines changed

main.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@
33
import pyreadr
44

55
def synBar():
6-
df = pd.read_csv("bar_pass_prediction.csv")[['sex', 'race1', 'ugpa', 'bar']]
6+
df = pd.read_csv("bar_pass_prediction.csv")
77
print(df.dtypes)
8-
dtype_map = {'sex': 'float', 'race1': 'category', 'ugpa': 'float', 'bar': 'category'}
8+
dtype_map = {}
99

10-
for (k,v) in dtype_map.items():
11-
12-
if v == 'category':
13-
df = df.astype({k : "category"})
10+
for k in df.dtypes.keys():
11+
match df.dtypes[k]:
12+
case 'float64':
13+
dtype_map[k] = 'float'
14+
case 'category':
15+
dtype_map[k] = 'category'
16+
df = df.astype({k : "category"})
17+
case _:
18+
dtype_map[k]= 'category'
19+
df = df.astype({k : "category"})
1420

1521
print(df.dtypes)
1622
spop = Synthpop()
@@ -28,22 +34,35 @@ def synBar():
2834
def synSD2011():
2935
df0 = pyreadr.read_r("SD2011.rda")['SD2011']
3036
#pd.read_csv("bar_pass_prediction.csv")
31-
print(df0.dtypes)
32-
df = df0[['age', 'unempdur', 'income', 'sex']]#df0[['sex', 'race1', 'ugpa', 'bar']]
33-
print(df.isna().sum())
37+
#print(df0.dtypes)
38+
df = df0#df0[['sex', 'race1', 'ugpa', 'bar']]
39+
#print(df.isna().sum())
3440
#df.to_excel("inputData.xlsx")
3541
dtype_map ={
36-
"age":"float",
37-
"unempdur":"float",
38-
"income":"float",
39-
"sex":"category"
42+
# "age":"float",
43+
# "unempdur":"float",
44+
# "income":"float",
45+
# "sex":"category"
4046
}
47+
48+
for k in df.dtypes.keys():
49+
match df.dtypes[k]:
50+
case 'float64':
51+
dtype_map[k] = 'float'
52+
case 'category':
53+
dtype_map[k] = 'category'
54+
df = df.astype({k : "category"})
55+
case _:
56+
dtype_map[k]= 'category'
57+
df = df.astype({k : "category"})
58+
59+
print(dtype_map)
4160
#{'sex': 'float', 'race1': 'category', 'ugpa': 'float', 'bar': 'category'}
4261
# for (k,v) in dtype_map.items():
4362
# if v == 'category':
4463
# df[k] = df[k].astype('category')
4564

46-
print(df.dtypes)
65+
4766
r = df.dtypes.keys()
4867
spop = Synthpop()
4968
spop.fit(df,dtype_map)
@@ -53,4 +72,4 @@ def synSD2011():
5372
print(synth_df.head())
5473

5574

56-
synSD2011()
75+
synBar()

synthpop/processor/processor.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,10 @@ def postprocess(self, synth_df):
114114
# NaNs in numerical columns
115115
#The code below sets changes NANs in numerical columns to a given value, and removes the NAN indicator column.
116116
#The NAN_indicator columns are not synthesised.
117-
# elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
118-
# for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
119-
# nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag #expects columnName_NAN in the synthetic data set
120-
# synth_df.loc[nan_flag_indices, col] = col_nan_value
121-
# synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
117+
elif processing_nan_col_dict['dtype'] in NUM_COLS_DTYPES:
118+
for col_nan_flag, col_nan_value in processing_nan_col_dict['nan_flags'].items():
119+
nan_flag_indices = synth_df[processing_nan_col_dict['col_nan_name']] == col_nan_flag #expects columnName_NAN in the synthetic data set
120+
synth_df.loc[nan_flag_indices, col] = col_nan_value
121+
synth_df.drop(columns=processing_nan_col_dict['col_nan_name'], inplace=True)
122122

123123
return synth_df

0 commit comments

Comments
 (0)