@@ -82,7 +82,7 @@ def _detect_datetime_column_parameters(series, column_metadata):
8282 return params
8383
8484
85- def _detect_categorical_or_boolean_column_parameters (series ):
85+ def _detect_categorical_column_parameters (series ):
8686 """Detect categorical/boolean parameters."""
8787 categorical_values = series .dropna ().unique ()
8888 if len (categorical_values ) == 0 :
@@ -113,37 +113,14 @@ def detect_column_parameters(data, metadata, table_name):
113113 sdtype = column_metadata ['sdtype' ]
114114 params = {}
115115 if sdtype == 'numerical' :
116- column_parameters [column_name ] = {
117- 'num_decimal_digits' : learn_rounding_digits (data [column_name ]),
118- 'min_value' : data [column_name ].min (),
119- 'max_value' : data [column_name ].max (),
120- }
116+ params .update (_detect_numerical_column_parameters (data [column_name ]))
121117 elif sdtype == 'datetime' :
122- datetime_format = column_metadata .get ('datetime_format' , None )
123- if datetime_format :
124- datetime_column = pd .to_datetime (
125- data [column_name ], format = datetime_format , errors = 'coerce'
126- )
127- start_timestamp = datetime_column .min ().strftime (datetime_format )
128- end_timestamp = datetime_column .max ().strftime (datetime_format )
129-
130- else :
131- datetime_column = pd .to_datetime (data [column_name ], errors = 'coerce' )
132- start_timestamp = str (datetime_column .min ())
133- end_timestamp = str (datetime_column .max ())
134-
135- column_parameters [column_name ] = {
136- 'start_timestamp' : start_timestamp ,
137- 'end_timestamp' : end_timestamp ,
138- }
118+ params .update (_detect_datetime_column_parameters (data [column_name ], column_metadata ))
139119 elif sdtype == 'categorical' :
140- column_parameters [column_name ] = {
141- 'category_values' : data [column_name ].dropna ().unique ().tolist ()
142- }
120+ params .update (_detect_categorical_column_parameters (data [column_name ]))
143121
144- column_parameters [column_name ]['missing_values_proportion' ] = float (
145- data [column_name ].isna ().mean ()
146- )
122+ params ['missing_values_proportion' ] = _compute_missing_values_proportion (data [column_name ])
123+ column_parameters [column_name ] = params
147124
148125 return {'columns' : column_parameters }
149126
0 commit comments