-
Notifications
You must be signed in to change notification settings - Fork 16
Open
Description
Hello, there seems to be a bug with the PowerTransform implementation, which I run into using 15 datasets out of 50 when evaluate CARTE over datasets from OpenML and Kaggle. I understand that this is a known issue.
I add here a simple reproducible example for one of the problematic datasets. The code is simplified to the highest extent possible (e.g. no train-test split).
import openml
from huggingface_hub import hf_hub_download
from carte_ai import Table2GraphTransformer
model_path = hf_hub_download(repo_id="hi-paris/fastText", filename="cc.en.300.bin")
preprocessor = Table2GraphTransformer(fasttext_model_path=model_path)
dataset = openml.datasets.get_dataset(46667)
x, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
x = preprocessor.fit_transform(x, y=y)
In [4]: x = preprocessor.fit_transform(x, y=y)
/data/home/alan.arazi/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3438: RuntimeWarning: overflow encountered in power
out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
---------------------------------------------------------------------------
BracketError Traceback (most recent call last)
Cell In[4], line 1
----> 1 x = preprocessor.fit_transform(x, y=y)
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/utils/_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
314 @wraps(f)
315 def wrapped(self, X, *args, **kwargs):
--> 316 data_to_wrap = f(self, X, *args, **kwargs)
317 if isinstance(data_to_wrap, tuple):
318 # only wrap the first output for cross decomposition
319 return_tuple = (
320 _wrap_data_with_container(method, data_to_wrap[0], X, self),
321 *data_to_wrap[1:],
322 )
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/base.py:1101, in TransformerMixin.fit_transform(self, X, y, **fit_params)
1098 return self.fit(X, **fit_params).transform(X)
1099 else:
1100 # fit method of arity 2 (supervised transformation)
-> 1101 return self.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/carte_ai/src/carte_table_to_graph.py:148, in Table2GraphTransformer.fit(self, X, y)
146 num_cols_exist = [col for col in self.num_col_names if col in X.columns]
147 if num_cols_exist:
--> 148 self.num_transformer_.fit(X[num_cols_exist])
149 #print(f"Numerical columns fitted for normalization: {num_cols_exist}")
151 self.is_fitted_ = True
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3251, in PowerTransformer.fit(self, X, y)
3231 @_fit_context(prefer_skip_nested_validation=True)
3232 def fit(self, X, y=None):
3233 """Estimate the optimal parameter lambda for each feature.
3234
3235 The optimal lambda parameter for minimizing skewness is estimated on
(...)
3249 Fitted transformer.
3250 """
-> 3251 self._fit(X, y=y, force_transform=False)
3252 return self
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3304, in PowerTransformer._fit(self, X, y, force_transform)
3301 self.lambdas_[i] = 1.0
3302 continue
-> 3304 self.lambdas_[i] = optim_function(col)
3306 if self.standardize or force_transform:
3307 X[:, i] = transform_function(X[:, i], self.lambdas_[i])
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/sklearn/preprocessing/_data.py:3493, in PowerTransformer._yeo_johnson_optimize(self, x)
3491 x = x[~np.isnan(x)]
3492 # choosing bracket -2, 2 like for boxcox
-> 3493 return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2655, in brent(func, args, brack, tol, full_output, maxiter)
2583 """
2584 Given a function of one variable and a possible bracket, return
2585 a local minimizer of the function isolated to a fractional precision
(...)
2651
2652 """
2653 options = {'xtol': tol,
2654 'maxiter': maxiter}
-> 2655 res = _minimize_scalar_brent(func, brack, args, **options)
2656 if full_output:
2657 return res['x'], res['fun'], res['nit'], res['nfev']
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2697, in _minimize_scalar_brent(func, brack, args, xtol, maxiter, disp, **unknown_options)
2694 brent = Brent(func=func, args=args, tol=tol,
2695 full_output=True, maxiter=maxiter, disp=disp)
2696 brent.set_bracket(brack)
-> 2697 brent.optimize()
2698 x, fval, nit, nfev = brent.get_result(full_output=True)
2700 success = nit < maxiter and not (np.isnan(x) or np.isnan(fval))
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2462, in Brent.optimize(self)
2459 def optimize(self):
2460 # set up for optimization
2461 func = self.func
-> 2462 xa, xb, xc, fa, fb, fc, funcalls = self.get_bracket_info()
2463 _mintol = self._mintol
2464 _cg = self._cg
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:2431, in Brent.get_bracket_info(self)
2429 xa, xb, xc, fa, fb, fc, funcalls = bracket(func, args=args)
2430 elif len(brack) == 2:
-> 2431 xa, xb, xc, fa, fb, fc, funcalls = bracket(func, xa=brack[0],
2432 xb=brack[1], args=args)
2433 elif len(brack) == 3:
2434 xa, xb, xc = brack
File ~/miniconda3/envs/tabular/lib/python3.11/site-packages/scipy/optimize/_optimize.py:3070, in bracket(func, xa, xb, args, grow_limit, maxiter)
3068 e = BracketError(msg)
3069 e.data = (xa, xb, xc, fa, fb, fc, funcalls)
-> 3070 raise e
3072 return xa, xb, xc, fa, fb, fc, funcalls
BracketError: The algorithm terminated without finding a valid bracket. Consider trying different initial points.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels