@@ -79,9 +79,9 @@ f_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
7979 # apply sampling on training data for pipeline enumeration
8080 # TODO why recoding/sampling twice (within getDirtyScore)
8181 print("---- class-stratified sampling of feature matrix w/ f="+sample);
82- if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # &
83- [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
84- else
82+ # if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2) # &
83+ # [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
84+ # else
8585 [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, metaR, TRUE)
8686 t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
8787
@@ -112,6 +112,7 @@ f_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
112112 metaList['distY'] = dist
113113
114114 print("-- Cleaning - Enum Logical Pipelines: ");
115+ print("---- Data Dimension before Cleaning: "+ nrow(eXtrain) + ", " + ncol(eXtrain));
115116 [bestLogical, bestHp, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
116117 initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter, metaList = metaList,
117118 evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
0 commit comments