[SYSTEMDS-3850] Pipeline pruning after top-K cleaning

phaniarnab · phaniarnab · commit 36eb8193966c · 2025-04-06T16:24:10.000+02:00
This patch introduces a pruning technique on the cleaning pipeline
returned by the top-K cleaning. We identify a smaller yet equally
effective subset of primitives for all top-performing pipelines,
which optimizes their scoring performance.
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
@@ -311,7 +311,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
       {
         [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=op, 
           Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList2,  hyperParameters=hp_matrix, hpForPruning=hpForPruning,
-          changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
+          changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(op))
         if(max(eYtrain) == min(eYtrain)) 
           print("Y contains only one class")
         else if(changesByPip < ref)
@@ -540,7 +540,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
     {
       [trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=as.frame(pipList['ph']),
         Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
-        changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
+        changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(as.frame(pipList['ph'])))
       #TODO double check why this is necessary
       mincol = min(ncol(cvChanges),ncol(changesByOp))
       cvChanges[cvk,1:mincol] = changesByOp[,1:mincol];
@@ -557,7 +557,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
   allChanges = min(allChanges)
   changesByOp = colMaxs(cvChanges)
   accuracy =  mean(accuracyMatrix)
-  print("cv accuracy: "+toString(accuracy))
+  print("- cv accuracy: "+toString(accuracy))
 }
 
 pruningSignal = function(Matrix[Double] pipPre, Matrix[Double] pipNew, Matrix[Double] hp_matrix, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
@@ -670,7 +670,7 @@ run_with_hyperparamNested = function(Frame[Unknown] ph_pip, Integer r_i = 1, Mat
         {
           [eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=op, 
             Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList2,  hyperParameters=hp_matrix, hpForPruning=hpForPruning,
-            changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
+            changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(op))
           if(max(eYtrain) == min(eYtrain)) 
             print("Y contains only one class")
           else if(changesByPip < ref)
@@ -727,4 +727,4 @@ return(Boolean execute)
     }
   }
   execute = !(changeCount > 0)
-}
+}
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
@@ -51,7 +51,8 @@
 
 f_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Matrix[Double] Ytrain, 
   Matrix[Double] Xtest,  Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
-  Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
+  Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose,
+  Integer startInd, Integer endInd)
   return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest,
     Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll, List[Unknown] internalStates)
 {
@@ -68,8 +69,11 @@ f_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
     print("pipeline in execution "+toString(pipeline))
     print("pipeline hps "+toString(hyperParameters))
   }
-  for(i in 1:ncol(pipeline)) {
+
+#  for(i in 1:ncol(pipeline)) {
+  for(i in startInd:endInd) {
     op = as.scalar(pipeline[1,i])
+    print("-- Applying Primitive: "+op);
     applyOp = toString(as.scalar(applyFunc[1,i]))
     Xclone = Xtrain
     XtestClone = Xtest
@@ -476,4 +480,4 @@ return (Matrix[Double] cmin, Matrix[Double] cmax)
     cmin[1, i] = min(vec)
     cmax[1, i] = max(vec)
   }
-}
+}
diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml
@@ -48,7 +48,7 @@ source("scripts/builtin/bandit.dml") as bandit;
 
 f_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
   Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
-  Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
+  Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean allCombinations=FALSE)
 return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
 {
   externalState = list()
@@ -92,28 +92,66 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
   hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
   pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
 
+  print("Getting training score using CV")
   [trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
       pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
-  print("train score cv: "+toString(trainScore))
+  print("- train score cv: "+toString(trainScore))
   
-  
-  # # # now test accuracy
-  [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
-    Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
-  
-  if(max(eYtrain) == min(eYtrain)) 
-    stop("Y contains only one class")
+  print("----------------------------");
+  print("Getting test accuracy")
+  primitives = matrix(0, rows=0, cols=0);
+  if (allCombinations) {
+  # Count number of subsets of consecutive primitives
+    totCount = 0;
+    n = ncol(pip); 
+    for (i in 1:n) {
+      for (j in i:n)
+        totCount = totCount + 1;
+    }
+    # List start and end indices of all those subsets
+    primitives = matrix(0, rows=totCount, cols=2);
+    r = 1;
+    for (start in 1:n) {
+      for (end in start:n) {
+        primitives[r,1] = start;
+        primitives[r,2] = end;
+        r = r + 1;
+      }
+    }
+  }
+  else {
+  # Include all primitives
+    primitives = matrix(0, rows=1, cols=2);
+    primitives[1,1] = 1;
+    primitives[1,2] = ncol(pip);
+  }
 
-  # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
-  # trainAccuracy = as.scalar(score[1, 1])
-  
-  score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
-  testAccuracy = as.scalar(score[1, 1])
-  
-  scores = matrix(0, rows=1, cols=3)
-  scores[1, 1] = dirtyScore
-  # scores[1, 2] = trainAccuracy
-  scores[1, 3] = testAccuracy  
-  cleanTrain = cbind(eXtrain, eYtrain)
-  cleanTest = cbind(eXtest, eYtest)
+  for (r in 1:nrow(primitives)) {
+    startInd = as.scalar(primitives[r,1]);
+    endInd = as.scalar(primitives[r,2]);
+    # # # now test accuracy
+    [eXtrain_cl, eYtrain_cl, eXtest_cl, eYtest_cl, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
+      Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=startInd, endInd=endInd)
+    
+    if(max(eYtrain_cl) == min(eYtrain_cl)) 
+      stop("Y contains only one class")
+
+    # score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
+    # trainAccuracy = as.scalar(score[1, 1])
+    
+    score = eval(evaluationFunc, list(X=eXtrain_cl, Y=eYtrain_cl, Xtest=eXtest_cl, Ytest=eYtest_cl, Xorig=as.matrix(0), evalFunHp=evalFunHp))
+    testAccuracy = as.scalar(score[1, 1])
+    
+    scores = matrix(0, rows=1, cols=3)
+    scores[1, 1] = dirtyScore
+    # scores[1, 2] = trainAccuracy
+    scores[1, 3] = testAccuracy  
+    cleanTrain = cbind(eXtrain_cl, eYtrain_cl)
+    cleanTest = cbind(eXtest, eYtest)
+
+    header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
+    result = as.frame(scores)
+    writeRes = rbind(header, result)
+    print(toString(writeRes))
+  }
 }