[SYSTEMDS-3819] Bug fixes in sliceLineExtract builtin

phaniarnab · phaniarnab · commit 48fd91ca135f · 2025-04-06T16:24:10.000+02:00
diff --git a/scripts/builtin/sliceLineExtract.dml b/scripts/builtin/sliceLineExtract.dml
@@ -39,7 +39,7 @@
 
 m_sliceLineExtract = function(Matrix[Double] X, Matrix[Double] e,
   Matrix[Double] TK, Matrix[Double] TKC, Integer k2 = -1)
-  return(Matrix[Double] Xtk, Matrix[Double] etk)
+  return(Matrix[Double] Xtk, Matrix[Double] etk, Matrix[Double] I)
 {
   # check valid parameters
   if( k2 > nrow(TK) )
@@ -50,7 +50,7 @@ m_sliceLineExtract = function(Matrix[Double] X, Matrix[Double] e,
   # extract first k2 slices from X and e
   I = matrix(0, k2, nrow(X));
   parfor(i in 1:k2) {
-    I[i,] = t(rowSums(X == TK[i,]) == sum(TK[i,]))
+    I[i,] = t(rowSums(X == TK[i,]) == sum(TK[i,] > 0))
   }
   I = t(colSums(I)); #union
 
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
@@ -79,9 +79,9 @@ f_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   # apply sampling on training data for pipeline enumeration
   # TODO why recoding/sampling twice (within getDirtyScore)
   print("---- class-stratified sampling of feature matrix w/ f="+sample);
-  if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2)  # & 
-    [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
-  else 
+#  if(nrow(eYtrain) >= rowCount & sample == 1.0 & sum(mask) > ncol(mask)/2)  # & 
+#    [eXtrain, eYtrain ] = utils::doErrorSample(eXtrain, eYtrain, lq, uq, rowCount)
+#  else 
     [eXtrain, eYtrain] = utils::doSample(eXtrain, eYtrain, sample, mask, metaR, TRUE)
   t5 = time(); print("---- finalized in: "+(t5-t4)/1e9+"s");
 
@@ -112,6 +112,7 @@ f_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   metaList['distY'] = dist
 
   print("-- Cleaning - Enum Logical Pipelines: ");
+  print("---- Data Dimension before Cleaning: "+ nrow(eXtrain) + ", " + ncol(eXtrain));
   [bestLogical, bestHp, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
   initial_population=logical, refSol=refSol, seed = seed,  max_iter=max_iter, metaList = metaList,
   evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
@@ -64,7 +64,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D
   sampledY = eY
   sampled = floor(nrow(eX) * ratio)
 
-  if(sampled > MIN_SAMPLE & ratio != 1.0)
+#  if(sampled > MIN_SAMPLE & ratio != 1.0)
+  if(ratio != 1.0)
   {
     sampleVec = sample(nrow(eX), sampled, FALSE, 23)
     P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec), nrow(eX))

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D`
`64`	`64`	`sampledY = eY`
`65`	`65`	`sampled = floor(nrow(eX) * ratio)`
`66`	`66`
`67`		`- if(sampled > MIN_SAMPLE & ratio != 1.0)`
	`67`	`+# if(sampled > MIN_SAMPLE & ratio != 1.0)`
	`68`	`+ if(ratio != 1.0)`
`68`	`69`	`{`
`69`	`70`	`sampleVec = sample(nrow(eX), sampled, FALSE, 23)`
`70`	`71`	`P = table(seq(1, nrow(sampleVec)), sampleVec, nrow(sampleVec), nrow(eX))`