[SYSTEMDS-3822] Fix incorrect sampling in top-k cleaning pipelines

mboehm7 · mboehm7 · commit 41c21bf9ce22 · 2025-01-29T10:51:43.000+01:00
This patch fixes a bug in top-k cleaning pipeline enumeration, where
for datasets with more than 200K rows the sampling ratio was ignored
and always set to 0.6 which means we actually ran with larger data
than expected, if people wanted to sampling very large datasets.
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
@@ -62,9 +62,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D
   MIN_SAMPLE = 1000
   sampledX = eX
   sampledY = eY
-  ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
   sampled = floor(nrow(eX) * ratio)
-  
+
   if(sampled > MIN_SAMPLE & ratio != 1.0)
   {
     sampleVec = sample(nrow(eX), sampled, FALSE, 23)
@@ -76,7 +75,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D
     }
     else if(nrow(eY) == 1) { # for clustering
       sampledX = P %*% eX
-      sampledY = eY 
+      sampledY = eY
     }
     print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
   }
@@ -271,4 +270,4 @@ return(Frame[Unknown] data)
       # data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)
     # }
   # }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -62,9 +62,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D`
`62`	`62`	`MIN_SAMPLE = 1000`
`63`	`63`	`sampledX = eX`
`64`	`64`	`sampledY = eY`
`65`		`- ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)`
`66`	`65`	`sampled = floor(nrow(eX) * ratio)`
`67`		`-`
	`66`	`+`
`68`	`67`	`if(sampled > MIN_SAMPLE & ratio != 1.0)`
`69`	`68`	`{`
`70`	`69`	`sampleVec = sample(nrow(eX), sampled, FALSE, 23)`
`@@ -76,7 +75,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Matrix[D`
`76`	`75`	`}`
`77`	`76`	`else if(nrow(eY) == 1) { # for clustering`
`78`	`77`	`sampledX = P %*% eX`
`79`		`- sampledY = eY`
	`78`	`+ sampledY = eY`
`80`	`79`	`}`
`81`	`80`	`print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))`
`82`	`81`	`}`
`@@ -271,4 +270,4 @@ return(Frame[Unknown] data)`
`271`	`270`	`# data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)`
`272`	`271`	`# }`
`273`	`272`	`# }`
`274`		`-}`
	`273`	`+}`