Skip to content

Commit 36eb819

Browse files
committed
[SYSTEMDS-3850] Pipeline pruning after top-K cleaning
This patch introduces a pruning technique on the cleaning pipeline returned by the top-K cleaning. We identify a smaller yet equally effective subset of primitives for all top-performing pipelines, which optimizes their scoring performance.
1 parent 48fd91c commit 36eb819

File tree

3 files changed

+71
-29
lines changed

3 files changed

+71
-29
lines changed

scripts/builtin/bandit.dml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
311311
{
312312
[eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=op,
313313
Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList2, hyperParameters=hp_matrix, hpForPruning=hpForPruning,
314-
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
314+
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(op))
315315
if(max(eYtrain) == min(eYtrain))
316316
print("Y contains only one class")
317317
else if(changesByPip < ref)
@@ -540,7 +540,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
540540
{
541541
[trainX, trainy, testX, testy, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=as.frame(pipList['ph']),
542542
Xtrain=trainX, Ytrain=trainy, Xtest= testX, Ytest=testy, metaList=metaList, hyperParameters=as.matrix(pipList['hp']), hpForPruning=hpForPruning,
543-
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE)
543+
changesByOp=changesByOp, flagsCount=as.scalar(pipList['flags']), test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(as.frame(pipList['ph'])))
544544
#TODO double check why this is necessary
545545
mincol = min(ncol(cvChanges),ncol(changesByOp))
546546
cvChanges[cvk,1:mincol] = changesByOp[,1:mincol];
@@ -557,7 +557,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
557557
allChanges = min(allChanges)
558558
changesByOp = colMaxs(cvChanges)
559559
accuracy = mean(accuracyMatrix)
560-
print("cv accuracy: "+toString(accuracy))
560+
print("- cv accuracy: "+toString(accuracy))
561561
}
562562

563563
pruningSignal = function(Matrix[Double] pipPre, Matrix[Double] pipNew, Matrix[Double] hp_matrix, Matrix[Double] hpForPruning, Matrix[Double] changesByOp)
@@ -670,7 +670,7 @@ run_with_hyperparamNested = function(Frame[Unknown] ph_pip, Integer r_i = 1, Mat
670670
{
671671
[eXtrain, eYtrain, eXtest, eYtest, Tr, hpForPruning, changesByOp, changesByPip] = executePipeline(pipeline=op,
672672
Xtrain=X, Ytrain=Y, Xtest=Xtest, Ytest=Ytest, metaList=metaList2, hyperParameters=hp_matrix, hpForPruning=hpForPruning,
673-
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
673+
changesByOp=changesByOp, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=1, endInd=ncol(op))
674674
if(max(eYtrain) == min(eYtrain))
675675
print("Y contains only one class")
676676
else if(changesByPip < ref)
@@ -727,4 +727,4 @@ return(Boolean execute)
727727
}
728728
}
729729
execute = !(changeCount > 0)
730-
}
730+
}

scripts/builtin/executePipeline.dml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@
5151

5252
f_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Matrix[Double] Ytrain,
5353
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
54-
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
54+
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose,
55+
Integer startInd, Integer endInd)
5556
return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest,
5657
Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll, List[Unknown] internalStates)
5758
{
@@ -68,8 +69,11 @@ f_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat
6869
print("pipeline in execution "+toString(pipeline))
6970
print("pipeline hps "+toString(hyperParameters))
7071
}
71-
for(i in 1:ncol(pipeline)) {
72+
73+
# for(i in 1:ncol(pipeline)) {
74+
for(i in startInd:endInd) {
7275
op = as.scalar(pipeline[1,i])
76+
print("-- Applying Primitive: "+op);
7377
applyOp = toString(as.scalar(applyFunc[1,i]))
7478
Xclone = Xtrain
7579
XtestClone = Xtest
@@ -476,4 +480,4 @@ return (Matrix[Double] cmin, Matrix[Double] cmax)
476480
cmin[1, i] = min(vec)
477481
cmax[1, i] = max(vec)
478482
}
479-
}
483+
}

scripts/builtin/fit_pipeline.dml

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ source("scripts/builtin/bandit.dml") as bandit;
4848

4949
f_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
5050
Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, Integer cvk=3, String evaluationFunc, Matrix[Double] evalFunHp,
51-
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
51+
Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean allCombinations=FALSE)
5252
return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
5353
{
5454
externalState = list()
@@ -92,28 +92,66 @@ return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
9292
hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
9393
pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
9494

95+
print("Getting training score using CV")
9596
[trainScore, evalFunHp] = bandit::crossV(X=eXtrain, y=eYtrain, cvk=cvk, evalFunHp=evalFunHp,
9697
pipList=pipList, metaList=metaList, evalFunc=evaluationFunc)
97-
print("train score cv: "+toString(trainScore))
98+
print("- train score cv: "+toString(trainScore))
9899

99-
100-
# # # now test accuracy
101-
[eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
102-
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
103-
104-
if(max(eYtrain) == min(eYtrain))
105-
stop("Y contains only one class")
100+
print("----------------------------");
101+
print("Getting test accuracy")
102+
primitives = matrix(0, rows=0, cols=0);
103+
if (allCombinations) {
104+
# Count number of subsets of consecutive primitives
105+
totCount = 0;
106+
n = ncol(pip);
107+
for (i in 1:n) {
108+
for (j in i:n)
109+
totCount = totCount + 1;
110+
}
111+
# List start and end indices of all those subsets
112+
primitives = matrix(0, rows=totCount, cols=2);
113+
r = 1;
114+
for (start in 1:n) {
115+
for (end in start:n) {
116+
primitives[r,1] = start;
117+
primitives[r,2] = end;
118+
r = r + 1;
119+
}
120+
}
121+
}
122+
else {
123+
# Include all primitives
124+
primitives = matrix(0, rows=1, cols=2);
125+
primitives[1,1] = 1;
126+
primitives[1,2] = ncol(pip);
127+
}
106128

107-
# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
108-
# trainAccuracy = as.scalar(score[1, 1])
109-
110-
score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
111-
testAccuracy = as.scalar(score[1, 1])
112-
113-
scores = matrix(0, rows=1, cols=3)
114-
scores[1, 1] = dirtyScore
115-
# scores[1, 2] = trainAccuracy
116-
scores[1, 3] = testAccuracy
117-
cleanTrain = cbind(eXtrain, eYtrain)
118-
cleanTest = cbind(eXtest, eYtest)
129+
for (r in 1:nrow(primitives)) {
130+
startInd = as.scalar(primitives[r,1]);
131+
endInd = as.scalar(primitives[r,2]);
132+
# # # now test accuracy
133+
[eXtrain_cl, eYtrain_cl, eXtest_cl, eYtest_cl, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
134+
Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE, startInd=startInd, endInd=endInd)
135+
136+
if(max(eYtrain_cl) == min(eYtrain_cl))
137+
stop("Y contains only one class")
138+
139+
# score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtrain, Ytest=eYtrain, Xorig=as.matrix(0), evalFunHp=evalFunHp))
140+
# trainAccuracy = as.scalar(score[1, 1])
141+
142+
score = eval(evaluationFunc, list(X=eXtrain_cl, Y=eYtrain_cl, Xtest=eXtest_cl, Ytest=eYtest_cl, Xorig=as.matrix(0), evalFunHp=evalFunHp))
143+
testAccuracy = as.scalar(score[1, 1])
144+
145+
scores = matrix(0, rows=1, cols=3)
146+
scores[1, 1] = dirtyScore
147+
# scores[1, 2] = trainAccuracy
148+
scores[1, 3] = testAccuracy
149+
cleanTrain = cbind(eXtrain_cl, eYtrain_cl)
150+
cleanTest = cbind(eXtest, eYtest)
151+
152+
header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
153+
result = as.frame(scores)
154+
writeRes = rbind(header, result)
155+
print(toString(writeRes))
156+
}
119157
}

0 commit comments

Comments
 (0)