Skip to content

Commit 661a98d

Browse files
committed
[SYSTEMDS-3790] Fix transformencode robustness for non-existing columns
This patch fixes endless loops in transformencode, if the tfspec references columns outside the column range.
1 parent 066c0aa commit 661a98d

File tree

2 files changed

+15
-8
lines changed

2 files changed

+15
-8
lines changed

src/main/java/org/apache/sysds/runtime/transform/encode/EncoderFactory.java

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,24 @@ public static MultiColumnEncoder createEncoder(String spec, String[] colnames, i
127127
// Error out if the first level encoders have overlaps
128128
if (intersect(rcIDs, binIDs, haIDs, weIDs, bowIDs))
129129
throw new DMLRuntimeException("More than one encoders (recode, binning, hashing, word_embedding, bag_of_words) on one column is not allowed");
130-
130+
131131
List<Integer> ptIDs = except(UtilFunctions.getSeqList(1, clen, 1), naryUnionDistinct(rcIDs, haIDs, binIDs, weIDs, bowIDs));
132-
List<Integer> oIDs = Arrays.asList(ArrayUtils
133-
.toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfMethod.OMIT.toString(), minCol, maxCol)));
134-
List<Integer> mvIDs = Arrays.asList(ArrayUtils.toObject(
135-
TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfMethod.IMPUTE.toString(), minCol, maxCol)));
132+
List<Integer> oIDs = new ArrayList<>(Arrays.asList(ArrayUtils
133+
.toObject(TfMetaUtils.parseJsonIDList(jSpec, colnames, TfMethod.OMIT.toString(), minCol, maxCol))));
134+
List<Integer> mvIDs = new ArrayList<>(Arrays.asList(ArrayUtils.toObject(
135+
TfMetaUtils.parseJsonObjectIDList(jSpec, colnames, TfMethod.IMPUTE.toString(), minCol, maxCol))));
136136
List<Integer> udfIDs = TfMetaUtils.parseUDFColIDs(jSpec, colnames, minCol, maxCol);
137137

138-
138+
// robustness for transformencode specs w/ non-existing columns (so far, endless loops)
139+
rcIDs.removeIf(i -> i > clen);
140+
ptIDs.removeIf(i -> i > clen);
141+
oIDs.removeIf(i -> i > clen);
142+
mvIDs.removeIf(i -> i > clen);
143+
udfIDs.removeIf(i -> i > clen);
144+
binIDs.removeIf(i -> i > clen);
145+
weIDs.removeIf(i -> i > clen);
146+
bowIDs.removeIf(i -> i > clen);
147+
139148
// create individual encoders
140149
if(!rcIDs.isEmpty())
141150
for(Integer id : rcIDs)

src/test/scripts/functions/builtin/adasynRealData.dml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ tfspec = read($2, data_type="scalar", value_type="string")
2828
upsample = as.logical($3)
2929

3030
if( tfspec != " " ) {
31-
F = M[, 1:ncol(M)] # FIXME
3231
[X,meta] = transformencode(target=F, spec=tfspec);
33-
X = X[,1:ncol(X)-1];
3432
X = imputeByMode(X);
3533
}
3634
else {

0 commit comments

Comments
 (0)