Skip to content

Commit 635de5e

Browse files
authored
[ML] Truncate categorization fields (#89827) (#89961)
Truncate the raw categorization field passed to the backend at 1001 characters.
1 parent 02e0c8f commit 635de5e

File tree

10 files changed

+109
-76
lines changed

10 files changed

+109
-76
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/AnalysisConfig.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,14 @@ public class AnalysisConfig implements ToXContentObject, Writeable {
6969
public static final String ML_CATEGORY_FIELD = "mlcategory";
7070
public static final Set<String> AUTO_CREATED_FIELDS = new HashSet<>(Collections.singletonList(ML_CATEGORY_FIELD));
7171

72+
// Since the C++ backend truncates the categorization field at length 1000 (see model::CCategoryExamplesCollector::MAX_EXAMPLE_LENGTH),
73+
// adding an ellipsis on truncation, it makes no sense to send potentially very long strings to it. For the backend logic still to work
74+
// we need to send more than that, hence we truncate at length 1001.
75+
//
76+
// Also, because we do the tokenization on the Java side now the tokens will still be sent correctly (separately) to the C++ backend
77+
// even if they extend beyond the length of a truncated example.
78+
public static final int MAX_CATEGORIZATION_FIELD_LENGTH = 1001;
79+
7280
// These parsers follow the pattern that metadata is parsed leniently (to allow for enhancements), whilst config is parsed strictly
7381
public static final ConstructingObjectParser<AnalysisConfig.Builder, Void> LENIENT_PARSER = createParser(true);
7482
public static final ConstructingObjectParser<AnalysisConfig.Builder, Void> STRICT_PARSER = createParser(false);

x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/CategorizationIT.java

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
*/
77
package org.elasticsearch.xpack.ml.integration;
88

9-
import org.apache.logging.log4j.LogManager;
109
import org.elasticsearch.action.bulk.BulkRequestBuilder;
1110
import org.elasticsearch.action.bulk.BulkResponse;
1211
import org.elasticsearch.action.index.IndexRequest;
@@ -32,7 +31,6 @@
3231
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.CategorizerStats;
3332
import org.elasticsearch.xpack.core.ml.job.results.CategoryDefinition;
3433
import org.elasticsearch.xpack.core.ml.job.results.Result;
35-
import org.elasticsearch.xpack.ml.MachineLearning;
3634
import org.junit.After;
3735
import org.junit.Before;
3836

@@ -342,58 +340,6 @@ public void testCategorizationStatePersistedOnSwitchToRealtime() throws Exceptio
342340
);
343341
}
344342

345-
public void testCategorizationPerformance() {
346-
// To compare Java/C++ tokenization performance:
347-
// 1. Change false to true in this assumption
348-
// 2. Run the test several times
349-
// 3. Change MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA to false
350-
// 4. Run the test several more times
351-
// 5. Check the timings that get logged
352-
// 6. Revert the changes to this assumption and MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
353-
assumeTrue("This is time consuming to run on every build - it should be run manually when comparing Java/C++ tokenization", false);
354-
355-
int testBatchSize = 1000;
356-
int testNumBatches = 1000;
357-
String[] possibleMessages = new String[] {
358-
"<sol13m-9402.1.p2ps: Info: Tue Apr 06 19:00:16 2010> Source LOTS on 33080:817 has shut down.<END>",
359-
"<lnl00m-8601.1.p2ps: Alert: Tue Apr 06 18:57:24 2010> P2PS failed to connect to the hrm server. "
360-
+ "Reason: Failed to connect to hrm server - No ACK from SIPC<END>",
361-
"<sol00m-8607.1.p2ps: Debug: Tue Apr 06 18:56:43 2010> Did not receive an image data for IDN_SELECTFEED:7630.T on 493. "
362-
+ "Recalling item. <END>",
363-
"<lnl13m-8602.1.p2ps.rrcpTransport.0.sinkSide.rrcp.transmissionBus: Warning: Tue Apr 06 18:36:32 2010> "
364-
+ "RRCP STATUS MSG: RRCP_REBOOT: node 33191 has rebooted<END>",
365-
"<sol00m-8608.1.p2ps: Info: Tue Apr 06 18:30:02 2010> Source PRISM_VOBr on 33069:757 has shut down.<END>",
366-
"<lnl06m-9402.1.p2ps: Info: Thu Mar 25 18:30:01 2010> Service PRISM_VOB has shut down.<END>" };
367-
368-
String jobId = "categorization-performance";
369-
Job.Builder job = newJobBuilder(jobId, Collections.emptyList(), false);
370-
putJob(job);
371-
openJob(job.getId());
372-
373-
long startTime = System.currentTimeMillis();
374-
375-
for (int batchNum = 0; batchNum < testNumBatches; ++batchNum) {
376-
StringBuilder json = new StringBuilder(testBatchSize * 100);
377-
for (int docNum = 0; docNum < testBatchSize; ++docNum) {
378-
json.append(
379-
String.format(Locale.ROOT, "{\"time\":1000000,\"msg\":\"%s\"}\n", possibleMessages[docNum % possibleMessages.length])
380-
);
381-
}
382-
postData(jobId, json.toString());
383-
}
384-
flushJob(jobId, false);
385-
386-
long duration = System.currentTimeMillis() - startTime;
387-
LogManager.getLogger(CategorizationIT.class)
388-
.info(
389-
"Performance test with tokenization in "
390-
+ (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA ? "Java" : "C++")
391-
+ " took "
392-
+ duration
393-
+ "ms"
394-
);
395-
}
396-
397343
public void testStopOnWarn() throws IOException {
398344

399345
long testTime = System.currentTimeMillis();

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -473,9 +473,6 @@ public class MachineLearning extends Plugin
473473

474474
private static final long DEFAULT_MODEL_CIRCUIT_BREAKER_LIMIT = (long) ((0.50) * JvmInfo.jvmInfo().getMem().getHeapMax().getBytes());
475475
private static final double DEFAULT_MODEL_CIRCUIT_BREAKER_OVERHEAD = 1.0D;
476-
// This is for performance testing. It's not exposed to the end user.
477-
// Recompile if you want to compare performance with C++ tokenization.
478-
public static final boolean CATEGORIZATION_TOKENIZATION_IN_JAVA = true;
479476

480477
public static final LicensedFeature.Persistent ML_ANOMALY_JOBS_FEATURE = LicensedFeature.persistent(
481478
MachineLearningField.ML_FEATURE_FAMILY,

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectCommunicator.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
2828
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.TimingStats;
2929
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
30-
import org.elasticsearch.xpack.ml.MachineLearning;
3130
import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
3231
import org.elasticsearch.xpack.ml.job.persistence.StateStreamer;
3332
import org.elasticsearch.xpack.ml.job.process.CountingInputStream;
@@ -88,8 +87,7 @@ public class AutodetectCommunicator implements Closeable {
8887
this.onFinishHandler = onFinishHandler;
8988
this.xContentRegistry = xContentRegistry;
9089
this.autodetectWorkerExecutor = autodetectWorkerExecutor;
91-
this.includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
92-
&& job.getAnalysisConfig().getCategorizationFieldName() != null;
90+
this.includeTokensField = job.getAnalysisConfig().getCategorizationFieldName() != null;
9391
}
9492

9593
public void restoreState(ModelSnapshot modelSnapshot) {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/JobModelSnapshotUpgrader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ protected final Map<String, Integer> outputFieldIndexes() {
213213
}
214214
}
215215
// field for categorization tokens
216-
if (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA && job.getAnalysisConfig().getCategorizationFieldName() != null) {
216+
if (job.getAnalysisConfig().getCategorizationFieldName() != null) {
217217
fieldIndexes.put(LengthEncodedWriter.PRETOKENISED_TOKEN_FIELD, index++);
218218
}
219219

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessFactory.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,7 @@ public AutodetectProcess createAutodetectProcess(
9696
true
9797
);
9898
createNativeProcess(job, params, processPipes, filesToDelete);
99-
boolean includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
100-
&& job.getAnalysisConfig().getCategorizationFieldName() != null;
99+
boolean includeTokensField = job.getAnalysisConfig().getCategorizationFieldName() != null;
101100
// The extra 1 is the control field
102101
int numberOfFields = job.allInputFields().size() + (includeTokensField ? 1 : 0) + 1;
103102

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriter.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ public abstract class AbstractDataToProcessWriter implements DataToProcessWriter
5656
private long latestEpochMs;
5757
private long latestEpochMsThisUpload;
5858

59+
private Set<String> termFields;
60+
5961
protected AbstractDataToProcessWriter(
6062
boolean includeControlField,
6163
boolean includeTokensField,
@@ -74,6 +76,7 @@ protected AbstractDataToProcessWriter(
7476
this.logger = Objects.requireNonNull(logger);
7577
this.latencySeconds = analysisConfig.getLatency() == null ? 0 : analysisConfig.getLatency().seconds();
7678
this.bucketSpanMs = analysisConfig.getBucketSpan().getMillis();
79+
this.termFields = analysisConfig.termFields();
7780

7881
Date date = dataCountsReporter.getLatestRecordTime();
7982
latestEpochMsThisUpload = 0;
@@ -90,6 +93,13 @@ protected AbstractDataToProcessWriter(
9093
}
9194
}
9295

96+
public String maybeTruncateCatgeorizationField(String categorizationField) {
97+
if (termFields.contains(analysisConfig.getCategorizationFieldName()) == false) {
98+
return categorizationField.substring(0, Math.min(categorizationField.length(), AnalysisConfig.MAX_CATEGORIZATION_FIELD_LENGTH));
99+
}
100+
return categorizationField;
101+
}
102+
93103
/**
94104
* Set up the field index mappings. This must be called before
95105
* {@linkplain DataToProcessWriter#write(InputStream, CategorizationAnalyzer, XContentType, BiConsumer)}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriter.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,17 @@ private void writeJson(CategorizationAnalyzer categorizationAnalyzer, XContentPa
154154

155155
for (InputOutputMap inOut : inputOutputMap) {
156156
String field = input[inOut.inputIndex];
157-
record[inOut.outputIndex] = (field == null) ? "" : field;
157+
field = (field == null) ? "" : field;
158+
if (categorizationFieldIndex != null && inOut.inputIndex == categorizationFieldIndex) {
159+
field = maybeTruncateCatgeorizationField(field);
160+
}
161+
record[inOut.outputIndex] = field;
158162
}
159163

160164
if (categorizationAnalyzer != null && categorizationFieldIndex != null) {
161165
tokenizeForCategorization(categorizationAnalyzer, input[categorizationFieldIndex], record);
162166
}
167+
163168
transformTimeAndWrite(record, inputFieldCount);
164169

165170
inputFieldCount = recordReader.read(input, gotFields);

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AbstractDataToProcessWriterTests.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,81 @@ public void testTokenizeForCategorization() throws IOException {
165165
);
166166
}
167167
}
168+
169+
public void testMaybeTruncateCategorizationField() {
170+
{
171+
DataDescription.Builder dd = new DataDescription.Builder();
172+
dd.setTimeField("time_field");
173+
174+
Detector.Builder detector = new Detector.Builder("count", "");
175+
detector.setByFieldName("mlcategory");
176+
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
177+
builder.setCategorizationFieldName("message");
178+
AnalysisConfig ac = builder.build();
179+
180+
boolean includeTokensFields = randomBoolean();
181+
AbstractDataToProcessWriter writer = new JsonDataToProcessWriter(
182+
true,
183+
includeTokensFields,
184+
autodetectProcess,
185+
dd.build(),
186+
ac,
187+
dataCountsReporter,
188+
NamedXContentRegistry.EMPTY
189+
);
190+
191+
String truncatedField = writer.maybeTruncateCatgeorizationField(randomAlphaOfLengthBetween(1002, 2000));
192+
assertEquals(AnalysisConfig.MAX_CATEGORIZATION_FIELD_LENGTH, truncatedField.length());
193+
}
194+
{
195+
DataDescription.Builder dd = new DataDescription.Builder();
196+
dd.setTimeField("time_field");
197+
198+
Detector.Builder detector = new Detector.Builder("count", "");
199+
detector.setByFieldName("mlcategory");
200+
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
201+
builder.setCategorizationFieldName("message");
202+
AnalysisConfig ac = builder.build();
203+
204+
boolean includeTokensFields = randomBoolean();
205+
AbstractDataToProcessWriter writer = new JsonDataToProcessWriter(
206+
true,
207+
includeTokensFields,
208+
autodetectProcess,
209+
dd.build(),
210+
ac,
211+
dataCountsReporter,
212+
NamedXContentRegistry.EMPTY
213+
);
214+
215+
String categorizationField = randomAlphaOfLengthBetween(1, 1000);
216+
String truncatedField = writer.maybeTruncateCatgeorizationField(categorizationField);
217+
assertEquals(categorizationField.length(), truncatedField.length());
218+
}
219+
{
220+
DataDescription.Builder dd = new DataDescription.Builder();
221+
dd.setTimeField("time_field");
222+
223+
Detector.Builder detector = new Detector.Builder("count", "");
224+
detector.setByFieldName("mlcategory");
225+
detector.setPartitionFieldName("message");
226+
AnalysisConfig.Builder builder = new AnalysisConfig.Builder(Collections.singletonList(detector.build()));
227+
builder.setCategorizationFieldName("message");
228+
AnalysisConfig ac = builder.build();
229+
230+
boolean includeTokensFields = randomBoolean();
231+
AbstractDataToProcessWriter writer = new JsonDataToProcessWriter(
232+
true,
233+
includeTokensFields,
234+
autodetectProcess,
235+
dd.build(),
236+
ac,
237+
dataCountsReporter,
238+
NamedXContentRegistry.EMPTY
239+
);
240+
241+
String truncatedField = writer.maybeTruncateCatgeorizationField(randomAlphaOfLengthBetween(1002, 2000));
242+
assertFalse(AnalysisConfig.MAX_CATEGORIZATION_FIELD_LENGTH == truncatedField.length());
243+
}
244+
}
168245
}

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/JsonDataToProcessWriterTests.java

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
2424
import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
2525
import org.elasticsearch.xpack.core.ml.job.config.Detector;
26-
import org.elasticsearch.xpack.ml.MachineLearning;
2726
import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzer;
2827
import org.elasticsearch.xpack.ml.job.categorization.CategorizationAnalyzerTests;
2928
import org.elasticsearch.xpack.ml.job.process.DataCountsReporter;
@@ -135,15 +134,10 @@ public void testWrite_GivenTimeFormatIsEpochAndCategorization() throws Exception
135134

136135
List<String[]> expectedRecords = new ArrayList<>();
137136
// The "." field is the control field; "..." is the pre-tokenized tokens field
138-
if (MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA) {
139-
expectedRecords.add(new String[] { "time", "message", "...", "." });
140-
expectedRecords.add(new String[] { "1", "Node 1 started", "Node,started", "" });
141-
expectedRecords.add(new String[] { "2", "Node 2 started", "Node,started", "" });
142-
} else {
143-
expectedRecords.add(new String[] { "time", "message", "." });
144-
expectedRecords.add(new String[] { "1", "Node 1 started", "" });
145-
expectedRecords.add(new String[] { "2", "Node 2 started", "" });
146-
}
137+
expectedRecords.add(new String[] { "time", "message", "...", "." });
138+
expectedRecords.add(new String[] { "1", "Node 1 started", "Node,started", "" });
139+
expectedRecords.add(new String[] { "2", "Node 2 started", "Node,started", "" });
140+
147141
assertWrittenRecordsEqualTo(expectedRecords);
148142

149143
verify(dataCountsReporter).finishReporting();
@@ -411,8 +405,7 @@ private static InputStream createInputStream(String input) {
411405
}
412406

413407
private JsonDataToProcessWriter createWriter() {
414-
boolean includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
415-
&& analysisConfig.getCategorizationFieldName() != null;
408+
boolean includeTokensField = analysisConfig.getCategorizationFieldName() != null;
416409
return new JsonDataToProcessWriter(
417410
true,
418411
includeTokensField,

0 commit comments

Comments
 (0)