Skip to content

Commit 450d5d9

Browse files
Merge 322ed68 into openjdk23-bundle
2 parents 74672ab + 322ed68 commit 450d5d9

File tree

28 files changed

+967
-193
lines changed

28 files changed

+967
-193
lines changed

docs/changelog/112768.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 112768
2+
summary: Deduplicate Kuromoji User Dictionary
3+
area: Search
4+
type: enhancement
5+
issues: []

docs/plugins/analysis-kuromoji.asciidoc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ unknown words. It can be set to:
133133

134134
Whether punctuation should be discarded from the output. Defaults to `true`.
135135

136+
`lenient`::
137+
138+
Whether the `user_dictionary` should be deduplicated on the provided `text`.
139+
False by default causing duplicates to generate an error.
140+
136141
`user_dictionary`::
137142
+
138143
--
@@ -221,7 +226,8 @@ PUT kuromoji_sample
221226
"type": "kuromoji_tokenizer",
222227
"mode": "extended",
223228
"discard_punctuation": "false",
224-
"user_dictionary": "userdict_ja.txt"
229+
"user_dictionary": "userdict_ja.txt",
230+
"lenient": "true"
225231
}
226232
},
227233
"analyzer": {

docs/plugins/analysis-nori.asciidoc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ It can be set to:
5858

5959
Whether punctuation should be discarded from the output. Defaults to `true`.
6060

61+
`lenient`::
62+
63+
Whether the `user_dictionary` should be deduplicated on the provided `text`.
64+
False by default causing duplicates to generate an error.
65+
6166
`user_dictionary`::
6267
+
6368
--
@@ -104,7 +109,8 @@ PUT nori_sample
104109
"type": "nori_tokenizer",
105110
"decompound_mode": "mixed",
106111
"discard_punctuation": "false",
107-
"user_dictionary": "userdict_ko.txt"
112+
"user_dictionary": "userdict_ko.txt",
113+
"lenient": "true"
108114
}
109115
},
110116
"analyzer": {
@@ -299,7 +305,6 @@ Which responds with:
299305
}
300306
--------------------------------------------------
301307

302-
303308
[[analysis-nori-speech]]
304309
==== `nori_part_of_speech` token filter
305310

modules/transport-netty4/src/main/java/org/elasticsearch/http/netty4/Netty4HttpAggregator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception
4646
assert msg instanceof HttpObject;
4747
if (msg instanceof HttpRequest request) {
4848
var preReq = HttpHeadersAuthenticatorUtils.asHttpPreRequest(request);
49-
aggregating = decider.test(preReq) && IGNORE_TEST.test(preReq);
49+
aggregating = (decider.test(preReq) && IGNORE_TEST.test(preReq)) || request.decoderResult().isFailure();
5050
}
5151
if (aggregating || msg instanceof FullHttpRequest) {
5252
super.channelRead(ctx, msg);

modules/transport-netty4/src/main/java/org/elasticsearch/http/netty4/Netty4HttpRequest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import io.netty.handler.codec.http.HttpHeaderNames;
1818
import io.netty.handler.codec.http.HttpHeaders;
1919
import io.netty.handler.codec.http.HttpMethod;
20+
import io.netty.handler.codec.http.QueryStringDecoder;
2021
import io.netty.handler.codec.http.cookie.Cookie;
2122
import io.netty.handler.codec.http.cookie.ServerCookieDecoder;
2223
import io.netty.handler.codec.http.cookie.ServerCookieEncoder;
@@ -48,6 +49,7 @@ public class Netty4HttpRequest implements HttpRequest {
4849
private final Exception inboundException;
4950
private final boolean pooled;
5051
private final int sequence;
52+
private final QueryStringDecoder queryStringDecoder;
5153

5254
Netty4HttpRequest(int sequence, io.netty.handler.codec.http.HttpRequest request, Netty4HttpRequestBodyStream contentStream) {
5355
this(
@@ -94,6 +96,7 @@ private Netty4HttpRequest(
9496
this.pooled = pooled;
9597
this.released = released;
9698
this.inboundException = inboundException;
99+
this.queryStringDecoder = new QueryStringDecoder(request.uri());
97100
}
98101

99102
@Override
@@ -106,6 +109,11 @@ public String uri() {
106109
return request.uri();
107110
}
108111

112+
@Override
113+
public String rawPath() {
114+
return queryStringDecoder.rawPath();
115+
}
116+
109117
@Override
110118
public HttpBody body() {
111119
assert released.get() == false;

modules/transport-netty4/src/main/java/org/elasticsearch/http/netty4/Netty4HttpServerTransport.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -375,9 +375,8 @@ protected HttpMessage createMessage(String[] initialLine) throws Exception {
375375
final HttpObjectAggregator aggregator = new Netty4HttpAggregator(
376376
handlingSettings.maxContentLength(),
377377
httpPreRequest -> enabled.get() == false
378-
|| (httpPreRequest.uri().contains("_bulk") == false
379-
|| httpPreRequest.uri().contains("_bulk_update")
380-
|| httpPreRequest.uri().contains("/_xpack/monitoring/_bulk"))
378+
|| ((httpPreRequest.rawPath().endsWith("/_bulk") == false)
379+
|| httpPreRequest.rawPath().startsWith("/_xpack/monitoring/_bulk"))
381380
);
382381
aggregator.setMaxCumulationBufferComponents(transport.maxCompositeBufferComponents);
383382
ch.pipeline()

muted-tests.yml

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,6 @@ tests:
215215
- class: org.elasticsearch.xpack.sql.qa.security.JdbcSqlSpecIT
216216
method: test {case-functions.testSelectInsertWithLcaseAndLengthWithOrderBy}
217217
issue: https://github.com/elastic/elasticsearch/issues/112642
218-
- class: org.elasticsearch.datastreams.logsdb.qa.StandardVersusLogsIndexModeRandomDataChallengeRestIT
219-
method: testHistogramAggregation
220-
issue: https://github.com/elastic/elasticsearch/issues/113109
221218
- class: org.elasticsearch.action.admin.cluster.node.stats.NodeStatsTests
222219
method: testChunking
223220
issue: https://github.com/elastic/elasticsearch/issues/113139
@@ -262,15 +259,9 @@ tests:
262259
- class: org.elasticsearch.index.mapper.DoubleRangeFieldMapperTests
263260
method: testSyntheticSourceKeepAll
264261
issue: https://github.com/elastic/elasticsearch/issues/113234
265-
- class: org.elasticsearch.datastreams.logsdb.qa.StandardVersusLogsIndexModeRandomDataChallengeRestIT
266-
method: testTermsQuery
267-
issue: https://github.com/elastic/elasticsearch/issues/113246
268262
- class: org.elasticsearch.integration.KibanaUserRoleIntegTests
269263
method: testGetMappings
270264
issue: https://github.com/elastic/elasticsearch/issues/113260
271-
- class: org.elasticsearch.datastreams.logsdb.qa.StandardVersusLogsIndexModeRandomDataChallengeRestIT
272-
method: testMatchAllQuery
273-
issue: https://github.com/elastic/elasticsearch/issues/113265
274265
- class: org.elasticsearch.xpack.security.authz.SecurityScrollTests
275266
method: testSearchAndClearScroll
276267
issue: https://github.com/elastic/elasticsearch/issues/113285
@@ -286,6 +277,41 @@ tests:
286277
- class: org.elasticsearch.xpack.esql.ccq.MultiClusterSpecIT
287278
method: test {stats.DocsStatsGroupByMultipleValues}
288279
issue: https://github.com/elastic/elasticsearch/issues/113296
280+
- class: org.elasticsearch.xpack.esql.qa.mixed.MixedClusterEsqlSpecIT
281+
issue: https://github.com/elastic/elasticsearch/issues/113298
282+
- class: org.elasticsearch.integration.KibanaUserRoleIntegTests
283+
method: testGetIndex
284+
issue: https://github.com/elastic/elasticsearch/issues/113311
285+
- class: org.elasticsearch.packaging.test.WindowsServiceTests
286+
method: test81JavaOptsInJvmOptions
287+
issue: https://github.com/elastic/elasticsearch/issues/113313
288+
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
289+
method: test {p0=esql/50_index_patterns/disjoint_mappings}
290+
issue: https://github.com/elastic/elasticsearch/issues/113315
291+
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
292+
method: test {p0=wildcard/10_wildcard_basic/Query_string wildcard query}
293+
issue: https://github.com/elastic/elasticsearch/issues/113316
294+
- class: org.elasticsearch.index.mapper.LongRangeFieldMapperTests
295+
method: testSyntheticSourceKeepAll
296+
issue: https://github.com/elastic/elasticsearch/issues/113324
297+
- class: org.elasticsearch.backwards.MixedClusterClientYamlTestSuiteIT
298+
method: test {p0=mtermvectors/10_basic/Tests catching other exceptions per item}
299+
issue: https://github.com/elastic/elasticsearch/issues/113325
300+
- class: org.elasticsearch.index.mapper.IntegerRangeFieldMapperTests
301+
method: testSyntheticSourceKeepArrays
302+
issue: https://github.com/elastic/elasticsearch/issues/113326
303+
- class: org.elasticsearch.xpack.test.rest.XPackRestIT
304+
method: test {p0=transform/transforms_force_delete/Test force deleting a running transform}
305+
issue: https://github.com/elastic/elasticsearch/issues/113327
306+
- class: org.elasticsearch.integration.KibanaUserRoleIntegTests
307+
method: testValidateQuery
308+
issue: https://github.com/elastic/elasticsearch/issues/113328
309+
- class: org.elasticsearch.index.mapper.LongRangeFieldMapperTests
310+
method: testSyntheticSourceKeepArrays
311+
issue: https://github.com/elastic/elasticsearch/issues/113335
312+
- class: org.elasticsearch.xpack.security.support.SecurityIndexManagerIntegTests
313+
method: testOnIndexAvailableForSearchIndexAlreadyAvailable
314+
issue: https://github.com/elastic/elasticsearch/issues/113336
289315

290316
# Examples:
291317
#

plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiTokenizerFactory.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
3333
private static final String NBEST_COST = "nbest_cost";
3434
private static final String NBEST_EXAMPLES = "nbest_examples";
3535
private static final String DISCARD_COMPOUND_TOKEN = "discard_compound_token";
36+
private static final String LENIENT = "lenient";
3637

3738
private final UserDictionary userDictionary;
3839
private final Mode mode;
@@ -58,14 +59,23 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
5859
"It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"
5960
);
6061
}
61-
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false, true);
62+
List<String> ruleList = Analysis.getWordList(
63+
env,
64+
settings,
65+
USER_DICT_PATH_OPTION,
66+
USER_DICT_RULES_OPTION,
67+
LENIENT,
68+
false, // typically don't want to remove comments as deduplication will provide better feedback
69+
true
70+
);
6271
if (ruleList == null || ruleList.isEmpty()) {
6372
return null;
6473
}
6574
StringBuilder sb = new StringBuilder();
6675
for (String line : ruleList) {
6776
sb.append(line).append(System.lineSeparator());
6877
}
78+
6979
try (Reader rulesReader = new StringReader(sb.toString())) {
7080
return UserDictionary.open(rulesReader);
7181
} catch (IOException e) {

plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,26 @@ public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
445445
)
446446
.build();
447447
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
448-
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
448+
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [4]"));
449+
}
450+
451+
public void testKuromojiAnalyzerDuplicateUserDictRuleDeduplication() throws Exception {
452+
Settings settings = Settings.builder()
453+
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
454+
.put("index.analysis.analyzer.my_analyzer.lenient", "true")
455+
.putList(
456+
"index.analysis.analyzer.my_analyzer.user_dictionary_rules",
457+
"c++,c++,w,w",
458+
"#comment",
459+
"制限スピード,制限スピード,セイゲンスピード,テスト名詞",
460+
"制限スピード,制限スピード,セイゲンスピード,テスト名詞"
461+
)
462+
.build();
463+
TestAnalysis analysis = createTestAnalysis(settings);
464+
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
465+
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
466+
assertTokenStreamContents(stream, new String[] { "制限スピード" });
467+
}
449468
}
450469

451470
public void testDiscardCompoundToken() throws Exception {

plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/NoriTokenizerFactory.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
public class NoriTokenizerFactory extends AbstractTokenizerFactory {
3232
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
3333
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
34+
private static final String LENIENT = "lenient";
3435

3536
private final UserDictionary userDictionary;
3637
private final KoreanTokenizer.DecompoundMode decompoundMode;
@@ -54,7 +55,8 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
5455
settings,
5556
USER_DICT_PATH_OPTION,
5657
USER_DICT_RULES_OPTION,
57-
true,
58+
LENIENT,
59+
false, // typically don't want to remove comments as deduplication will provide better feedback
5860
isSupportDuplicateCheck(indexSettings)
5961
);
6062
if (ruleList == null || ruleList.isEmpty()) {

0 commit comments

Comments
 (0)