Skip to content

Commit 20616e6

Browse files
committed
Move RawString class into separate file and rename to EncodedString
1 parent b0f701c commit 20616e6

File tree

2 files changed

+67
-50
lines changed

2 files changed

+67
-50
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.common.bytes;
11+
12+
import org.apache.lucene.util.BytesRef;
13+
14+
import java.util.Objects;
15+
16+
/**
17+
* Class that holds either a UTF-16 String or a UTF-8 BytesRef, and lazily converts between the two.
18+
*/
19+
public class EncodedString {
20+
private BytesRef bytesValue;
21+
private String stringValue;
22+
private final int charCount;
23+
24+
public EncodedString(BytesRef bytesValue, int charCount) {
25+
this.bytesValue = Objects.requireNonNull(bytesValue);
26+
this.charCount = charCount;
27+
}
28+
29+
public EncodedString(String stringValue) {
30+
this.stringValue = Objects.requireNonNull(stringValue);
31+
this.charCount = stringValue.length();
32+
}
33+
34+
public BytesRef bytesValue() {
35+
if (bytesValue != null) {
36+
return bytesValue;
37+
}
38+
39+
bytesValue = new BytesRef(stringValue);
40+
return bytesValue;
41+
}
42+
43+
public String stringValue() {
44+
if (stringValue != null) {
45+
return stringValue;
46+
}
47+
48+
stringValue = bytesValue.utf8ToString();
49+
assert stringValue.length() == charCount;
50+
return stringValue;
51+
}
52+
53+
public int length() {
54+
return charCount;
55+
}
56+
}

server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java

Lines changed: 11 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.apache.lucene.util.automaton.CompiledAutomaton;
3535
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
3636
import org.apache.lucene.util.automaton.Operations;
37+
import org.elasticsearch.common.bytes.EncodedString;
3738
import org.elasticsearch.common.lucene.BytesRefs;
3839
import org.elasticsearch.common.lucene.Lucene;
3940
import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -1104,65 +1105,25 @@ public String getOffsetFieldName() {
11041105
return offsetsFieldName;
11051106
}
11061107

1107-
/**
1108-
* Class that holds either a UTF-16 String or a UTF-8 BytesRef, and lazily converts between the two.
1109-
*/
1110-
private static class RawString {
1111-
private BytesRef bytesValue;
1112-
private String stringValue;
1113-
1114-
RawString(BytesRef bytesValue) {
1115-
this.bytesValue = Objects.requireNonNull(bytesValue);
1116-
}
1117-
1118-
RawString(String stringValue) {
1119-
this.stringValue = Objects.requireNonNull(stringValue);
1120-
}
1121-
1122-
BytesRef bytesValue() {
1123-
if (bytesValue != null) {
1124-
return bytesValue;
1125-
}
1126-
1127-
bytesValue = new BytesRef(stringValue);
1128-
return bytesValue;
1129-
}
1130-
1131-
String stringValue() {
1132-
if (stringValue != null) {
1133-
return stringValue;
1134-
}
1135-
1136-
stringValue = bytesValue.utf8ToString();
1137-
return stringValue;
1138-
}
1139-
1140-
int length() {
1141-
if (stringValue != null) {
1142-
return stringValue.length();
1143-
} else {
1144-
// This works because we currently use raw utf-8 encoding only for ascii-only strings.
1145-
return bytesValue.length;
1146-
}
1147-
}
1148-
}
1149-
11501108
protected void parseCreateField(DocumentParserContext context) throws IOException {
1151-
RawString value;
1109+
EncodedString value;
11521110
var bytesValue = context.parser().textRefOrNull();
11531111
if (bytesValue != null) {
1154-
value = new RawString(new BytesRef(bytesValue.bytes(), bytesValue.start(), bytesValue.end() - bytesValue.start()));
1112+
int len = bytesValue.end() - bytesValue.start();
1113+
// For now, we can use `len` for `charCount` because textRefOrNull only returns ascii-encoded unescaped strings,
1114+
// which means each character uses exactly 1 byte.
1115+
value = new EncodedString(new BytesRef(bytesValue.bytes(), bytesValue.start(), len), len);
11551116
} else {
11561117
var stringValue = context.parser().textOrNull();
11571118
if (stringValue != null) {
1158-
value = new RawString(stringValue);
1119+
value = new EncodedString(stringValue);
11591120
} else {
11601121
value = null;
11611122
}
11621123
}
11631124

11641125
if (value == null && fieldType().nullValue != null) {
1165-
value = new RawString(fieldType().nullValue);
1126+
value = new EncodedString(fieldType().nullValue);
11661127
}
11671128

11681129
boolean indexed = indexValue(context, value);
@@ -1186,10 +1147,10 @@ protected void indexScriptValues(
11861147
}
11871148

11881149
private boolean indexValue(DocumentParserContext context, String value) {
1189-
return indexValue(context, new RawString(value));
1150+
return indexValue(context, new EncodedString(value));
11901151
}
11911152

1192-
private boolean indexValue(DocumentParserContext context, RawString value) {
1153+
private boolean indexValue(DocumentParserContext context, EncodedString value) {
11931154
if (value == null) {
11941155
return false;
11951156
}
@@ -1210,7 +1171,7 @@ private boolean indexValue(DocumentParserContext context, RawString value) {
12101171

12111172
if (fieldType().normalizer() != Lucene.KEYWORD_ANALYZER) {
12121173
String normalizedString = normalizeValue(fieldType().normalizer(), fullPath(), value.stringValue());
1213-
value = new RawString(normalizedString);
1174+
value = new EncodedString(normalizedString);
12141175
}
12151176

12161177
BytesRef binaryValue = value.bytesValue();

0 commit comments

Comments
 (0)