3434import org .apache .lucene .util .automaton .CompiledAutomaton ;
3535import org .apache .lucene .util .automaton .CompiledAutomaton .AUTOMATON_TYPE ;
3636import org .apache .lucene .util .automaton .Operations ;
37+ import org .elasticsearch .common .bytes .EncodedString ;
3738import org .elasticsearch .common .lucene .BytesRefs ;
3839import org .elasticsearch .common .lucene .Lucene ;
3940import org .elasticsearch .common .lucene .search .AutomatonQueries ;
@@ -1104,65 +1105,25 @@ public String getOffsetFieldName() {
11041105 return offsetsFieldName ;
11051106 }
11061107
1107- /**
1108- * Class that holds either a UTF-16 String or a UTF-8 BytesRef, and lazily converts between the two.
1109- */
1110- private static class RawString {
1111- private BytesRef bytesValue ;
1112- private String stringValue ;
1113-
1114- RawString (BytesRef bytesValue ) {
1115- this .bytesValue = Objects .requireNonNull (bytesValue );
1116- }
1117-
1118- RawString (String stringValue ) {
1119- this .stringValue = Objects .requireNonNull (stringValue );
1120- }
1121-
1122- BytesRef bytesValue () {
1123- if (bytesValue != null ) {
1124- return bytesValue ;
1125- }
1126-
1127- bytesValue = new BytesRef (stringValue );
1128- return bytesValue ;
1129- }
1130-
1131- String stringValue () {
1132- if (stringValue != null ) {
1133- return stringValue ;
1134- }
1135-
1136- stringValue = bytesValue .utf8ToString ();
1137- return stringValue ;
1138- }
1139-
1140- int length () {
1141- if (stringValue != null ) {
1142- return stringValue .length ();
1143- } else {
1144- // This works because we currently use raw utf-8 encoding only for ascii-only strings.
1145- return bytesValue .length ;
1146- }
1147- }
1148- }
1149-
11501108 protected void parseCreateField (DocumentParserContext context ) throws IOException {
1151- RawString value ;
1109+ EncodedString value ;
11521110 var bytesValue = context .parser ().textRefOrNull ();
11531111 if (bytesValue != null ) {
1154- value = new RawString (new BytesRef (bytesValue .bytes (), bytesValue .start (), bytesValue .end () - bytesValue .start ()));
1112+ int len = bytesValue .end () - bytesValue .start ();
1113+ // For now, we can use `len` for `charCount` because textRefOrNull only returns ascii-encoded unescaped strings,
1114+ // which means each character uses exactly 1 byte.
1115+ value = new EncodedString (new BytesRef (bytesValue .bytes (), bytesValue .start (), len ), len );
11551116 } else {
11561117 var stringValue = context .parser ().textOrNull ();
11571118 if (stringValue != null ) {
1158- value = new RawString (stringValue );
1119+ value = new EncodedString (stringValue );
11591120 } else {
11601121 value = null ;
11611122 }
11621123 }
11631124
11641125 if (value == null && fieldType ().nullValue != null ) {
1165- value = new RawString (fieldType ().nullValue );
1126+ value = new EncodedString (fieldType ().nullValue );
11661127 }
11671128
11681129 boolean indexed = indexValue (context , value );
@@ -1186,10 +1147,10 @@ protected void indexScriptValues(
11861147 }
11871148
11881149 private boolean indexValue (DocumentParserContext context , String value ) {
1189- return indexValue (context , new RawString (value ));
1150+ return indexValue (context , new EncodedString (value ));
11901151 }
11911152
1192- private boolean indexValue (DocumentParserContext context , RawString value ) {
1153+ private boolean indexValue (DocumentParserContext context , EncodedString value ) {
11931154 if (value == null ) {
11941155 return false ;
11951156 }
@@ -1210,7 +1171,7 @@ private boolean indexValue(DocumentParserContext context, RawString value) {
12101171
12111172 if (fieldType ().normalizer () != Lucene .KEYWORD_ANALYZER ) {
12121173 String normalizedString = normalizeValue (fieldType ().normalizer (), fullPath (), value .stringValue ());
1213- value = new RawString (normalizedString );
1174+ value = new EncodedString (normalizedString );
12141175 }
12151176
12161177 BytesRef binaryValue = value .bytesValue ();
0 commit comments