Move RawString class into separate file and rename to EncodedString

jordan-powers · jordan-powers · commit 20616e6f8b7c · 2025-04-17T14:26:27.000-07:00
diff --git a/server/src/main/java/org/elasticsearch/common/bytes/EncodedString.java b/server/src/main/java/org/elasticsearch/common/bytes/EncodedString.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.common.bytes;
+
+import org.apache.lucene.util.BytesRef;
+
+import java.util.Objects;
+
+/**
+ * Class that holds either a UTF-16 String or a UTF-8 BytesRef, and lazily converts between the two.
+ */
+public class EncodedString {
+    private BytesRef bytesValue;
+    private String stringValue;
+    private final int charCount;
+
+    public EncodedString(BytesRef bytesValue, int charCount) {
+        this.bytesValue = Objects.requireNonNull(bytesValue);
+        this.charCount = charCount;
+    }
+
+    public EncodedString(String stringValue) {
+        this.stringValue = Objects.requireNonNull(stringValue);
+        this.charCount = stringValue.length();
+    }
+
+    public BytesRef bytesValue() {
+        if (bytesValue != null) {
+            return bytesValue;
+        }
+
+        bytesValue = new BytesRef(stringValue);
+        return bytesValue;
+    }
+
+    public String stringValue() {
+        if (stringValue != null) {
+            return stringValue;
+        }
+
+        stringValue = bytesValue.utf8ToString();
+        assert stringValue.length() == charCount;
+        return stringValue;
+    }
+
+    public int length() {
+        return charCount;
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java
@@ -34,6 +34,7 @@
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
 import org.apache.lucene.util.automaton.Operations;
+import org.elasticsearch.common.bytes.EncodedString;
 import org.elasticsearch.common.lucene.BytesRefs;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.search.AutomatonQueries;
@@ -1104,65 +1105,25 @@ public String getOffsetFieldName() {
         return offsetsFieldName;
     }
 
-    /**
-     * Class that holds either a UTF-16 String or a UTF-8 BytesRef, and lazily converts between the two.
-     */
-    private static class RawString {
-        private BytesRef bytesValue;
-        private String stringValue;
-
-        RawString(BytesRef bytesValue) {
-            this.bytesValue = Objects.requireNonNull(bytesValue);
-        }
-
-        RawString(String stringValue) {
-            this.stringValue = Objects.requireNonNull(stringValue);
-        }
-
-        BytesRef bytesValue() {
-            if (bytesValue != null) {
-                return bytesValue;
-            }
-
-            bytesValue = new BytesRef(stringValue);
-            return bytesValue;
-        }
-
-        String stringValue() {
-            if (stringValue != null) {
-                return stringValue;
-            }
-
-            stringValue = bytesValue.utf8ToString();
-            return stringValue;
-        }
-
-        int length() {
-            if (stringValue != null) {
-                return stringValue.length();
-            } else {
-                // This works because we currently use raw utf-8 encoding only for ascii-only strings.
-                return bytesValue.length;
-            }
-        }
-    }
-
     protected void parseCreateField(DocumentParserContext context) throws IOException {
-        RawString value;
+        EncodedString value;
         var bytesValue = context.parser().textRefOrNull();
         if (bytesValue != null) {
-            value = new RawString(new BytesRef(bytesValue.bytes(), bytesValue.start(), bytesValue.end() - bytesValue.start()));
+            int len = bytesValue.end() - bytesValue.start();
+            // For now, we can use `len` for `charCount` because textRefOrNull only returns ascii-encoded unescaped strings,
+            // which means each character uses exactly 1 byte.
+            value = new EncodedString(new BytesRef(bytesValue.bytes(), bytesValue.start(), len), len);
         } else {
             var stringValue = context.parser().textOrNull();
             if (stringValue != null) {
-                value = new RawString(stringValue);
+                value = new EncodedString(stringValue);
             } else {
                 value = null;
             }
         }
 
         if (value == null && fieldType().nullValue != null) {
-            value = new RawString(fieldType().nullValue);
+            value = new EncodedString(fieldType().nullValue);
         }
 
         boolean indexed = indexValue(context, value);
@@ -1186,10 +1147,10 @@ protected void indexScriptValues(
     }
 
     private boolean indexValue(DocumentParserContext context, String value) {
-        return indexValue(context, new RawString(value));
+        return indexValue(context, new EncodedString(value));
     }
 
-    private boolean indexValue(DocumentParserContext context, RawString value) {
+    private boolean indexValue(DocumentParserContext context, EncodedString value) {
         if (value == null) {
             return false;
         }
@@ -1210,7 +1171,7 @@ private boolean indexValue(DocumentParserContext context, RawString value) {
 
         if (fieldType().normalizer() != Lucene.KEYWORD_ANALYZER) {
             String normalizedString = normalizeValue(fieldType().normalizer(), fullPath(), value.stringValue());
-            value = new RawString(normalizedString);
+            value = new EncodedString(normalizedString);
         }
 
         BytesRef binaryValue = value.bytesValue();