From 884526bba60b758592504c964ca727862be35fc1 Mon Sep 17 00:00:00 2001 From: Florian Bernd Date: Tue, 25 Nov 2025 15:46:33 +0100 Subject: [PATCH 1/2] Add custom vector data converters for high performance ingest scenarios --- .../ElasticsearchClientSettings.cs | 16 + .../IElasticsearchClientSettings.cs | 31 +- .../_Shared/Next/JsonWriterExtensions.cs | 15 + .../_Shared/Next/VectorConverters.cs | 298 ++++++++++++++++++ 4 files changed, 356 insertions(+), 4 deletions(-) create mode 100644 src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs index a8444286571..b313c76fd89 100644 --- a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs @@ -115,6 +115,8 @@ public abstract class ElasticsearchClientSettingsBase : private readonly Serializer _sourceSerializer; private BeforeRequestEvent? _onBeforeRequest; private bool _experimentalEnableSerializeNullInferredValues; + private FloatVectorDataEncoding _floatVectorDataEncoding = Elasticsearch.FloatVectorDataEncoding.Base64; + private ByteVectorDataEncoding _byteVectorDataEncoding = Elasticsearch.ByteVectorDataEncoding.Base64; private ExperimentalSettings _experimentalSettings = new(); private bool _defaultDisableAllInference; @@ -165,6 +167,8 @@ protected ElasticsearchClientSettingsBase( FluentDictionary IElasticsearchClientSettings.RouteProperties => _routeProperties; Serializer IElasticsearchClientSettings.SourceSerializer => _sourceSerializer; BeforeRequestEvent? IElasticsearchClientSettings.OnBeforeRequest => _onBeforeRequest; + FloatVectorDataEncoding IElasticsearchClientSettings.FloatVectorDataEncoding => _floatVectorDataEncoding; + ByteVectorDataEncoding IElasticsearchClientSettings.ByteVectorDataEncoding => _byteVectorDataEncoding; ExperimentalSettings IElasticsearchClientSettings.Experimental => _experimentalSettings; bool IElasticsearchClientSettings.ExperimentalEnableSerializeNullInferredValues => _experimentalEnableSerializeNullInferredValues; @@ -198,6 +202,18 @@ public TConnectionSettings DefaultFieldNameInferrer(Func fieldNa public TConnectionSettings ExperimentalEnableSerializeNullInferredValues(bool enabled = true) => Assign(enabled, (a, v) => a._experimentalEnableSerializeNullInferredValues = v); + /// + /// The default vector data encoding to use. + /// This settings instance for chaining. + public TConnectionSettings FloatVectorDataEncoding(FloatVectorDataEncoding encoding) => + Assign(encoding, (a, v) => a._floatVectorDataEncoding = v); + + /// + /// The default vector data encoding to use. + /// This settings instance for chaining. + public TConnectionSettings ByteVectorDataEncoding(ByteVectorDataEncoding encoding) => + Assign(encoding, (a, v) => a._byteVectorDataEncoding = v); + public TConnectionSettings Experimental(ExperimentalSettings settings) => Assign(settings, (a, v) => a._experimentalSettings = v); diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs index ffda6461b83..376b8d41da8 100644 --- a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs @@ -116,14 +116,37 @@ public interface IElasticsearchClientSettings : ITransportConfiguration BeforeRequestEvent? OnBeforeRequest { get; } /// - /// This is an advanced setting which controls serialization behaviour for inferred properies such as ID, routing and index name. - /// When enabled, it may reduce allocations on serialisation paths where the cost can be more significant, such as in bulk operations. + /// This is an advanced setting which controls serialization behaviour for inferred properties such as ID, routing and index name. + /// When enabled, it may reduce allocations on serialization paths where the cost can be more significant, such as in bulk operations. /// As a by-product it may cause null values to be included in the serialized data and impact payload size. This will only be a concern should some - /// typed not have inferrence mappings defined for the required properties. + /// typed not have inference mappings defined for the required properties. /// - /// This is marked as experiemental and may be removed or renamed in the future once its impact is evaluated. + /// This is marked as experimental and may be removed or renamed in the future once its impact is evaluated. bool ExperimentalEnableSerializeNullInferredValues { get; } + /// + /// Controls the vector data encoding to use for properties + /// in documents during ingestion when the is used. + /// + /// + /// Setting this value to provides backwards + /// compatibility when talking to Elasticsearch servers with a version older than 9.3.0 + /// (required for ). + /// + FloatVectorDataEncoding FloatVectorDataEncoding { get; } + + /// + /// Controls the vector data encoding to use for properties + /// in documents during ingestion when the is used. + /// + /// + /// Setting this value to provides backwards + /// compatibility when talking to Elasticsearch servers with a version older than 8.14.0 + /// (required for ) or older than 9.3.0 (required + /// for ). + /// + ByteVectorDataEncoding ByteVectorDataEncoding { get; } + /// /// Experimental settings. /// diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Next/JsonWriterExtensions.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Next/JsonWriterExtensions.cs index 926a284fb90..2258026d8c0 100644 --- a/src/Elastic.Clients.Elasticsearch/_Shared/Next/JsonWriterExtensions.cs +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Next/JsonWriterExtensions.cs @@ -250,6 +250,21 @@ public static void WriteUnionValue(this Utf8JsonWriter writer, JsonSeria ); } + public static void WriteSpanValue(this Utf8JsonWriter writer, JsonSerializerOptions options, ReadOnlySpan span, + JsonWriteFunc? writeElement) + { + writeElement ??= static (w, o, v) => WriteValue(w, o, v); + + writer.WriteStartArray(); + + foreach (var element in span) + { + writeElement(writer, options, element); + } + + writer.WriteEndArray(); + } + #endregion Delegate Based Write Methods #region Specialized Write Methods diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs new file mode 100644 index 00000000000..f6a6bd2aebe --- /dev/null +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs @@ -0,0 +1,298 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information. + +using System; +using System.Buffers; +using System.Buffers.Binary; +using System.Runtime.InteropServices; +using System.Text.Json; +using System.Text.Json.Serialization; + +using Elastic.Clients.Elasticsearch.Serialization; + +namespace Elastic.Clients.Elasticsearch; + +/// +/// The encoding to use when serializing vector data using the converter. +/// +public enum FloatVectorDataEncoding +{ + /// + /// Legacy (JSON array) vector encoding for backwards compatibility. + /// + Legacy, + + /// + /// Base64 vector encoding. + /// + /// + /// Base64 encoding is available starting from Elasticsearch 9.3.0. + /// + Base64 +} + +public sealed class FloatVectorDataConverter : + JsonConverter> +{ + private FloatVectorDataEncoding? _encoding; + + public override ReadOnlyMemory Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + return reader.TokenType switch + { + JsonTokenType.StartArray => new(reader.ReadCollectionValue(options, null)!.ToArray()), + JsonTokenType.String => ReadBase64VectorData(ref reader), + _ => throw reader.UnexpectedTokenException(JsonTokenType.StartArray, JsonTokenType.String) + }; + } + + public override void Write(Utf8JsonWriter writer, ReadOnlyMemory value, JsonSerializerOptions options) + { + var encoding = _encoding; + if (encoding is null) + { + var settings = ContextProvider.GetContext(options); + _encoding = settings.FloatVectorDataEncoding; + } + + switch (_encoding) + { + case FloatVectorDataEncoding.Legacy: + writer.WriteSpanValue(options, value.Span, null); + break; + + case FloatVectorDataEncoding.Base64: + WriteBase64VectorData(writer, value); + break; + + default: + throw new NotSupportedException(); + } + } + + private static ReadOnlyMemory ReadBase64VectorData(ref Utf8JsonReader reader) + { + var bytes = reader.GetBytesFromBase64(); + + if ((bytes.Length & 3) != 0) + { + throw new ArgumentException("Decoded vector data length is not a multiple of 4 (not valid 32-bit floats)."); + } + + var span = bytes.AsSpan(); + + if (BitConverter.IsLittleEndian) + { + // Host is little-endian. We must swap the byte order. + + var intSourceDest = MemoryMarshal.Cast(span); + + for (var i = 0; i < intSourceDest.Length; i++) + { + intSourceDest[i] = BinaryPrimitives.ReverseEndianness(intSourceDest[i]); + } + } + + var result = new float[bytes.Length / 4]; + Buffer.BlockCopy(bytes, 0, result, 0, bytes.Length); + + return new(result); + } + + private static void WriteBase64VectorData(Utf8JsonWriter writer, ReadOnlyMemory value) + { + if (value.IsEmpty) + { + writer.WriteStringValue(string.Empty); + return; + } + + // If the host is big-endian we can reinterpret the memory as bytes without copying. + if (!BitConverter.IsLittleEndian) + { + writer.WriteBase64StringValue(MemoryMarshal.AsBytes(value.Span)); + } + + // Host is little-endian. We must swap the byte order. + + var pool = MemoryPool.Shared; + var required = checked(value.Length * sizeof(float)); + var owner = pool.Rent(required); + + try + { + var dest = owner.Memory.Span[..required]; + + var intSource = MemoryMarshal.Cast(value.Span); + var intDest = MemoryMarshal.Cast(dest); + + for (var i = 0; i < intSource.Length; i++) + { + intDest[i] = BinaryPrimitives.ReverseEndianness(intSource[i]); + } + + writer.WriteBase64StringValue(dest); + } + finally + { + owner.Dispose(); + } + } +} + +/// +/// The encoding to use when serializing vector data using the converter. +/// +public enum ByteVectorDataEncoding +{ + /// + /// Legacy (JSON array) vector encoding for backwards compatibility. + /// + Legacy, + + /// + /// Hexadecimal string vector encoding. + /// + /// + /// Hexadecimal encoding is available starting from Elasticsearch 8.14.0. + /// + Hex, + + /// + /// Base64 vector encoding. + /// + /// + /// Base64 encoding is available starting from Elasticsearch 9.3.0. + /// + Base64 +} + +public sealed class ByteVectorDataConverter : + JsonConverter> +{ + private ByteVectorDataEncoding? _encoding; + + public override ReadOnlyMemory Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + return reader.TokenType switch + { + JsonTokenType.StartArray => new(reader.ReadCollectionValue(options, (ref r, _) => unchecked((byte)r.GetSByte()))!.ToArray()), + JsonTokenType.String => ReadStringVectorData(ref reader), + _ => throw reader.UnexpectedTokenException(JsonTokenType.StartArray, JsonTokenType.String) + }; + } + + public override void Write(Utf8JsonWriter writer, ReadOnlyMemory value, JsonSerializerOptions options) + { + if (_encoding is null) + { + var settings = ContextProvider.GetContext(options); + _encoding = settings.ByteVectorDataEncoding; + } + + switch (_encoding) + { + case ByteVectorDataEncoding.Legacy: + writer.WriteSpanValue(options, value.Span, (w, _, b) => w.WriteNumberValue(unchecked((sbyte)b))); + break; + + case ByteVectorDataEncoding.Hex: + WriteHexVectorData(writer, value); + break; + + case ByteVectorDataEncoding.Base64: + writer.WriteBase64StringValue(value.Span); + break; + + default: + throw new NotSupportedException(); + } + } + + private static ReadOnlyMemory ReadStringVectorData(ref Utf8JsonReader reader) + { + if (reader.TryGetBytesFromBase64(out var result)) + { + return result; + } + + return ReadHexVectorData(ref reader); + } + + private static ReadOnlyMemory ReadHexVectorData(ref Utf8JsonReader reader) + { +#if NET5_0_OR_GREATER + var data = Convert.FromHexString(reader.GetString()!); +#else + var data = FromHex(reader.GetString()!); +#endif + + return new(data); + } + + private static void WriteHexVectorData(Utf8JsonWriter writer, ReadOnlyMemory value) + { + if (value.IsEmpty) + { + writer.WriteStringValue(string.Empty); + return; + } + + // We don't use Convert.ToHexString even for .NET 5.0+ to be able to use pooled memory. + + var pool = MemoryPool.Shared; + var required = checked(value.Length * 2); + var owner = pool.Rent(required); + + try + { + var source = value.Span; + var dest = owner.Memory.Span[..required]; + + byte b; + + for(int bx = 0, cx = 0; bx < source.Length; ++bx, ++cx) + { + b = ((byte)(source[bx] >> 4)); + dest[cx] = (char)(b > 9 ? b + 0x37 : b + 0x30); + b = ((byte)(source[bx] & 0x0F)); + dest[++cx]=(char)(b > 9 ? b + 0x37 : b + 0x30); + } + + writer.WriteStringValue(dest); + } + finally + { + owner.Dispose(); + } + } + +#if !NET5_0_OR_GREATER + public static byte[] FromHex(string data) + { + if (data.Length is 0) + { + return []; + } + + if (data.Length % 2 != 0) + { + throw new ArgumentException("Decoded vector data length is not a multiple of 2 (not valid 8-bit hex niblets)."); + } + + var buffer = new byte[data.Length / 2]; + char c; + + for (int bx = 0, sx = 0; bx < buffer.Length; ++bx, ++sx) + { + c = data[sx]; + buffer[bx] = (byte)((c > '9' ? (c > 'Z' ? (c - 'a' + 10) : (c - 'A' + 10)) : (c - '0')) << 4); + c = data[++sx]; + buffer[bx] |= (byte)(c > '9' ? (c > 'Z' ? (c - 'a' + 10) : (c - 'A' + 10)) : (c - '0')); + } + + return buffer; + } +#endif +} From c223ca3182669793d32f96f40c042ae1f616638e Mon Sep 17 00:00:00 2001 From: Florian Bernd Date: Tue, 2 Dec 2025 14:59:31 +0100 Subject: [PATCH 2/2] Add documentation --- docs/reference/source-serialization.md | 58 +++++++++++++++++++ .../ElasticsearchClientSettings.cs | 4 +- .../IElasticsearchClientSettings.cs | 3 + .../_Shared/Next/VectorConverters.cs | 4 +- 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/docs/reference/source-serialization.md b/docs/reference/source-serialization.md index 41d6fb074c1..f8c2c14c8f6 100644 --- a/docs/reference/source-serialization.md +++ b/docs/reference/source-serialization.md @@ -16,6 +16,9 @@ Source serialization refers to the process of (de)serializing POCO types in cons - [Registering custom `System.Text.Json` converters](#registering-custom-converters) - [Creating a custom `Serializer`](#creating-custom-serializers) - [Native AOT](#native-aot) +- [Vector data serialization](#vector-data-serialization) + - [Opt‑in on document properties](#optin-on-document-properties) + - [Configure encodings globally](#configure-encodings-globally) ## Modeling documents with types [modeling-documents-with-types] @@ -451,3 +454,58 @@ static void ConfigureOptions(JsonSerializerOptions o) o.TypeInfoResolver = UserTypeSerializerContext.Default; } ``` + +## Vector data serialization [vector-data-serialization] + +Efficient ingestion of high-dimensional vectors often benefits from compact encodings rather than verbose JSON arrays. The client provides opt‑in converters for vector properties in your source documents that serialize to either hexadecimal or `base64` strings, depending on the vector type and the Elasticsearch version you target. + +- Float vectors can use `base64` starting from Elasticsearch 9.3.0. +- Byte/bit vectors can use hexadecimal strings starting from Elasticsearch 8.14.0 and `base64` starting from Elasticsearch 9.3.0. +- The legacy representation (JSON arrays) remains available for backwards compatibility. + +Base64 is the preferred format for high‑throughput indexing because it minimizes payload size and reduces JSON parsing overhead. + +### Opt‑in on document properties [optin-on-document-properties] + +Vector encodings are opt‑in. Apply a `System.Text.Json` `JsonConverter` attribute on the vector property of your POCO. For best performance, model the properties as `ReadOnlyMemory`. + +```csharp +using System; +using System.Text.Json.Serialization; +using Elastic.Clients.Elasticsearch.Serialization; + +public class ImageEmbedding +{ + [JsonConverter(typeof(FloatVectorDataConverter))] <1> + public ReadOnlyMemory Vector { get; set; } +} + +public class ByteSignature +{ + [JsonConverter(typeof(ByteVectorDataConverter))] <2> + public ReadOnlyMemory Signature { get; set; } +} +``` + +1. `FloatVectorDataConverter` enables `base64` encoding for float vectors. +2. `ByteVectorDataConverter` enables `base64` encoding for byte vectors. + +Without these attributes, vectors are serialized using the default source serializer behavior. + +### Configure encodings globally [configure-encodings-globally] + +When the opt‑in attributes are present, you can control the actual wire encoding globally via `ElasticsearchClient` settings on a per‑type basis: + +- `FloatVectorDataEncoding`: controls float vector encoding (legacy arrays or `base64`). +- `ByteVectorDataEncoding`: controls byte/bit vector encoding (legacy arrays, hexadecimal, or `base64`). + +These settings allow a single set of document types to work against mixed clusters. For example, a library using the 8.19.x client can talk to both 8.x and 9.x servers and dynamically opt out of `base64` on older servers without maintaining duplicate POCOs (with/without converter attributes). + +::::{note} + +Set the encoding based on your effective server version: + +- Float vectors: use `base64` for 9.3.0+; otherwise use legacy arrays. +- Byte/bit vectors: prefer `base64` for 9.3.0+; use hexadecimal for 8.14.0–9.2.x; otherwise use legacy arrays. + +:::: diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs index b313c76fd89..5b92ec7379d 100644 --- a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/ElasticsearchClientSettings.cs @@ -115,8 +115,8 @@ public abstract class ElasticsearchClientSettingsBase : private readonly Serializer _sourceSerializer; private BeforeRequestEvent? _onBeforeRequest; private bool _experimentalEnableSerializeNullInferredValues; - private FloatVectorDataEncoding _floatVectorDataEncoding = Elasticsearch.FloatVectorDataEncoding.Base64; - private ByteVectorDataEncoding _byteVectorDataEncoding = Elasticsearch.ByteVectorDataEncoding.Base64; + private FloatVectorDataEncoding _floatVectorDataEncoding = Serialization.FloatVectorDataEncoding.Base64; + private ByteVectorDataEncoding _byteVectorDataEncoding = Serialization.ByteVectorDataEncoding.Base64; private ExperimentalSettings _experimentalSettings = new(); private bool _defaultDisableAllInference; diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs index 376b8d41da8..36261666485 100644 --- a/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Core/Configuration/IElasticsearchClientSettings.cs @@ -5,7 +5,10 @@ using System; using System.Collections.Generic; using System.Reflection; + using Elastic.Clients.Elasticsearch.Requests; +using Elastic.Clients.Elasticsearch.Serialization; + using Elastic.Transport; namespace Elastic.Clients.Elasticsearch; diff --git a/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs b/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs index f6a6bd2aebe..4a54d60f1b1 100644 --- a/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs +++ b/src/Elastic.Clients.Elasticsearch/_Shared/Next/VectorConverters.cs @@ -9,9 +9,7 @@ using System.Text.Json; using System.Text.Json.Serialization; -using Elastic.Clients.Elasticsearch.Serialization; - -namespace Elastic.Clients.Elasticsearch; +namespace Elastic.Clients.Elasticsearch.Serialization; /// /// The encoding to use when serializing vector data using the converter.