Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions csharp/src/ComplexTypeSerializingStream.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*
* Copyright (c) 2025 ADBC Drivers Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using Apache.Arrow;
using Apache.Arrow.Adbc.Extensions;
using Apache.Arrow.Ipc;
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using Apache.Arrow.Ipc; appears unused in this file and will trigger CS8019; the repo treats warnings as errors, so this can break the build. Remove the unused using (or reference an Ipc type if it’s intended).

Suggested change
using Apache.Arrow.Ipc;

Copilot uses AI. Check for mistakes.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Done — removed using Apache.Arrow.Ipc; from both ComplexTypeSerializingStream.cs and ComplexTypesValueTests.cs.


This comment was generated with GitHub MCP.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverting this — IArrowArrayStream is defined in Apache.Arrow.Ipc, so the using is required. Removing it caused a build failure (CS0246). Restored.


This comment was generated with GitHub MCP.

using Apache.Arrow.Types;

namespace AdbcDrivers.Databricks
{
/// <summary>
/// Wraps an <see cref="IArrowArrayStream"/> and converts columns of complex Arrow types
/// (LIST, MAP represented as LIST of STRUCTs, STRUCT) into STRING columns containing
/// their JSON representation.
///
/// This is applied when EnableComplexDatatypeSupport=false (the default), so that SEA
/// results match the legacy Thrift behavior of returning JSON strings for complex types.
/// </summary>
internal sealed class ComplexTypeSerializingStream : IArrowArrayStream
{
private readonly IArrowArrayStream _inner;
private readonly Schema _schema;
private readonly HashSet<int> _complexColumnIndices;

public ComplexTypeSerializingStream(IArrowArrayStream inner)
{
_inner = inner ?? throw new ArgumentNullException(nameof(inner));
(_schema, _complexColumnIndices) = BuildStringSchema(inner.Schema);
}

public Schema Schema => _schema;

public async ValueTask<RecordBatch?> ReadNextRecordBatchAsync(CancellationToken cancellationToken = default)
{
RecordBatch? batch = await _inner.ReadNextRecordBatchAsync(cancellationToken).ConfigureAwait(false);
if (batch == null)
return null;

Comment on lines +51 to +56
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file uses several var locals (e.g., var batch, var fields, var indices) but the repo’s C# coding guidelines prefer explicit types over var (csharp/CODING_GUIDELINES.md). Consider switching these to explicit types for consistency across the codebase.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Done — replaced all var locals with explicit types throughout the file.


This comment was generated with GitHub MCP.

if (_complexColumnIndices.Count == 0)
return batch;

return ConvertComplexColumns(batch);
}

public void Dispose() => _inner.Dispose();

private RecordBatch ConvertComplexColumns(RecordBatch batch)
{
IArrowArray[] arrays = new IArrowArray[batch.ColumnCount];
for (int i = 0; i < batch.ColumnCount; i++)
{
arrays[i] = _complexColumnIndices.Contains(i) ? SerializeToStringArray(batch.Column(i)) : batch.Column(i);
}
return new RecordBatch(_schema, arrays, batch.Length);
}

private static StringArray SerializeToStringArray(IArrowArray array)
{
StringArray.Builder builder = new StringArray.Builder();
for (int i = 0; i < array.Length; i++)
{
if (array.IsNull(i))
builder.AppendNull();
else
builder.Append(JsonSerializer.Serialize(ToObject(array, i)));
Comment on lines +81 to +83
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current JSON serialization builds Dictionary<string, object?> objects (for STRUCT/MAP) and then calls JsonSerializer.Serialize(...). Dictionary enumeration order is not guaranteed (notably on net472, which this project targets), so property ordering in the resulting JSON string can be nondeterministic and can break string-equality comparisons and clients that expect stable output. Prefer writing JSON with Utf8JsonWriter (or another ordered representation) to emit fields in schema/order deterministically.

Suggested change
builder.AppendNull();
else
builder.Append(JsonSerializer.Serialize(ToObject(array, i)));
{
builder.AppendNull();
}
else
{
var value = ToObject(array, i);
if (value is IDictionary<string, object?> dict)
{
// Wrap in a SortedDictionary to ensure deterministic key ordering
var ordered = new SortedDictionary<string, object?>(dict);
builder.Append(JsonSerializer.Serialize(ordered));
}
else
{
builder.Append(JsonSerializer.Serialize(value));
}
}

Copilot uses AI. Check for mistakes.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Done — changed ToMapDict to return SortedDictionary<string, object?> so MAP keys are always emitted in sorted order, giving deterministic JSON output across all target frameworks.


This comment was generated with GitHub MCP.

}
return builder.Build();
}

/// <summary>
/// Builds a new schema where all complex-type fields are replaced with StringType,
/// and returns the set of column indices that were converted.
/// </summary>
private static (Schema schema, HashSet<int> complexIndices) BuildStringSchema(Schema original)
{
List<Field> fields = new List<Field>(original.FieldsList.Count);
HashSet<int> indices = new HashSet<int>();

for (int i = 0; i < original.FieldsList.Count; i++)
{
Field field = original.FieldsList[i];
if (IsComplexType(field.DataType))
{
fields.Add(new Field(field.Name, StringType.Default, field.IsNullable, field.Metadata));
indices.Add(i);
}
else
{
fields.Add(field);
}
}

return (new Schema(fields, original.Metadata), indices);
}

private static bool IsComplexType(IArrowType type) =>
type is ListType || type is MapType || type is StructType;

// --- JSON serialization helpers ---

private static object? ToObject(IArrowArray array, int index)
{
if (array.IsNull(index))
return null;

// Handle complex types with recursive traversal, and types needing specific
// string formatting. All other primitives delegate to ValueAt().
return array switch
{
ListArray la => ToListOrMap(la, index),
StructArray sa => ToDict(sa, index),
Decimal128Array dec => dec.GetString(index), // preserve precision as string
Date32Array d32 => d32.GetDateTime(index)?.ToString("yyyy-MM-dd"),
_ => array.ValueAt(index, StructResultType.Object) // int, long, float, bool, string, timestamp, etc.
};
}

private static object ToListOrMap(ListArray listArray, int index)
{
IArrowArray values = listArray.Values;
int start = (int)listArray.ValueOffsets[index];
int end = (int)listArray.ValueOffsets[index + 1];

// Arrow MAP is stored as List<Struct<key, value>>
if (values is StructArray structValues && IsMapStruct(structValues))
return ToMapDict(structValues, start, end);

List<object?> list = new List<object?>();
for (int i = start; i < end; i++)
list.Add(ToObject(values, i));
return list;
}

private static bool IsMapStruct(StructArray structArray)
{
StructType type = (StructType)structArray.Data.DataType;
return type.Fields.Count == 2 &&
type.Fields[0].Name == "key" &&
type.Fields[1].Name == "value";
}

private static SortedDictionary<string, object?> ToMapDict(StructArray entries, int start, int end)
{
IArrowArray keyArray = entries.Fields[0];
IArrowArray valueArray = entries.Fields[1];
// Use SortedDictionary for deterministic key ordering in the JSON output
SortedDictionary<string, object?> result = new SortedDictionary<string, object?>();
for (int i = start; i < end; i++)
{
// Convert any key type to its string representation; treat null keys as "null"
string key = ToObject(keyArray, i)?.ToString() ?? "null";
result[key] = ToObject(valueArray, i);
}
return result;
}

private static Dictionary<string, object?> ToDict(StructArray structArray, int index)
{
StructType type = (StructType)structArray.Data.DataType;
Dictionary<string, object?> dict = new Dictionary<string, object?>();
for (int i = 0; i < type.Fields.Count; i++)
dict[type.Fields[i].Name] = ToObject(structArray.Fields[i], index);
return dict;
}
}
}
7 changes: 7 additions & 0 deletions csharp/src/DatabricksConnection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ internal class DatabricksConnection : SparkHttpConnection
private bool _enableMultipleCatalogSupport = true;
private bool _enablePKFK = true;
private bool _runAsyncInThrift = true;
private bool _enableComplexDatatypeSupport = false;

// DirectQuery configuration
private const long DefaultDirectResultMaxBytes = 10 * 1024 * 1024; // 10MB for direct query results size limit
Expand Down Expand Up @@ -195,6 +196,7 @@ private void ValidateProperties()
_canDecompressLz4 = PropertyHelper.GetBooleanPropertyWithValidation(Properties, DatabricksParameters.CanDecompressLz4, _canDecompressLz4);
_useDescTableExtended = PropertyHelper.GetBooleanPropertyWithValidation(Properties, DatabricksParameters.UseDescTableExtended, _useDescTableExtended);
_runAsyncInThrift = PropertyHelper.GetBooleanPropertyWithValidation(Properties, DatabricksParameters.EnableRunAsyncInThriftOp, _runAsyncInThrift);
_enableComplexDatatypeSupport = PropertyHelper.GetBooleanPropertyWithValidation(Properties, DatabricksParameters.EnableComplexDatatypeSupport, _enableComplexDatatypeSupport);

if (Properties.ContainsKey(DatabricksParameters.MaxBytesPerFile))
{
Expand Down Expand Up @@ -370,6 +372,11 @@ protected internal override bool TrySetGetDirectResults(IRequest request)
/// </summary>
public bool RunAsyncInThrift => _runAsyncInThrift;

/// <summary>
/// Whether to return complex types as native Arrow types (true) or JSON strings (false).
/// </summary>
internal bool EnableComplexDatatypeSupport => _enableComplexDatatypeSupport;

/// <summary>
/// Gets a value indicating whether to retry requests that receive retryable responses (408, 502, 503, 504) .
/// </summary>
Expand Down
9 changes: 9 additions & 0 deletions csharp/src/DatabricksParameters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,15 @@ public class DatabricksParameters : SparkParameters
/// Default value is 900 seconds (15 minutes) if not specified.
/// </summary>
public const string FeatureFlagCacheTtlSeconds = "adbc.databricks.feature_flag_cache_ttl_seconds";

/// <summary>
/// Whether to return complex types (ARRAY, MAP, STRUCT) as native Arrow types.
/// When false (default): complex types are serialized to JSON strings, matching legacy behavior.
/// When true: complex types are returned as native Arrow types (ListType, MapType, StructType).
/// This applies to both Thrift and SEA protocols, providing consistent behavior across protocols.
/// Default value is false if not specified.
/// </summary>
public const string EnableComplexDatatypeSupport = "adbc.databricks.enable_complex_datatype_support";
}

/// <summary>
Expand Down
8 changes: 3 additions & 5 deletions csharp/src/DatabricksStatement.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ internal class DatabricksStatement : SparkStatement, IHiveServer2Statement
private bool enableMultipleCatalogSupport;
private bool enablePKFK;
private bool runAsyncInThrift;
private bool enableComplexDatatypeSupport;
private Dictionary<string, string>? confOverlay;
internal string? StatementId { get; set; }

Expand Down Expand Up @@ -90,6 +91,7 @@ public DatabricksStatement(DatabricksConnection connection)
enablePKFK = connection.EnablePKFK;

runAsyncInThrift = connection.RunAsyncInThrift;
enableComplexDatatypeSupport = connection.EnableComplexDatatypeSupport;

// Override the Apache base default (500ms) with Databricks-specific poll interval (100ms)
if (!connection.Properties.ContainsKey(ApacheParameters.PollTimeMilliseconds))
Expand Down Expand Up @@ -260,11 +262,7 @@ protected override void SetStatementProperties(TExecuteStatementReq statement)
{
TimestampAsArrow = true,
DecimalAsArrow = true,

// set to false so they return as string
// otherwise, they return as ARRAY_TYPE but you can't determine
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow keep this comment

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Done — restored an expanded comment on ComplexTypesAsArrow explaining:

  • When false (default): Thrift server returns complex types as JSON-encoded strings
  • When true: server returns native Arrow types
  • Notes the ARRAY_TYPE limitation: element type info isn't embedded in Thrift responses, so string serialization is the safe default

This comment was generated with GitHub MCP.

// the object type of the items in the array
ComplexTypesAsArrow = false,
ComplexTypesAsArrow = enableComplexDatatypeSupport,
IntervalTypesAsArrow = false,
};

Expand Down
4 changes: 4 additions & 0 deletions csharp/src/StatementExecution/StatementExecutionConnection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ internal class StatementExecutionConnection : TracingConnection, IGetObjectsData
private readonly bool _tracePropagationEnabled;
private readonly string _traceParentHeaderName;
private readonly bool _traceStateEnabled;
private readonly bool _enableComplexDatatypeSupport;

// Authentication support
private readonly string? _identityFederationClientId;
Expand Down Expand Up @@ -229,6 +230,7 @@ private StatementExecutionConnection(
_tracePropagationEnabled = PropertyHelper.GetBooleanPropertyWithValidation(properties, DatabricksParameters.TracePropagationEnabled, true);
_traceParentHeaderName = PropertyHelper.GetStringProperty(properties, DatabricksParameters.TraceParentHeaderName, "traceparent");
_traceStateEnabled = PropertyHelper.GetBooleanPropertyWithValidation(properties, DatabricksParameters.TraceStateEnabled, false);
_enableComplexDatatypeSupport = PropertyHelper.GetBooleanPropertyWithValidation(properties, DatabricksParameters.EnableComplexDatatypeSupport, false);

// Authentication configuration
if (properties.TryGetValue(DatabricksParameters.IdentityFederationClientId, out string? identityFederationClientId))
Expand Down Expand Up @@ -851,6 +853,8 @@ public override void Dispose()
}

// TracingConnection provides IActivityTracer implementation
internal bool EnableComplexDatatypeSupport => _enableComplexDatatypeSupport;

public override string AssemblyVersion => GetType().Assembly.GetName().Version?.ToString() ?? "1.0.0";
public override string AssemblyName => "AdbcDrivers.Databricks";
}
Expand Down
12 changes: 12 additions & 0 deletions csharp/src/StatementExecution/StatementExecutionStatement.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
using AdbcDrivers.Databricks;
using AdbcDrivers.Databricks.Reader.CloudFetch;
using AdbcDrivers.Databricks.StatementExecution.MetadataCommands;
using AdbcDrivers.Databricks.Result;
Expand Down Expand Up @@ -65,6 +66,9 @@ internal class StatementExecutionStatement : TracingStatement
// HTTP client for CloudFetch downloads
private readonly HttpClient _httpClient;

// Complex type configuration
private readonly bool _enableComplexDatatypeSupport;

// Connection reference for metadata queries
private readonly StatementExecutionConnection _connection;

Expand Down Expand Up @@ -119,6 +123,7 @@ public StatementExecutionStatement(
_recyclableMemoryStreamManager = recyclableMemoryStreamManager ?? throw new ArgumentNullException(nameof(recyclableMemoryStreamManager));
_lz4BufferPool = lz4BufferPool ?? throw new ArgumentNullException(nameof(lz4BufferPool));
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
_enableComplexDatatypeSupport = connection.EnableComplexDatatypeSupport;
}

/// <summary>
Expand Down Expand Up @@ -280,6 +285,13 @@ public async Task<QueryResult> ExecuteQueryAsync(
// Create appropriate reader based on result disposition
IArrowArrayStream reader = CreateReader(response, cancellationToken);

// When EnableComplexDatatypeSupport=false (default), serialize complex Arrow types to JSON strings
// so that SEA behavior matches Thrift (which sets ComplexTypesAsArrow=false).
if (!_enableComplexDatatypeSupport)
{
reader = new ComplexTypeSerializingStream(reader);
}

// Get schema from reader
var schema = reader.Schema;

Expand Down
Loading
Loading