Skip to content

Commit 0000e8d

Browse files
authored
feat(csharp/src/Drivers/Databricks): Optimize GetColumnsExtendedAsync via DESC TABLE EXTENDED (#2953)
# Optimize GetColumnsExtendedAsync in Databricks via `DESC TABLE EXTENDED` ## Motivation Currently, `HiveServer2Statement.GetColumnsExtendedAsync` will make 3 thrift calls `GetColumns`, `GetPrimaryKeys` and `GetCrossReference` to get all the information and then join them into one result set. Now Databricks introduces a SQL `DESC TABLE EXTENDED <table> AS JSON` that will return all of these information in one query, this can save 2 extra roundtrips and improve the performance. ### Description Override `HiveServer2Statement.GetColumnsExtendedAsync` in `DatabricksStatement` by executing single SQL `DESC TABLE EXTENDED <table> AS JSON` to get all the required column info for `GetColumnsExtendedAsync` then combine and join these info into the result. As this SQL `DESC TABLE EXTENDED <table> AS JSON` is only available in Databricks Runtime 16.2 or above, it will check the `ServerProtocolVersion`. if it does not meet the requirement, it will fallback the implementation of base class. ### Change - Added `DescTableExtendedResult` that represents the result of `DESC TABLE EXTENDED <table> AS JSON` (see format [here](https://docs.databricks.com/aws/en/sql/language-manual/sql-ref-syntax-aux-describe-table#json-formatted-output)). It also - Parses the string-based `table_constraints` to the structured primary key and foreign key constraints - Adds calculated properties `DataType`, `ColumnSize`, `FullTypeName` which are calculated from the column type and type specific properties - Changed the access modifiers of some properties and methods in `HiveServer2Statement` to protected so that they can be used/overridden in subclass `DatabricksStatement` - `PrimaryKeyFields` - `ForeignKeyFields` - `PrimaryKeyPrefix` - `ForeignKeyPrefix` - `GetColumnsExtendedAsync` - `CreateEmptyExtendedColumnsResult` - Updated `DatabricksStatement` with the changes below - Added `GetColumnsExtendedAsync` in `DatabricksStatement`, it executes query `DESC TABLE EXTENDED <table> AS JSON` to get all the required info and then json them into the QueryResult - Moved the column metadata schema creation logic from `GetColumnsAsync` to a reusable method `CreateColumnMetadataSchema` - Moved the column metadata data array creation logic from `GetColumnsAsync` to a reusable method `CreateColumnMetadataEmptyArray` - Added a Databricks connection Parameter `adbc.databricks.use_desc_table_extended` - Added an internal calculated property `CanUseDescTableExtended` in `DatabricksConnection`, `DatabricksStatement` will call it to decide if it will override `GetColumnsExtendedAsync` - Improved the Databricks driver integration test `StatementTest:CanGetColumnsExtended` - Setup the required table resources during the test instead of depending on manual setup before the test - Support running multiple test cases from the inputs - Add a deep result check to make sure all the properties of all the columns are set correctly ### Testing - Added Test class `DescTableExtendedResultTest` to cover all the deserialization and parsing cases from raw result of `DESC TABLE EXTENDED <table> AS JSON` - Tested all the `ColumnsExtended` relevant test cases in `Databricks/StatementTest`
1 parent cde9e7b commit 0000e8d

11 files changed

+2273
-104
lines changed

csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ internal class HiveServer2Statement : TracingStatement
4747
GetColumnsExtendedCommandName;
4848

4949
// Add constants for PK and FK field names and prefixes
50-
private static readonly string[] PrimaryKeyFields = new[] { "COLUMN_NAME" };
51-
private static readonly string[] ForeignKeyFields = new[] { "PKCOLUMN_NAME", "PKTABLE_CAT", "PKTABLE_SCHEM", "PKTABLE_NAME", "FKCOLUMN_NAME", "FK_NAME", "KEQ_SEQ" };
50+
protected static readonly string[] PrimaryKeyFields = new[] { "COLUMN_NAME" };
51+
protected static readonly string[] ForeignKeyFields = new[] { "PKCOLUMN_NAME", "PKTABLE_CAT", "PKTABLE_SCHEM", "PKTABLE_NAME", "FKCOLUMN_NAME", "FK_NAME", "KEQ_SEQ" };
5252
private static readonly string s_assemblyName = ApacheUtility.GetAssemblyName(typeof(HiveServer2Statement));
5353
private static readonly string s_assemblyVersion = ApacheUtility.GetAssemblyVersion(typeof(HiveServer2Statement));
54-
private const string PrimaryKeyPrefix = "PK_";
55-
private const string ForeignKeyPrefix = "FK_";
54+
protected const string PrimaryKeyPrefix = "PK_";
55+
protected const string ForeignKeyPrefix = "FK_";
5656

5757
internal HiveServer2Statement(HiveServer2Connection connection)
5858
: base(connection)
@@ -704,7 +704,7 @@ protected internal QueryResult EnhanceGetColumnsResult(Schema originalSchema, IR
704704
return (batches, schema, totalRows);
705705
}
706706

707-
private async Task<QueryResult> GetColumnsExtendedAsync(CancellationToken cancellationToken = default)
707+
protected virtual async Task<QueryResult> GetColumnsExtendedAsync(CancellationToken cancellationToken = default)
708708
{
709709
// 1. Get all three results at once
710710
var columnsResult = await GetColumnsAsync(cancellationToken);
@@ -797,7 +797,7 @@ await ProcessRelationshipDataSafe(fkResult, ForeignKeyPrefix, "FKCOLUMN_NAME",
797797
}
798798

799799
// Helper method to create an empty result with the complete extended columns schema
800-
private QueryResult CreateEmptyExtendedColumnsResult(Schema baseSchema)
800+
protected QueryResult CreateEmptyExtendedColumnsResult(Schema baseSchema)
801801
{
802802
// Create the complete schema with all fields
803803
var allFields = new List<Field>(baseSchema.FieldsList);
@@ -809,7 +809,8 @@ private QueryResult CreateEmptyExtendedColumnsResult(Schema baseSchema)
809809
// Add FK fields
810810
foreach (var field in ForeignKeyFields)
811811
{
812-
allFields.Add(new Field(ForeignKeyPrefix + field, StringType.Default, true));
812+
IArrowType fieldType = field != "KEQ_SEQ" ? StringType.Default : Int32Type.Default;
813+
allFields.Add(new Field(ForeignKeyPrefix + field, fieldType, true));
813814
}
814815

815816
var combinedSchema = new Schema(allFields, baseSchema.Metadata);

csharp/src/Drivers/Databricks/DatabricksConnection.cs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ internal class DatabricksConnection : SparkHttpConnection
5656
private long _maxBytesPerFile = DefaultMaxBytesPerFile;
5757
private const bool DefaultRetryOnUnavailable = true;
5858
private const int DefaultTemporarilyUnavailableRetryTimeout = 900;
59+
private bool _useDescTableExtended = true;
5960

6061
// Default namespace
6162
private TNamespace? _defaultNamespace;
@@ -145,6 +146,18 @@ private void ValidateProperties()
145146
}
146147
}
147148

149+
if (Properties.TryGetValue(DatabricksParameters.UseDescTableExtended, out string? useDescTableExtendedStr))
150+
{
151+
if (bool.TryParse(useDescTableExtendedStr, out bool useDescTableExtended))
152+
{
153+
_useDescTableExtended = useDescTableExtended;
154+
}
155+
else
156+
{
157+
throw new ArgumentException($"Parameter '{DatabricksParameters.UseDescTableExtended}' value '{useDescTableExtendedStr}' could not be parsed. Valid values are 'true' and 'false'.");
158+
}
159+
}
160+
148161
if (Properties.TryGetValue(DatabricksParameters.MaxBytesPerFile, out string? maxBytesPerFileStr))
149162
{
150163
if (!long.TryParse(maxBytesPerFileStr, out long maxBytesPerFileValue))
@@ -223,6 +236,11 @@ private void ValidateProperties()
223236
/// </summary>
224237
internal bool EnableMultipleCatalogSupport => _enableMultipleCatalogSupport;
225238

239+
/// <summary>
240+
/// Check if current connection can use `DESC TABLE EXTENDED` query
241+
/// </summary>
242+
internal bool CanUseDescTableExtended => _useDescTableExtended && ServerProtocolVersion != null && ServerProtocolVersion >= TProtocolVersion.SPARK_CLI_SERVICE_PROTOCOL_V7;
243+
226244
/// <summary>
227245
/// Gets whether PK/FK metadata call is enabled
228246
/// </summary>

csharp/src/Drivers/Databricks/DatabricksParameters.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ public class DatabricksParameters : SparkParameters
173173
/// Default value is true if not specified.
174174
/// </summary>
175175
public const string EnablePKFK = "adbc.databricks.enable_pk_fk";
176+
177+
/// <summary>
178+
/// Whether to use query DESC TABLE EXTENDED to get extended column metadata when the current DBR supports it
179+
/// Default value is true if not specified.
180+
/// </summary>
181+
public const string UseDescTableExtended = "adbc.databricks.use_desc_table_extended";
176182
}
177183

178184
/// <summary>

0 commit comments

Comments
 (0)