Skip to content

Commit ad52ed4

Browse files
init col normalisation
Signed-off-by: varun-edachali-dbx <[email protected]>
1 parent 83e45ae commit ad52ed4

File tree

4 files changed

+375
-0
lines changed

4 files changed

+375
-0
lines changed

src/databricks/sql/backend/sea/backend.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,14 @@ def get_catalogs(
694694
enforce_embedded_schema_correctness=False,
695695
)
696696
assert result is not None, "execute_command returned None in synchronous mode"
697+
698+
# Normalize column names to match JDBC/thrift backend
699+
from .metadata_constants import CATALOG_COLUMNS, normalize_metadata_description
700+
701+
result.description = normalize_metadata_description(
702+
result.description, CATALOG_COLUMNS
703+
)
704+
697705
return result
698706

699707
def get_schemas(
@@ -727,6 +735,14 @@ def get_schemas(
727735
enforce_embedded_schema_correctness=False,
728736
)
729737
assert result is not None, "execute_command returned None in synchronous mode"
738+
739+
# Normalize column names to match JDBC/thrift backend
740+
from .metadata_constants import SCHEMA_COLUMNS, normalize_metadata_description
741+
742+
result.description = normalize_metadata_description(
743+
result.description, SCHEMA_COLUMNS
744+
)
745+
730746
return result
731747

732748
def get_tables(
@@ -769,6 +785,13 @@ def get_tables(
769785
)
770786
assert result is not None, "execute_command returned None in synchronous mode"
771787

788+
# Normalize column names to match JDBC/thrift backend
789+
from .metadata_constants import TABLE_COLUMNS, normalize_metadata_description
790+
791+
result.description = normalize_metadata_description(
792+
result.description, TABLE_COLUMNS
793+
)
794+
772795
# Apply client-side filtering by table_types
773796
from databricks.sql.backend.sea.utils.filters import ResultSetFilter
774797

@@ -815,4 +838,10 @@ def get_columns(
815838
enforce_embedded_schema_correctness=False,
816839
)
817840
assert result is not None, "execute_command returned None in synchronous mode"
841+
842+
# Normalize column names to match JDBC/thrift backend
843+
from .metadata_constants import normalize_columns_metadata_description
844+
845+
result.description = normalize_columns_metadata_description(result.description)
846+
818847
return result
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""
2+
Column normalization constants for SEA backend metadata queries.
3+
4+
This module defines column mappings to normalize SEA backend metadata results
5+
to match JDBC DatabaseMetaData standards and thrift backend behavior.
6+
"""
7+
8+
from typing import List, Tuple, Dict, Any
9+
10+
11+
# Columns for catalogs() - matching JDBC CATALOG_COLUMNS exactly
12+
CATALOG_COLUMNS: List[Tuple[str, str]] = [
13+
("TABLE_CAT", "catalog"), # CATALOG_COLUMN_FOR_GET_CATALOGS
14+
]
15+
16+
# Columns for schemas() - matching JDBC SCHEMA_COLUMNS exactly
17+
SCHEMA_COLUMNS: List[Tuple[str, str]] = [
18+
("TABLE_SCHEM", "databaseName"), # SCHEMA_COLUMN_FOR_GET_SCHEMA
19+
("TABLE_CATALOG", "catalogName"), # CATALOG_FULL_COLUMN
20+
]
21+
22+
# Columns for tables() - matching JDBC TABLE_COLUMNS exactly
23+
TABLE_COLUMNS: List[Tuple[str, str]] = [
24+
("TABLE_CAT", "catalogName"), # CATALOG_COLUMN
25+
("TABLE_SCHEM", "namespace"), # SCHEMA_COLUMN
26+
("TABLE_NAME", "tableName"), # TABLE_NAME_COLUMN
27+
("TABLE_TYPE", "tableType"), # TABLE_TYPE_COLUMN
28+
("REMARKS", "remarks"), # REMARKS_COLUMN
29+
("TYPE_CAT", "TYPE_CATALOG_COLUMN"), # TYPE_CATALOG_COLUMN (likely None in data)
30+
("TYPE_SCHEM", "TYPE_SCHEMA_COLUMN"), # TYPE_SCHEMA_COLUMN (likely None in data)
31+
("TYPE_NAME", "TYPE_NAME"), # TYPE_NAME_COLUMN (likely None in data)
32+
(
33+
"SELF_REFERENCING_COL_NAME",
34+
"SELF_REFERENCING_COLUMN_NAME",
35+
), # (likely None in data)
36+
(
37+
"REF_GENERATION",
38+
"REF_GENERATION_COLUMN",
39+
), # REF_GENERATION_COLUMN (likely None in data)
40+
]
41+
42+
# Columns for columns() - matching JDBC COLUMN_COLUMNS exactly
43+
COLUMN_COLUMNS: List[Tuple[str, str]] = [
44+
("TABLE_CAT", "catalogName"), # CATALOG_COLUMN
45+
("TABLE_SCHEM", "namespace"), # SCHEMA_COLUMN
46+
("TABLE_NAME", "tableName"), # TABLE_NAME_COLUMN
47+
("COLUMN_NAME", "col_name"), # COL_NAME_COLUMN
48+
("DATA_TYPE", "dataType"), # DATA_TYPE_COLUMN
49+
("TYPE_NAME", "columnType"), # COLUMN_TYPE_COLUMN
50+
("COLUMN_SIZE", "columnSize"), # COLUMN_SIZE_COLUMN
51+
("BUFFER_LENGTH", "bufferLength"), # BUFFER_LENGTH_COLUMN
52+
("DECIMAL_DIGITS", "decimalDigits"), # DECIMAL_DIGITS_COLUMN
53+
("NUM_PREC_RADIX", "radix"), # NUM_PREC_RADIX_COLUMN
54+
("NULLABLE", "Nullable"), # NULLABLE_COLUMN
55+
("REMARKS", "remarks"), # REMARKS_COLUMN
56+
("COLUMN_DEF", "columnType"), # COLUMN_DEF_COLUMN (same source as TYPE_NAME)
57+
("SQL_DATA_TYPE", "SQLDataType"), # SQL_DATA_TYPE_COLUMN
58+
("SQL_DATETIME_SUB", "SQLDateTimeSub"), # SQL_DATETIME_SUB_COLUMN
59+
("CHAR_OCTET_LENGTH", "CharOctetLength"), # CHAR_OCTET_LENGTH_COLUMN
60+
("ORDINAL_POSITION", "ordinalPosition"), # ORDINAL_POSITION_COLUMN
61+
("IS_NULLABLE", "isNullable"), # IS_NULLABLE_COLUMN
62+
("SCOPE_CATALOG", "ScopeCatalog"), # SCOPE_CATALOG_COLUMN
63+
("SCOPE_SCHEMA", "ScopeSchema"), # SCOPE_SCHEMA_COLUMN
64+
("SCOPE_TABLE", "ScopeTable"), # SCOPE_TABLE_COLUMN
65+
("SOURCE_DATA_TYPE", "SourceDataType"), # SOURCE_DATA_TYPE_COLUMN
66+
("IS_AUTOINCREMENT", "isAutoIncrement"), # IS_AUTO_INCREMENT_COLUMN
67+
("IS_GENERATEDCOLUMN", "isGenerated"), # IS_GENERATED_COLUMN
68+
]
69+
70+
# Note: COLUMN_DEF and TYPE_NAME both map to "columnType" - no special handling needed
71+
# since they both reference the same source column in the data
72+
73+
74+
# Helper functions to work with column definitions
75+
def get_column_names(columns: List[Tuple[str, str]]) -> List[str]:
76+
"""Extract JDBC column names from column definitions."""
77+
return [jdbc_name for jdbc_name, _ in columns]
78+
79+
80+
def get_column_mapping(columns: List[Tuple[str, str]]) -> Dict[str, str]:
81+
"""Create mapping dict from SEA names to JDBC names."""
82+
return {
83+
sea_name: jdbc_name for jdbc_name, sea_name in columns if sea_name is not None
84+
}
85+
86+
87+
def normalize_metadata_description(
88+
original_description: List[Tuple], column_definitions: List[Tuple[str, str]]
89+
) -> List[Tuple]:
90+
"""
91+
Transform result set description to use JDBC-standard column names.
92+
93+
Args:
94+
original_description: Original PEP-249 description from SEA backend
95+
Format: [(name, type_code, display_size, internal_size,
96+
precision, scale, null_ok), ...]
97+
column_definitions: List of (jdbc_name, sea_source_name) tuples defining mappings
98+
99+
Returns:
100+
Normalized description with JDBC column names
101+
"""
102+
if not original_description:
103+
return original_description
104+
105+
# Build mapping from SEA column names to their indices
106+
sea_col_to_idx = {}
107+
for idx, col_desc in enumerate(original_description):
108+
sea_col_to_idx[col_desc[0]] = idx
109+
110+
# Build new description based on column definitions
111+
normalized_description = []
112+
113+
for jdbc_name, sea_name in column_definitions:
114+
if sea_name and sea_name in sea_col_to_idx:
115+
# Column exists in original description
116+
orig_idx = sea_col_to_idx[sea_name]
117+
orig_desc = original_description[orig_idx]
118+
# Replace the column name, keep other metadata
119+
new_desc = (jdbc_name,) + orig_desc[1:]
120+
normalized_description.append(new_desc)
121+
else:
122+
# Column doesn't exist, add with default metadata
123+
# Use VARCHAR type and nullable=None as defaults
124+
normalized_description.append(
125+
(jdbc_name, "string", None, None, None, None, None)
126+
)
127+
128+
return normalized_description
129+
130+
131+
def normalize_columns_metadata_description(
132+
original_description: List[Tuple],
133+
) -> List[Tuple]:
134+
"""
135+
Normalization for columns() metadata.
136+
137+
Args:
138+
original_description: Original description from SEA backend
139+
140+
Returns:
141+
Normalized description matching JDBC COLUMN_COLUMNS
142+
"""
143+
# COLUMN_DEF and TYPE_NAME both map to "columnType" so no special handling needed
144+
return normalize_metadata_description(original_description, COLUMN_COLUMNS)
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
"""
2+
Unit tests for metadata column normalization constants and functions.
3+
"""
4+
5+
import unittest
6+
from databricks.sql.backend.sea.metadata_constants import (
7+
CATALOG_COLUMNS,
8+
SCHEMA_COLUMNS,
9+
TABLE_COLUMNS,
10+
COLUMN_COLUMNS,
11+
get_column_names,
12+
get_column_mapping,
13+
normalize_metadata_description,
14+
normalize_columns_metadata_description,
15+
)
16+
17+
18+
class TestMetadataConstants(unittest.TestCase):
19+
"""Test metadata column constants and helper functions."""
20+
21+
def test_catalog_columns_structure(self):
22+
"""Test CATALOG_COLUMNS has correct structure."""
23+
self.assertEqual(len(CATALOG_COLUMNS), 1)
24+
self.assertEqual(CATALOG_COLUMNS[0], ("TABLE_CAT", "catalog"))
25+
26+
def test_schema_columns_structure(self):
27+
"""Test SCHEMA_COLUMNS has correct structure."""
28+
self.assertEqual(len(SCHEMA_COLUMNS), 2)
29+
self.assertEqual(SCHEMA_COLUMNS[0], ("TABLE_SCHEM", "databaseName"))
30+
self.assertEqual(SCHEMA_COLUMNS[1], ("TABLE_CATALOG", "catalogName"))
31+
32+
def test_table_columns_structure(self):
33+
"""Test TABLE_COLUMNS has correct structure and count."""
34+
self.assertEqual(len(TABLE_COLUMNS), 10)
35+
# Check key columns
36+
self.assertEqual(TABLE_COLUMNS[0], ("TABLE_CAT", "catalogName"))
37+
self.assertEqual(TABLE_COLUMNS[1], ("TABLE_SCHEM", "namespace"))
38+
self.assertEqual(TABLE_COLUMNS[2], ("TABLE_NAME", "tableName"))
39+
self.assertEqual(TABLE_COLUMNS[3], ("TABLE_TYPE", "tableType"))
40+
self.assertEqual(TABLE_COLUMNS[4], ("REMARKS", "remarks"))
41+
42+
def test_column_columns_structure(self):
43+
"""Test COLUMN_COLUMNS has correct structure and count."""
44+
self.assertEqual(len(COLUMN_COLUMNS), 24)
45+
# Check key columns
46+
self.assertEqual(COLUMN_COLUMNS[0], ("TABLE_CAT", "catalogName"))
47+
self.assertEqual(COLUMN_COLUMNS[1], ("TABLE_SCHEM", "namespace"))
48+
self.assertEqual(COLUMN_COLUMNS[2], ("TABLE_NAME", "tableName"))
49+
self.assertEqual(COLUMN_COLUMNS[3], ("COLUMN_NAME", "col_name"))
50+
self.assertEqual(COLUMN_COLUMNS[4], ("DATA_TYPE", "dataType"))
51+
self.assertEqual(COLUMN_COLUMNS[5], ("TYPE_NAME", "columnType"))
52+
# Check that COLUMN_DEF also maps to columnType (same source)
53+
self.assertEqual(COLUMN_COLUMNS[12], ("COLUMN_DEF", "columnType"))
54+
55+
def test_get_column_names(self):
56+
"""Test get_column_names helper function."""
57+
test_columns = [("JDBC_NAME1", "sea_name1"), ("JDBC_NAME2", "sea_name2")]
58+
result = get_column_names(test_columns)
59+
self.assertEqual(result, ["JDBC_NAME1", "JDBC_NAME2"])
60+
61+
def test_get_column_mapping(self):
62+
"""Test get_column_mapping helper function."""
63+
test_columns = [
64+
("JDBC_NAME1", "sea_name1"),
65+
("JDBC_NAME2", "sea_name2"),
66+
("JDBC_NAME3", None), # Should be excluded
67+
]
68+
result = get_column_mapping(test_columns)
69+
expected = {"sea_name1": "JDBC_NAME1", "sea_name2": "JDBC_NAME2"}
70+
self.assertEqual(result, expected)
71+
72+
def test_normalize_metadata_description_basic(self):
73+
"""Test basic metadata description normalization."""
74+
# Mock original description
75+
original_desc = [
76+
("catalog", "string", None, None, None, None, True),
77+
]
78+
79+
result = normalize_metadata_description(original_desc, CATALOG_COLUMNS)
80+
81+
expected = [
82+
("TABLE_CAT", "string", None, None, None, None, True),
83+
]
84+
self.assertEqual(result, expected)
85+
86+
def test_normalize_metadata_description_with_missing_columns(self):
87+
"""Test normalization when some columns are missing from source."""
88+
# Original description has only one column
89+
original_desc = [
90+
("databaseName", "string", None, None, None, None, True),
91+
]
92+
93+
result = normalize_metadata_description(original_desc, SCHEMA_COLUMNS)
94+
95+
expected = [
96+
("TABLE_SCHEM", "string", None, None, None, None, True),
97+
(
98+
"TABLE_CATALOG",
99+
"string",
100+
None,
101+
None,
102+
None,
103+
None,
104+
None,
105+
), # Missing column gets defaults
106+
]
107+
self.assertEqual(result, expected)
108+
109+
def test_normalize_metadata_description_empty_input(self):
110+
"""Test normalization with empty input."""
111+
result = normalize_metadata_description([], CATALOG_COLUMNS)
112+
self.assertEqual(result, [])
113+
114+
def test_normalize_columns_metadata_description(self):
115+
"""Test columns-specific normalization function."""
116+
# Mock original description with key columns
117+
original_desc = [
118+
("catalogName", "string", None, None, None, None, True),
119+
("namespace", "string", None, None, None, None, True),
120+
("tableName", "string", None, None, None, None, True),
121+
("col_name", "string", None, None, None, None, True),
122+
("dataType", "int", None, None, None, None, True),
123+
("columnType", "string", None, None, None, None, True),
124+
]
125+
126+
result = normalize_columns_metadata_description(original_desc)
127+
128+
# Should have 24 columns total
129+
self.assertEqual(len(result), 24)
130+
131+
# Check that key columns are mapped correctly
132+
self.assertEqual(result[0][0], "TABLE_CAT") # catalogName -> TABLE_CAT
133+
self.assertEqual(result[1][0], "TABLE_SCHEM") # namespace -> TABLE_SCHEM
134+
self.assertEqual(result[5][0], "TYPE_NAME") # columnType -> TYPE_NAME
135+
self.assertEqual(
136+
result[12][0], "COLUMN_DEF"
137+
) # columnType -> COLUMN_DEF (same source)
138+
139+
# Both TYPE_NAME and COLUMN_DEF should have same metadata (except name)
140+
self.assertEqual(result[5][1:], result[12][1:])
141+
142+
def test_normalize_metadata_description_preserves_metadata(self):
143+
"""Test that normalization preserves non-name metadata."""
144+
original_desc = [
145+
("catalog", "varchar", 100, 50, 10, 2, False),
146+
]
147+
148+
result = normalize_metadata_description(original_desc, CATALOG_COLUMNS)
149+
150+
expected = [
151+
("TABLE_CAT", "varchar", 100, 50, 10, 2, False),
152+
]
153+
self.assertEqual(result, expected)
154+
155+
def test_columns_with_duplicate_source_mapping(self):
156+
"""Test that TYPE_NAME and COLUMN_DEF both map to columnType correctly."""
157+
original_desc = [
158+
("columnType", "string", None, None, None, None, True),
159+
]
160+
161+
# Create a subset of column definitions that includes both TYPE_NAME and COLUMN_DEF
162+
test_columns = [
163+
("TYPE_NAME", "columnType"),
164+
("COLUMN_DEF", "columnType"),
165+
]
166+
167+
result = normalize_metadata_description(original_desc, test_columns)
168+
169+
expected = [
170+
("TYPE_NAME", "string", None, None, None, None, True),
171+
("COLUMN_DEF", "string", None, None, None, None, True),
172+
]
173+
self.assertEqual(result, expected)
174+
175+
# Both should have identical metadata except for the name
176+
self.assertEqual(result[0][1:], result[1][1:])
177+
178+
179+
if __name__ == "__main__":
180+
unittest.main()

0 commit comments

Comments
 (0)