Skip to content

Commit 0ebcdb7

Browse files
ksseniiEnmk
authored andcommitted
Merge pull request ClickHouse#82114 from ClickHouse/fix_parser_for_complex_types
Fix support for nested data types with decimal subfields in glue catalog
1 parent 3635c44 commit 0ebcdb7

File tree

2 files changed

+91
-19
lines changed

2 files changed

+91
-19
lines changed

src/Databases/DataLake/GlueCatalog.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,20 +64,28 @@ String trim(const String & str)
6464
std::vector<String> splitTypeArguments(const String & type_str)
6565
{
6666
std::vector<String> args;
67-
int depth = 0;
67+
int angle_depth = 0;
68+
int paren_depth = 0;
6869
size_t start = 0;
69-
for (size_t i = 0; i < type_str.size(); i++)
70+
71+
for (size_t i = 0; i < type_str.size(); ++i)
7072
{
71-
if (type_str[i] == '<')
72-
depth++;
73-
else if (type_str[i] == '>')
74-
depth--;
75-
else if (type_str[i] == ',' && depth == 0)
73+
char c = type_str[i];
74+
if (c == '<')
75+
angle_depth++;
76+
else if (c == '>')
77+
angle_depth--;
78+
else if (c == '(')
79+
paren_depth++;
80+
else if (c == ')')
81+
paren_depth--;
82+
else if (c == ',' && angle_depth == 0 && paren_depth == 0)
7683
{
7784
args.push_back(trim(type_str.substr(start, i - start)));
7885
start = i + 1;
7986
}
8087
}
88+
8189
args.push_back(trim(type_str.substr(start)));
8290
return args;
8391
}

tests/integration/test_database_glue/test.py

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,23 @@
1212
import pytest
1313
import requests
1414
import urllib3
15+
from datetime import datetime, timedelta
1516
from minio import Minio
1617
from pyiceberg.catalog import load_catalog
1718
from pyiceberg.partitioning import PartitionField, PartitionSpec
1819
from pyiceberg.schema import Schema
1920
from pyiceberg.table.sorting import SortField, SortOrder
2021
from pyiceberg.transforms import DayTransform, IdentityTransform
22+
from helpers.config_cluster import minio_access_key, minio_secret_key
23+
import decimal
2124
from pyiceberg.types import (
2225
DoubleType,
23-
FloatType,
2426
NestedField,
2527
StringType,
2628
StructType,
2729
TimestampType,
30+
MapType,
31+
DecimalType,
2832
)
2933

3034
from helpers.cluster import ClickHouseCluster, ClickHouseInstance, is_arm
@@ -39,6 +43,11 @@
3943
BASE_URL = "http://glue:3000"
4044
BASE_URL_LOCAL_HOST = "http://localhost:3000"
4145

46+
def generate_decimal(precision=9, scale=2):
47+
max_value = 10**(precision - scale) - 1
48+
value = random.uniform(0, max_value)
49+
return round(decimal.Decimal(value), scale)
50+
4251
DEFAULT_SCHEMA = Schema(
4352
NestedField(
4453
field_id=1, name="datetime", field_type=TimestampType(), required=False
@@ -59,9 +68,21 @@
5968
),
6069
required=False,
6170
),
71+
NestedField(
72+
field_id=6,
73+
name="map_string_decimal",
74+
field_type=MapType(
75+
key_type=StringType(),
76+
value_type=DecimalType(9, 2),
77+
key_id=7,
78+
value_id=8,
79+
value_required=False,
80+
),
81+
required=False,
82+
),
6283
)
6384

64-
DEFAULT_CREATE_TABLE = "CREATE TABLE {}.`{}.{}`\\n(\\n `datetime` Nullable(DateTime64(6)),\\n `symbol` Nullable(String),\\n `bid` Nullable(Float64),\\n `ask` Nullable(Float64),\\n `details` Tuple(created_by Nullable(String))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n"
85+
DEFAULT_CREATE_TABLE = "CREATE TABLE {}.`{}.{}`\\n(\\n `datetime` Nullable(DateTime64(6)),\\n `symbol` Nullable(String),\\n `bid` Nullable(Float64),\\n `ask` Nullable(Float64),\\n `details` Tuple(created_by Nullable(String)),\\n `map_string_decimal` Map(String, Nullable(Decimal(9, 2)))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse-glue/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n"
6586

6687
DEFAULT_PARTITION_SPEC = PartitionSpec(
6788
PartitionField(
@@ -107,15 +128,59 @@ def create_table(
107128
)
108129

109130

110-
def generate_record():
111-
return {
112-
"datetime": datetime.now(),
113-
"symbol": str("kek"),
114-
"bid": round(random.uniform(100, 200), 2),
115-
"ask": round(random.uniform(200, 300), 2),
116-
"details": {"created_by": "Alice Smith"},
117-
}
118131

132+
def generate_arrow_data(num_rows=5):
133+
datetimes = []
134+
symbols = []
135+
bids = []
136+
asks = []
137+
details_created_by = []
138+
map_keys = []
139+
map_values = []
140+
141+
offsets = [0]
142+
143+
for _ in range(num_rows):
144+
datetimes.append(datetime.utcnow() - timedelta(minutes=random.randint(0, 60)))
145+
symbols.append(random.choice(["AAPL", "GOOG", "MSFT"]))
146+
bids.append(random.uniform(100, 150))
147+
asks.append(random.uniform(150, 200))
148+
details_created_by.append(random.choice(["alice", "bob", "carol"]))
149+
150+
# map<string, decimal(9,2)>
151+
keys = []
152+
values = []
153+
for i in range(random.randint(1, 3)):
154+
keys.append(f"key{i}")
155+
values.append(generate_decimal())
156+
map_keys.extend(keys)
157+
map_values.extend(values)
158+
offsets.append(offsets[-1] + len(keys))
159+
160+
# Struct for 'details'
161+
struct_array = pa.StructArray.from_arrays(
162+
[pa.array(details_created_by, type=pa.string())],
163+
names=["created_by"]
164+
)
165+
166+
# Map array
167+
map_array = pa.MapArray.from_arrays(
168+
offsets=pa.array(offsets, type=pa.int32()),
169+
keys=pa.array(map_keys, type=pa.string()),
170+
items=pa.array(map_values, type=pa.decimal128(9, 2))
171+
)
172+
173+
# Final table
174+
table = pa.table({
175+
"datetime": pa.array(datetimes, type=pa.timestamp("us")),
176+
"symbol": pa.array(symbols, type=pa.string()),
177+
"bid": pa.array(bids, type=pa.float64()),
178+
"ask": pa.array(asks, type=pa.float64()),
179+
"details": struct_array,
180+
"map_string_decimal": map_array,
181+
})
182+
183+
return table
119184

120185
def create_clickhouse_glue_database(
121186
started_cluster, node, name, additional_settings={}
@@ -259,8 +324,7 @@ def test_select(started_cluster):
259324
table = create_table(catalog, namespace, table_name)
260325

261326
num_rows = 10
262-
data = [generate_record() for _ in range(num_rows)]
263-
df = pa.Table.from_pylist(data)
327+
df = generate_arrow_data(num_rows)
264328
table.append(df)
265329

266330
create_clickhouse_glue_database(started_cluster, node, CATALOG_NAME)

0 commit comments

Comments
 (0)