Skip to content

Commit e65cc31

Browse files
authored
Merge pull request #264 from chdb-io/fixArrowTypes
Fix arrow types handling
2 parents c550182 + b524be0 commit e65cc31

File tree

4 files changed

+97
-5
lines changed

4 files changed

+97
-5
lines changed

src/Processors/Sources/PythonSource.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ void PythonSource::convert_string_array_to_block(
164164
void PythonSource::insert_obj_to_string_column(PyObject * obj, ColumnString * string_column)
165165
{
166166
// check if the object is NaN
167-
if (PyFloat_Check(obj) && Py_IS_NAN(PyFloat_AS_DOUBLE(obj)))
167+
if (obj == Py_None || (PyFloat_Check(obj) && Py_IS_NAN(PyFloat_AS_DOUBLE(obj))))
168168
{
169169
// insert default value for string column, which is empty string
170170
string_column->insertDefault();
@@ -494,6 +494,8 @@ Chunk PythonSource::scanDataToChunk()
494494
columns[i] = convert_and_insert_array<UInt16>(col, cursor, count);
495495
else if (which.isString())
496496
columns[i] = convert_and_insert_array<String>(col, cursor, count);
497+
else if (which.isNullable())
498+
columns[i] = convert_and_insert_array<String>(col, cursor, count);
497499
else
498500
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {} for column {}", type->getName(), col.name);
499501

src/Storages/StoragePython.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,15 @@ ColumnsDescription StoragePython::getTableStructureFromData(py::object data_sour
156156
RE2 pattern_decimal128(R"(decimal128\((\d+),\s*(\d+)\))");
157157
RE2 pattern_decimal256(R"(decimal256\((\d+),\s*(\d+)\))");
158158
RE2 pattern_date32(R"(\bdate32\b)");
159-
RE2 pattern_date64(R"(\bdate64\b)");
159+
RE2 pattern_datatime64s(R"(\bdatetime64\[s\]|timestamp\[s\])");
160+
RE2 pattern_date64(R"(\bdate64\b|datetime64\[ms\]|timestamp\[ms\])");
160161
RE2 pattern_time32(R"(\btime32\b)");
161-
RE2 pattern_time64_us(R"(\btime64\[us\]\b)");
162-
RE2 pattern_time64_ns(R"(\btime64\[ns\]\b|<M8\[ns\])");
162+
RE2 pattern_time64_us(R"(\btime64\[us\]\b|datetime64\[us\]|<M8\[us\])");
163+
RE2 pattern_time64_ns(R"(\btime64\[ns\]\b|datetime64\[ns\]|<M8\[ns\])");
163164
RE2 pattern_string_binary(
164165
R"(\bstring\b|<class 'str'>|str|DataType\(string\)|DataType\(binary\)|binary\[pyarrow\]|dtype\[object_\]|
165166
dtype\('S|dtype\('O|<class 'bytes'>|<class 'bytearray'>|<class 'memoryview'>|<class 'numpy.bytes_'>|<class 'numpy.str_'>|<class 'numpy.void)");
167+
RE2 pattern_null(R"(\bnull\b)");
166168

167169
// Iterate through each pair of name and type string in the schema
168170
for (const auto & [name, typeStr] : schema)
@@ -231,6 +233,10 @@ dtype\('S|dtype\('O|<class 'bytes'>|<class 'bytearray'>|<class 'memoryview'>|<cl
231233
{
232234
data_type = std::make_shared<DataTypeDate32>();
233235
}
236+
else if (RE2::PartialMatch(typeStr, pattern_datatime64s))
237+
{
238+
data_type = std::make_shared<DataTypeDateTime64>(0); // datetime64[s] corresponds to DateTime64(0)
239+
}
234240
else if (RE2::PartialMatch(typeStr, pattern_date64))
235241
{
236242
data_type = std::make_shared<DataTypeDateTime64>(3); // date64 corresponds to DateTime64(3)
@@ -251,9 +257,18 @@ dtype\('S|dtype\('O|<class 'bytes'>|<class 'bytearray'>|<class 'memoryview'>|<cl
251257
{
252258
data_type = std::make_shared<DataTypeString>();
253259
}
260+
else if (RE2::PartialMatch(typeStr, pattern_null))
261+
{
262+
// ClickHouse uses a separate file with NULL masks in addition to normal file with values.
263+
// Entries in masks file allow ClickHouse to distinguish between NULL and a default value of
264+
// corresponding data type for each table row. Because of an additional file we can't make it
265+
// in Python, so we have to use String type for NULLs.
266+
// https://clickhouse.com/docs/en/sql-reference/data-types/nullable#storage-features
267+
data_type = std::make_shared<DataTypeString>();
268+
}
254269
else
255270
{
256-
throw Exception(ErrorCodes::TYPE_MISMATCH, "Unrecognized data type: {}", typeStr);
271+
throw Exception(ErrorCodes::TYPE_MISMATCH, "Unrecognized data type: {} on column {}", typeStr, name);
257272
}
258273

259274
names_and_types.push_back({name, data_type});
10 KB
Binary file not shown.

tests/test_query_py.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
#!python3
22

33
import io
4+
import json
45
import random
56
import unittest
67
import numpy as np
78
import pandas as pd
89
import pyarrow as pa
910
from pyarrow import csv
11+
import pyarrow.json
12+
import pyarrow.parquet
1013
import chdb
1114

1215

@@ -42,6 +45,11 @@
4245
639860,win,1989-06-30
4346
"""
4447

48+
ARROW_DATA_JSONL = """{"match_id": 3943077, "match_date": "2024-07-15", "kick_off": "04:15:00.000", "competition": {"competition_id": 223, "country_name": "South America", "competition_name": "Copa America"}, "season": {"season_id": 282, "season_name": "2024"}, "home_team": {"home_team_id": 779, "home_team_name": "Argentina", "home_team_gender": "male", "home_team_group": null, "country": {"id": 11, "name": "Argentina"}, "managers": [{"id": 5677, "name": "Lionel Sebasti\u00e1n Scaloni", "nickname": null, "dob": "1978-05-16", "country": {"id": 11, "name": "Argentina"}}]}, "away_team": {"away_team_id": 769, "away_team_name": "Colombia", "away_team_gender": "male", "away_team_group": null, "country": {"id": 49, "name": "Colombia"}, "managers": [{"id": 5905, "name": "N\u00e9stor Gabriel Lorenzo", "nickname": null, "dob": "1966-02-28", "country": {"id": 11, "name": "Argentina"}}]}, "home_score": 1, "away_score": 0, "match_status": "available", "match_status_360": "unscheduled", "last_updated": "2024-07-15T15:50:08.671355", "last_updated_360": null, "metadata": {"data_version": "1.1.0", "shot_fidelity_version": "2", "xy_fidelity_version": "2"}, "match_week": 6, "competition_stage": {"id": 26, "name": "Final"}, "stadium": {"id": 5337, "name": "Hard Rock Stadium", "country": {"id": 241, "name": "United States of America"}}, "referee": {"id": 2638, "name": "Raphael Claus", "country": {"id": 31, "name": "Brazil"}}}
49+
{"match_id": 3943076, "match_date": "2024-07-14", "kick_off": "03:00:00.000", "competition": {"competition_id": 223, "country_name": "South America", "competition_name": "Copa America"}, "season": {"season_id": 282, "season_name": "2024"}, "home_team": {"home_team_id": 1833, "home_team_name": "Canada", "home_team_gender": "male", "home_team_group": null, "country": {"id": 40, "name": "Canada"}, "managers": [{"id": 165, "name": "Jesse Marsch", "nickname": null, "dob": "1973-11-08", "country": {"id": 241, "name": "United States of America"}}]}, "away_team": {"away_team_id": 783, "away_team_name": "Uruguay", "away_team_gender": "male", "away_team_group": null, "country": {"id": 242, "name": "Uruguay"}, "managers": [{"id": 269, "name": "Marcelo Alberto Bielsa Caldera", "nickname": "Marcelo Bielsa", "dob": "1955-07-21", "country": {"id": 11, "name": "Argentina"}}]}, "home_score": 2, "away_score": 2, "match_status": "available", "match_status_360": "unscheduled", "last_updated": "2024-07-15T07:57:02.660641", "last_updated_360": null, "metadata": {"data_version": "1.1.0", "shot_fidelity_version": "2", "xy_fidelity_version": "2"}, "match_week": 6, "competition_stage": {"id": 25, "name": "3rd Place Final"}, "stadium": {"id": 52985, "name": "Bank of America Stadium", "country": {"id": 241, "name": "United States of America"}}, "referee": {"id": 1849, "name": "Alexis Herrera", "country": {"id": 246, "name": "Venezuela\u00a0(Bolivarian Republic)"}}}
50+
"""
51+
52+
4553
class myReader(chdb.PyReader):
4654
def __init__(self, data):
4755
self.data = data
@@ -58,6 +66,7 @@ def read(self, col_names, count):
5866

5967

6068
class TestQueryPy(unittest.TestCase):
69+
6170
# def test_query_np(self):
6271
# t3 = {
6372
# "a": np.array([1, 2, 3, 4, 5, 6]),
@@ -135,6 +144,72 @@ def test_query_arrow3(self):
135144
"5872873,587287.3,553446.5,470878.25,3,0,7,10\n",
136145
)
137146

147+
def test_query_arrow4(self):
148+
arrow_table = pa.json.read_json(io.BytesIO(ARROW_DATA_JSONL.encode()))
149+
# print(arrow_table.schema)
150+
ret = chdb.query("SELECT * FROM Python(arrow_table) LIMIT 10", "JSONEachRow")
151+
# print(ret)
152+
self.assertEqual("", ret.error_message())
153+
154+
def test_query_arrow5(self):
155+
arrow_table = pa.parquet.read_table(
156+
"data/sample_2021-04-01_performance_mobile_tiles.parquet"
157+
)
158+
# print("Arrow Schema:\n", arrow_table.schema)
159+
ret = chdb.query("SELECT * FROM Python(arrow_table) LIMIT 1", "JSONCompact")
160+
# print("JSON:\n", ret)
161+
schema = json.loads(str(ret)).get("meta")
162+
# shema is array like:
163+
# [{"name":"quadkey","type":"String"},{"name":"tile","type":"String"}]
164+
schema_dict = {x["name"]: x["type"] for x in schema}
165+
self.assertDictEqual(
166+
schema_dict,
167+
{
168+
"quadkey": "String",
169+
"tile": "String",
170+
"tile_x": "Float64",
171+
"tile_y": "Float64",
172+
"avg_d_kbps": "Int64",
173+
"avg_u_kbps": "Int64",
174+
"avg_lat_ms": "Int64",
175+
"avg_lat_down_ms": "Float64",
176+
"avg_lat_up_ms": "Float64",
177+
"tests": "Int64",
178+
"devices": "Int64",
179+
},
180+
)
181+
ret = chdb.query(
182+
"""
183+
WITH numericColumns AS (
184+
SELECT * EXCEPT ('tile.*') EXCEPT(quadkey)
185+
FROM Python(arrow_table)
186+
)
187+
SELECT * APPLY(max), * APPLY(median) APPLY(x -> round(x, 2))
188+
FROM numericColumns
189+
""",
190+
"JSONCompact",
191+
)
192+
# print("JSONCompact:\n", ret)
193+
self.assertDictEqual(
194+
{x["name"]: x["type"] for x in json.loads(str(ret)).get("meta")},
195+
{
196+
"max(avg_d_kbps)": "Int64",
197+
"max(avg_lat_down_ms)": "Float64",
198+
"max(avg_lat_ms)": "Int64",
199+
"max(avg_lat_up_ms)": "Float64",
200+
"max(avg_u_kbps)": "Int64",
201+
"max(devices)": "Int64",
202+
"max(tests)": "Int64",
203+
"round(median(avg_d_kbps), 2)": "Float64",
204+
"round(median(avg_lat_down_ms), 2)": "Float64",
205+
"round(median(avg_lat_ms), 2)": "Float64",
206+
"round(median(avg_lat_up_ms), 2)": "Float64",
207+
"round(median(avg_u_kbps), 2)": "Float64",
208+
"round(median(devices), 2)": "Float64",
209+
"round(median(tests), 2)": "Float64",
210+
},
211+
)
212+
138213
def test_random_float(self):
139214
x = {"col1": [random.uniform(0, 1) for _ in range(0, 100000)]}
140215
ret = chdb.sql(

0 commit comments

Comments
 (0)