Skip to content

Commit 3ac2b09

Browse files
committed
Fix arrow null type handling
1 parent c550182 commit 3ac2b09

File tree

2 files changed

+22
-5
lines changed

2 files changed

+22
-5
lines changed

src/Processors/Sources/PythonSource.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ void PythonSource::convert_string_array_to_block(
164164
void PythonSource::insert_obj_to_string_column(PyObject * obj, ColumnString * string_column)
165165
{
166166
// check if the object is NaN
167-
if (PyFloat_Check(obj) && Py_IS_NAN(PyFloat_AS_DOUBLE(obj)))
167+
if (obj == Py_None || (PyFloat_Check(obj) && Py_IS_NAN(PyFloat_AS_DOUBLE(obj))))
168168
{
169169
// insert default value for string column, which is empty string
170170
string_column->insertDefault();
@@ -494,6 +494,8 @@ Chunk PythonSource::scanDataToChunk()
494494
columns[i] = convert_and_insert_array<UInt16>(col, cursor, count);
495495
else if (which.isString())
496496
columns[i] = convert_and_insert_array<String>(col, cursor, count);
497+
else if (which.isNullable())
498+
columns[i] = convert_and_insert_array<String>(col, cursor, count);
497499
else
498500
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {} for column {}", type->getName(), col.name);
499501

src/Storages/StoragePython.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,15 @@ ColumnsDescription StoragePython::getTableStructureFromData(py::object data_sour
156156
RE2 pattern_decimal128(R"(decimal128\((\d+),\s*(\d+)\))");
157157
RE2 pattern_decimal256(R"(decimal256\((\d+),\s*(\d+)\))");
158158
RE2 pattern_date32(R"(\bdate32\b)");
159-
RE2 pattern_date64(R"(\bdate64\b)");
159+
RE2 pattern_datatime64s(R"(\bdatetime64\[s\]|timestamp\[s\])");
160+
RE2 pattern_date64(R"(\bdate64\b|datetime64\[ms\]|timestamp\[ms\])");
160161
RE2 pattern_time32(R"(\btime32\b)");
161-
RE2 pattern_time64_us(R"(\btime64\[us\]\b)");
162-
RE2 pattern_time64_ns(R"(\btime64\[ns\]\b|<M8\[ns\])");
162+
RE2 pattern_time64_us(R"(\btime64\[us\]\b|datetime64\[us\]|<M8\[us\])");
163+
RE2 pattern_time64_ns(R"(\btime64\[ns\]\b|datetime64\[ns\]|<M8\[ns\])");
163164
RE2 pattern_string_binary(
164165
R"(\bstring\b|<class 'str'>|str|DataType\(string\)|DataType\(binary\)|binary\[pyarrow\]|dtype\[object_\]|
165166
dtype\('S|dtype\('O|<class 'bytes'>|<class 'bytearray'>|<class 'memoryview'>|<class 'numpy.bytes_'>|<class 'numpy.str_'>|<class 'numpy.void)");
167+
RE2 pattern_null(R"(\bnull\b)");
166168

167169
// Iterate through each pair of name and type string in the schema
168170
for (const auto & [name, typeStr] : schema)
@@ -231,6 +233,10 @@ dtype\('S|dtype\('O|<class 'bytes'>|<class 'bytearray'>|<class 'memoryview'>|<cl
231233
{
232234
data_type = std::make_shared<DataTypeDate32>();
233235
}
236+
else if (RE2::PartialMatch(typeStr, pattern_datatime64s))
237+
{
238+
data_type = std::make_shared<DataTypeDateTime64>(0); // datetime64[s] corresponds to DateTime64(0)
239+
}
234240
else if (RE2::PartialMatch(typeStr, pattern_date64))
235241
{
236242
data_type = std::make_shared<DataTypeDateTime64>(3); // date64 corresponds to DateTime64(3)
@@ -251,9 +257,18 @@ dtype\('S|dtype\('O|<class 'bytes'>|<class 'bytearray'>|<class 'memoryview'>|<cl
251257
{
252258
data_type = std::make_shared<DataTypeString>();
253259
}
260+
else if (RE2::PartialMatch(typeStr, pattern_null))
261+
{
262+
// ClickHouse uses a separate file with NULL masks in addition to normal file with values.
263+
// Entries in masks file allow ClickHouse to distinguish between NULL and a default value of
264+
// corresponding data type for each table row. Because of an additional file we can't make it
265+
// in Python, so we have to use String type for NULLs.
266+
// https://clickhouse.com/docs/en/sql-reference/data-types/nullable#storage-features
267+
data_type = std::make_shared<DataTypeString>();
268+
}
254269
else
255270
{
256-
throw Exception(ErrorCodes::TYPE_MISMATCH, "Unrecognized data type: {}", typeStr);
271+
throw Exception(ErrorCodes::TYPE_MISMATCH, "Unrecognized data type: {} on column {}", typeStr, name);
257272
}
258273

259274
names_and_types.push_back({name, data_type});

0 commit comments

Comments
 (0)