Skip to content

Commit b141e64

Browse files
authored
Merge pull request #989 from Altinity/backports/25.6/81090_parquet_enum_as_byte_array
Antalya 25.6 : Backports of ClickHouse#81090 - Support writing parquet enum as byte array
2 parents d4f9aa1 + eb466f6 commit b141e64

File tree

12 files changed

+91
-8
lines changed

12 files changed

+91
-8
lines changed

src/Client/BuzzHouse/Generator/ServerSettings.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,7 @@ static std::unordered_map<String, CHSetting> serverSettings2 = {
702702
},
703703
{},
704704
false)},
705+
{"output_format_parquet_enum_as_byte_array", CHSetting(trueOrFalse, {}, false)},
705706
{"output_format_parquet_datetime_as_uint32", trueOrFalseSettingNoOracle},
706707
{"output_format_parquet_fixed_string_as_fixed_byte_array", trueOrFalseSettingNoOracle},
707708
{"output_format_parquet_parallel_encoding", trueOrFalseSettingNoOracle},

src/Core/FormatFactorySettings.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,9 @@ Where in the parquet file to place the bloom filters. Bloom filters will be writ
10231023
)", 0) \
10241024
DECLARE(Bool, output_format_parquet_datetime_as_uint32, false, R"(
10251025
Write DateTime values as raw unix timestamp (read back as UInt32), instead of converting to milliseconds (read back as DateTime64(3)).
1026+
)", 0) \
1027+
DECLARE(Bool, output_format_parquet_enum_as_byte_array, true, R"(
1028+
Write enum using parquet physical type: BYTE_ARRAY and logical type: ENUM
10261029
)", 0) \
10271030
DECLARE(String, output_format_avro_codec, "", R"(
10281031
Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.

src/Core/SettingsChangesHistory.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
6767
/// controls new feature and it's 'true' by default, use 'false' as previous_value).
6868
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
6969
/// Note: please check if the key already exists to prevent duplicate entries.
70+
71+
addSettingsChanges(settings_changes_history, "25.6.5.2000",
72+
{
73+
{"output_format_parquet_enum_as_byte_array", true, true, "Enable writing Enum as byte array in Parquet by default"},
74+
});
7075
addSettingsChanges(settings_changes_history, "25.6",
7176
{
7277
{"output_format_native_use_flattened_dynamic_and_json_serialization", false, false, "Add flattened Dynamic/JSON serializations to Native format"},

src/Formats/FormatFactory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
200200
format_settings.parquet.output_string_as_string = settings[Setting::output_format_parquet_string_as_string];
201201
format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings[Setting::output_format_parquet_fixed_string_as_fixed_byte_array];
202202
format_settings.parquet.output_datetime_as_uint32 = settings[Setting::output_format_parquet_datetime_as_uint32];
203+
format_settings.parquet.output_enum_as_byte_array = settings[Setting::output_format_parquet_enum_as_byte_array];
203204
format_settings.parquet.max_block_size = settings[Setting::input_format_parquet_max_block_size];
204205
format_settings.parquet.prefer_block_bytes = settings[Setting::input_format_parquet_prefer_block_bytes];
205206
format_settings.parquet.output_compression_method = settings[Setting::output_format_parquet_compression_method];

src/Formats/FormatSettings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ struct FormatSettings
288288
bool output_string_as_string = false;
289289
bool output_fixed_string_as_fixed_byte_array = true;
290290
bool output_datetime_as_uint32 = false;
291+
bool output_enum_as_byte_array = false;
291292
bool preserve_order = false;
292293
bool use_custom_encoder = true;
293294
bool parallel_encoding = true;

src/Processors/Formats/Impl/Parquet/PrepareForWrite.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,22 @@ void preparePrimitiveColumn(ColumnPtr column, DataTypePtr type, const std::strin
322322
case TypeIndex::Int64: types(T::INT64); break;
323323
case TypeIndex::Float32: types(T::FLOAT); break;
324324
case TypeIndex::Float64: types(T::DOUBLE); break;
325-
326-
/// These don't have suitable parquet logical types, so we write them as plain numbers.
327-
/// (Parquet has "enums" but they're just strings, with nowhere to declare all possible enum
328-
/// values in advance as part of the data type.)
329-
case TypeIndex::Enum8: types(T::INT32, C::INT_8, int_type(8, true)); break; // Int8
330-
case TypeIndex::Enum16: types(T::INT32, C::INT_16, int_type(16, true)); break; // Int16
325+
case TypeIndex::Enum8:
326+
case TypeIndex::Enum16:
327+
{
328+
if (options.output_enum_as_byte_array)
329+
{
330+
parq::LogicalType t;
331+
t.__set_ENUM({});
332+
types(T::BYTE_ARRAY, C::ENUM, t);
333+
}
334+
else if (type->getTypeId() == TypeIndex::Enum8)
335+
types(T::INT32, C::INT_8, int_type(8, true));
336+
else
337+
types(T::INT32, C::INT_16, int_type(16, true));
338+
break;
339+
}
340+
/// IPv4 does not have suitable parquet logical types, so we write them as plain numbers.
331341
case TypeIndex::IPv4: types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32
332342

333343
/// Parquet doesn't have 16-bit date type, so we cast Date to 32 bits.

src/Processors/Formats/Impl/Parquet/Write.cpp

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
#include <Common/config_version.h>
1919
#include <Common/formatReadable.h>
2020
#include <Common/HashTable/HashSet.h>
21+
#include <DataTypes/DataTypeEnum.h>
22+
#include <Core/Block.h>
23+
#include <DataTypes/DataTypeCustom.h>
24+
2125

2226
#if USE_SNAPPY
2327
#include <snappy.h>
@@ -337,6 +341,34 @@ struct ConverterString
337341
}
338342
};
339343

344+
template <typename T>
345+
struct ConverterEnumAsString
346+
{
347+
using Statistics = StatisticsStringRef;
348+
349+
explicit ConverterEnumAsString(const ColumnPtr & c, const DataTypePtr & enum_type_)
350+
: column(assert_cast<const ColumnVector<T> &>(*c)), enum_type(assert_cast<const DataTypeEnum<T> *>(enum_type_.get())) {}
351+
352+
const ColumnVector<T> & column;
353+
const DataTypeEnum<T> * enum_type;
354+
PODArray<parquet::ByteArray> buf;
355+
356+
const parquet::ByteArray * getBatch(size_t offset, size_t count)
357+
{
358+
buf.resize(count);
359+
360+
const auto & data = column.getData();
361+
362+
for (size_t i = 0; i < count; ++i)
363+
{
364+
const T value = data[offset + i];
365+
const StringRef s = enum_type->getNameForValue(value);
366+
buf[i] = parquet::ByteArray(static_cast<UInt32>(s.size), reinterpret_cast<const uint8_t *>(s.data));
367+
}
368+
return buf.data();
369+
}
370+
};
371+
340372
struct ConverterFixedString
341373
{
342374
using Statistics = StatisticsFixedStringRef;
@@ -991,8 +1023,24 @@ void writeColumnChunkBody(
9911023
break;
9921024
case TypeIndex::UInt16 : N(UInt16, Int32Type); break;
9931025
case TypeIndex::UInt64 : N(UInt64, Int64Type); break;
994-
case TypeIndex::Int8 : N(Int8, Int32Type); break;
995-
case TypeIndex::Int16 : N(Int16, Int32Type); break;
1026+
case TypeIndex::Int8:
1027+
{
1028+
if (options.output_enum_as_byte_array && isEnum8(s.type))
1029+
writeColumnImpl<parquet::ByteArrayType>(
1030+
s, options, out, ConverterEnumAsString<Int8>(s.primitive_column, s.type));
1031+
else
1032+
N(Int8, Int32Type);
1033+
break;
1034+
}
1035+
case TypeIndex::Int16:
1036+
{
1037+
if (options.output_enum_as_byte_array && isEnum16(s.type))
1038+
writeColumnImpl<parquet::ByteArrayType>(
1039+
s, options, out, ConverterEnumAsString<Int16>(s.primitive_column, s.type));
1040+
else
1041+
N(Int16, Int32Type);
1042+
break;
1043+
}
9961044
case TypeIndex::Int32 : N(Int32, Int32Type); break;
9971045
case TypeIndex::Int64 : N(Int64, Int64Type); break;
9981046

src/Processors/Formats/Impl/Parquet/Write.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ struct WriteOptions
2323
bool output_string_as_string = false;
2424
bool output_fixed_string_as_fixed_byte_array = true;
2525
bool output_datetime_as_uint32 = false;
26+
bool output_enum_as_byte_array = false;
2627

2728
CompressionMethod compression = CompressionMethod::Lz4;
2829
int compression_level = 3;

src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Blo
105105
options.output_string_as_string = format_settings.parquet.output_string_as_string;
106106
options.output_fixed_string_as_fixed_byte_array = format_settings.parquet.output_fixed_string_as_fixed_byte_array;
107107
options.output_datetime_as_uint32 = format_settings.parquet.output_datetime_as_uint32;
108+
options.output_enum_as_byte_array = format_settings.parquet.output_enum_as_byte_array;
108109
options.data_page_size = format_settings.parquet.data_page_size;
109110
options.write_batch_size = format_settings.parquet.write_batch_size;
110111
options.write_page_index = format_settings.parquet.write_page_index;

tests/queries/0_stateless/02735_parquet_encoder.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ set output_format_parquet_batch_size = 100;
77
set output_format_parquet_row_group_size_bytes = 1000000000;
88
set engine_file_truncate_on_insert=1;
99
set allow_suspicious_low_cardinality_types=1;
10+
set output_format_parquet_enum_as_byte_array=0;
1011

1112
-- Write random data to parquet file, then read from it and check that it matches what we wrote.
1213
-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),

0 commit comments

Comments
 (0)