12
12
#include < yql/essentials/public/udf/arrow/block_builder.h>
13
13
#include < yql/essentials/public/udf/arrow/block_item.h>
14
14
#include < yql/essentials/public/udf/arrow/block_reader.h>
15
+ #include < yql/essentials/public/udf/udf_data_type.h>
15
16
#include < yql/essentials/utils/yql_panic.h>
16
17
17
18
#include < arrow/api.h>
@@ -89,9 +90,9 @@ ui32 GetMultiplierForDatetime(arrow::TimeUnit::type unit) {
89
90
case arrow::TimeUnit::SECOND:
90
91
return 1 ;
91
92
case arrow::TimeUnit::MILLI:
92
- throw parquet::ParquetException ( TStringBuilder () << " millisecond accuracy does not fit into the datetime " ) ;
93
+ return 1000 ;
93
94
case arrow::TimeUnit::MICRO:
94
- throw parquet::ParquetException ( TStringBuilder () << " microsecond accuracy does not fit into the datetime " ) ;
95
+ return 1000000 ;
95
96
case arrow::TimeUnit::NANO:
96
97
throw parquet::ParquetException (TStringBuilder () << " nanosecond accuracy does not fit into the datetime" );
97
98
}
@@ -127,6 +128,35 @@ std::shared_ptr<arrow::Array> ArrowTypeAsYqlDatetime(const std::shared_ptr<arrow
127
128
return builder.Build (true ).make_array ();
128
129
}
129
130
131
+ template <bool isOptional, typename TArrowType>
132
+ std::shared_ptr<arrow::Array> ArrowTimestampAsYqlDatetime (const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, ui32 multiplier) {
133
+ ::NYql::NUdf::TFixedSizeArrayBuilder<TArrowType, isOptional> builder (NKikimr::NMiniKQL::TTypeInfoHelper (), targetType, *arrow::system_memory_pool (), value->length ());
134
+ ::NYql::NUdf::TFixedSizeBlockReader<i64 , isOptional> reader;
135
+ for (i64 i = 0 ; i < value->length (); ++i) {
136
+ const NUdf::TBlockItem item = reader.GetItem (*value->data (), i);
137
+ if constexpr (isOptional) {
138
+ if (!item) {
139
+ builder.Add (item);
140
+ continue ;
141
+ }
142
+ } else if (!item) {
143
+ throw parquet::ParquetException (TStringBuilder () << " null value for datetime could not be represented in non-optional type" );
144
+ }
145
+
146
+ const i64 baseValue = item.As <i64 >();
147
+ if (baseValue < 0 && baseValue > static_cast <int64_t >(::NYql::NUdf::MAX_DATETIME)) {
148
+ throw parquet::ParquetException (TStringBuilder () << " datetime in parquet is out of range [0, " << ::NYql::NUdf::MAX_DATETIME << " ]: " << baseValue);
149
+ }
150
+
151
+ if (baseValue % multiplier) {
152
+ throw parquet::ParquetException (TStringBuilder () << " datetime in parquet should have integer amount of seconds, have: " << baseValue * 1.0 / multiplier);
153
+ }
154
+ const TArrowType v = baseValue / static_cast <ui64>(multiplier);
155
+ builder.Add (NUdf::TBlockItem (static_cast <TArrowType>(v)));
156
+ }
157
+ return builder.Build (true ).make_array ();
158
+ }
159
+
130
160
template <bool isOptional>
131
161
std::shared_ptr<arrow::Array> ArrowStringAsYqlDateTime (const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, const NDB::FormatSettings& formatSettings) {
132
162
::NYql::NUdf::TFixedSizeArrayBuilder<ui32, isOptional> builder (NKikimr::NMiniKQL::TTypeInfoHelper (), targetType, *arrow::system_memory_pool (), value->length ());
@@ -330,8 +360,16 @@ TColumnConverter ArrowDate64AsYqlDatetime(const std::shared_ptr<arrow::DataType>
330
360
TColumnConverter ArrowTimestampAsYqlDatetime (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::TimeUnit::type timeUnit) {
331
361
return [targetType, isOptional, multiplier = GetMultiplierForDatetime (timeUnit)](const std::shared_ptr<arrow::Array>& value) {
332
362
return isOptional
333
- ? ArrowTypeAsYqlDatetime<true , i64 >(targetType, value, multiplier)
334
- : ArrowTypeAsYqlDatetime<false , i64 >(targetType, value, multiplier);
363
+ ? ArrowTimestampAsYqlDatetime<true , ui32>(targetType, value, multiplier)
364
+ : ArrowTimestampAsYqlDatetime<false , ui32>(targetType, value, multiplier);
365
+ };
366
+ }
367
+
368
+ TColumnConverter ArrowTimestampAsYqlDatetime64 (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::TimeUnit::type timeUnit) {
369
+ return [targetType, isOptional, multiplier = GetMultiplierForDatetime (timeUnit)](const std::shared_ptr<arrow::Array>& value) {
370
+ return isOptional
371
+ ? ArrowTimestampAsYqlDatetime<true , i64 >(targetType, value, multiplier)
372
+ : ArrowTimestampAsYqlDatetime<false , i64 >(targetType, value, multiplier);
335
373
};
336
374
}
337
375
@@ -624,6 +662,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
624
662
switch (slotItem) {
625
663
case NUdf::EDataSlot::Datetime:
626
664
return ArrowTimestampAsYqlDatetime (targetType, isOptional, timestampType.unit ());
665
+ case NUdf::EDataSlot::Datetime64:
666
+ return ArrowTimestampAsYqlDatetime64 (targetType, isOptional, timestampType.unit ());
627
667
case NUdf::EDataSlot::Timestamp:
628
668
return ArrowTimestampAsYqlTimestamp (targetType, isOptional, timestampType.unit ());
629
669
case NUdf::EDataSlot::String:
@@ -654,7 +694,7 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
654
694
) {
655
695
return [](const std::shared_ptr<arrow::Array>& value) {
656
696
auto decimals = std::static_pointer_cast<arrow::Decimal128Array>(value);
657
- auto output = std::make_shared<arrow::FixedSizeBinaryArray>(arrow::fixed_size_binary (16 ), decimals->length (), decimals->values ());
697
+ auto output = std::make_shared<arrow::FixedSizeBinaryArray>(arrow::fixed_size_binary (16 ), decimals->length (), decimals->values (), decimals-> null_bitmap (), decimals-> null_count () );
658
698
return output;
659
699
};
660
700
}
@@ -691,6 +731,39 @@ TColumnConverter YqlBlockTzDateToArrow(const std::string& columnName, const std:
691
731
};
692
732
}
693
733
734
+ template <bool isOptional>
735
+ TColumnConverter DecimalToArrowBaseConverter (const std::shared_ptr<arrow::DataType>& targetType) {
736
+ return [targetType](const std::shared_ptr<arrow::Array>& value) {
737
+ arrow::Decimal128Builder builder (targetType, arrow::default_memory_pool ());
738
+ ::NYql::NUdf::TFixedSizeBlockReader<NYql::NDecimal::TInt128, isOptional> reader;
739
+
740
+ for (i64 i = 0 ; i < value->length (); ++i) {
741
+ NUdf::TBlockItem item = reader.GetItem (*value->data (), i);
742
+
743
+ if (!item) {
744
+ THROW_ARROW_NOT_OK (builder.AppendNull ());
745
+ continue ;
746
+ }
747
+
748
+ NYql::NDecimal::TInt128 val = item.GetInt128 ();
749
+ arrow::Decimal128 newValue ((uint8_t *)(&val));
750
+ THROW_ARROW_NOT_OK (builder.Append (newValue));
751
+ }
752
+
753
+ std::shared_ptr<arrow::Array> array;
754
+ THROW_ARROW_NOT_OK (builder.Finish (&array));
755
+ return array;
756
+ };
757
+ }
758
+
759
+ TColumnConverter DecimalToArrowConverter (bool isOptional, const std::shared_ptr<arrow::DataType>& targetType) {
760
+ if (isOptional) {
761
+ return DecimalToArrowBaseConverter<true >(targetType);
762
+ } else {
763
+ return DecimalToArrowBaseConverter<false >(targetType);
764
+ }
765
+ }
766
+
694
767
}
695
768
696
769
namespace NYql ::NDq {
@@ -728,7 +801,9 @@ TColumnConverter BuildOutputColumnConverter(const std::string& columnName, NKiki
728
801
YQL_ENSURE (ConvertArrowType (columnType, yqlArrowType), " Got unsupported yql block type: " << *columnType << " in column " << columnName);
729
802
YQL_ENSURE (S3ConvertArrowOutputType (columnType, s3OutputType), " Got unsupported s3 output block type: " << *columnType << " in column " << columnName);
730
803
731
- if (columnType->IsOptional ()) {
804
+ bool isOptional = columnType->IsOptional ();
805
+
806
+ if (isOptional) {
732
807
columnType = AS_TYPE (TOptionalType, columnType)->GetItemType ();
733
808
}
734
809
YQL_ENSURE (columnType->IsData (), " Allowed only data types for S3 output, but got: " << *columnType << " in column " << columnName);
@@ -752,13 +827,18 @@ TColumnConverter BuildOutputColumnConverter(const std::string& columnName, NKiki
752
827
case NUdf::EDataSlot::Datetime:
753
828
case NUdf::EDataSlot::Timestamp:
754
829
return {};
830
+ case NUdf::EDataSlot::Date32:
831
+ case NUdf::EDataSlot::Datetime64:
832
+ case NUdf::EDataSlot::Timestamp64:
755
833
case NUdf::EDataSlot::Utf8:
756
834
case NUdf::EDataSlot::Json:
757
835
return ArrowComputeConvertor (columnName, yqlArrowType, s3OutputType);
758
836
case NUdf::EDataSlot::TzDate:
759
837
case NUdf::EDataSlot::TzDatetime:
760
838
case NUdf::EDataSlot::TzTimestamp:
761
839
return YqlBlockTzDateToArrow (columnName, yqlArrowType);
840
+ case NUdf::EDataSlot::Decimal:
841
+ return DecimalToArrowConverter (isOptional, s3OutputType);
762
842
default :
763
843
YQL_ENSURE (false , " Got unsupported s3 output block type: " << *columnType << " in column " << columnName);
764
844
}
@@ -815,7 +895,7 @@ std::shared_ptr<arrow::RecordBatch> ConvertArrowColumns(std::shared_ptr<arrow::R
815
895
}
816
896
817
897
// Type conversion same as in ClickHouseClient.SerializeFormat udf
818
- bool S3ConvertArrowOutputType (NUdf::EDataSlot slot, std::shared_ptr<arrow::DataType>& type) {
898
+ bool S3ConvertArrowOutputType (NUdf::EDataSlot slot, std::shared_ptr<arrow::DataType>& type, TType* itemType ) {
819
899
switch (slot) {
820
900
case NUdf::EDataSlot::Int8:
821
901
type = arrow::int8 ();
@@ -835,11 +915,17 @@ bool S3ConvertArrowOutputType(NUdf::EDataSlot slot, std::shared_ptr<arrow::DataT
835
915
case NUdf::EDataSlot::Int32:
836
916
type = arrow::int32 ();
837
917
return true ;
918
+ case NUdf::EDataSlot::Date32:
919
+ type = arrow::date32 ();
920
+ return true ;
838
921
case NUdf::EDataSlot::Datetime:
839
922
case NUdf::EDataSlot::TzDatetime:
840
923
case NUdf::EDataSlot::Uint32:
841
924
type = arrow::uint32 ();
842
925
return true ;
926
+ case NUdf::EDataSlot::Datetime64:
927
+ type = arrow::timestamp (arrow::TimeUnit::SECOND, " UTC" );
928
+ return true ;
843
929
case NUdf::EDataSlot::Int64:
844
930
type = arrow::int64 ();
845
931
return true ;
@@ -857,7 +943,17 @@ bool S3ConvertArrowOutputType(NUdf::EDataSlot slot, std::shared_ptr<arrow::DataT
857
943
case NUdf::EDataSlot::Json:
858
944
type = arrow::binary ();
859
945
return true ;
946
+ case NUdf::EDataSlot::Decimal: {
947
+ if (itemType) {
948
+ auto [precision, scale] = static_cast <TDataDecimalType*>(itemType)->GetParams ();
949
+ type = arrow::decimal128 (precision, scale);
950
+ } else {
951
+ type = arrow::decimal128 (22 , 9 );
952
+ }
953
+ return true ;
954
+ }
860
955
case NUdf::EDataSlot::Timestamp:
956
+ case NUdf::EDataSlot::Timestamp64:
861
957
case NUdf::EDataSlot::TzTimestamp:
862
958
type = arrow::timestamp (arrow::TimeUnit::MICRO, " UTC" );
863
959
return true ;
@@ -880,7 +976,7 @@ bool S3ConvertArrowOutputType(TType* itemType, std::shared_ptr<arrow::DataType>&
880
976
return false ;
881
977
}
882
978
883
- return S3ConvertArrowOutputType (*slot, type);
979
+ return S3ConvertArrowOutputType (*slot, type, itemType );
884
980
}
885
981
886
982
void BuildOutputColumnConverters (const NKikimr::NMiniKQL::TStructType* outputStructType, std::vector<TColumnConverter>& columnConverters) {
0 commit comments