1
1
#include " arrow_parser.h"
2
+ #include " arrow_metadata_constants.h"
2
3
3
4
#include < yt/yt/client/formats/parser.h>
4
5
@@ -635,6 +636,31 @@ i64 CheckAndTransformTimestamp(i64 arrowValue, arrow20::TimeUnit::type timeUnit,
635
636
636
637
// //////////////////////////////////////////////////////////////////////////////
637
638
639
+ std::optional<std::string> GetYtTypeFromMetadata (const std::shared_ptr<arrow20::Field>& schemaField)
640
+ {
641
+ auto columnMetadata = schemaField->metadata ();
642
+ if (!columnMetadata) {
643
+ return std::nullopt ;
644
+ }
645
+ auto valueResult = columnMetadata->Get (YtTypeMetadataKey);
646
+ if (valueResult.ok ()) {
647
+ return *valueResult;
648
+ }
649
+ return std::nullopt ;
650
+ }
651
+
652
+ bool HasEmptyStructTypeInMetadata (const std::shared_ptr<arrow20::Field>& schemaField)
653
+ {
654
+ return GetYtTypeFromMetadata (schemaField) == YtTypeMetadataValueEmptyStruct;
655
+ }
656
+
657
+ bool HasNestedOptionalTypeInMetadata (const std::shared_ptr<arrow20::Field>& schemaField)
658
+ {
659
+ return GetYtTypeFromMetadata (schemaField) == YtTypeMetadataValueNestedOptional;
660
+ }
661
+
662
+ // //////////////////////////////////////////////////////////////////////////////
663
+
638
664
class TArraySimpleVisitor
639
665
: public arrow20::TypeVisitor
640
666
{
@@ -1076,11 +1102,13 @@ class TArrayCompositeVisitor
1076
1102
TArrayCompositeVisitor (
1077
1103
TLogicalTypePtr ytType,
1078
1104
const std::shared_ptr<arrow20::Array>& array,
1105
+ const std::shared_ptr<arrow20::Field>& schemaField,
1079
1106
NYson::TCheckedInDebugYsonTokenWriter* writer,
1080
1107
int rowIndex)
1081
1108
: YTType_(DenullifyLogicalType(ytType))
1082
1109
, RowIndex_(rowIndex)
1083
1110
, Array_(array)
1111
+ , SchemaField_(schemaField)
1084
1112
, Writer_(writer)
1085
1113
{
1086
1114
YT_VERIFY (writer != nullptr );
@@ -1098,6 +1126,7 @@ class TArrayCompositeVisitor
1098
1126
TArrayCompositeVisitor visitor (
1099
1127
YTType_,
1100
1128
dictionary,
1129
+ SchemaField_,
1101
1130
Writer_,
1102
1131
dictionaryArrayColumn->GetValueIndex (RowIndex_));
1103
1132
ThrowOnError (dictionary->type ()->Accept (&visitor));
@@ -1293,6 +1322,7 @@ class TArrayCompositeVisitor
1293
1322
const int RowIndex_;
1294
1323
1295
1324
std::shared_ptr<arrow20::Array> Array_;
1325
+ std::shared_ptr<arrow20::Field> SchemaField_;
1296
1326
NYson::TCheckedInDebugYsonTokenWriter* Writer_ = nullptr ;
1297
1327
1298
1328
template <typename ArrayType>
@@ -1513,7 +1543,7 @@ class TArrayCompositeVisitor
1513
1543
1514
1544
auto listValue = array->value_slice (RowIndex_);
1515
1545
for (int offset = 0 ; offset < listValue->length (); ++offset) {
1516
- TArrayCompositeVisitor visitor (YTType_->AsListTypeRef ().GetElement (), listValue, Writer_, offset);
1546
+ TArrayCompositeVisitor visitor (YTType_->AsListTypeRef ().GetElement (), listValue, array-> type ()-> field ( 0 ), Writer_, offset);
1517
1547
try {
1518
1548
ThrowOnError (listValue->type ()->Accept (&visitor));
1519
1549
} catch (const std::exception& ex) {
@@ -1548,12 +1578,15 @@ class TArrayCompositeVisitor
1548
1578
auto keyList = allKeys->Slice (offset, length);
1549
1579
auto valueList = allValues->Slice (offset, length);
1550
1580
1581
+ // Map is represented as list of pairs.
1582
+ auto pairType = array->type ()->field (0 )->type ();
1583
+
1551
1584
Writer_->WriteBeginList ();
1552
1585
1553
1586
for (int offset = 0 ; offset < keyList->length (); ++offset) {
1554
1587
Writer_->WriteBeginList ();
1555
1588
1556
- TArrayCompositeVisitor keyVisitor (YTType_->AsDictTypeRef ().GetKey (), keyList, Writer_, offset);
1589
+ TArrayCompositeVisitor keyVisitor (YTType_->AsDictTypeRef ().GetKey (), keyList, pairType-> field ( 0 ), Writer_, offset);
1557
1590
try {
1558
1591
ThrowOnError (keyList->type ()->Accept (&keyVisitor));
1559
1592
} catch (const std::exception& ex) {
@@ -1564,7 +1597,7 @@ class TArrayCompositeVisitor
1564
1597
1565
1598
Writer_->WriteItemSeparator ();
1566
1599
1567
- TArrayCompositeVisitor valueVisitor (YTType_->AsDictTypeRef ().GetValue (), valueList, Writer_, offset);
1600
+ TArrayCompositeVisitor valueVisitor (YTType_->AsDictTypeRef ().GetValue (), valueList, pairType-> field ( 1 ), Writer_, offset);
1568
1601
try {
1569
1602
ThrowOnError (valueList->type ()->Accept (&valueVisitor));
1570
1603
} catch (const std::exception& ex) {
@@ -1584,29 +1617,41 @@ class TArrayCompositeVisitor
1584
1617
return arrow20::Status::OK ();
1585
1618
}
1586
1619
1587
- arrow20::Status ParseStruct ()
1620
+ void ParseStructForStruct ()
1588
1621
{
1589
- if (YTType_->GetMetatype () != ELogicalMetatype::Struct) {
1590
- THROW_ERROR_EXCEPTION (" Unexpected arrow type \" struct\" for YT metatype %Qlv" ,
1591
- YTType_->GetMetatype ());
1592
- }
1593
1622
auto array = std::static_pointer_cast<arrow20::StructArray>(Array_);
1594
1623
if (array->IsNull (RowIndex_)) {
1595
1624
Writer_->WriteEntity ();
1596
1625
} else {
1597
1626
Writer_->WriteBeginList ();
1598
- auto structFields = YTType_->AsStructTypeRef ().GetFields ();
1599
- if (std::ssize (structFields) != array->num_fields ()) {
1600
- THROW_ERROR_EXCEPTION (" The number of fields in the Arrow \" struct\" type does not match the number of fields in the YT \" struct\" type" )
1601
- << TErrorAttribute (" arrow_field_count" , array->num_fields ())
1602
- << TErrorAttribute (" yt_field_count" , std::ssize (structFields));
1627
+
1628
+ const auto & structFields = YTType_->AsStructTypeRef ().GetFields ();
1629
+
1630
+ if (structFields.empty ()) {
1631
+ if (!HasEmptyStructTypeInMetadata (SchemaField_)) {
1632
+ THROW_ERROR_EXCEPTION (
1633
+ " YT \" struct\" type has no fields, but no metadata found with the key \' %v\' and the value \' %v\' " ,
1634
+ YtTypeMetadataKey,
1635
+ YtTypeMetadataValueEmptyStruct);
1636
+ }
1637
+ if (array->num_fields () != 1 && array->field (0 )->type ()->Equals (arrow20::null ())) {
1638
+ THROW_ERROR_EXCEPTION (" YT \" struct\" type has no fields, but Arrow \" struct\" type does not have a single dummy null field" );
1639
+ }
1640
+ } else {
1641
+ if (std::ssize (structFields) != array->num_fields ()) {
1642
+ THROW_ERROR_EXCEPTION (" The number of fields in the Arrow \" struct\" type does not match the number of fields in the YT \" struct\" type" )
1643
+ << TErrorAttribute (" arrow_field_count" , array->num_fields ())
1644
+ << TErrorAttribute (" yt_field_count" , std::ssize (structFields));
1645
+ }
1603
1646
}
1647
+
1648
+ const auto & structType = std::static_pointer_cast<arrow20::StructType>(array->type ());
1604
1649
for (const auto & field : structFields) {
1605
1650
auto arrowField = array->GetFieldByName (field.Name );
1606
1651
if (!arrowField) {
1607
1652
THROW_ERROR_EXCEPTION (" Field %Qv is not found in arrow type \" struct\" " , field.Name );
1608
1653
}
1609
- TArrayCompositeVisitor visitor (field.Type , arrowField, Writer_, RowIndex_);
1654
+ TArrayCompositeVisitor visitor (field.Type , arrowField, structType-> GetFieldByName (field. Name ), Writer_, RowIndex_);
1610
1655
try {
1611
1656
ThrowOnError (arrowField->type ()->Accept (&visitor));
1612
1657
} catch (const std::exception& ex) {
@@ -1619,6 +1664,53 @@ class TArrayCompositeVisitor
1619
1664
1620
1665
Writer_->WriteEndList ();
1621
1666
}
1667
+ }
1668
+
1669
+ void ParseStructForOptional ()
1670
+ {
1671
+ auto array = std::static_pointer_cast<arrow20::StructArray>(Array_);
1672
+ if (array->IsNull (RowIndex_)) {
1673
+ Writer_->WriteEntity ();
1674
+ } else {
1675
+ Writer_->WriteBeginList ();
1676
+ if (!HasNestedOptionalTypeInMetadata (SchemaField_)) {
1677
+ THROW_ERROR_EXCEPTION (
1678
+ " The element of YT \" optional\" type is nullable, but no metadata found with the key \' %v\' and the value \' %v\' " ,
1679
+ YtTypeMetadataKey,
1680
+ YtTypeMetadataValueNestedOptional);
1681
+ }
1682
+ if (array->num_fields () != 1 ) {
1683
+ THROW_ERROR_EXCEPTION (" The number of fields in the Arrow \" struct\" type is not equal to 1 for the YT \" optional\" type" )
1684
+ << TErrorAttribute (" arrow_field_count" , array->num_fields ());
1685
+ }
1686
+
1687
+ auto arrowField = array->field (0 );
1688
+ TArrayCompositeVisitor visitor (YTType_->GetElement (), arrowField, array->type ()->field (0 ), Writer_, RowIndex_);
1689
+ try {
1690
+ ThrowOnError (arrowField->type ()->Accept (&visitor));
1691
+ } catch (const std::exception& ex) {
1692
+ THROW_ERROR_EXCEPTION (" Failed to parse arrow struct field for the YT \" optional\" type" )
1693
+ << ex;
1694
+ }
1695
+
1696
+ Writer_->WriteItemSeparator ();
1697
+ Writer_->WriteEndList ();
1698
+ }
1699
+ }
1700
+
1701
+ arrow20::Status ParseStruct ()
1702
+ {
1703
+ switch (YTType_->GetMetatype ()) {
1704
+ case ELogicalMetatype::Struct:
1705
+ ParseStructForStruct ();
1706
+ break ;
1707
+ case ELogicalMetatype::Optional:
1708
+ ParseStructForOptional ();
1709
+ break ;
1710
+ default :
1711
+ THROW_ERROR_EXCEPTION (" Unexpected arrow type \" struct\" for YT metatype %Qlv" ,
1712
+ YTType_->GetMetatype ());
1713
+ }
1622
1714
return arrow20::Status::OK ();
1623
1715
}
1624
1716
@@ -1650,6 +1742,7 @@ void PrepareArrayForComplexType(
1650
1742
const TLogicalTypePtr& denullifiedLogicalType,
1651
1743
const std::shared_ptr<TChunkedOutputStream>& bufferForStringLikeValues,
1652
1744
const std::shared_ptr<arrow20::Array>& column,
1745
+ const std::shared_ptr<arrow20::Field>& schemaField,
1653
1746
TUnversionedRowValues& rowValues,
1654
1747
int columnId)
1655
1748
{
@@ -1699,6 +1792,16 @@ void PrepareArrayForComplexType(
1699
1792
break ;
1700
1793
1701
1794
case ELogicalMetatype::Optional:
1795
+ CheckArrowType (
1796
+ metatype,
1797
+ {
1798
+ arrow20::Type::STRUCT,
1799
+ arrow20::Type::BINARY
1800
+ },
1801
+ column->type ()->name (),
1802
+ column->type_id ());
1803
+ break ;
1804
+
1702
1805
case ELogicalMetatype::Tuple:
1703
1806
case ELogicalMetatype::VariantTuple:
1704
1807
case ELogicalMetatype::VariantStruct:
@@ -1735,7 +1838,7 @@ void PrepareArrayForComplexType(
1735
1838
TBufferOutput out (valueBuffer);
1736
1839
NYson::TCheckedInDebugYsonTokenWriter writer (&out);
1737
1840
1738
- TArrayCompositeVisitor visitor (denullifiedLogicalType, column, &writer, rowIndex);
1841
+ TArrayCompositeVisitor visitor (denullifiedLogicalType, column, schemaField, &writer, rowIndex);
1739
1842
1740
1843
ThrowOnError (column->type ()->Accept (&visitor));
1741
1844
@@ -1760,14 +1863,15 @@ void PrepareArray(
1760
1863
const TLogicalTypePtr& denullifiedLogicalType,
1761
1864
const std::shared_ptr<TChunkedOutputStream>& bufferForStringLikeValues,
1762
1865
const std::shared_ptr<arrow20::Array>& column,
1866
+ const std::shared_ptr<arrow20::Field>& schemaField,
1763
1867
TUnversionedRowValues& rowValues,
1764
1868
int columnId)
1765
1869
{
1766
1870
if (column->type ()->id () == arrow20::Type::DICTIONARY) {
1767
1871
auto dictionaryArrayColumn = std::static_pointer_cast<arrow20::DictionaryArray>(column);
1768
1872
auto dictionary = dictionaryArrayColumn->dictionary ();
1769
1873
TUnversionedRowValues dictionaryValues (dictionary->length ());
1770
- PrepareArray (denullifiedLogicalType, bufferForStringLikeValues, dictionary, dictionaryValues, columnId);
1874
+ PrepareArray (denullifiedLogicalType, bufferForStringLikeValues, dictionary, schemaField, dictionaryValues, columnId);
1771
1875
1772
1876
for (int offset = 0 ; offset < std::ssize (rowValues); ++offset) {
1773
1877
if (dictionaryArrayColumn->IsNull (offset)) {
@@ -1802,6 +1906,7 @@ void PrepareArray(
1802
1906
denullifiedLogicalType,
1803
1907
bufferForStringLikeValues,
1804
1908
column,
1909
+ schemaField,
1805
1910
rowValues,
1806
1911
columnId);
1807
1912
@@ -1869,6 +1974,7 @@ class TListener
1869
1974
denullifiedColumnType,
1870
1975
bufferForStringLikeValues,
1871
1976
batch->column (columnIndex),
1977
+ batch->schema ()->field (columnIndex),
1872
1978
rowsValues[columnIndex],
1873
1979
columnId);
1874
1980
} catch (const std::exception& ex) {
0 commit comments