diff --git a/dwio/nimble/velox/FieldWriter.cpp b/dwio/nimble/velox/FieldWriter.cpp index edbd2787..fc75b31c 100644 --- a/dwio/nimble/velox/FieldWriter.cpp +++ b/dwio/nimble/velox/FieldWriter.cpp @@ -1614,6 +1614,11 @@ class FlatMapFieldWriter : public FieldWriter { NIMBLE_CHECK( currentValueFields_.empty() && allValueFields_.empty(), "Mixing map and flatmap vectors in the FlatMapFieldWriter is not supported"); + // Mark that we have passthrough flatmap writes (ROW vector written as MAP). + // This is used to skip the raw size check in writeColumnStats() since + // getRawSizeFromVector doesn't properly align with column stats collection + // for passthrough flatmaps. + context_.setHasPassthroughFlatMapWrites(true); const auto& rowVector = vector->as(); NIMBLE_CHECK_NOT_NULL( rowVector, diff --git a/dwio/nimble/velox/FieldWriter.h b/dwio/nimble/velox/FieldWriter.h index 31e78a16..8aac3979 100644 --- a/dwio/nimble/velox/FieldWriter.h +++ b/dwio/nimble/velox/FieldWriter.h @@ -213,6 +213,14 @@ class FieldWriterContext { ignoreTopLevelNulls_ = value; } + inline bool hasPassthroughFlatMapWrites() const { + return hasPassthroughFlatMapWrites_; + } + + inline void setHasPassthroughFlatMapWrites(bool value) { + hasPassthroughFlatMapWrites_ = value; + } + inline std::unique_ptr& inputBufferGrowthPolicy() { return inputBufferGrowthPolicy_; } @@ -424,6 +432,9 @@ class FieldWriterContext { folly::F14FastSet deduplicatedMapNodeIds_; bool ignoreTopLevelNulls_{false}; bool disableSharedStringBuffers_{false}; + // Set to true when a flatmap passthrough write is detected + // (ROW vector written as MAP via flatMapColumns configuration). + bool hasPassthroughFlatMapWrites_{false}; std::unique_ptr inputBufferGrowthPolicy_; std::unique_ptr stringBufferGrowthPolicy_; diff --git a/dwio/nimble/velox/RawSizeUtils.cpp b/dwio/nimble/velox/RawSizeUtils.cpp index 3bec52e4..1b4d954f 100644 --- a/dwio/nimble/velox/RawSizeUtils.cpp +++ b/dwio/nimble/velox/RawSizeUtils.cpp @@ -31,10 +31,9 @@ namespace facebook::nimble { // kTimestampLogicalSize. constexpr uint64_t kTimestampLogicalSize = 12; -namespace { - // Returns the size in bytes for a given TypeKind. -// Used for calculating key sizes in passthrough flatmaps. +// Used for calculating key sizes in passthrough flatmaps and for +// handling type mismatches between vector types and schema types. // Returns std::nullopt for variable-length types (VARCHAR, VARBINARY, etc.) std::optional getTypeSizeFromKind(velox::TypeKind kind) { switch (kind) { @@ -61,6 +60,96 @@ std::optional getTypeSizeFromKind(velox::TypeKind kind) { } } +namespace { + +// Checks if upcasting from vectorType to schemaType is valid. +// Only allows same-family promotions: integer->integer, float->float +// where the schema type is larger or equal. +bool isValidUpcast(velox::TypeKind vectorType, velox::TypeKind schemaType) { + auto vectorSize = getTypeSizeFromKind(vectorType); + auto schemaSize = getTypeSizeFromKind(schemaType); + + if (!vectorSize.has_value() || !schemaSize.has_value()) { + return false; + } + + // Integer type family + const std::unordered_set integerTypes = { + velox::TypeKind::BOOLEAN, + velox::TypeKind::TINYINT, + velox::TypeKind::SMALLINT, + velox::TypeKind::INTEGER, + velox::TypeKind::BIGINT, + }; + + // Floating point type family + const std::unordered_set floatTypes = { + velox::TypeKind::REAL, + velox::TypeKind::DOUBLE, + }; + + // Integer to integer upcast + if (integerTypes.contains(vectorType) && integerTypes.contains(schemaType)) { + return *schemaSize >= *vectorSize; + } + + // Float to float upcast + if (floatTypes.contains(vectorType) && floatTypes.contains(schemaType)) { + return *schemaSize >= *vectorSize; + } + + return false; +} + +// Computes raw size for scalar types when the vector type differs from the +// schema type. Counts nulls from the vector's actual data but uses the schema +// type's size for computing the final raw size. Only supports scalar types +// (fixed-width numeric types). +uint64_t getUpcastedRawSizeFromVector( + const velox::VectorPtr& vector, + const velox::common::Ranges& ranges, + RawSizeContext& context, + size_t schemaTypeSize) { + VELOX_CHECK_NOT_NULL(vector); + + const auto& encoding = vector->encoding(); + uint64_t nullCount = 0; + + switch (encoding) { + case velox::VectorEncoding::Simple::FLAT: { + if (vector->mayHaveNulls()) { + for (const auto& row : ranges) { + if (vector->isNullAt(row)) { + ++nullCount; + } + } + } + break; + } + case velox::VectorEncoding::Simple::CONSTANT: { + nullCount = vector->mayHaveNulls() ? ranges.size() : 0; + break; + } + default: { + auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( + context.getDecodedVectorManager()); + velox::DecodedVector& decodedVector = localDecodedVector.get(); + decodedVector.decode(*vector); + if (decodedVector.mayHaveNulls()) { + for (const auto& row : ranges) { + if (decodedVector.isNullAt(row)) { + ++nullCount; + } + } + } + } + } + + context.nullCount = nullCount; + uint64_t nonNullCount = ranges.size() - nullCount; + return (nonNullCount * schemaTypeSize) + (nullCount * kNullSize); +} + } // namespace template @@ -98,7 +187,7 @@ uint64_t getRawSizeFromFixedWidthVector( context.nullCount = nullCount; return ((ranges.size() - nullCount) * sizeof(T)) + - (nullCount * NULL_SIZE); + (nullCount * kNullSize); } case velox::VectorEncoding::Simple::CONSTANT: { const auto* constVector = vector->as>(); @@ -109,21 +198,15 @@ uint64_t getRawSizeFromFixedWidthVector( vector->typeKind()); context.nullCount = constVector->mayHaveNulls() ? ranges.size() : 0; - return constVector->mayHaveNulls() ? ranges.size() * NULL_SIZE + return constVector->mayHaveNulls() ? ranges.size() * kNullSize : ranges.size() * sizeof(T); } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictVector = vector->as>(); - VELOX_CHECK_NOT_NULL( - dictVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - + default: { + // Decode the vector to handle any encoding (FLAT, DICTIONARY, etc.) auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( context.getDecodedVectorManager()); velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictVector); + decodedVector.decode(*vector); uint64_t nullCount = 0; if (decodedVector.mayHaveNulls()) { @@ -133,26 +216,22 @@ uint64_t getRawSizeFromFixedWidthVector( } } } - context.nullCount = nullCount; return ((ranges.size() - nullCount) * sizeof(T)) + - (nullCount * NULL_SIZE); - } - default: { - VELOX_FAIL("Unsupported encoding: {}.", encoding); + (nullCount * kNullSize); } } } -// Specialized function for TIMESTAMP vectors that uses kTimestampLogicalSize -// instead of sizeof(velox::Timestamp) to match DWRF and Nimble FieldWriter. uint64_t getRawSizeFromTimestampVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context) { VELOX_CHECK_NOT_NULL(vector); - using T = velox::Timestamp; - + VELOX_DCHECK( + vector->typeKind() == velox::TypeKind::TIMESTAMP, + "Wrong vector type. Expected TIMESTAMP."); + using T = typename velox::TypeTraits::NativeType; const auto& encoding = vector->encoding(); switch (encoding) { case velox::VectorEncoding::Simple::FLAT: { @@ -174,7 +253,7 @@ uint64_t getRawSizeFromTimestampVector( context.nullCount = nullCount; return ((ranges.size() - nullCount) * kTimestampLogicalSize) + - (nullCount * NULL_SIZE); + (nullCount * kNullSize); } case velox::VectorEncoding::Simple::CONSTANT: { const auto* constVector = vector->as>(); @@ -186,21 +265,15 @@ uint64_t getRawSizeFromTimestampVector( context.nullCount = constVector->mayHaveNulls() ? ranges.size() : 0; return constVector->mayHaveNulls() - ? ranges.size() * NULL_SIZE + ? ranges.size() * kNullSize : ranges.size() * kTimestampLogicalSize; } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictVector = vector->as>(); - VELOX_CHECK_NOT_NULL( - dictVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - + default: { + // Decode the vector to handle any encoding (FLAT, DICTIONARY, etc.) auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( context.getDecodedVectorManager()); velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictVector); + decodedVector.decode(*vector); uint64_t nullCount = 0; if (decodedVector.mayHaveNulls()) { @@ -210,13 +283,9 @@ uint64_t getRawSizeFromTimestampVector( } } } - context.nullCount = nullCount; return ((ranges.size() - nullCount) * kTimestampLogicalSize) + - (nullCount * NULL_SIZE); - } - default: { - VELOX_FAIL("Unsupported encoding: {}.", encoding); + (nullCount * kNullSize); } } } @@ -226,6 +295,11 @@ uint64_t getRawSizeFromStringVector( const velox::common::Ranges& ranges, RawSizeContext& context) { VELOX_CHECK_NOT_NULL(vector); + VELOX_DCHECK( + vector->typeKind() == velox::TypeKind::VARCHAR || + vector->typeKind() == velox::TypeKind::VARBINARY, + "Wrong vector type. Expected VARCHAR | VARBINARY."); + const auto& encoding = vector->encoding(); switch (encoding) { case velox::VectorEncoding::Simple::FLAT: { @@ -236,24 +310,24 @@ uint64_t getRawSizeFromStringVector( encoding, vector->typeKind()); - uint64_t rawSize = 0; uint64_t nullCount = 0; + uint64_t totalStringSize = 0; if (flatVector->mayHaveNulls()) { for (const auto& row : ranges) { if (flatVector->isNullAt(row)) { ++nullCount; } else { - rawSize += flatVector->valueAt(row).size(); + totalStringSize += flatVector->valueAt(row).size(); } } } else { for (const auto& row : ranges) { - rawSize += flatVector->valueAt(row).size(); + totalStringSize += flatVector->valueAt(row).size(); } } context.nullCount = nullCount; - return rawSize + (nullCount * NULL_SIZE); + return totalStringSize + (nullCount * kNullSize); } case velox::VectorEncoding::Simple::CONSTANT: { const auto* constVector = @@ -266,46 +340,36 @@ uint64_t getRawSizeFromStringVector( context.nullCount = constVector->mayHaveNulls() ? ranges.size() : 0; return constVector->mayHaveNulls() - ? ranges.size() * NULL_SIZE + ? ranges.size() * kNullSize : ranges.size() * constVector->value().size(); } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictVector = - vector->as>(); - VELOX_CHECK_NOT_NULL( - dictVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - + default: { + // Decode the vector to handle any encoding (FLAT, DICTIONARY, etc.) auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( context.getDecodedVectorManager()); velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictVector); + decodedVector.decode(*vector); - uint64_t rawSize = 0; uint64_t nullCount = 0; - const auto* indices = decodedVector.indices(); - const auto decodedVectorData = decodedVector.data(); + uint64_t totalStringSize = 0; if (decodedVector.mayHaveNulls()) { for (const auto& row : ranges) { if (decodedVector.isNullAt(row)) { ++nullCount; } else { - rawSize += decodedVectorData[indices[row]].size(); + totalStringSize += + decodedVector.valueAt(row).size(); } } } else { for (const auto& row : ranges) { - rawSize += decodedVectorData[indices[row]].size(); + totalStringSize += + decodedVector.valueAt(row).size(); } } context.nullCount = nullCount; - return rawSize + (nullCount * NULL_SIZE); - } - default: { - VELOX_FAIL("Unsupported encoding: {}", encoding); + return totalStringSize + (nullCount * kNullSize); } } } @@ -314,18 +378,22 @@ uint64_t getRawSizeFromConstantComplexVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context, - bool topLevelRow = false) { + const velox::dwio::common::TypeWithId* type, + const folly::F14FastSet& flatMapNodeIds, + const bool topLevel = false) { VELOX_CHECK_NOT_NULL(vector); + const auto& encoding = vector->encoding(); VELOX_DCHECK( - velox::VectorEncoding::Simple::CONSTANT == vector->encoding(), - "Wrong vector encoding. Expected VectorEncoding::Simple::CONSTANT."); + encoding == velox::VectorEncoding::Simple::CONSTANT, + "Expected encoding CONSTANT. Encoding: {}.", + encoding); const auto* constantVector = vector->as>(); VELOX_CHECK_NOT_NULL( constantVector, "Encoding mismatch on ConstantVector. Encoding: {}. TypeKind: {}.", - vector->encoding(), + encoding, vector->typeKind()); const auto& valueVector = constantVector->valueVector(); @@ -334,28 +402,43 @@ uint64_t getRawSizeFromConstantComplexVector( childRanges.add(index, index + 1); uint64_t rawSize = 0; - if (topLevelRow) { + if (topLevel && vector->typeKind() == velox::TypeKind::ROW) { VELOX_CHECK_EQ( velox::TypeKind::ROW, valueVector->typeKind(), "Value vector should be a RowVector"); rawSize = getRawSizeFromRowVector( - valueVector, childRanges, context, /*topLevel=*/true); - for (int idx = 0; idx < context.columnCount(); ++idx) { - context.setSizeAt(idx, context.sizeAt(idx) * ranges.size()); - context.setNullsAt(idx, context.nullsAt(idx) * ranges.size()); - } + valueVector, + childRanges, + context, + type, + flatMapNodeIds, + /*topLevel=*/true); } else { - rawSize = getRawSizeFromVector(valueVector, childRanges, context); + rawSize = getRawSizeFromVector( + valueVector, childRanges, context, type, flatMapNodeIds); + } + + context.nullCount = 0; + if (topLevel) { + // Scale up all column sizes by the number of rows + for (size_t i = 0; i < context.columnCount(); ++i) { + context.setSizeAt(i, context.sizeAt(i) * ranges.size()); + context.setNullsAt(i, context.nullsAt(i) * ranges.size()); + } } - context.nullCount = constantVector->mayHaveNulls() ? ranges.size() : 0; return rawSize * ranges.size(); } +// Unified getRawSizeFromArrayVector that works with optional TypeWithId* +// When type is nullptr, uses vector's actual type for calculations. +// When type is non-null, uses schema type for size calculations. uint64_t getRawSizeFromArrayVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, - RawSizeContext& context) { + RawSizeContext& context, + const velox::dwio::common::TypeWithId* type, + const folly::F14FastSet& flatMapNodeIds) { VELOX_CHECK_NOT_NULL(vector); const auto& encoding = vector->encoding(); const velox::ArrayVector* arrayVector; @@ -402,23 +485,16 @@ uint64_t getRawSizeFromArrayVector( break; } case velox::VectorEncoding::Simple::CONSTANT: { - return getRawSizeFromConstantComplexVector(vector, ranges, context); + return getRawSizeFromConstantComplexVector( + vector, ranges, context, type, flatMapNodeIds); } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictionaryArrayVector = - vector->as>(); - VELOX_CHECK_NOT_NULL( - dictionaryArrayVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - + default: { + // Decode the vector to handle any encoding (ARRAY, DICTIONARY, etc.) auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( context.getDecodedVectorManager()); velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictionaryArrayVector); + decodedVector.decode(*vector); - // Decoded ComplexVectors are stored in baseVector. arrayVector = decodedVector.base()->as(); VELOX_CHECK_NOT_NULL( arrayVector, @@ -445,21 +521,22 @@ uint64_t getRawSizeFromArrayVector( break; } - default: { - VELOX_FAIL("Unsupported encoding: {}.", encoding); - } } - // ARRAY and DICTIONARY encodings should only reach here uint64_t rawSize = 0; if (childRanges.size()) { - rawSize += - getRawSizeFromVector(arrayVector->elements(), childRanges, context); + // Use schema's element type for computing sizes + rawSize += getRawSizeFromVector( + arrayVector->elements(), + childRanges, + context, + type ? type->childAt(0).get() : nullptr, + flatMapNodeIds); } context.nullCount = nullCount; if (nullCount) { - rawSize += nullCount * NULL_SIZE; + rawSize += nullCount * kNullSize; } return rawSize; @@ -467,13 +544,27 @@ uint64_t getRawSizeFromArrayVector( namespace { +// Unified getRawSizeFromMapVector that works with optional TypeWithId* uint64_t getRawSizeFromMapVector( const velox::MapVector& mapVector, const velox::common::Ranges& childRanges, - RawSizeContext& context) { + RawSizeContext& context, + const velox::dwio::common::TypeWithId* type, + const folly::F14FastSet& flatMapNodeIds) { uint64_t rawSize = 0; - rawSize += getRawSizeFromVector(mapVector.mapKeys(), childRanges, context); - rawSize += getRawSizeFromVector(mapVector.mapValues(), childRanges, context); + // Use schema's key/value types for computing sizes + rawSize += getRawSizeFromVector( + mapVector.mapKeys(), + childRanges, + context, + type ? type->childAt(0).get() : nullptr, + flatMapNodeIds); + rawSize += getRawSizeFromVector( + mapVector.mapValues(), + childRanges, + context, + type ? type->childAt(1).get() : nullptr, + flatMapNodeIds); return rawSize; } @@ -523,12 +614,15 @@ uint64_t getRawSizeFromFlatMapVector( } // namespace +// Unified getRawSizeFromMap that works with optional TypeWithId* uint64_t getRawSizeFromMap( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, - RawSizeContext& context) { + RawSizeContext& context, + const velox::dwio::common::TypeWithId* type, + const folly::F14FastSet& flatMapNodeIds) { VELOX_CHECK_NOT_NULL(vector); - const auto& encoding = vector->encoding(); + auto encoding = vector->encoding(); const velox::MapVector* mapVector; const velox::vector_size_t* offsets; @@ -549,10 +643,23 @@ uint64_t getRawSizeFromMap( } }; + if (encoding == velox::VectorEncoding::Simple::CONSTANT) { + return getRawSizeFromConstantComplexVector( + vector, ranges, context, type, flatMapNodeIds); + } + + const velox::BaseVector* decoded = vector.get(); + auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( + context.getDecodedVectorManager()); + velox::DecodedVector& decodedVector = localDecodedVector.get(); + decodedVector.decode(*vector); + decoded = decodedVector.base(); + encoding = decoded->encoding(); + switch (encoding) { - // Handle top-level (regular) Map vectors. + // Handle regular map vectors. case velox::VectorEncoding::Simple::MAP: { - mapVector = vector->as(); + mapVector = decoded->as(); VELOX_CHECK_NOT_NULL( mapVector, "Encoding mismatch on MapVector. Encoding: {}. TypeKind: {}.", @@ -562,329 +669,81 @@ uint64_t getRawSizeFromMap( offsets = mapVector->rawOffsets(); sizes = mapVector->rawSizes(); - if (mapVector->mayHaveNulls()) { - const uint64_t* nulls = mapVector->rawNulls(); + if (decodedVector.mayHaveNulls()) { for (const auto& row : ranges) { - if (velox::bits::isBitNull(nulls, row)) { + if (decodedVector.isNullAt(row)) { ++nullCount; } else { - processMapRow(row); + processMapRow(decodedVector.index(row)); } } } else { for (const auto& row : ranges) { - processMapRow(row); + processMapRow(decodedVector.index(row)); } } - rawSize += getRawSizeFromMapVector(*mapVector, childRanges, context); + rawSize += getRawSizeFromMapVector( + *mapVector, childRanges, context, type, flatMapNodeIds); break; } - // Handle top-level Flat Map vectors. + // Handle flat map vectors. case velox::VectorEncoding::Simple::FLAT_MAP: { - auto flatMapVector = vector->as(); + auto flatMapVector = decoded->as(); VELOX_CHECK_NOT_NULL( flatMapVector, "Encoding mismatch on FlatMapVector. Encoding: {}. TypeKind: {}.", encoding, vector->typeKind()); - if (flatMapVector->mayHaveNulls()) { - const uint64_t* nulls = flatMapVector->rawNulls(); - for (const auto& row : ranges) { - if (velox::bits::isBitNull(nulls, row)) { - ++nullCount; - } else { - childRanges.add(row, row + 1); - } - } - rawSize += - getRawSizeFromFlatMapVector(*flatMapVector, childRanges, context); - } else { - rawSize += getRawSizeFromFlatMapVector(*flatMapVector, ranges, context); - } - break; - } - - // Cases when maps or flat maps are wrapped by a constant. - case velox::VectorEncoding::Simple::CONSTANT: { - return getRawSizeFromConstantComplexVector(vector, ranges, context); - } - - // Cases when maps or flat maps are wrapped by a dictionary. - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictionaryMapVector = - vector->as>(); - VELOX_CHECK_NOT_NULL( - dictionaryMapVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - - auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( - context.getDecodedVectorManager()); - velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictionaryMapVector); - - // Now switch on the inner type of the dictionary; must be either a map - // or a flat map. - switch (decodedVector.base()->encoding()) { - // Dictionary wrapped around a map: - case velox::VectorEncoding::Simple::MAP: { - mapVector = decodedVector.base()->as(); - VELOX_CHECK_NOT_NULL( - mapVector, - "Encoding mismatch on FlatVector. MapVector: {}. TypeKind: {}.", - decodedVector.base()->encoding(), - decodedVector.base()->typeKind()); - - offsets = mapVector->rawOffsets(); - sizes = mapVector->rawSizes(); - - if (decodedVector.mayHaveNulls()) { - for (const auto& row : ranges) { - if (decodedVector.isNullAt(row)) { - ++nullCount; - } else { - processMapRow(decodedVector.index(row)); - } - } - } else { - for (const auto& row : ranges) { - processMapRow(decodedVector.index(row)); - } - } - rawSize += getRawSizeFromMapVector(*mapVector, childRanges, context); - break; - } - // Dictionary wrapped around a flat map: - case velox::VectorEncoding::Simple::FLAT_MAP: { - auto flatMapVector = decodedVector.base()->as(); - VELOX_CHECK_NOT_NULL( - flatMapVector, - "Encoding mismatch on FlatMapVector. Encoding: {}. TypeKind: {}.", - decodedVector.base()->encoding(), - decodedVector.base()->typeKind()); - - if (decodedVector.mayHaveNulls()) { - for (const auto& row : ranges) { - if (decodedVector.isNullAt(row)) { - ++nullCount; - } else { - auto idx = decodedVector.index(row); - childRanges.add(idx, idx + 1); - } - } - } else { - for (const auto& row : ranges) { - auto idx = decodedVector.index(row); - childRanges.add(idx, idx + 1); - } - } - rawSize += - getRawSizeFromFlatMapVector(*flatMapVector, childRanges, context); - break; - } - default: - VELOX_FAIL( - "Unsupported map encoding wrapped by DICTIONARY: {}.", encoding); - } - break; - } - default: - VELOX_FAIL("Unsupported map encoding: {}.", encoding); - } - - context.nullCount = nullCount; - if (nullCount) { - rawSize += nullCount * NULL_SIZE; - } - return rawSize; -} - -uint64_t getRawSizeFromRowVector( - const velox::VectorPtr& vector, - const velox::common::Ranges& ranges, - RawSizeContext& context, - const bool topLevel) { - VELOX_CHECK_NOT_NULL(vector); - const auto& encoding = vector->encoding(); - const velox::RowVector* rowVector; - uint64_t nullCount = 0; - velox::common::Ranges childRanges; - const velox::common::Ranges* childRangesPtr; - - switch (encoding) { - case velox::VectorEncoding::Simple::ROW: { - rowVector = vector->as(); - VELOX_CHECK_NOT_NULL( - rowVector, - "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - - childRangesPtr = &childRanges; - if (rowVector->mayHaveNulls()) { - const auto& nulls = rowVector->rawNulls(); - for (const auto& row : ranges) { - if (velox::bits::isBitNull(nulls, row)) { - ++nullCount; - } else { - childRanges.add(row, row + 1); - } - } - } else { - childRangesPtr = &ranges; - } - break; - } - case velox::VectorEncoding::Simple::CONSTANT: { - return getRawSizeFromConstantComplexVector( - vector, ranges, context, topLevel); - } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictionaryRowVector = - vector->as>(); - VELOX_CHECK_NOT_NULL( - dictionaryRowVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); - - auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( - context.getDecodedVectorManager()); - velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictionaryRowVector); - - rowVector = decodedVector.base()->as(); - VELOX_CHECK_NOT_NULL( - rowVector, - "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", - decodedVector.base()->encoding(), - decodedVector.base()->typeKind()); - - childRangesPtr = &childRanges; if (decodedVector.mayHaveNulls()) { for (const auto& row : ranges) { if (decodedVector.isNullAt(row)) { ++nullCount; } else { - childRanges.add( - decodedVector.index(row), decodedVector.index(row) + 1); + auto baseIndex = decodedVector.index(row); + childRanges.add(baseIndex, baseIndex + 1); } } } else { - for (auto& row : ranges) { - childRanges.add( - decodedVector.index(row), decodedVector.index(row) + 1); + for (const auto& row : ranges) { + auto baseIndex = decodedVector.index(row); + childRanges.add(baseIndex, baseIndex + 1); } } + rawSize += + getRawSizeFromFlatMapVector(*flatMapVector, childRanges, context); break; } default: { - VELOX_FAIL("Unsupported encoding: {}.", encoding); - } - } - - uint64_t rawSize = 0; - const auto nonNullCount = (*childRangesPtr).size(); - if (nonNullCount) { - const auto childrenSize = rowVector->childrenSize(); - - for (size_t i = 0; i < childrenSize; ++i) { - auto childRawSize = - getRawSizeFromVector(rowVector->childAt(i), *childRangesPtr, context); - rawSize += childRawSize; - if (topLevel) { - context.appendSize(childRawSize); - context.appendNullCount(context.nullCount); - } - } - } else if (topLevel) { - for (size_t i = 0; i < rowVector->childrenSize(); ++i) { - context.appendSize(0); - context.appendNullCount(0); + VELOX_FAIL("Unexpected encoding for decoded vector base: {}.", encoding); } } context.nullCount = nullCount; if (nullCount) { - rawSize += nullCount * NULL_SIZE; + rawSize += nullCount * kNullSize; } return rawSize; } -// Returns uint64_t bytes of raw data in the vector. -uint64_t getRawSizeFromVector( - const velox::VectorPtr& vector, - const velox::common::Ranges& ranges, - RawSizeContext& context) { - VELOX_CHECK_NOT_NULL(vector); - const auto& typeKind = vector->typeKind(); - switch (typeKind) { - case velox::TypeKind::BOOLEAN: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::TINYINT: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::SMALLINT: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::INTEGER: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::BIGINT: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::REAL: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::DOUBLE: { - return getRawSizeFromFixedWidthVector( - vector, ranges, context); - } - case velox::TypeKind::TIMESTAMP: { - return getRawSizeFromTimestampVector(vector, ranges, context); - } - case velox::TypeKind::VARCHAR: - case velox::TypeKind::VARBINARY: { - return getRawSizeFromStringVector(vector, ranges, context); - } - case velox::TypeKind::ARRAY: { - return getRawSizeFromArrayVector(vector, ranges, context); - } - case velox::TypeKind::MAP: { - return getRawSizeFromMap(vector, ranges, context); - } - case velox::TypeKind::ROW: { - return getRawSizeFromRowVector(vector, ranges, context); - } - default: { - VELOX_FAIL("Unsupported type: {}.", typeKind); - } - } -} - -uint64_t getRawSizeFromVector( - const velox::VectorPtr& vector, - const velox::common::Ranges& ranges) { - RawSizeContext context; - return getRawSizeFromVector(vector, ranges, context); -} - -// Computes raw size for a passthrough flatmap. -// A passthrough flatmap is a ROW vector that represents a MAP in the schema, -// where each ROW field name becomes a map key and each ROW child becomes the -// corresponding map value. +// Passthrough flatmap handling: Nimble treats ROW vectors at certain node IDs +// as "passthrough flatmaps" - maps where keys are field names and values +// correspond to the map value type. This is indicated when: +// 1. The node's id is in flatMapNodeIds, AND +// 2. The vector at that level is a ROW vector +// +// For passthrough flatmaps, we need to compute: +// - Key sizes: Fixed-size based on declared key type in schema (VARCHAR, +// BIGINT, etc.) +// - Value sizes: Computed recursively for each field, using the map's value +// type +// from schema +// +// The raw size is: sum of (key_size * field_count + value_sizes) for each row +// where the field is present, plus null handling. uint64_t getRawSizeFromPassthroughFlatMap( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, @@ -893,139 +752,100 @@ uint64_t getRawSizeFromPassthroughFlatMap( const bool topLevel) { VELOX_CHECK_NOT_NULL(vector); const auto& encoding = vector->encoding(); - const velox::RowVector* rowVector = nullptr; - uint64_t nullCount = 0; - velox::common::Ranges childRanges; - const velox::common::Ranges* childRangesPtr = nullptr; - - // For passthrough flatmaps, the schema type is MAP. - // Get the key type from type.childAt(0) which is the MAP's key type. - const auto& keyType = type.childAt(0); - std::optional keyTypeSize = - getTypeSizeFromKind(keyType->type()->kind()); - - // For VARCHAR keys, compute the total string key size from ROW field names - size_t stringKeySize = 0; - if (!keyTypeSize.has_value() && - keyType->type()->kind() == velox::TypeKind::VARCHAR) { - const velox::RowVector* passthroughRow = nullptr; - if (encoding == velox::VectorEncoding::Simple::ROW) { - passthroughRow = vector->as(); - } else if (encoding == velox::VectorEncoding::Simple::DICTIONARY) { - auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( - context.getDecodedVectorManager()); - velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*vector); - passthroughRow = decodedVector.base()->as(); - } - if (passthroughRow) { - const auto& rowType = passthroughRow->type()->asRow(); - for (size_t i = 0; i < rowType.size(); ++i) { - stringKeySize += rowType.nameOf(i).size(); - } - } + + if (encoding == velox::VectorEncoding::Simple::CONSTANT) { + return getRawSizeFromConstantComplexVector( + vector, ranges, context, &type, {}, topLevel); } - switch (encoding) { - case velox::VectorEncoding::Simple::ROW: { - rowVector = vector->as(); - VELOX_CHECK_NOT_NULL( - rowVector, - "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); + // Decode the vector to handle any encoding (ROW, DICTIONARY, etc.) + auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( + context.getDecodedVectorManager()); + velox::DecodedVector& decodedVector = localDecodedVector.get(); + decodedVector.decode(*vector); - childRangesPtr = &childRanges; - if (rowVector->mayHaveNulls()) { - const auto& nulls = rowVector->rawNulls(); - for (const auto& row : ranges) { - if (velox::bits::isBitNull(nulls, row)) { - ++nullCount; - } else { - childRanges.add(row, row + 1); - } - } + const velox::RowVector* rowVector = + decodedVector.base()->as(); + VELOX_CHECK_NOT_NULL( + rowVector, + "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", + decodedVector.base()->encoding(), + decodedVector.base()->typeKind()); + + uint64_t nullCount = 0; + velox::common::Ranges childRanges; + + // For passthrough flatmap, we NEVER ignore top-level nulls + // because the ROW represents a map entry, and null means "empty map" + if (decodedVector.mayHaveNulls()) { + for (const auto& row : ranges) { + if (decodedVector.isNullAt(row)) { + ++nullCount; } else { - childRangesPtr = &ranges; + auto baseIndex = decodedVector.index(row); + childRanges.add(baseIndex, baseIndex + 1); } - break; } - case velox::VectorEncoding::Simple::CONSTANT: { - return getRawSizeFromConstantComplexVector( - vector, ranges, context, topLevel); + } else { + for (const auto& row : ranges) { + auto baseIndex = decodedVector.index(row); + childRanges.add(baseIndex, baseIndex + 1); } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictionaryRowVector = - vector->as>(); - VELOX_CHECK_NOT_NULL( - dictionaryRowVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); + } - auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( - context.getDecodedVectorManager()); - velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictionaryRowVector); + // For passthrough flatmap: type is MAP, and we treat ROW children as + // map entries. + // Schema: MAP + // - type.childAt(0) is the key type (determines key size) + // - type.childAt(1) is the value type (used for computing value sizes) + const auto& schemaKeyType = type.childAt(0)->type(); + const auto& schemaValueType = type.childAt(1); + auto keyTypeSize = getTypeSizeFromKind(schemaKeyType->kind()); - rowVector = decodedVector.base()->as(); - VELOX_CHECK_NOT_NULL( - rowVector, - "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", - decodedVector.base()->encoding(), - decodedVector.base()->typeKind()); + uint64_t rawSize = 0; + const auto nonNullCount = childRanges.size(); - childRangesPtr = &childRanges; - if (decodedVector.mayHaveNulls()) { - for (const auto& row : ranges) { - if (decodedVector.isNullAt(row)) { - ++nullCount; - } else { - childRanges.add( - decodedVector.index(row), decodedVector.index(row) + 1); + if (nonNullCount) { + // ROW children represent the "values" in the passthrough flatmap + // Each field in the ROW is a key-value pair + const auto childrenSize = rowVector->childrenSize(); + for (size_t i = 0; i < childrenSize; ++i) { + const auto& child = rowVector->childAt(i); + // Key size is fixed based on schema key type + // We add key size for each present (non-null) value + velox::common::Ranges presentRanges; + if (child->mayHaveNulls()) { + for (const auto& row : childRanges) { + if (!child->isNullAt(row)) { + presentRanges.add(row, row + 1); } } } else { - for (auto& row : ranges) { - childRanges.add( - decodedVector.index(row), decodedVector.index(row) + 1); - } + presentRanges = childRanges; } - break; - } - default: { - VELOX_FAIL("Unsupported encoding: {}.", encoding); - } - } - - uint64_t rawSize = 0; - const auto nonNullCount = (*childRangesPtr).size(); - // Use vector's children count (each child is a key/value pair) - const auto childrenSize = rowVector->childrenSize(); - if (nonNullCount) { - // Add key sizes: each ROW field name is written as a key for each non-null - // row - if (keyTypeSize.has_value()) { - rawSize += *keyTypeSize * childrenSize * nonNullCount; - } else if (stringKeySize > 0) { - // For VARCHAR keys, use the total string key size multiplied by non-null - // count - rawSize += stringKeySize * nonNullCount; - } + // Add key size for present values + if (keyTypeSize.has_value()) { + rawSize += *keyTypeSize * presentRanges.size(); + } else { + // For VARCHAR keys, use the field name length + const auto& fieldName = rowVector->type()->asRow().nameOf(i); + rawSize += fieldName.size() * presentRanges.size(); + } - // Add value sizes: all children use the same value type from the MAP schema - for (size_t i = 0; i < childrenSize; ++i) { - // Value types are not flatmaps themselves, so use the non-flatmap version - uint64_t childRawSize = - getRawSizeFromVector(rowVector->childAt(i), *childRangesPtr, context); + // Compute value sizes using schema's value type + uint64_t childRawSize = getRawSizeFromVector( + child, childRanges, context, schemaValueType.get(), {}); rawSize += childRawSize; + if (topLevel) { context.appendSize(childRawSize); context.appendNullCount(context.nullCount); } } } else if (topLevel) { + // No non-null rows, but we still need to record sizes for each child + const auto childrenSize = rowVector->childrenSize(); for (size_t i = 0; i < childrenSize; ++i) { context.appendSize(0); context.appendNullCount(0); @@ -1034,115 +854,89 @@ uint64_t getRawSizeFromPassthroughFlatMap( context.nullCount = nullCount; if (nullCount) { - rawSize += nullCount * NULL_SIZE; + rawSize += nullCount * kNullSize; } return rawSize; } -// Computes raw size for a regular ROW vector (not a passthrough flatmap). +// Unified getRawSizeFromRegularRowVector that works with optional TypeWithId* // ignoreNulls: when true, nulls at this level are ignored uint64_t getRawSizeFromRegularRowVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context, - const velox::dwio::common::TypeWithId& type, + const velox::dwio::common::TypeWithId* type, const folly::F14FastSet& flatMapNodeIds, const bool topLevel, bool ignoreNulls) { VELOX_CHECK_NOT_NULL(vector); const auto& encoding = vector->encoding(); - const velox::RowVector* rowVector = nullptr; - uint64_t nullCount = 0; - velox::common::Ranges childRanges; - const velox::common::Ranges* childRangesPtr = nullptr; - switch (encoding) { - case velox::VectorEncoding::Simple::ROW: { - rowVector = vector->as(); - VELOX_CHECK_NOT_NULL( - rowVector, - "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); + if (encoding == velox::VectorEncoding::Simple::CONSTANT) { + return getRawSizeFromConstantComplexVector( + vector, ranges, context, type, flatMapNodeIds, topLevel); + } - childRangesPtr = &childRanges; - if (!ignoreNulls && rowVector->mayHaveNulls()) { - const auto& nulls = rowVector->rawNulls(); - for (const auto& row : ranges) { - if (velox::bits::isBitNull(nulls, row)) { - ++nullCount; - } else { - childRanges.add(row, row + 1); - } - } - } else { - childRangesPtr = &ranges; - } - break; - } - case velox::VectorEncoding::Simple::CONSTANT: { - return getRawSizeFromConstantComplexVector( - vector, ranges, context, topLevel); - } - case velox::VectorEncoding::Simple::DICTIONARY: { - const auto* dictionaryRowVector = - vector->as>(); - VELOX_CHECK_NOT_NULL( - dictionaryRowVector, - "Encoding mismatch on DictionaryVector. Encoding: {}. TypeKind: {}.", - encoding, - vector->typeKind()); + // Decode the vector to handle any encoding (ROW, DICTIONARY, etc.) + auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( + context.getDecodedVectorManager()); + velox::DecodedVector& decodedVector = localDecodedVector.get(); + decodedVector.decode(*vector); - auto localDecodedVector = DecodedVectorManager::LocalDecodedVector( - context.getDecodedVectorManager()); - velox::DecodedVector& decodedVector = localDecodedVector.get(); - decodedVector.decode(*dictionaryRowVector); + const velox::RowVector* rowVector = + decodedVector.base()->as(); + VELOX_CHECK_NOT_NULL( + rowVector, + "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", + decodedVector.base()->encoding(), + decodedVector.base()->typeKind()); - rowVector = decodedVector.base()->as(); - VELOX_CHECK_NOT_NULL( - rowVector, - "Encoding mismatch on RowVector. Encoding: {}. TypeKind: {}.", - decodedVector.base()->encoding(), - decodedVector.base()->typeKind()); + uint64_t nullCount = 0; + velox::common::Ranges childRanges; - childRangesPtr = &childRanges; - if (!ignoreNulls && decodedVector.mayHaveNulls()) { - for (const auto& row : ranges) { - if (decodedVector.isNullAt(row)) { - ++nullCount; - } else { - childRanges.add( - decodedVector.index(row), decodedVector.index(row) + 1); - } - } + if (!ignoreNulls && decodedVector.mayHaveNulls()) { + for (const auto& row : ranges) { + if (decodedVector.isNullAt(row)) { + ++nullCount; } else { - for (auto& row : ranges) { - childRanges.add( - decodedVector.index(row), decodedVector.index(row) + 1); - } + auto baseIndex = decodedVector.index(row); + childRanges.add(baseIndex, baseIndex + 1); } - break; } - default: { - VELOX_FAIL("Unsupported encoding: {}.", encoding); + } else { + for (const auto& row : ranges) { + auto baseIndex = decodedVector.index(row); + childRanges.add(baseIndex, baseIndex + 1); } } uint64_t rawSize = 0; - const auto nonNullCount = (*childRangesPtr).size(); - // Use schema's children count (input may have extra columns not in schema) - const auto childrenSize = type.size(); + const auto nonNullCount = childRanges.size(); + + // Determine children count - use schema if available, otherwise use vector + const size_t childrenSize = type ? type->size() : rowVector->childrenSize(); if (nonNullCount) { - // Each child has its own type in the schema for (size_t i = 0; i < childrenSize; ++i) { - uint64_t childRawSize = getRawSizeFromVector( - rowVector->childAt(i), - *childRangesPtr, - context, - *type.childAt(i), - flatMapNodeIds); + uint64_t childRawSize; + if (type) { + // Use schema's child type for computing sizes + childRawSize = getRawSizeFromVector( + rowVector->childAt(i), + childRanges, + context, + type->childAt(i).get(), + flatMapNodeIds); + } else { + // No schema, use vector's actual type + childRawSize = getRawSizeFromVector( + rowVector->childAt(i), + childRanges, + context, + nullptr, + flatMapNodeIds); + } rawSize += childRawSize; if (topLevel) { context.appendSize(childRawSize); @@ -1158,13 +952,13 @@ uint64_t getRawSizeFromRegularRowVector( context.nullCount = nullCount; if (nullCount) { - rawSize += nullCount * NULL_SIZE; + rawSize += nullCount * kNullSize; } return rawSize; } -// TypeWithId version of getRawSizeFromRowVector. +// Unified getRawSizeFromRowVector that works with optional TypeWithId* // Dispatches to either passthrough flatmap or regular row handling. // ignoreTopLevelNulls: when true and topLevel is true, nulls at the top level // are ignored @@ -1172,21 +966,22 @@ uint64_t getRawSizeFromRowVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context, - const velox::dwio::common::TypeWithId& type, + const velox::dwio::common::TypeWithId* type, const folly::F14FastSet& flatMapNodeIds, const bool topLevel, bool ignoreTopLevelNulls) { VELOX_CHECK_NOT_NULL(vector); - // Check if this is a passthrough flatmap: - // 1. The node's id is in flatMapNodeIds (configured as flatmap) + // Check if this is a passthrough flatmap (only when we have schema info): + // 1. The type's node ID is in flatMapNodeIds (configured as flatmap) // 2. The vector at that level is a ROW vector - const bool isPassthroughFlatMap = flatMapNodeIds.contains(type.id()) && + const bool isPassthroughFlatMap = type && + flatMapNodeIds.contains(type->id()) && vector->typeKind() == velox::TypeKind::ROW; if (isPassthroughFlatMap) { return getRawSizeFromPassthroughFlatMap( - vector, ranges, context, type, topLevel); + vector, ranges, context, *type, topLevel); } else { // When ignoreTopLevelNulls is true and this is the top level, treat all // rows as non-null @@ -1196,19 +991,52 @@ uint64_t getRawSizeFromRowVector( } } -// TypeWithId version of getRawSizeFromVector that passes schema and -// flatMapNodeIds recursively. +// Unified getRawSizeFromVector that works with optional TypeWithId* +// When type is nullptr, uses vector's actual type for calculations. +// When type is non-null, uses schema type for size calculations (handles type +// mismatches like int32_t vector with BIGINT schema). uint64_t getRawSizeFromVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context, - const velox::dwio::common::TypeWithId& type, + const velox::dwio::common::TypeWithId* type, const folly::F14FastSet& flatMapNodeIds, bool ignoreTopLevelNulls) { VELOX_CHECK_NOT_NULL(vector); - const auto& typeKind = vector->typeKind(); - switch (typeKind) { + auto vectorTypeKind = vector->typeKind(); + + // If we have schema info and there's a type mismatch, handle it. + // Type mismatch is only allowed for leaf (scalar) types. + // Complex type mismatches (e.g., ROW vs MAP for passthrough flatmaps) are + // handled in the type-specific functions. + if (type) { + const auto schemaTypeKind = type->type()->kind(); + if (vectorTypeKind != schemaTypeKind) { + auto schemaTypeSize = getTypeSizeFromKind(schemaTypeKind); + + // Only validate and handle type mismatch for scalar (leaf) types. + // Complex types (ROW, MAP, ARRAY) are allowed to mismatch for cases like + // passthrough flatmaps and we let upstream decide whether to run the + // compatibility check. + if (schemaTypeSize.has_value()) { + // Validate that the upcast is valid (schema type must be same or larger + // in same type family) + VELOX_CHECK( + isValidUpcast(vectorTypeKind, schemaTypeKind), + "Invalid type coercion from {} to {}. Only upcasting within the same type family is allowed.", + vectorTypeKind, + schemaTypeKind); + + // Use the dedicated upcasting function to count nulls from the vector's + // actual data and compute size using the schema type. + return getUpcastedRawSizeFromVector( + vector, ranges, context, *schemaTypeSize); + } + } + } + + switch (vectorTypeKind) { case velox::TypeKind::BOOLEAN: { return getRawSizeFromFixedWidthVector( vector, ranges, context); @@ -1245,24 +1073,19 @@ uint64_t getRawSizeFromVector( return getRawSizeFromStringVector(vector, ranges, context); } case velox::TypeKind::ARRAY: { - // For arrays, pass the element type to child - // Note: Current implementation doesn't fully support TypeWithId for - // arrays This can be enhanced later to pass type.childAt(0) to elements - return getRawSizeFromArrayVector(vector, ranges, context); + return getRawSizeFromArrayVector( + vector, ranges, context, type, flatMapNodeIds); } case velox::TypeKind::MAP: { - // For maps, pass key/value types to children - // Note: Current implementation doesn't fully support TypeWithId for maps - // This can be enhanced later - return getRawSizeFromMap(vector, ranges, context); + return getRawSizeFromMap(vector, ranges, context, type, flatMapNodeIds); } case velox::TypeKind::ROW: { - // type.id() == 0 means this is the root/top-level ROW - // Handle passthrough flatmap: node id is in flatMapNodeIds AND vector is - // ROW - // Passthrough flatmaps are treated as top-level for stats purposes. - const bool isTopLevel = - (type.id() == 0 || flatMapNodeIds.contains(type.id())); + // Determine if this is a top-level ROW: + // - If no schema, treat node 0 (first call) as top level + // - If schema, use type->id() == 0 or passthrough flatmap detection + const bool isTopLevel = type + ? (type->id() == 0 || flatMapNodeIds.contains(type->id())) + : true; // Without schema, first ROW is top level return getRawSizeFromRowVector( vector, ranges, @@ -1273,9 +1096,25 @@ uint64_t getRawSizeFromVector( isTopLevel ? ignoreTopLevelNulls : false); } default: { - VELOX_FAIL("Unsupported type: {}.", typeKind); + VELOX_FAIL("Unsupported type: {}.", vectorTypeKind); } } } +// Convenience overload with context but without schema +uint64_t getRawSizeFromVector( + const velox::VectorPtr& vector, + const velox::common::Ranges& ranges, + RawSizeContext& context) { + return getRawSizeFromVector(vector, ranges, context, nullptr, {}); +} + +// Convenience overload without context for simple use cases +uint64_t getRawSizeFromVector( + const velox::VectorPtr& vector, + const velox::common::Ranges& ranges) { + RawSizeContext context; + return getRawSizeFromVector(vector, ranges, context, nullptr, {}); +} + } // namespace facebook::nimble diff --git a/dwio/nimble/velox/RawSizeUtils.h b/dwio/nimble/velox/RawSizeUtils.h index 2b1ab848..2afd4a3f 100644 --- a/dwio/nimble/velox/RawSizeUtils.h +++ b/dwio/nimble/velox/RawSizeUtils.h @@ -25,7 +25,13 @@ namespace facebook::nimble { -constexpr uint64_t NULL_SIZE = 1; +constexpr uint64_t kNullSize = 1; + +// Returns the size in bytes for a given TypeKind. +// Used for calculating key sizes in passthrough flatmaps and for +// handling type mismatches between vector types and schema types. +// Returns std::nullopt for variable-length types (VARCHAR, VARBINARY, etc.) +std::optional getTypeSizeFromKind(velox::TypeKind kind); // Get raw size from vector with schema and flatmap node IDs. // flatMapNodeIds contains the node IDs that are configured as flatmaps. @@ -34,11 +40,12 @@ constexpr uint64_t NULL_SIZE = 1; // 2. The vector at that level is a ROW vector // ignoreTopLevelNulls: when true, top-level row nulls are ignored (treated as // non-null) to match FieldWriter behavior when that option is enabled. +// type can be nullptr when schema is not available. uint64_t getRawSizeFromVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context, - const velox::dwio::common::TypeWithId& type, + const velox::dwio::common::TypeWithId* type, const folly::F14FastSet& flatMapNodeIds, bool ignoreTopLevelNulls = false); @@ -46,7 +53,7 @@ uint64_t getRawSizeFromRowVector( const velox::VectorPtr& vector, const velox::common::Ranges& ranges, RawSizeContext& context, - const velox::dwio::common::TypeWithId& type, + const velox::dwio::common::TypeWithId* type, const folly::F14FastSet& flatMapNodeIds, bool topLevel = false, bool ignoreTopLevelNulls = false); diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp index 18e6cc84..43e34bc1 100644 --- a/dwio/nimble/velox/VeloxWriter.cpp +++ b/dwio/nimble/velox/VeloxWriter.cpp @@ -709,18 +709,24 @@ bool VeloxWriter::write(const velox::VectorPtr& input) { NIMBLE_CHECK_NOT_NULL(file_, "Writer is already closed"); try { const auto numRows = input->size(); - // Calculate raw size using schema information to correctly handle - // passthrough flatmaps (ROW vectors written as MAP). - RawSizeContext context; - const auto rawSize = nimble::getRawSizeFromVector( - input, - velox::common::Ranges::of(0, numRows), - context, - *schema_, - context_->flatMapNodeIds(), - context_->ignoreTopLevelNulls()); - NIMBLE_CHECK_GE(rawSize, 0, "Invalid raw size"); - context_->updateFileRawSize(rawSize); + // When enableStatsConsistencyCheck is true, compute raw size using + // RawSizeUtils to verify consistency with column statistics. + // Otherwise, skip this computation as column statistics will provide + // the raw size. + if (context_->options().enableStatsConsistencyCheck) { + // Calculate raw size using schema information to correctly handle + // passthrough flatmaps (ROW vectors written as MAP). + RawSizeContext context; + const auto rawSize = nimble::getRawSizeFromVector( + input, + velox::common::Ranges::of(0, numRows), + context, + schema_.get(), + context_->flatMapNodeIds(), + context_->ignoreTopLevelNulls()); + NIMBLE_CHECK_GE(rawSize, 0, "Invalid raw size"); + context_->updateFileRawSize(rawSize); + } if (context_->options().writeExecutor) { velox::dwio::common::ExecutorBarrier barrier{ @@ -784,7 +790,22 @@ void VeloxWriter::writeMetadata() { void VeloxWriter::writeColumnStats() { flatbuffers::FlatBufferBuilder builder; - builder.Finish(serialization::CreateStats(builder, context_->fileRawSize())); + // When enableStatsConsistencyCheck is true, verify that fileRawSize + // (accumulated via RawSizeUtils) matches the root column statistics. + // Skip the check when passthrough flatmaps are detected because + // getRawSizeFromVector doesn't properly account for flatmap key + // statistics in a way that aligns with column stats collection. + // See FieldWriterStatsTests for similar workaround. + if (context_->options().enableStatsConsistencyCheck && + !context_->hasPassthroughFlatMapWrites()) { + NIMBLE_CHECK_EQ( + context_->fileRawSize(), + context_->columnStats().front()->getLogicalSize(), + "Mismatched raw sizes!"); + } + builder.Finish( + serialization::CreateStats( + builder, context_->columnStats().front()->getLogicalSize())); tabletWriter_->writeOptionalSection( std::string(kStatsSection), {reinterpret_cast(builder.GetBufferPointer()), diff --git a/dwio/nimble/velox/VeloxWriterOptions.h b/dwio/nimble/velox/VeloxWriterOptions.h index c3baf305..d1eb49f6 100644 --- a/dwio/nimble/velox/VeloxWriterOptions.h +++ b/dwio/nimble/velox/VeloxWriterOptions.h @@ -190,6 +190,13 @@ struct VeloxWriterOptions { // When true, string fields use per-field buffers instead of a shared buffer. // This enables incremental memory reclamation during chunking. bool disableSharedStringBuffers{false}; + + // When true, enables consistency check between fileRawSize (accumulated via + // RawSizeUtils) and the root column statistics during file close. + // This is used to validate that column statistics accurately track raw sizes, + // with the goal of eventually replacing RawSizeUtils accumulation with column + // statistics for non-deduplicated columns. + bool enableStatsConsistencyCheck{true}; }; } // namespace facebook::nimble diff --git a/dwio/nimble/velox/stats/ColumnStatsUtils.cpp b/dwio/nimble/velox/stats/ColumnStatsUtils.cpp index 1b7eb1d6..2ddc7ae1 100644 --- a/dwio/nimble/velox/stats/ColumnStatsUtils.cpp +++ b/dwio/nimble/velox/stats/ColumnStatsUtils.cpp @@ -81,7 +81,7 @@ void aggregateStats( const auto& arrayWithOffsets = builder.asArrayWithOffsets(); offset = arrayWithOffsets.lengthsDescriptor().offset(); columnStats[offset.value()].dedupedLogicalSize = - columnStats[offset.value()].nullCount * NULL_SIZE; + columnStats[offset.value()].nullCount * kNullSize; updateStats( arrayWithOffsets.offsetsDescriptor().offset(), offset.value()); aggregateStats(arrayWithOffsets.elements(), columnStats, offset); @@ -108,7 +108,7 @@ void aggregateStats( const auto& slidingWindowMap = builder.asSlidingWindowMap(); offset = slidingWindowMap.lengthsDescriptor().offset(); columnStats[offset.value()].dedupedLogicalSize = - columnStats[offset.value()].nullCount * NULL_SIZE; + columnStats[offset.value()].nullCount * kNullSize; updateStats( slidingWindowMap.offsetsDescriptor().offset(), offset.value()); aggregateStats(slidingWindowMap.keys(), columnStats, offset); diff --git a/dwio/nimble/velox/tests/FieldWriterStatsTests.cpp b/dwio/nimble/velox/tests/FieldWriterStatsTests.cpp index 577b45f8..361761ae 100644 --- a/dwio/nimble/velox/tests/FieldWriterStatsTests.cpp +++ b/dwio/nimble/velox/tests/FieldWriterStatsTests.cpp @@ -152,7 +152,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { c1->setNull(1, true); uint64_t columnSize = c1->size(); auto stat1 = ColumnStats{ - .logicalSize = sizeof(int32_t) * (columnSize - 1) + nimble::NULL_SIZE, + .logicalSize = sizeof(int32_t) * (columnSize - 1) + nimble::kNullSize, .physicalSize = 45, .nullCount = 1, .valueCount = columnSize}; @@ -162,7 +162,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { c2->setNull(0, true); c2->setNull(2, true); auto stat2 = ColumnStats{ - .logicalSize = std::string("bbbb").size() + 2 * nimble::NULL_SIZE, + .logicalSize = std::string("bbbb").size() + 2 * nimble::kNullSize, .physicalSize = 44, .nullCount = 2, .valueCount = columnSize}; @@ -210,7 +210,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { columnSize, leafPool_.get(), velox::bits::kNull); vector->setNulls(nulls); auto rootStat = ColumnStats{ - .logicalSize = nimble::NULL_SIZE * columnSize, + .logicalSize = nimble::kNullSize * columnSize, .physicalSize = 12, // Null Stream. .nullCount = columnSize, .valueCount = columnSize}; @@ -223,7 +223,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { vectorMaker_->flatVectorNullable({0, 0, 1, std::nullopt}); uint64_t columnSize = c1->size(); auto stat1 = ColumnStats{ - .logicalSize = sizeof(int64_t) * (columnSize - 2) + nimble::NULL_SIZE, + .logicalSize = sizeof(int64_t) * (columnSize - 2) + nimble::kNullSize, .physicalSize = 46, .nullCount = 1, .valueCount = columnSize - 1, @@ -242,7 +242,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { auto c2 = velox::BaseVector::wrapInDictionary( velox::BufferPtr(nullptr), indices, columnSize, valueVector); auto stat2 = ColumnStats{ - .logicalSize = sizeof(int64_t) + nimble::NULL_SIZE * 2, + .logicalSize = sizeof(int64_t) + nimble::kNullSize * 2, .physicalSize = 45, .nullCount = 2, .valueCount = columnSize - 1, @@ -250,7 +250,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { auto c3 = velox::BaseVector::wrapInConstant(columnSize, 3, c1); auto stat3 = ColumnStats{ - .logicalSize = nimble::NULL_SIZE * 3, + .logicalSize = nimble::kNullSize * 3, .physicalSize = 0, .nullCount = 3, .valueCount = columnSize - 1, @@ -261,7 +261,7 @@ TEST_F(FieldWriterStatsTests, simpleFieldWriterStats) { auto rootStat = rollupChildrenStats({stat1, stat2, stat3}); rootStat.valueCount = columnSize; rootStat.nullCount = 1; - rootStat.logicalSize += rootStat.nullCount * nimble::NULL_SIZE; + rootStat.logicalSize += rootStat.nullCount * nimble::kNullSize; rootStat.physicalSize += 20; // Null Stream. verifyReturnedColumnStats(vector, {rootStat, stat1, stat2, stat3}); } @@ -274,7 +274,7 @@ TEST_F(FieldWriterStatsTests, arrayFieldWriterStats) { auto elementsStat1 = ColumnStats{ .logicalSize = sizeof(int8_t) * 3, .physicalSize = 15, .valueCount = 3}; auto topLevelStat1 = ColumnStats{ - .logicalSize = elementsStat1.logicalSize + nimble::NULL_SIZE, + .logicalSize = elementsStat1.logicalSize + nimble::kNullSize, .physicalSize = elementsStat1.physicalSize + 41, .nullCount = 1, .valueCount = 2, @@ -316,7 +316,7 @@ TEST_F(FieldWriterStatsTests, arrayFieldWriterStats) { auto elementsStat4 = ColumnStats{ .logicalSize = sizeof(int8_t) * 3, .physicalSize = 15, .valueCount = 3}; auto topLevelStat4 = ColumnStats{ - .logicalSize = elementsStat4.logicalSize + nimble::NULL_SIZE, + .logicalSize = elementsStat4.logicalSize + nimble::kNullSize, .physicalSize = elementsStat4.physicalSize + 41, // Length Stream. .nullCount = 1, .valueCount = 2}; @@ -336,7 +336,7 @@ TEST_F(FieldWriterStatsTests, arrayFieldWriterStats) { }); rootStat.nullCount = 1; rootStat.valueCount = columnSize; - rootStat.logicalSize += rootStat.nullCount * nimble::NULL_SIZE; + rootStat.logicalSize += rootStat.nullCount * nimble::kNullSize; rootStat.physicalSize += 20; verifyReturnedColumnStats( vector, @@ -435,7 +435,7 @@ TEST_F(FieldWriterStatsTests, arrayWithOffsetFieldWriterStats) { topLevelStat4.dedupedLogicalSize.value(); rootStat.nullCount = 1; rootStat.valueCount = columnSize; - rootStat.logicalSize += rootStat.nullCount * nimble::NULL_SIZE; + rootStat.logicalSize += rootStat.nullCount * nimble::kNullSize; rootStat.physicalSize += 20; verifyReturnedColumnStats( vector, @@ -480,14 +480,14 @@ TEST_F(FieldWriterStatsTests, mapFieldWriterStats) { }; auto mapStat = ColumnStats{ .logicalSize = (sizeof(int32_t) * 6) + (sizeof(int8_t) * 6) + - (2 * nimble::NULL_SIZE), + (2 * nimble::kNullSize), .physicalSize = keyStat.physicalSize + valueStat.physicalSize + 40, .nullCount = 2, .valueCount = 5}; auto vector = vectorMaker_->rowVector({mapVector}); vector->setNull(5, true); auto rootStat = ColumnStats{ - .logicalSize = mapStat.logicalSize + nimble::NULL_SIZE, + .logicalSize = mapStat.logicalSize + nimble::kNullSize, .physicalSize = mapStat.physicalSize + 20, .nullCount = 1, .valueCount = columnSize}; @@ -629,12 +629,12 @@ TEST_F(FieldWriterStatsTests, flatMapFieldWriterStats) { // nullCount = 2 (flatmap nulls) // valueCount = 5 (non-null flatmap rows) // logicalSize = keyStat.logicalSize + valueStat.logicalSize + nullCount * - // NULL_SIZE + // kNullSize // physicalSize includes overhead for key null streams (3 keys * 20 bytes) // plus flatmap null stream (20 bytes) auto flatmapStat = ColumnStats{ .logicalSize = - keyStat.logicalSize + valueStat.logicalSize + 2 * nimble::NULL_SIZE, + keyStat.logicalSize + valueStat.logicalSize + 2 * nimble::kNullSize, .physicalSize = valueStat.physicalSize + 80, // 3 key null streams + flatmap null stream .nullCount = 2, @@ -643,7 +643,7 @@ TEST_F(FieldWriterStatsTests, flatMapFieldWriterStats) { // Root stat: flatmap logicalSize + 1 null byte for row5. auto rootStat = ColumnStats{ - .logicalSize = flatmapStat.logicalSize + nimble::NULL_SIZE, + .logicalSize = flatmapStat.logicalSize + nimble::kNullSize, .physicalSize = flatmapStat.physicalSize + 20, // Null Stream. .nullCount = 1, .valueCount = columnSize, @@ -707,21 +707,21 @@ TEST_F(FieldWriterStatsTests, flatMapPassThroughValueFieldWriterStats) { auto key0ValueStat = ColumnStats{ .logicalSize = - sizeof(int32_t) * 2 + nimble::NULL_SIZE, // 2 values + 1 null + sizeof(int32_t) * 2 + nimble::kNullSize, // 2 values + 1 null .physicalSize = 40, .valueCount = 3, .nullCount = 1, }; auto key2ValueStat = ColumnStats{ .logicalSize = - sizeof(int32_t) * 1 + 2 * nimble::NULL_SIZE, // 1 value + 2 nulls + sizeof(int32_t) * 1 + 2 * nimble::kNullSize, // 1 value + 2 nulls .physicalSize = 41, .valueCount = 3, .nullCount = 2, }; auto key10ValueStat = ColumnStats{ .logicalSize = - sizeof(int32_t) * 1 + 2 * nimble::NULL_SIZE, // 1 value + 2 nulls + sizeof(int32_t) * 1 + 2 * nimble::kNullSize, // 1 value + 2 nulls .physicalSize = 41, .valueCount = 3, .nullCount = 2, @@ -757,7 +757,7 @@ TEST_F(FieldWriterStatsTests, flatMapPassThroughValueFieldWriterStats) { // physicalSize includes overhead for key null streams (3 keys * ~18-19 bytes) auto flatmapStat = ColumnStats{ .logicalSize = - 2 * nimble::NULL_SIZE + keyStat.logicalSize + valueStat.logicalSize, + 2 * nimble::kNullSize + keyStat.logicalSize + valueStat.logicalSize, .physicalSize = valueStat.physicalSize + 56, // 3 key null streams overhead .nullCount = 2, @@ -766,7 +766,7 @@ TEST_F(FieldWriterStatsTests, flatMapPassThroughValueFieldWriterStats) { // Root stat. auto rootStat = ColumnStats{ - .logicalSize = flatmapStat.logicalSize + nimble::NULL_SIZE, + .logicalSize = flatmapStat.logicalSize + nimble::kNullSize, .physicalSize = flatmapStat.physicalSize + 20, // Null Stream. .nullCount = 1, .valueCount = columnSize, @@ -829,13 +829,13 @@ TEST_F(FieldWriterStatsTests, flatMapWithNullValuesFieldWriterStats) { // - Key 2: appears in row0, row1, row2, row4 → 4 entries (2 nulls) auto key0ValueStat = ColumnStats{ - .logicalSize = sizeof(int32_t) * 2 + nimble::NULL_SIZE * 2, + .logicalSize = sizeof(int32_t) * 2 + nimble::kNullSize * 2, .physicalSize = 45, // 41 + 4 for null stream overhead .nullCount = 2, .valueCount = 4, }; auto key2ValueStat = ColumnStats{ - .logicalSize = sizeof(int32_t) * 2 + nimble::NULL_SIZE * 2, + .logicalSize = sizeof(int32_t) * 2 + nimble::kNullSize * 2, .physicalSize = 45, // 41 + 4 for null stream overhead .nullCount = 2, .valueCount = 4, @@ -862,7 +862,7 @@ TEST_F(FieldWriterStatsTests, flatMapWithNullValuesFieldWriterStats) { // plus flatmap null stream (20 bytes) = 44 additional bytes auto flatmapStat = ColumnStats{ .logicalSize = - keyStat.logicalSize + valueStat.logicalSize + 1 * nimble::NULL_SIZE, + keyStat.logicalSize + valueStat.logicalSize + 1 * nimble::kNullSize, .physicalSize = valueStat.physicalSize + 44, // 2 key null streams + flatmap null stream .nullCount = 1, @@ -871,7 +871,7 @@ TEST_F(FieldWriterStatsTests, flatMapWithNullValuesFieldWriterStats) { // Root stat: flatmap logicalSize + 1 null byte for row5. auto rootStat = ColumnStats{ - .logicalSize = flatmapStat.logicalSize + nimble::NULL_SIZE, + .logicalSize = flatmapStat.logicalSize + nimble::kNullSize, .physicalSize = flatmapStat.physicalSize + 20, .nullCount = 1, .valueCount = columnSize, @@ -918,13 +918,13 @@ TEST_F( // Total: 4 values, 4 nulls auto key0ValueStat = ColumnStats{ - .logicalSize = sizeof(int32_t) * 2 + nimble::NULL_SIZE * 2, + .logicalSize = sizeof(int32_t) * 2 + nimble::kNullSize * 2, .physicalSize = 45, // 41 + 4 for null stream overhead .nullCount = 2, .valueCount = 4, }; auto key2ValueStat = ColumnStats{ - .logicalSize = sizeof(int32_t) * 2 + nimble::NULL_SIZE * 2, + .logicalSize = sizeof(int32_t) * 2 + nimble::kNullSize * 2, .physicalSize = 45, // 41 + 4 for null stream overhead .nullCount = 2, .valueCount = 4, @@ -955,7 +955,7 @@ TEST_F( // plus flatmap null stream (20 bytes) = 44 additional bytes auto flatmapStat = ColumnStats{ .logicalSize = - 1 * nimble::NULL_SIZE + keyStat.logicalSize + valueStat.logicalSize, + 1 * nimble::kNullSize + keyStat.logicalSize + valueStat.logicalSize, .physicalSize = valueStat.physicalSize + 44, // 2 key null streams + flatmap null stream .nullCount = 1, @@ -964,7 +964,7 @@ TEST_F( // Root stat. auto rootStat = ColumnStats{ - .logicalSize = flatmapStat.logicalSize + nimble::NULL_SIZE, + .logicalSize = flatmapStat.logicalSize + nimble::kNullSize, .physicalSize = flatmapStat.physicalSize + 20, .nullCount = 1, .valueCount = columnSize, @@ -1013,7 +1013,7 @@ TEST_F(FieldWriterStatsTests, slidingWindowMapFieldWriterStats) { // Deduped: 4 keys + 4 values (nulls not included in dedup size) auto mapStat = ColumnStats{ .logicalSize = (sizeof(int32_t) * 6) + (sizeof(int8_t) * 6) + - (2 * nimble::NULL_SIZE), + (2 * nimble::kNullSize), .dedupedLogicalSize = valueStat.logicalSize + keyStat.logicalSize, .physicalSize = keyStat.physicalSize + valueStat.physicalSize + 43 + // Offsets stream @@ -1047,7 +1047,7 @@ TEST_F(FieldWriterStatsTests, slidingWindowMapFieldWriterStats) { // Root stat: includes map's logical size + 1 null byte for row5. auto rootStat = ColumnStats{ - .logicalSize = mapStat.logicalSize + nimble::NULL_SIZE, + .logicalSize = mapStat.logicalSize + nimble::kNullSize, .dedupedLogicalSize = mapStat.dedupedLogicalSize.value(), .physicalSize = mapStat.physicalSize + 20, // Null Stream. .nullCount = 1, @@ -1070,7 +1070,7 @@ TEST_F(FieldWriterStatsTests, mixedColumnsFieldWriterStats) { auto intColumn = vectorMaker_->flatVectorNullable( {1, 2, std::nullopt, 4, 5, std::nullopt}); auto intStat = ColumnStats{ - .logicalSize = sizeof(int32_t) * 4 + nimble::NULL_SIZE * 2, + .logicalSize = sizeof(int32_t) * 4 + nimble::kNullSize * 2, .physicalSize = 44, .nullCount = 2, .valueCount = columnSize, @@ -1079,7 +1079,7 @@ TEST_F(FieldWriterStatsTests, mixedColumnsFieldWriterStats) { auto floatColumn = vectorMaker_->flatVectorNullable( {1.0f, 2.0f, 3.0f, std::nullopt, 5.0f, 6.0f}); auto floatStat = ColumnStats{ - .logicalSize = sizeof(float) * 5 + nimble::NULL_SIZE, + .logicalSize = sizeof(float) * 5 + nimble::kNullSize, .physicalSize = 57, .nullCount = 1, .valueCount = columnSize, @@ -1137,7 +1137,7 @@ TEST_F(FieldWriterStatsTests, mixedColumnsFieldWriterStats) { .logicalSize = sizeof(int32_t) * 8, .physicalSize = 32, .valueCount = 8}; auto mapStat = ColumnStats{ .logicalSize = mapKeyStat.logicalSize + mapValueStat.logicalSize + - nimble::NULL_SIZE * 2, + nimble::kNullSize * 2, .physicalSize = mapKeyStat.physicalSize + mapValueStat.physicalSize + 40, // length .nullCount = 2, @@ -1165,7 +1165,7 @@ TEST_F(FieldWriterStatsTests, mixedColumnsFieldWriterStats) { // - dedupedLogicalSize: 6 + 24 = 30 auto dedupedMapStat = ColumnStats{ .logicalSize = (sizeof(int8_t) * 8) + (sizeof(int32_t) * 8) + - (2 * nimble::NULL_SIZE), + (2 * nimble::kNullSize), .dedupedLogicalSize = dedupedMapKeyStat.logicalSize + dedupedMapValueStat.logicalSize, .physicalSize = dedupedMapKeyStat.physicalSize + @@ -1405,7 +1405,7 @@ TEST_F(FieldWriterStatsTests, rowFieldWriterStats) { }); uint64_t columnSize = c1->size(); auto c1SubFieldStat1 = ColumnStats{ - .logicalSize = sizeof(int8_t) * (columnSize - 1) + nimble::NULL_SIZE, + .logicalSize = sizeof(int8_t) * (columnSize - 1) + nimble::kNullSize, .physicalSize = 42, .nullCount = 1, .valueCount = columnSize}; @@ -1416,7 +1416,7 @@ TEST_F(FieldWriterStatsTests, rowFieldWriterStats) { auto c2 = velox::BaseVector::wrapInConstant(columnSize, 5, c1); auto c2SubFieldStat1 = ColumnStats{ - .logicalSize = columnSize * nimble::NULL_SIZE, + .logicalSize = columnSize * nimble::kNullSize, .nullCount = columnSize, .valueCount = columnSize}; auto c2SubFieldStat2 = ColumnStats{ @@ -1451,7 +1451,7 @@ TEST_F(FieldWriterStatsTests, rowFieldWriterStats) { auto stat3 = rollupChildrenStats({c3SubFieldStat1, c3SubFieldStat2}); stat3.physicalSize += 20; // Null Stream. stat3.nullCount = 2; - stat3.logicalSize += stat3.nullCount * nimble::NULL_SIZE; + stat3.logicalSize += stat3.nullCount * nimble::kNullSize; stat3.valueCount = columnSize; auto vector = vectorMaker_->rowVector({c1, c2, c3}); diff --git a/dwio/nimble/velox/tests/RawSizeTests.cpp b/dwio/nimble/velox/tests/RawSizeTests.cpp index 19aed3c0..64d175d0 100644 --- a/dwio/nimble/velox/tests/RawSizeTests.cpp +++ b/dwio/nimble/velox/tests/RawSizeTests.cpp @@ -123,7 +123,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { uint64_t expectedRawSize = 0; for (velox::vector_size_t i = 1; i <= VECTOR_SIZE; ++i) { if (isNullAt(i)) { - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; vec.emplace_back(std::nullopt); } else { auto value = getValue(i); @@ -145,7 +145,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { uint64_t expectedRawSize = 0; if (isNullAt(1)) { vec.assign(VECTOR_SIZE, std::nullopt); - expectedRawSize += nimble::NULL_SIZE * VECTOR_SIZE; + expectedRawSize += nimble::kNullSize * VECTOR_SIZE; } else { auto valueAt = getValue(1); expectedRawSize += getSize(valueAt) * VECTOR_SIZE; @@ -177,7 +177,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { const velox::vector_size_t* data = indices->as(); for (auto i = 0; i < VECTOR_SIZE; ++i) { if (vec[data[i]] == std::nullopt) { - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { expectedRawSize += getSize(vec[data[i]].value()); } @@ -198,7 +198,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { for (velox::vector_size_t i = 1; i <= VECTOR_SIZE; ++i) { if (isNullAt(i)) { vec.emplace_back(std::nullopt); - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { std::vector> innerVec; velox::vector_size_t innerSize = @@ -206,7 +206,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { for (velox::vector_size_t j = 0; j < innerSize; ++j) { if (isNullAt(j)) { innerVec.emplace_back(std::nullopt); - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { auto value = getValue(j); expectedRawSize += getSize(value); @@ -233,7 +233,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { for (velox::vector_size_t j = 0; j < innerSize; ++j) { if (isNullAt(j)) { innerVec.emplace_back(std::nullopt); - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { auto value = getValue(j); expectedRawSize += getSize(value); @@ -268,7 +268,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { folly::Random::rand32() % VECTOR_SIZE + 1; for (velox::vector_size_t j = 0; j < innerSize; ++j) { if (isNullAt(j)) { - size += nimble::NULL_SIZE; + size += nimble::kNullSize; innerVec.emplace_back(std::nullopt); } else { auto value = getValue(j); @@ -285,7 +285,7 @@ class RawSizeTestFixture : public RawSizeBaseTestFixture { const velox::vector_size_t* data = indices->as(); for (auto i = 0; i < VECTOR_SIZE; ++i) { if (vec[data[i]] == std::nullopt) { - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { expectedRawSize += indexToSize[data[i]]; } @@ -325,7 +325,7 @@ class RawSizeMapTestFixture for (velox::vector_size_t i = 1; i <= VECTOR_SIZE; ++i) { if (isNullAt(i)) { vec.emplace_back(std::nullopt); - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { std::vector>> innerVec; std::unordered_set keysSeen; @@ -343,7 +343,7 @@ class RawSizeMapTestFixture if (isNullAt(j)) { innerVec.emplace_back(key, std::nullopt); - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { auto value = getValue(j); expectedRawSize += getSize(value); @@ -384,7 +384,7 @@ class RawSizeMapTestFixture if (isNullAt(j)) { innerVec.emplace_back(key, std::nullopt); - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { auto value = getValue(j); expectedRawSize += getSize(value); @@ -435,7 +435,7 @@ class RawSizeMapTestFixture keysSeen.insert(key); if (isNullAt(j)) { - size += nimble::NULL_SIZE; + size += nimble::kNullSize; innerVec.emplace_back(key, std::nullopt); } else { auto value = getValue(j); @@ -456,7 +456,7 @@ class RawSizeMapTestFixture for (auto i = 0; i < VECTOR_SIZE; ++i) { if (vec[data[i]] == std::nullopt) { - expectedRawSize += nimble::NULL_SIZE; + expectedRawSize += nimble::kNullSize; } else { expectedRawSize += indexToSize[data[i]]; } @@ -1460,7 +1460,7 @@ TEST_F(RawSizeTestFixture, RowSameTypes) { {"1", "2", "3"}, {childVector1, childVector2, childVector3}); this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); ASSERT_EQ(sizeof(int64_t) * 18, rawSize); size_t expectedChildCount = 3; @@ -1479,7 +1479,7 @@ TEST_F(RawSizeTestFixture, RowDifferentTypes) { {"1", "2", "3"}, {childVector1, childVector2, childVector3}); this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = sizeof(int64_t) * 6 + sizeof(bool) * 6 + sizeof(int16_t) * 6; @@ -1503,7 +1503,7 @@ TEST_F(RawSizeTestFixture, RowDifferentTypes2) { {"1", "2", "3"}, {childVector1, childVector2, childVector3}); this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = sizeof(int64_t) * 6 + sizeof(int16_t) * 6 + 21; @@ -1536,10 +1536,10 @@ TEST_F(RawSizeTestFixture, RowNulls) { children); this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = sizeof(int64_t) * 5 + sizeof(bool) * 5 + - sizeof(int16_t) * 5 + nimble::NULL_SIZE * 1; + sizeof(int16_t) * 5 + nimble::kNullSize * 1; ASSERT_EQ(expectedRawSize, rawSize); ASSERT_EQ(3, context_.columnCount()); ASSERT_EQ(sizeof(int64_t) * 5, context_.sizeAt(0)); @@ -1569,9 +1569,9 @@ TEST_F(RawSizeTestFixture, RowAllNulls) { children); this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); - constexpr auto expectedRawSize = nimble::NULL_SIZE * VECTOR_TEST_SIZE; + constexpr auto expectedRawSize = nimble::kNullSize * VECTOR_TEST_SIZE; ASSERT_EQ(expectedRawSize, rawSize); ASSERT_EQ(3, context_.columnCount()); for (size_t i = 0; i < 3; ++i) { @@ -1590,15 +1590,15 @@ TEST_F(RawSizeTestFixture, RowNestedNull) { {"1", "2", "3"}, {childVector1, childVector2, childVector3}); this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = sizeof(int64_t) * 5 + (1 + 2 + 3 + 4 + 5) + - sizeof(int16_t) * 5 + nimble::NULL_SIZE * 3; + sizeof(int16_t) * 5 + nimble::kNullSize * 3; ASSERT_EQ(expectedRawSize, rawSize); ASSERT_EQ(3, context_.columnCount()); - ASSERT_EQ(sizeof(int64_t) * 5 + nimble::NULL_SIZE, context_.sizeAt(0)); - ASSERT_EQ(15 + nimble::NULL_SIZE, context_.sizeAt(1)); - ASSERT_EQ(sizeof(int16_t) * 5 + nimble::NULL_SIZE, context_.sizeAt(2)); + ASSERT_EQ(sizeof(int64_t) * 5 + nimble::kNullSize, context_.sizeAt(0)); + ASSERT_EQ(15 + nimble::kNullSize, context_.sizeAt(1)); + ASSERT_EQ(sizeof(int16_t) * 5 + nimble::kNullSize, context_.sizeAt(2)); ASSERT_EQ(0, context_.nullCount); for (size_t i = 0; i < 3; ++i) { @@ -1656,7 +1656,7 @@ TEST_F(RawSizeTestFixture, RowDictionaryChildren) { this->ranges_.add(0, rowVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - rowVector, this->ranges_, context_, /*topLevel=*/true); + rowVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); const uint64_t expectedSize = expectedArrayRawSize + expectedMapRawSize; ASSERT_EQ(expectedSize, rawSize); @@ -1682,7 +1682,7 @@ TEST_F(RawSizeTestFixture, ConstRow) { velox::BaseVector::wrapInConstant(CONST_VECTOR_SIZE, 5, rowVector); this->ranges_.add(0, CONST_VECTOR_SIZE); auto rawSize = nimble::getRawSizeFromRowVector( - constVector, this->ranges_, context_, /*topLevel=*/true); + constVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = (sizeof(int64_t) + 6 + sizeof(int16_t)) * CONST_VECTOR_SIZE; @@ -1712,15 +1712,15 @@ TEST_F(RawSizeTestFixture, ConstRowNestedNull) { velox::BaseVector::wrapInConstant(CONST_VECTOR_SIZE, 5, rowVector); this->ranges_.add(0, constVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - constVector, this->ranges_, context_, /*topLevel=*/true); + constVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = - (sizeof(int64_t) + nimble::NULL_SIZE * 2) * CONST_VECTOR_SIZE; + (sizeof(int64_t) + nimble::kNullSize * 2) * CONST_VECTOR_SIZE; ASSERT_EQ(expectedRawSize, rawSize); ASSERT_EQ(3, context_.columnCount()); ASSERT_EQ(sizeof(int64_t) * CONST_VECTOR_SIZE, context_.sizeAt(0)); - ASSERT_EQ(nimble::NULL_SIZE * CONST_VECTOR_SIZE, context_.sizeAt(1)); - ASSERT_EQ(nimble::NULL_SIZE * CONST_VECTOR_SIZE, context_.sizeAt(2)); + ASSERT_EQ(nimble::kNullSize * CONST_VECTOR_SIZE, context_.sizeAt(1)); + ASSERT_EQ(nimble::kNullSize * CONST_VECTOR_SIZE, context_.sizeAt(2)); ASSERT_EQ(0, context_.nullCount); ASSERT_EQ(0, context_.nullsAt(0)); @@ -1752,7 +1752,7 @@ TEST_F(RawSizeTestFixture, DictRow) { velox::BufferPtr(nullptr), indices, VECTOR_TEST_SIZE, rowVector); this->ranges_.add(0, dictVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - dictVector, this->ranges_, context_, /*topLevel=*/true); + dictVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = sizeof(int64_t) * 5 + sizeof(int16_t) * 5 + 11; @@ -1793,13 +1793,13 @@ TEST_F(RawSizeTestFixture, DictRowNull) { velox::BufferPtr(nullptr), indices, VECTOR_TEST_SIZE, rowVector); this->ranges_.add(0, dictVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - dictVector, this->ranges_, context_, /*topLevel=*/true); + dictVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = - sizeof(int64_t) * 3 + sizeof(int16_t) * 5 + 11 + nimble::NULL_SIZE * 2; + sizeof(int64_t) * 3 + sizeof(int16_t) * 5 + 11 + nimble::kNullSize * 2; ASSERT_EQ(expectedRawSize, rawSize); ASSERT_EQ(3, context_.columnCount()); - ASSERT_EQ(nimble::NULL_SIZE * 2 + sizeof(int64_t) * 3, context_.sizeAt(0)); + ASSERT_EQ(nimble::kNullSize * 2 + sizeof(int64_t) * 3, context_.sizeAt(0)); ASSERT_EQ(11, context_.sizeAt(1)); ASSERT_EQ(sizeof(int16_t) * VECTOR_TEST_SIZE, context_.sizeAt(2)); @@ -1838,9 +1838,9 @@ TEST_F(RawSizeTestFixture, DictRowNullTopLevel) { nulls, indices, VECTOR_TEST_SIZE, rowVector); this->ranges_.add(0, dictVector->size()); auto rawSize = nimble::getRawSizeFromRowVector( - dictVector, this->ranges_, context_, /*topLevel=*/true); + dictVector, this->ranges_, context_, nullptr, {}, /*topLevel=*/true); constexpr auto expectedRawSize = - sizeof(int64_t) * 4 + sizeof(int16_t) * 4 + 9 + nimble::NULL_SIZE * 1; + sizeof(int64_t) * 4 + sizeof(int16_t) * 4 + 9 + nimble::kNullSize * 1; ASSERT_EQ(expectedRawSize, rawSize); ASSERT_EQ(sizeof(int64_t) * (VECTOR_TEST_SIZE - 1), context_.sizeAt(0)); @@ -1883,6 +1883,375 @@ TEST_F(RawSizeTestFixture, ThrowOnDefaultEncodingVariableWidth) { velox::VeloxRuntimeError); } +// Test type compatibility helper functions +TEST_F(RawSizeTestFixture, TypeSizeFromKind) { + // Fixed-width types should return their sizes + EXPECT_EQ( + nimble::getTypeSizeFromKind(velox::TypeKind::BOOLEAN), sizeof(bool)); + EXPECT_EQ( + nimble::getTypeSizeFromKind(velox::TypeKind::TINYINT), sizeof(int8_t)); + EXPECT_EQ( + nimble::getTypeSizeFromKind(velox::TypeKind::SMALLINT), sizeof(int16_t)); + EXPECT_EQ( + nimble::getTypeSizeFromKind(velox::TypeKind::INTEGER), sizeof(int32_t)); + EXPECT_EQ( + nimble::getTypeSizeFromKind(velox::TypeKind::BIGINT), sizeof(int64_t)); + EXPECT_EQ(nimble::getTypeSizeFromKind(velox::TypeKind::REAL), sizeof(float)); + EXPECT_EQ( + nimble::getTypeSizeFromKind(velox::TypeKind::DOUBLE), sizeof(double)); + // Timestamp uses 12 bytes (8 for seconds + 4 for nanos) + EXPECT_EQ(nimble::getTypeSizeFromKind(velox::TypeKind::TIMESTAMP), 12); + + // Variable-width types should return std::nullopt + EXPECT_FALSE( + nimble::getTypeSizeFromKind(velox::TypeKind::VARCHAR).has_value()); + EXPECT_FALSE( + nimble::getTypeSizeFromKind(velox::TypeKind::VARBINARY).has_value()); + EXPECT_FALSE(nimble::getTypeSizeFromKind(velox::TypeKind::ARRAY).has_value()); + EXPECT_FALSE(nimble::getTypeSizeFromKind(velox::TypeKind::MAP).has_value()); + EXPECT_FALSE(nimble::getTypeSizeFromKind(velox::TypeKind::ROW).has_value()); +} + +// Tests for schema-aware raw size calculation with type mismatches +class RawSizeTypeCompatibilityTestFixture : public RawSizeBaseTestFixture { + protected: + std::shared_ptr makeTypeWithId( + const velox::TypePtr& type) { + return velox::dwio::common::TypeWithId::create(type); + } +}; + +// Test scalar type mismatch: int32_t vector with BIGINT schema +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarIntegerToBigint) { + constexpr velox::vector_size_t SIZE = 10; + auto vector = + vectorMaker_->flatVector({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + auto schemaType = makeTypeWithId(velox::BIGINT()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: int32_t * 10 = 40 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * SIZE); + + // With schema (BIGINT): int64_t * 10 = 80 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * SIZE); +} + +// Test scalar type mismatch with nulls: int32_t vector with BIGINT schema +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarIntegerToBigintWithNulls) { + auto vector = vectorMaker_->flatVectorNullable( + {1, std::nullopt, 3, 4, std::nullopt, 6, 7, 8, 9, 10}); + auto schemaType = makeTypeWithId(velox::BIGINT()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, 10); + + // Without schema: (int32_t * 8) + (kNullSize * 2) = 32 + 2 = 34 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * 8 + nimble::kNullSize * 2); + + // With schema (BIGINT): (int64_t * 8) + (kNullSize * 2) = 64 + 2 = 66 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * 8 + nimble::kNullSize * 2); + EXPECT_EQ(context_.nullCount, 2); +} + +// Test REAL to DOUBLE promotion +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarRealToDouble) { + constexpr velox::vector_size_t SIZE = 5; + auto vector = vectorMaker_->flatVector({1.0f, 2.0f, 3.0f, 4.0f, 5.0f}); + auto schemaType = makeTypeWithId(velox::DOUBLE()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: float * 5 = 20 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(float) * SIZE); + + // With schema (DOUBLE): double * 5 = 40 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(double) * SIZE); +} + +// Test SMALLINT to INTEGER promotion +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarSmallintToInteger) { + constexpr velox::vector_size_t SIZE = 4; + auto vector = vectorMaker_->flatVector({1, 2, 3, 4}); + auto schemaType = makeTypeWithId(velox::INTEGER()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: int16_t * 4 = 8 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int16_t) * SIZE); + + // With schema (INTEGER): int32_t * 4 = 16 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int32_t) * SIZE); +} + +// Test TINYINT to BIGINT promotion (largest gap) +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarTinyintToBigint) { + constexpr velox::vector_size_t SIZE = 8; + auto vector = vectorMaker_->flatVector({1, 2, 3, 4, 5, 6, 7, 8}); + auto schemaType = makeTypeWithId(velox::BIGINT()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: int8_t * 8 = 8 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int8_t) * SIZE); + + // With schema (BIGINT): int64_t * 8 = 64 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * SIZE); +} + +// Test BOOLEAN to INTEGER promotion (boolean is now in integer family) +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarBooleanToInteger) { + constexpr velox::vector_size_t SIZE = 6; + auto vector = + vectorMaker_->flatVector({true, false, true, true, false, true}); + auto schemaType = makeTypeWithId(velox::INTEGER()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: bool * 6 = 6 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(bool) * SIZE); + + // With schema (INTEGER): int32_t * 6 = 24 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int32_t) * SIZE); +} + +// Test BOOLEAN to BIGINT promotion +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarBooleanToBigint) { + constexpr velox::vector_size_t SIZE = 4; + auto vector = vectorMaker_->flatVector({true, false, true, false}); + auto schemaType = makeTypeWithId(velox::BIGINT()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: bool * 4 = 4 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(bool) * SIZE); + + // With schema (BIGINT): int64_t * 4 = 32 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * SIZE); +} + +// Test BOOLEAN to INTEGER with nulls +TEST_F(RawSizeTypeCompatibilityTestFixture, ScalarBooleanToIntegerWithNulls) { + auto vector = vectorMaker_->flatVectorNullable( + {true, std::nullopt, false, true, std::nullopt}); + auto schemaType = makeTypeWithId(velox::INTEGER()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, 5); + + // Without schema: bool * 3 + null * 2 = 3 + 2 = 5 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(vector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(bool) * 3 + nimble::kNullSize * 2); + + // With schema (INTEGER): int32_t * 3 + null * 2 = 12 + 2 = 14 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + vector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int32_t) * 3 + nimble::kNullSize * 2); + EXPECT_EQ(context_.nullCount, 2); +} + +// Test array with element type mismatch +TEST_F(RawSizeTypeCompatibilityTestFixture, ArrayWithTypeMismatch) { + // Create ARRAY vector with 2 rows: + // Row 0: [1, 2, 3] (3 elements) + // Row 1: [4, 5] (2 elements) + // Total: 5 int32_t elements = 20 bytes without schema + auto arrayVector = vectorMaker_->arrayVector({{1, 2, 3}, {4, 5}}); + auto schemaType = makeTypeWithId(velox::ARRAY(velox::BIGINT())); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, 2); + + // Without schema: int32_t * 5 = 20 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(arrayVector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * 5); + + // With schema (ARRAY): int64_t * 5 = 40 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + arrayVector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * 5); +} + +// Test map with value type mismatch +TEST_F(RawSizeTypeCompatibilityTestFixture, MapWithValueTypeMismatch) { + // Create MAP vector with 2 entries: + // Entry 0: {"a" -> 1} + // Entry 1: {"bb" -> 2} + // Key sizes: 1 + 2 = 3 bytes + // Value sizes: int32_t * 2 = 8 bytes + // Total without schema: 11 bytes + auto mapVector = vectorMaker_->mapVector( + {{{velox::StringView("a"), 1}}, {{velox::StringView("bb"), 2}}}); + auto schemaType = + makeTypeWithId(velox::MAP(velox::VARCHAR(), velox::BIGINT())); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, 2); + + // Without schema: 3 (keys) + 8 (int32_t values) = 11 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(mapVector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, 3 + sizeof(int32_t) * 2); + + // With schema (MAP): 3 (keys) + 16 (int64_t values) = 19 + // bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + mapVector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, 3 + sizeof(int64_t) * 2); +} + +// Test map with key type mismatch +TEST_F(RawSizeTypeCompatibilityTestFixture, MapWithKeyTypeMismatch) { + // Create MAP vector with 2 entries: + // Entry 0: {1 -> "a"} + // Entry 1: {2 -> "bb"} + // Key sizes (int32_t): 4 * 2 = 8 bytes + // Value sizes: 1 + 2 = 3 bytes + // Total without schema: 11 bytes + auto mapVector = vectorMaker_->mapVector( + {{{1, velox::StringView("a")}}, {{2, velox::StringView("bb")}}}); + auto schemaType = + makeTypeWithId(velox::MAP(velox::BIGINT(), velox::VARCHAR())); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, 2); + + // Without schema: 8 (int32_t keys) + 3 (varchar values) = 11 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(mapVector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * 2 + 3); + + // With schema (MAP): 16 (int64_t keys) + 3 (varchar values) + // = 19 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + mapVector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * 2 + 3); +} + +// Test constant vector with type mismatch +TEST_F(RawSizeTypeCompatibilityTestFixture, ConstantVectorWithTypeMismatch) { + constexpr velox::vector_size_t SIZE = 100; + std::vector> vec(SIZE, 42); + auto constVector = vectorMaker_->constantVector(vec); + auto schemaType = makeTypeWithId(velox::BIGINT()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: int32_t * 100 = 400 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(constVector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * SIZE); + + // With schema (BIGINT): int64_t * 100 = 800 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + constVector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * SIZE); +} + +// Test dictionary vector with type mismatch +TEST_F(RawSizeTypeCompatibilityTestFixture, DictionaryVectorWithTypeMismatch) { + constexpr velox::vector_size_t SIZE = 10; + auto flatVector = + vectorMaker_->flatVector({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + auto indices = randomIndices(SIZE); + auto dictVector = velox::BaseVector::wrapInDictionary( + velox::BufferPtr(nullptr), indices, SIZE, flatVector); + + auto schemaType = makeTypeWithId(velox::BIGINT()); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, SIZE); + + // Without schema: int32_t * 10 = 40 bytes + auto rawSizeWithoutSchema = + nimble::getRawSizeFromVector(dictVector, ranges_, context_); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * SIZE); + + // With schema (BIGINT): int64_t * 10 = 80 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromVector( + dictVector, ranges_, context_, schemaType.get(), flatMapNodeIds); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * SIZE); +} + +// Test row vector with nested type mismatches +TEST_F(RawSizeTypeCompatibilityTestFixture, RowVectorWithTypeMismatch) { + // Create ROW with: + // - col0: int32_t values [1, 2] + // - col1: float values [1.0, 2.0] + auto childVector1 = vectorMaker_->flatVector({1, 2}); + auto childVector2 = vectorMaker_->flatVector({1.0f, 2.0f}); + auto rowVector = + vectorMaker_->rowVector({"col0", "col1"}, {childVector1, childVector2}); + + // Schema declares BIGINT and DOUBLE + auto schemaType = makeTypeWithId( + velox::ROW({{"col0", velox::BIGINT()}, {"col1", velox::DOUBLE()}})); + folly::F14FastSet flatMapNodeIds; + + ranges_.add(0, 2); + + // Without schema: (int32_t * 2) + (float * 2) = 8 + 8 = 16 bytes + auto rawSizeWithoutSchema = nimble::getRawSizeFromRowVector( + rowVector, ranges_, context_, nullptr, {}, true); + EXPECT_EQ(rawSizeWithoutSchema, sizeof(int32_t) * 2 + sizeof(float) * 2); + + // With schema: (int64_t * 2) + (double * 2) = 16 + 16 = 32 bytes + context_ = nimble::RawSizeContext(); + auto rawSizeWithSchema = nimble::getRawSizeFromRowVector( + rowVector, ranges_, context_, schemaType.get(), flatMapNodeIds, true); + EXPECT_EQ(rawSizeWithSchema, sizeof(int64_t) * 2 + sizeof(double) * 2); +} + TEST_F(RawSizeTestFixture, LocalDecodedVectorMoveConstructor) { auto localDecodedVector1 = facebook::nimble::DecodedVectorManager::LocalDecodedVector(