diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index c9e665d4d..bd2111f04 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -55,10 +55,11 @@ set(ICEBERG_SOURCES manifest_reader_internal.cc manifest_writer.cc arrow_c_data_guard_internal.cc + util/conversions.cc util/decimal.cc + util/gzip_internal.cc util/murmurhash3_internal.cc util/timepoint.cc - util/gzip_internal.cc util/uuid.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index e3abb6a66..adfe5355a 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -23,6 +23,8 @@ #include #include "iceberg/exception.h" +#include "iceberg/util/conversions.h" +#include "iceberg/util/macros.h" namespace iceberg { @@ -149,13 +151,18 @@ Literal Literal::Binary(std::vector value) { return {Value{std::move(value)}, binary()}; } +Literal Literal::Fixed(std::vector value) { + auto length = static_cast(value.size()); + return {Value{std::move(value)}, fixed(length)}; +} + Result Literal::Deserialize(std::span data, std::shared_ptr type) { - return NotImplemented("Deserialization of Literal is not implemented yet"); + return Conversions::FromBytes(std::move(type), data); } Result> Literal::Serialize() const { - return NotImplemented("Serialization of Literal is not implemented yet"); + return Conversions::ToBytes(*this); } // Getters @@ -189,7 +196,7 @@ bool Literal::operator==(const Literal& other) const { return (*this <=> other) // Three-way comparison operator std::partial_ordering Literal::operator<=>(const Literal& other) const { // If types are different, comparison is unordered - if (type_->type_id() != other.type_->type_id()) { + if (*type_ != *other.type_) { return std::partial_ordering::unordered; } @@ -216,6 +223,7 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { } case TypeId::kLong: + case TypeId::kTime: case TypeId::kTimestamp: case TypeId::kTimestampTz: { auto this_val = std::get(value_); @@ -249,6 +257,12 @@ std::partial_ordering Literal::operator<=>(const Literal& other) const { return this_val <=> other_val; } + case TypeId::kFixed: { + auto& this_val = std::get>(value_); + auto& other_val = std::get>(other.value_); + return this_val <=> other_val; + } + default: // For unsupported types, return unordered return std::partial_ordering::unordered; @@ -294,9 +308,17 @@ std::string Literal::ToString() const { } return result; } + case TypeId::kFixed: { + const auto& fixed_data = std::get>(value_); + std::string result; + result.reserve(fixed_data.size() * 2); // 2 chars per byte + for (const auto& byte : fixed_data) { + std::format_to(std::back_inserter(result), "{:02X}", byte); + } + return result; + } case TypeId::kDecimal: case TypeId::kUuid: - case TypeId::kFixed: case TypeId::kDate: case TypeId::kTime: case TypeId::kTimestamp: diff --git a/src/iceberg/expression/literal.h b/src/iceberg/expression/literal.h index 1c16b8ed4..c11d48f5b 100644 --- a/src/iceberg/expression/literal.h +++ b/src/iceberg/expression/literal.h @@ -72,6 +72,7 @@ class ICEBERG_EXPORT Literal : public util::Formattable { static Literal Double(double value); static Literal String(std::string value); static Literal Binary(std::vector value); + static Literal Fixed(std::vector value); /// \brief Create a literal representing a null value. static Literal Null(std::shared_ptr type) { @@ -144,11 +145,76 @@ class ICEBERG_EXPORT Literal : public util::Formattable { private: Literal(Value value, std::shared_ptr type); + friend class Conversions; friend class LiteralCaster; - private: Value value_; std::shared_ptr type_; }; +template +struct LiteralTraits { + using ValueType = void; +}; + +template <> +struct LiteralTraits { + using ValueType = bool; +}; + +template <> +struct LiteralTraits { + using ValueType = int32_t; +}; + +template <> +struct LiteralTraits { + using ValueType = int32_t; +}; + +template <> +struct LiteralTraits { + using ValueType = int64_t; +}; + +template <> +struct LiteralTraits { + using ValueType = int64_t; +}; + +template <> +struct LiteralTraits { + using ValueType = int64_t; +}; + +template <> +struct LiteralTraits { + using ValueType = int64_t; +}; + +template <> +struct LiteralTraits { + using ValueType = float; +}; + +template <> +struct LiteralTraits { + using ValueType = double; +}; + +template <> +struct LiteralTraits { + using ValueType = std::string; +}; + +template <> +struct LiteralTraits { + using ValueType = std::vector; +}; + +template <> +struct LiteralTraits { + using ValueType = std::vector; +}; + } // namespace iceberg diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc index e9ddd47a3..bd7544bfa 100644 --- a/src/iceberg/test/literal_test.cc +++ b/src/iceberg/test/literal_test.cc @@ -81,7 +81,7 @@ TEST(LiteralTest, IntCastTo) { auto long_result = int_literal.CastTo(iceberg::int64()); ASSERT_THAT(long_result, IsOk()); EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong); - EXPECT_EQ(long_result->ToString(), "42"); + EXPECT_EQ(std::get(long_result->value()), 42L); // Cast to Float auto float_result = int_literal.CastTo(iceberg::float32()); @@ -137,7 +137,6 @@ TEST(LiteralTest, LongCastTo) { } TEST(LiteralTest, LongCastToIntOverflow) { - // Test overflow cases auto max_long = Literal::Long(static_cast(std::numeric_limits::max()) + 1); auto min_long = @@ -383,4 +382,208 @@ TEST(LiteralTest, DoubleZeroComparison) { EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); } +struct LiteralParam { + std::string test_name; + std::vector serialized; + Literal value; + std::shared_ptr type; +}; + +class LiteralSerDeParam : public ::testing::TestWithParam {}; + +TEST_P(LiteralSerDeParam, RoundTrip) { + const auto& param = GetParam(); + + // Deserialize from bytes + Result literal_result = Literal::Deserialize(param.serialized, param.type); + ASSERT_TRUE(literal_result.has_value()) + << "Deserialization failed: " << literal_result.error().message; + + // Check type and value + EXPECT_EQ(*literal_result, param.value); + + // Serialize back to bytes + Result> bytes_result = literal_result->Serialize(); + ASSERT_TRUE(bytes_result.has_value()) + << "Serialization failed: " << bytes_result.error().message; + EXPECT_EQ(*bytes_result, param.serialized); + + // Deserialize again to verify idempotency + Result final_literal = Literal::Deserialize(*bytes_result, param.type); + ASSERT_TRUE(final_literal.has_value()) + << "Final deserialization failed: " << final_literal.error().message; + EXPECT_EQ(*final_literal, param.value); +} + +INSTANTIATE_TEST_SUITE_P( + BinarySerialization, LiteralSerDeParam, + ::testing::Values( + // Basic types + LiteralParam{"BooleanTrue", {1}, Literal::Boolean(true), boolean()}, + LiteralParam{"BooleanFalse", {0}, Literal::Boolean(false), boolean()}, + + LiteralParam{"Int", {32, 0, 0, 0}, Literal::Int(32), int32()}, + LiteralParam{ + "IntMaxValue", {255, 255, 255, 127}, Literal::Int(2147483647), int32()}, + LiteralParam{"IntMinValue", {0, 0, 0, 128}, Literal::Int(-2147483648), int32()}, + LiteralParam{"NegativeInt", {224, 255, 255, 255}, Literal::Int(-32), int32()}, + + LiteralParam{"Long", {32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32), int64()}, + LiteralParam{"LongMaxValue", + {255, 255, 255, 255, 255, 255, 255, 127}, + Literal::Long(std::numeric_limits::max()), + int64()}, + LiteralParam{"LongMinValue", + {0, 0, 0, 0, 0, 0, 0, 128}, + Literal::Long(std::numeric_limits::min()), + int64()}, + LiteralParam{"NegativeLong", + {224, 255, 255, 255, 255, 255, 255, 255}, + Literal::Long(-32), + int64()}, + + LiteralParam{"Float", {0, 0, 128, 63}, Literal::Float(1.0f), float32()}, + LiteralParam{"FloatNegativeInfinity", + {0, 0, 128, 255}, + Literal::Float(-std::numeric_limits::infinity()), + float32()}, + LiteralParam{"FloatMaxValue", + {255, 255, 127, 127}, + Literal::Float(std::numeric_limits::max()), + float32()}, + LiteralParam{"FloatMinValue", + {255, 255, 127, 255}, + Literal::Float(std::numeric_limits::lowest()), + float32()}, + + LiteralParam{ + "Double", {0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0), float64()}, + LiteralParam{"DoubleNegativeInfinity", + {0, 0, 0, 0, 0, 0, 240, 255}, + Literal::Double(-std::numeric_limits::infinity()), + float64()}, + LiteralParam{"DoubleMaxValue", + {255, 255, 255, 255, 255, 255, 239, 127}, + Literal::Double(std::numeric_limits::max()), + float64()}, + LiteralParam{"DoubleMinValue", + {255, 255, 255, 255, 255, 255, 239, 255}, + Literal::Double(std::numeric_limits::lowest()), + float64()}, + + LiteralParam{"String", + {105, 99, 101, 98, 101, 114, 103}, + Literal::String("iceberg"), + string()}, + LiteralParam{"StringLong", + {65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65}, + Literal::String("AAAAAAAAAAAAAAAA"), + string()}, + + LiteralParam{"BinaryData", + {0x01, 0x02, 0x03, 0xFF}, + Literal::Binary({0x01, 0x02, 0x03, 0xFF}), + binary()}, + LiteralParam{"BinarySingleByte", {42}, Literal::Binary({42}), binary()}, + + // Fixed type + LiteralParam{"FixedLength4", + {0x01, 0x02, 0x03, 0x04}, + Literal::Fixed({0x01, 0x02, 0x03, 0x04}), + fixed(4)}, + LiteralParam{"FixedLength8", + {0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00, 0x11}, + Literal::Fixed({0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x00, 0x11}), + fixed(8)}, + LiteralParam{"FixedLength16", + {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, + 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}, + Literal::Fixed({0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}), + fixed(16)}, + LiteralParam{"FixedSingleByte", {0xFF}, Literal::Fixed({0xFF}), fixed(1)}, + + // Temporal types + LiteralParam{"DateEpoch", {0, 0, 0, 0}, Literal::Date(0), date()}, + LiteralParam{"DateNextDay", {1, 0, 0, 0}, Literal::Date(1), date()}, + LiteralParam{"DateY2K", {205, 42, 0, 0}, Literal::Date(10957), date()}, + LiteralParam{"DateNegative", {255, 255, 255, 255}, Literal::Date(-1), date()}, + + LiteralParam{"TimeMidnight", {0, 0, 0, 0, 0, 0, 0, 0}, Literal::Time(0), time()}, + LiteralParam{"TimeNoon", + {128, 9, 230, 124, 10, 0, 0, 0}, + Literal::Time(45045123456), + time()}, + LiteralParam{ + "TimeOneSecond", {64, 66, 15, 0, 0, 0, 0, 0}, Literal::Time(1000000), time()}, + + LiteralParam{"TimestampEpoch", + {0, 0, 0, 0, 0, 0, 0, 0}, + Literal::Timestamp(0), + timestamp()}, + LiteralParam{"TimestampOneSecond", + {64, 66, 15, 0, 0, 0, 0, 0}, + Literal::Timestamp(1000000), + timestamp()}, + LiteralParam{"TimestampNoon2024", + {128, 9, 230, 124, 10, 0, 0, 0}, + Literal::Timestamp(45045123456), + timestamp()}, + + LiteralParam{"TimestampTzEpoch", + {0, 0, 0, 0, 0, 0, 0, 0}, + Literal::TimestampTz(0), + timestamp_tz()}, + LiteralParam{"TimestampTzOneHour", + {0, 164, 147, 214, 0, 0, 0, 0}, + Literal::TimestampTz(3600000000), + timestamp_tz()}, + + // Empty values + LiteralParam{"EmptyString", {}, Literal::String(""), string()}, + LiteralParam{"EmptyBinary", {}, Literal::Binary({}), binary()}), + + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + +TEST(LiteralSerDeTest, EmptyString) { + auto empty_string = Literal::String(""); + auto empty_bytes = empty_string.Serialize(); + ASSERT_TRUE(empty_bytes.has_value()); + EXPECT_TRUE(empty_bytes->empty()); + + auto deserialize_result = Literal::Deserialize(*empty_bytes, string()); + ASSERT_THAT(deserialize_result, IsOk()); + EXPECT_TRUE(std::get(deserialize_result->value()).empty()); +} + +TEST(LiteralSerDeTest, EmptyBinary) { + auto empty_binary = Literal::Binary({}); + auto empty_bytes = empty_binary.Serialize(); + ASSERT_TRUE(empty_bytes.has_value()); + EXPECT_TRUE(empty_bytes->empty()); + + auto deserialize_result = Literal::Deserialize(*empty_bytes, binary()); + ASSERT_THAT(deserialize_result, IsOk()); + EXPECT_TRUE(std::get>(deserialize_result->value()).empty()); +} + +// Type promotion tests +TEST(LiteralSerDeTest, TypePromotion) { + // 4-byte int data can be deserialized as long + std::vector int_data = {32, 0, 0, 0}; + auto long_result = Literal::Deserialize(int_data, int64()); + ASSERT_TRUE(long_result.has_value()); + EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong); + EXPECT_EQ(std::get(long_result->value()), 32L); + + // 4-byte float data can be deserialized as double + std::vector float_data = {0, 0, 128, 63}; + auto double_result = Literal::Deserialize(float_data, float64()); + ASSERT_TRUE(double_result.has_value()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); + EXPECT_DOUBLE_EQ(std::get(double_result->value()), 1.0); +} + } // namespace iceberg diff --git a/src/iceberg/test/manifest_list_reader_test.cc b/src/iceberg/test/manifest_list_reader_test.cc index a3c08c35c..9fd6e4c16 100644 --- a/src/iceberg/test/manifest_list_reader_test.cc +++ b/src/iceberg/test/manifest_list_reader_test.cc @@ -23,6 +23,7 @@ #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/avro/avro_register.h" +#include "iceberg/expression/literal.h" #include "iceberg/manifest_list.h" #include "iceberg/manifest_reader.h" #include "temp_file_test_base.h" @@ -76,43 +77,38 @@ class ManifestListReaderV1Test : public ManifestListReaderTestBase { std::vector file_size = {6185, 6113}; std::vector snapshot_id = {7532614258660258098, 7532614258660258098}; - std::vector> lower_bounds = { - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32}, - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}}; - - std::vector> upper_bounds = { - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}, - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}}; - - return {{.manifest_path = paths[0], - .manifest_length = file_size[0], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[0], - .added_files_count = 4, - .existing_files_count = 0, - .deleted_files_count = 0, - .added_rows_count = 6, - .existing_rows_count = 0, - .deleted_rows_count = 0, - .partitions = {{.contains_null = false, - .contains_nan = false, - .lower_bound = lower_bounds[0], - .upper_bound = upper_bounds[0]}}}, - - {.manifest_path = paths[1], - .manifest_length = file_size[1], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[1], - .added_files_count = 0, - .existing_files_count = 0, - .deleted_files_count = 2, - .added_rows_count = 0, - .existing_rows_count = 0, - .deleted_rows_count = 6, - .partitions = {{.contains_null = false, - .contains_nan = false, - .lower_bound = lower_bounds[1], - .upper_bound = upper_bounds[1]}}}}; + return { + {.manifest_path = paths[0], + .manifest_length = file_size[0], + .partition_spec_id = 0, + .added_snapshot_id = snapshot_id[0], + .added_files_count = 4, + .existing_files_count = 0, + .deleted_files_count = 0, + .added_rows_count = 6, + .existing_rows_count = 0, + .deleted_rows_count = 0, + .partitions = {{.contains_null = false, + .contains_nan = false, + .lower_bound = Literal::String("2022-02-22").Serialize().value(), + .upper_bound = + Literal::String("2022-2-23").Serialize().value()}}}, + + {.manifest_path = paths[1], + .manifest_length = file_size[1], + .partition_spec_id = 0, + .added_snapshot_id = snapshot_id[1], + .added_files_count = 0, + .existing_files_count = 0, + .deleted_files_count = 2, + .added_rows_count = 0, + .existing_rows_count = 0, + .deleted_rows_count = 6, + .partitions = { + {.contains_null = false, + .contains_nan = false, + .lower_bound = Literal::String("2022-2-22").Serialize().value(), + .upper_bound = Literal::String("2022-2-23").Serialize().value()}}}}; } std::vector PrepareComplexTypeTestData() { diff --git a/src/iceberg/test/manifest_reader_test.cc b/src/iceberg/test/manifest_reader_test.cc index db703c172..7381b2981 100644 --- a/src/iceberg/test/manifest_reader_test.cc +++ b/src/iceberg/test/manifest_reader_test.cc @@ -94,24 +94,33 @@ class ManifestReaderV1Test : public ManifestReaderTestBase { "order_ts_hour=2021-01-26-00/" "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"}; std::vector partitions = {447696, 473976, 465192, 447672}; + + // TODO(Li Feiyang): The Decimal type and its serialization logic are not yet fully + // implemented to support variable-length encoding as required by the Iceberg + // specification. Using Literal::Binary as a temporary substitute to represent the raw + // bytes for the decimal values. std::vector>> bounds = { - {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {3, {0x12, 0xe2}}, - {4, {0xc0, 'y', 0xe7, 0x98, 0xd6, 0xb9, 0x05, 0x00}}}, - {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {3, {0x12, 0xe3}}, - {4, {0xc0, 0x19, '#', '=', 0xe2, 0x0f, 0x06, 0x00}}}, - {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {3, {0x0e, '"'}}, - {4, {0xc0, 0xd9, '7', 0x93, 0x1f, 0xf3, 0x05, 0x00}}}, - {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {3, {0x0e, '!'}}, - {4, {0xc0, 0x19, 0x10, '{', 0xc2, 0xb9, 0x05, 0x00}}}, + {{1, Literal::Long(1234).Serialize().value()}, + {2, Literal::Long(5678).Serialize().value()}, + {3, Literal::Binary({0x12, 0xe2}).Serialize().value()}, + + {4, Literal::Timestamp(1611706223000000LL).Serialize().value()}}, + {{1, Literal::Long(1234).Serialize().value()}, + {2, Literal::Long(5678).Serialize().value()}, + {3, Literal::Binary({0x12, 0xe3}).Serialize().value()}, + + {4, Literal::Timestamp(1706314223000000LL).Serialize().value()}}, + {{1, Literal::Long(123).Serialize().value()}, + {2, Literal::Long(456).Serialize().value()}, + {3, Literal::Binary({0x0e, 0x22}).Serialize().value()}, + + {4, Literal::Timestamp(1674691823000000LL).Serialize().value()}}, + {{1, Literal::Long(123).Serialize().value()}, + {2, Literal::Long(456).Serialize().value()}, + {3, Literal::Binary({0x0e, 0x21}).Serialize().value()}, + {4, Literal::Timestamp(1611619823000000LL).Serialize().value()}}, }; + for (int i = 0; i < 4; ++i) { ManifestEntry entry; entry.status = ManifestStatus::kAdded; @@ -159,16 +168,16 @@ class ManifestReaderV2Test : public ManifestReaderTestBase { std::vector record_counts = {4}; std::vector>> lower_bounds = { - {{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}}, - {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '1'}}, - {4, {0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xdc, 0x5e, 0x40}}}}; + {{1, Literal::Long(1).Serialize().value()}, + {2, Literal::String("record_four").Serialize().value()}, + {3, Literal::String("data_content_1").Serialize().value()}, + {4, Literal::Double(123.45).Serialize().value()}}}; std::vector>> upper_bounds = { - {{1, {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, - {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 't', 'w', 'o'}}, - {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '4'}}, - {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}}; + {{1, Literal::Long(4).Serialize().value()}, + {2, Literal::String("record_two").Serialize().value()}, + {3, Literal::String("data_content_4").Serialize().value()}, + {4, Literal::Double(456.78).Serialize().value()}}}; DataFile data_file{.file_path = test_dir_prefix + paths[0], .file_format = FileFormatType::kParquet, diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 7b0f09454..ddb328585 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include "iceberg/exception.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep @@ -386,4 +387,45 @@ std::shared_ptr struct_(std::vector fields) { return std::make_shared(std::move(fields)); } +std::string_view ToString(TypeId id) { + switch (id) { + case TypeId::kStruct: + return "struct"; + case TypeId::kList: + return "list"; + case TypeId::kMap: + return "map"; + case TypeId::kBoolean: + return "boolean"; + case TypeId::kInt: + return "int"; + case TypeId::kLong: + return "long"; + case TypeId::kFloat: + return "float"; + case TypeId::kDouble: + return "double"; + case TypeId::kDecimal: + return "decimal"; + case TypeId::kDate: + return "date"; + case TypeId::kTime: + return "time"; + case TypeId::kTimestamp: + return "timestamp"; + case TypeId::kTimestampTz: + return "timestamptz"; + case TypeId::kString: + return "string"; + case TypeId::kUuid: + return "uuid"; + case TypeId::kFixed: + return "fixed"; + case TypeId::kBinary: + return "binary"; + } + + std::unreachable(); +} + } // namespace iceberg diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 01c911dd8..256526834 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -531,4 +531,13 @@ ICEBERG_EXPORT std::shared_ptr map(SchemaField key, SchemaField value); /// @} +/// \brief Get the lowercase string representation of a TypeId. +/// +/// This returns the same lowercase string as used by Type::ToString() methods. +/// For example: TypeId::kBoolean -> "boolean", TypeId::kInt -> "int", etc. +/// +/// \param id The TypeId to convert to string +/// \return A string_view containing the lowercase type name +ICEBERG_EXPORT std::string_view ToString(TypeId id); + } // namespace iceberg diff --git a/src/iceberg/util/conversions.cc b/src/iceberg/util/conversions.cc new file mode 100644 index 000000000..c5dbcf359 --- /dev/null +++ b/src/iceberg/util/conversions.cc @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include +#include +#include + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format and return as vector. +template +std::vector WriteLittleEndian(T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast(&value); + std::vector result; + result.insert(result.end(), bytes, bytes + sizeof(T)); + return result; +} + +/// \brief Read a value in little-endian format from the data. +template +Result ReadLittleEndian(std::span data) { + if (data.size() != sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +template +Result> ToBytesImpl(const Literal::Value& value) { + using CppType = typename LiteralTraits::ValueType; + return WriteLittleEndian(std::get(value)); +} + +template <> +Result> ToBytesImpl(const Literal::Value& value) { + return std::vector{std::get(value) ? static_cast(0x01) + : static_cast(0x00)}; +} + +template <> +Result> ToBytesImpl(const Literal::Value& value) { + const auto& str = std::get(value); + return std::vector(str.begin(), str.end()); +} + +template <> +Result> ToBytesImpl(const Literal::Value& value) { + return std::get>(value); +} + +template <> +Result> ToBytesImpl(const Literal::Value& value) { + return std::get>(value); +} + +#define DISPATCH_LITERAL_TO_BYTES(type_id) \ + case type_id: \ + return ToBytesImpl(value); + +Result> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + const auto type_id = type.type_id(); + + switch (type_id) { + DISPATCH_LITERAL_TO_BYTES(TypeId::kInt) + DISPATCH_LITERAL_TO_BYTES(TypeId::kDate) + DISPATCH_LITERAL_TO_BYTES(TypeId::kLong) + DISPATCH_LITERAL_TO_BYTES(TypeId::kTime) + DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestamp) + DISPATCH_LITERAL_TO_BYTES(TypeId::kTimestampTz) + DISPATCH_LITERAL_TO_BYTES(TypeId::kFloat) + DISPATCH_LITERAL_TO_BYTES(TypeId::kDouble) + DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean) + DISPATCH_LITERAL_TO_BYTES(TypeId::kString) + DISPATCH_LITERAL_TO_BYTES(TypeId::kBinary) + DISPATCH_LITERAL_TO_BYTES(TypeId::kFixed) + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Serialization for type {} is not supported", type.ToString()); + } +} + +#undef DISPATCH_LITERAL_TO_BYTES + +Result> Conversions::ToBytes(const Literal& literal) { + // Cannot serialize special values + if (literal.IsAboveMax()) { + return NotSupported("Cannot serialize AboveMax"); + } + if (literal.IsBelowMin()) { + return NotSupported("Cannot serialize BelowMin"); + } + if (literal.IsNull()) { + return NotSupported("Cannot serialize null"); + } + + return ToBytes(*literal.type(), literal.value()); +} + +Result Conversions::FromBytes(const PrimitiveType& type, + std::span data) { + const auto type_id = type.type_id(); + switch (type_id) { + case TypeId::kBoolean: { + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian(data)); + return Literal::Value{static_cast(value != 0x00)}; + } + case TypeId::kInt: { + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian(data)); + return Literal::Value{value}; + } + case TypeId::kDate: { + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian(data)); + return Literal::Value{value}; + } + case TypeId::kLong: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + int64_t value; + if (data.size() < 8) { + // Type was promoted from int to long + ICEBERG_ASSIGN_OR_RAISE(auto int_value, ReadLittleEndian(data)); + value = static_cast(int_value); + } else { + ICEBERG_ASSIGN_OR_RAISE(auto long_value, ReadLittleEndian(data)); + value = long_value; + } + return Literal::Value{value}; + } + case TypeId::kFloat: { + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian(data)); + return Literal::Value{value}; + } + case TypeId::kDouble: { + if (data.size() < 8) { + // Type was promoted from float to double + ICEBERG_ASSIGN_OR_RAISE(auto float_value, ReadLittleEndian(data)); + return Literal::Value{static_cast(float_value)}; + } else { + ICEBERG_ASSIGN_OR_RAISE(auto double_value, ReadLittleEndian(data)); + return Literal::Value{double_value}; + } + } + case TypeId::kString: + return Literal::Value{ + std::string(reinterpret_cast(data.data()), data.size())}; + case TypeId::kBinary: + return Literal::Value{std::vector(data.begin(), data.end())}; + case TypeId::kFixed: { + const auto& fixed_type = static_cast(type); + if (data.size() != fixed_type.length()) { + return InvalidArgument("Invalid data size for Fixed literal, got size: {}", + data.size()); + } + return Literal::Value{std::vector(data.begin(), data.end())}; + } + // TODO(Li Feiyang): Add support for UUID and Decimal + default: + return NotSupported("Deserialization for type {} is not supported", + type.ToString()); + } +} + +Result Conversions::FromBytes(std::shared_ptr type, + std::span data) { + if (!type) { + return InvalidArgument("Type cannot be null"); + } + + ICEBERG_ASSIGN_OR_RAISE(auto value, FromBytes(*type, data)); + return Literal(std::move(value), std::move(type)); +} + +} // namespace iceberg diff --git a/src/iceberg/util/conversions.h b/src/iceberg/util/conversions.h new file mode 100644 index 000000000..fe383bc5a --- /dev/null +++ b/src/iceberg/util/conversions.h @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +/// \file iceberg/util/conversions.h +/// \brief Conversion utilities for primitive types + +namespace iceberg { + +/// \brief Conversion utilities for primitive types +class ICEBERG_EXPORT Conversions { + public: + /// \brief Serializes a raw literal value into a byte vector according to its type. + /// \param type The primitive type of the value. + /// \param value The std::variant holding the raw literal value to serialize. + /// \return A Result containing the serialized value. + static Result> ToBytes(const PrimitiveType& type, + const Literal::Value& value); + + /// \brief Serializes a complete Literal object into a byte vector. + /// \param literal The Literal object to serialize. + /// \return A Result containing the serialized value. + static Result> ToBytes(const Literal& literal); + + /// \brief Deserializes a span of bytes into a raw literal value based on the given + /// type. + /// \param type The target primitive type to interpret the bytes as. + /// \param data A std::span of bytes representing the serialized value. + /// \return A Result containing the deserialized value. + static Result FromBytes(const PrimitiveType& type, + std::span data); + + /// \brief Deserializes a span of bytes into a complete Literal object. + /// \param type A shared pointer to the target primitive type. + /// \param data A std::span of bytes representing the serialized value. + /// \return A Result containing the deserialized value. + static Result FromBytes(std::shared_ptr type, + std::span data); +}; + +} // namespace iceberg