diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index c8fb07721..747d4c40e 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -52,7 +52,8 @@ set(ICEBERG_SOURCES util/decimal.cc util/murmurhash3_internal.cc util/timepoint.cc - util/gzip_internal.cc) + util/gzip_internal.cc + util/uuid.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/util/uuid.cc b/src/iceberg/util/uuid.cc new file mode 100644 index 000000000..14256755d --- /dev/null +++ b/src/iceberg/util/uuid.cc @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/uuid.h" + +#include +#include +#include +#include +#include + +#include "iceberg/exception.h" +#include "iceberg/result.h" +#include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/int128.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { + +constexpr std::array BuildHexTable() { + std::array buf{}; + for (int32_t i = 0; i < 256; i++) { + if (i >= '0' && i <= '9') { + buf[i] = static_cast(i - '0'); + } else if (i >= 'a' && i <= 'f') { + buf[i] = static_cast(i - 'a' + 10); + } else if (i >= 'A' && i <= 'F') { + buf[i] = static_cast(i - 'A' + 10); + } else { + buf[i] = 0xFF; + } + } + return buf; +} + +constexpr std::array BuildShl4Table() { + std::array buf{}; + for (int32_t i = 0; i < 256; i++) { + buf[i] = static_cast(i << 4); + } + return buf; +} + +constexpr auto kHexTable = BuildHexTable(); +constexpr auto kShl4Table = BuildShl4Table(); + +// Parse a UUID string without dashes, e.g. "67e5504410b1426f9247bb680e5fe0c8" +inline Result ParseSimple(std::string_view s) { + ICEBERG_DCHECK(s.size() == 32, "s must be 32 characters long"); + + std::array uuid{}; + for (size_t i = 0; i < 16; i++) { + uint8_t h1 = kHexTable[static_cast(s[i * 2])]; + uint8_t h2 = kHexTable[static_cast(s[i * 2 + 1])]; + + if ((h1 | h2) == 0xFF) [[unlikely]] { + return InvalidArgument("Invalid UUID string: {}", s); + } + + uuid[i] = static_cast(kShl4Table[h1] | h2); + } + return Uuid(std::move(uuid)); +} + +// Parse a UUID string with dashes, e.g. "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +inline Result ParseHyphenated(std::string_view s) { + ICEBERG_DCHECK(s.size() == 36, "s must be 36 characters long"); + + // Check that dashes are in the right places + if (!(s[8] == '-' && s[13] == '-' && s[18] == '-' && s[23] == '-')) [[unlikely]] { + return InvalidArgument("Invalid UUID string: {}", s); + } + + constexpr std::array positions = {0, 4, 9, 14, 19, 24, 28, 32}; + std::array uuid{}; + + for (size_t j = 0; j < 8; j++) { + size_t i = positions[j]; + uint8_t h1 = kHexTable[static_cast(s[i])]; + uint8_t h2 = kHexTable[static_cast(s[i + 1])]; + uint8_t h3 = kHexTable[static_cast(s[i + 2])]; + uint8_t h4 = kHexTable[static_cast(s[i + 3])]; + + if ((h1 | h2 | h3 | h4) == 0xFF) [[unlikely]] { + return InvalidArgument("Invalid UUID string: {}", s); + } + + uuid[j * 2] = static_cast(kShl4Table[h1] | h2); + uuid[j * 2 + 1] = static_cast(kShl4Table[h3] | h4); + } + + return Uuid(std::move(uuid)); +} + +} // namespace + +Uuid::Uuid(std::array data) : data_(std::move(data)) {} + +Uuid Uuid::GenerateV4() { + static std::random_device rd; + static std::mt19937 gen(rd()); + static std::uniform_int_distribution distrib( + std::numeric_limits::min(), std::numeric_limits::max()); + std::array uuid; + + // Generate two random 64-bit integers + uint64_t high_bits = distrib(gen); + uint64_t low_bits = distrib(gen); + + // Combine them into a uint128_t + uint128_t random_128_bit_number = (static_cast(high_bits) << 64) | low_bits; + + // Copy the bytes into the uuid array + std::memcpy(uuid.data(), &random_128_bit_number, 16); + + // Set magic numbers for a "version 4" (pseudorandom) UUID and variant, + // see https://datatracker.ietf.org/doc/html/rfc9562#name-uuid-version-4 + uuid[6] = (uuid[6] & 0x0F) | 0x40; + // Set variant field, top two bits are 1, 0 + uuid[8] = (uuid[8] & 0x3F) | 0x80; + + return Uuid(std::move(uuid)); +} + +Uuid Uuid::GenerateV7() { + // Get the current time in milliseconds since the Unix epoch + auto now = std::chrono::system_clock::now(); + auto duration_since_epoch = now.time_since_epoch(); + auto unix_ts_ms = + std::chrono::duration_cast(duration_since_epoch).count(); + + return GenerateV7(static_cast(unix_ts_ms)); +} + +Uuid Uuid::GenerateV7(uint64_t unix_ts_ms) { + std::array uuid = {}; + + // Set the timestamp (in milliseconds since Unix epoch) + uuid[0] = (unix_ts_ms >> 40) & 0xFF; + uuid[1] = (unix_ts_ms >> 32) & 0xFF; + uuid[2] = (unix_ts_ms >> 24) & 0xFF; + uuid[3] = (unix_ts_ms >> 16) & 0xFF; + uuid[4] = (unix_ts_ms >> 8) & 0xFF; + uuid[5] = unix_ts_ms & 0xFF; + + // Generate random bytes for the remaining fields + static std::random_device rd; + static std::mt19937 gen(rd()); + static std::uniform_int_distribution distrib( + std::numeric_limits::min(), std::numeric_limits::max()); + + // Note: uint8_t is invalid for uniform_int_distribution on Windows + for (size_t i = 6; i < 16; i += 2) { + auto rand = static_cast(distrib(gen)); + uuid[i] = (rand >> 8) & 0xFF; + uuid[i + 1] = rand & 0xFF; + } + + // Set magic numbers for a "version 7" (pseudorandom) UUID and variant, + // see https://www.rfc-editor.org/rfc/rfc9562#name-version-field + uuid[6] = (uuid[6] & 0x0F) | 0x70; + // set variant field, top two bits are 1, 0 + uuid[8] = (uuid[8] & 0x3F) | 0x80; + + return Uuid(std::move(uuid)); +} + +Result Uuid::FromString(std::string_view str) { + if (str.size() == 32) { + return ParseSimple(str); + } else if (str.size() == 36) { + return ParseHyphenated(str); + } else { + return InvalidArgument("Invalid UUID string: {}", str); + } +} + +Result Uuid::FromBytes(std::span bytes) { + if (bytes.size() != kLength) [[unlikely]] { + return InvalidArgument("UUID byte array must be exactly {} bytes, was {}", kLength, + bytes.size()); + } + std::array data; + std::memcpy(data.data(), bytes.data(), kLength); + return Uuid(std::move(data)); +} + +uint8_t Uuid::operator[](size_t index) const { + ICEBERG_CHECK(index < kLength, "UUID index out of range: {}", index); + return data_[index]; +} + +std::string Uuid::ToString() const { + return std::format( + "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}" + "{:02x}{:02x}{:02x}", + data_[0], data_[1], data_[2], data_[3], data_[4], data_[5], data_[6], data_[7], + data_[8], data_[9], data_[10], data_[11], data_[12], data_[13], data_[14], + data_[15]); +} + +} // namespace iceberg diff --git a/src/iceberg/util/uuid.h b/src/iceberg/util/uuid.h new file mode 100644 index 000000000..64db7c5d6 --- /dev/null +++ b/src/iceberg/util/uuid.h @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/util/formattable.h" + +/// \file iceberg/util/uuid.h +/// \brief UUID (Universally Unique Identifier) representation. + +namespace iceberg { + +class ICEBERG_EXPORT Uuid : public util::Formattable { + public: + Uuid() = delete; + constexpr static size_t kLength = 16; + + explicit Uuid(std::array data); + + /// \brief Generate a random UUID (version 4). + static Uuid GenerateV4(); + + /// \brief Generate UUID version 7 per RFC 9562, with the current timestamp. + static Uuid GenerateV7(); + + /// \brief Generate UUID version 7 per RFC 9562, with the given timestamp. + /// + /// UUID version 7 consists of a Unix timestamp in milliseconds (48 bits) and + /// 74 random bits, excluding the required version and variant bits. + /// + /// \param unix_ts_ms number of milliseconds since start of the UNIX epoch + /// + /// \note unix_ts_ms cannot be negative per RFC. + static Uuid GenerateV7(uint64_t unix_ts_ms); + + /// \brief Create a UUID from a string in standard format. + static Result FromString(std::string_view str); + + /// \brief Create a UUID from a 16-byte array. + static Result FromBytes(std::span bytes); + + /// \brief Get the raw bytes of the UUID. + std::span bytes() const { return data_; } + + /// \brief Access individual bytes of the UUID. + /// \param index The index of the byte to access (0-15). + /// \return The byte at the specified index. + /// \throw IcebergError if index is out of bounds. + uint8_t operator[](size_t index) const; + + /// \brief Convert the UUID to a string in standard format. + std::string ToString() const override; + + friend bool operator==(const Uuid& lhs, const Uuid& rhs) { + return lhs.data_ == rhs.data_; + } + + private: + std::array data_; +}; + +} // namespace iceberg diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d70b4f85b..3c7473522 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -91,6 +91,7 @@ add_iceberg_test(util_test endian_test.cc formatter_test.cc string_util_test.cc + uuid_test.cc visit_type_test.cc) add_iceberg_test(roaring_test SOURCES roaring_test.cc) diff --git a/test/uuid_test.cc b/test/uuid_test.cc new file mode 100644 index 000000000..3dbe573b2 --- /dev/null +++ b/test/uuid_test.cc @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/uuid.h" + +#include + +#include + +#include "matchers.h" + +namespace iceberg { + +TEST(UUIDUtilTest, GenerateV4) { + auto uuid = Uuid::GenerateV4(); + // just ensure it runs and produces a value + EXPECT_EQ(uuid.bytes().size(), Uuid::kLength); + // Version 4 UUIDs have the version number (4) in the 7th byte + EXPECT_EQ((uuid[6] >> 4) & 0x0F, 4); + // Variant is in the 9th byte, the two most significant bits should be 10 + EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10); +} + +TEST(UUIDUtilTest, GenerateV7) { + auto uuid = Uuid::GenerateV7(); + // just ensure it runs and produces a value + EXPECT_EQ(uuid.bytes().size(), 16); + // Version 7 UUIDs have the version number (7) in the 7th byte + EXPECT_EQ((uuid[6] >> 4) & 0x0F, 7); + // Variant is in the 9th byte, the two most significant bits should be 10 + EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10); +} + +TEST(UUIDUtilTest, FromString) { + std::vector uuid_strings = { + "123e4567-e89b-12d3-a456-426614174000", + "550e8400-e29b-41d4-a716-446655440000", + "f47ac10b-58cc-4372-a567-0e02b2c3d479", + }; + + for (const auto& uuid_str : uuid_strings) { + auto result = Uuid::FromString(uuid_str); + EXPECT_THAT(result, IsOk()); + auto uuid = result.value(); + EXPECT_EQ(uuid.ToString(), uuid_str); + } + + std::vector> uuid_string_pairs = { + {"123e4567e89b12d3a456426614174000", "123e4567-e89b-12d3-a456-426614174000"}, + {"550E8400E29B41D4A716446655440000", "550e8400-e29b-41d4-a716-446655440000"}, + {"F47AC10B58CC4372A5670E02B2C3D479", "f47ac10b-58cc-4372-a567-0e02b2c3d479"}, + }; + + for (const auto& [input_str, expected_str] : uuid_string_pairs) { + auto result = Uuid::FromString(input_str); + EXPECT_THAT(result, IsOk()); + auto uuid = result.value(); + EXPECT_EQ(uuid.ToString(), expected_str); + } +} + +TEST(UUIDUtilTest, FromStringInvalid) { + std::vector invalid_uuid_strings = { + "123e4567-e89b-12d3-a456-42661417400", // too short + "123e4567-e89b-12d3-a456-4266141740000", // too long + "g23e4567-e89b-12d3-a456-426614174000", // invalid character + "123e4567e89b12d3a45642661417400", // too short without dashes + "123e4567e89b12d3a4564266141740000", // too long without dashes + "550e8400-e29b-41d4-a716-44665544000Z", // invalid character at end + "550e8400-e29b-41d4-a716-44665544000-", // invalid character at end + "550e8400-e29b-41d4-a716-4466554400", // too short + }; + + for (const auto& uuid_str : invalid_uuid_strings) { + auto result = Uuid::FromString(uuid_str); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Invalid UUID string")); + } +} + +TEST(UUIDUtilTest, FromBytes) { + std::array bytes = {0x12, 0x3e, 0x45, 0x67, 0xe8, 0x9b, + 0x12, 0xd3, 0xa4, 0x56, 0x42, 0x66, + 0x14, 0x17, 0x40, 0x00}; + auto result = Uuid::FromBytes(bytes); + EXPECT_THAT(result, IsOk()); + auto uuid = result.value(); + EXPECT_EQ(uuid.ToString(), "123e4567-e89b-12d3-a456-426614174000"); + EXPECT_EQ(uuid, Uuid(bytes)); +} + +TEST(UUIDUtilTest, FromBytesInvalid) { + std::array short_bytes = {0x12, 0x3e, 0x45, 0x67, 0xe8, + 0x9b, 0x12, 0xd3, 0xa4, 0x56, + 0x42, 0x66, 0x14, 0x17, 0x40}; + auto result = Uuid::FromBytes(short_bytes); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("UUID byte array must be exactly 16 bytes")); +} + +} // namespace iceberg