Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ set(ICEBERG_SOURCES
util/decimal.cc
util/murmurhash3_internal.cc
util/timepoint.cc
util/gzip_internal.cc)
util/gzip_internal.cc
util/uuid_util.cc)

set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)
Expand Down
207 changes: 207 additions & 0 deletions src/iceberg/util/uuid_util.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/util/uuid_util.h"

#include <chrono>
#include <cstdint>
#include <cstring>
#include <random>
#include <string>

#include "iceberg/result.h"
#include "iceberg/util/int128.h"
#include "iceberg/util/macros.h"

namespace iceberg {

std::array<uint8_t, 16> UUIDUtils::GenerateUuidV4() {
static std::random_device rd;
static std::mt19937 gen(rd());
static std::uniform_int_distribution<uint64_t> distrib(
std::numeric_limits<uint64_t>::min(), std::numeric_limits<uint64_t>::max());
std::array<uint8_t, 16> uuid;

// Generate two random 64-bit integers
uint64_t high_bits = distrib(gen);
uint64_t low_bits = distrib(gen);

// Combine them into a uint128_t
uint128_t random_128_bit_number = (static_cast<uint128_t>(high_bits) << 64) | low_bits;

// Copy the bytes into the uuid array
std::memcpy(uuid.data(), &random_128_bit_number, 16);

// Set magic numbers for a "version 4" (pseudorandom) UUID and variant,
// see https://datatracker.ietf.org/doc/html/rfc9562#name-uuid-version-4
uuid[6] = (uuid[6] & 0x0F) | 0x40;
// Set variant field, top two bits are 1, 0
uuid[8] = (uuid[8] & 0x3F) | 0x80;

return uuid;
}

std::array<uint8_t, 16> UUIDUtils::GenerateUuidV7() {
// Get the current time in milliseconds since the Unix epoch
auto now = std::chrono::system_clock::now();
auto duration_since_epoch = now.time_since_epoch();
auto unix_ts_ms =
std::chrono::duration_cast<std::chrono::milliseconds>(duration_since_epoch).count();

return GenerateUuidV7(static_cast<uint64_t>(unix_ts_ms));
}

std::array<uint8_t, 16> UUIDUtils::GenerateUuidV7(uint64_t unix_ts_ms) {
std::array<uint8_t, 16> uuid = {};

// Set the timestamp (in milliseconds since Unix epoch)
uuid[0] = (unix_ts_ms >> 40) & 0xFF;
uuid[1] = (unix_ts_ms >> 32) & 0xFF;
uuid[2] = (unix_ts_ms >> 24) & 0xFF;
uuid[3] = (unix_ts_ms >> 16) & 0xFF;
uuid[4] = (unix_ts_ms >> 8) & 0xFF;
uuid[5] = unix_ts_ms & 0xFF;

// Generate random bytes for the remaining fields
static std::random_device rd;
static std::mt19937 gen(rd());
static std::uniform_int_distribution<uint16_t> distrib(
std::numeric_limits<uint16_t>::min(), std::numeric_limits<uint16_t>::max());

// Note: uint8_t is invalid for uniform_int_distribution on Windows
for (size_t i = 6; i < 16; i += 2) {
auto rand = static_cast<uint16_t>(distrib(gen));
uuid[i] = (rand >> 8) & 0xFF;
uuid[i + 1] = rand & 0xFF;
}

// Set magic numbers for a "version 7" (pseudorandom) UUID and variant,
// see https://www.rfc-editor.org/rfc/rfc9562#name-version-field
uuid[6] = (uuid[6] & 0x0F) | 0x70;
// set variant field, top two bits are 1, 0
uuid[8] = (uuid[8] & 0x3F) | 0x80;

return uuid;
}

namespace {

constexpr std::array<uint8_t, 256> BuildHexTable() {
std::array<uint8_t, 256> buf{};
for (int i = 0; i < 256; i++) {
if (i >= '0' && i <= '9') {
buf[i] = static_cast<uint8_t>(i - '0');
} else if (i >= 'a' && i <= 'f') {
buf[i] = static_cast<uint8_t>(i - 'a' + 10);
} else if (i >= 'A' && i <= 'F') {
buf[i] = static_cast<uint8_t>(i - 'A' + 10);
} else {
buf[i] = 0xff;
}
}
return buf;
}

constexpr std::array<uint8_t, 256> BuildShl4Table() {
std::array<uint8_t, 256> buf{};
for (int i = 0; i < 256; i++) {
buf[i] = static_cast<uint8_t>(i << 4);
}
return buf;
}

constexpr auto HEX_TABLE = BuildHexTable();
constexpr auto SHL4_TABLE = BuildShl4Table();

// Parse a UUID string without dashes, e.g. "67e5504410b1426f9247bb680e5fe0c8"
inline Result<std::array<uint8_t, 16>> ParseSimple(std::string_view s) {
ICEBERG_DCHECK(s.size() == 32, "s must be 32 characters long");

std::array<uint8_t, 16> buf{};
for (size_t i = 0; i < 16; i++) {
uint8_t h1 = HEX_TABLE[static_cast<uint8_t>(s[i * 2])];
uint8_t h2 = HEX_TABLE[static_cast<uint8_t>(s[i * 2 + 1])];

if ((h1 | h2) == 0xff) {
return InvalidArgument("Invalid UUID string: {}", s);
}

buf[i] = static_cast<uint8_t>(SHL4_TABLE[h1] | h2);
}
return buf;
}

// Parse a UUID string with dashes, e.g. "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
inline Result<std::array<uint8_t, 16>> ParseHyphenated(std::string_view s) {
ICEBERG_DCHECK(s.size() == 36, "s must be 36 characters long");

// Check that dashes are in the right places
if (!(s[8] == '-' && s[13] == '-' && s[18] == '-' && s[23] == '-')) {
return InvalidArgument("Invalid UUID string: {}", s);
}

constexpr std::array<size_t, 8> positions = {0, 4, 9, 14, 19, 24, 28, 32};
std::array<uint8_t, 16> buf{};

for (size_t j = 0; j < 8; j++) {
size_t i = positions[j];
uint8_t h1 = HEX_TABLE[static_cast<uint8_t>(s[i])];
uint8_t h2 = HEX_TABLE[static_cast<uint8_t>(s[i + 1])];
uint8_t h3 = HEX_TABLE[static_cast<uint8_t>(s[i + 2])];
uint8_t h4 = HEX_TABLE[static_cast<uint8_t>(s[i + 3])];

if ((h1 | h2 | h3 | h4) == 0xff) {
return InvalidArgument("Invalid UUID string: {}", s);
}

buf[j * 2] = static_cast<uint8_t>(SHL4_TABLE[h1] | h2);
buf[j * 2 + 1] = static_cast<uint8_t>(SHL4_TABLE[h3] | h4);
}

return buf;
}

} // namespace

Result<std::array<uint8_t, 16>> UUIDUtils::FromString(std::string_view str) {
if (str.size() == 32) {
return ParseSimple(str);
} else if (str.size() == 36) {
return ParseHyphenated(str);
} else {
return InvalidArgument("Invalid UUID string: {}", str);
}
}

std::string UUIDUtils::ToString(std::span<uint8_t> uuid) {
static const char* hex_chars = "0123456789abcdef";
ICEBERG_DCHECK(uuid.size() == 16, "uuid must be 16 bytes long");
std::string str(36, '-');

for (size_t i = 0; i < 16; i++) {
str[i * 2 + (i >= 4 ? 1 : 0) + (i >= 6 ? 1 : 0) + (i >= 8 ? 1 : 0) +
(i >= 10 ? 1 : 0)] = hex_chars[(uuid[i] >> 4) & 0x0F];
str[i * 2 + 1 + (i >= 4 ? 1 : 0) + (i >= 6 ? 1 : 0) + (i >= 8 ? 1 : 0) +
(i >= 10 ? 1 : 0)] = hex_chars[uuid[i] & 0x0F];
}

return str;
}

} // namespace iceberg
60 changes: 60 additions & 0 deletions src/iceberg/util/uuid_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <array>
#include <cstdint>
#include <span>
#include <string_view>

#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"

/// \file iceberg/util/uuid_util.h
/// \brief UUID (Universally Unique Identifier) utilities.

namespace iceberg {

class ICEBERG_EXPORT UUIDUtils {
public:
/// \brief Generate a random UUID (version 4).
static std::array<uint8_t, 16> GenerateUuidV4();

/// \brief Generate UUID version 7 per RFC 9562, with the current timestamp.
static std::array<uint8_t, 16> GenerateUuidV7();

/// \brief Generate UUID version 7 per RFC 9562, with the given timestamp.
///
/// UUID version 7 consists of a Unix timestamp in milliseconds (48 bits) and
/// 74 random bits, excluding the required version and variant bits.
///
/// \param unix_ts_ms number of milliseconds since start of the UNIX epoch
///
/// \note unix_ts_ms cannot be negative per RFC.
static std::array<uint8_t, 16> GenerateUuidV7(uint64_t unix_ts_ms);

/// \brief Create a UUID from a string in standard format.
static Result<std::array<uint8_t, 16>> FromString(std::string_view str);

/// \brief Convert a UUID to a string in standard format.
static std::string ToString(std::span<uint8_t> uuid);
};

} // namespace iceberg
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ add_iceberg_test(util_test
endian_test.cc
formatter_test.cc
string_util_test.cc
uuid_util_test.cc
visit_type_test.cc)

add_iceberg_test(roaring_test SOURCES roaring_test.cc)
Expand Down
97 changes: 97 additions & 0 deletions test/uuid_util_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/util/uuid_util.h"

#include <vector>

#include <gtest/gtest.h>

#include "matchers.h"

namespace iceberg {

TEST(UUIDUtilTest, GenerateV4) {
auto uuid = UUIDUtils::GenerateUuidV4();
// just ensure it runs and produces a value
EXPECT_EQ(uuid.size(), 16);
// Version 4 UUIDs have the version number (4) in the 7th byte
EXPECT_EQ((uuid[6] >> 4) & 0x0F, 4);
// Variant is in the 9th byte, the two most significant bits should be 10
EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
}

TEST(UUIDUtilTest, GenerateV7) {
auto uuid = UUIDUtils::GenerateUuidV7();
// just ensure it runs and produces a value
EXPECT_EQ(uuid.size(), 16);
// Version 7 UUIDs have the version number (7) in the 7th byte
EXPECT_EQ((uuid[6] >> 4) & 0x0F, 7);
// Variant is in the 9th byte, the two most significant bits should be 10
EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
}

TEST(UUIDUtilTest, FromString) {
std::vector<std::string> uuid_strings = {
"123e4567-e89b-12d3-a456-426614174000",
"550e8400-e29b-41d4-a716-446655440000",
"f47ac10b-58cc-4372-a567-0e02b2c3d479",
};

for (const auto& uuid_str : uuid_strings) {
auto result = UUIDUtils::FromString(uuid_str);
EXPECT_THAT(result, IsOk());
auto uuid = result.value();
EXPECT_EQ(UUIDUtils::ToString(uuid), uuid_str);
}

std::vector<std::pair<std::string, std::string>> uuid_string_pairs = {
{"123e4567e89b12d3a456426614174000", "123e4567-e89b-12d3-a456-426614174000"},
{"550E8400E29B41D4A716446655440000", "550e8400-e29b-41d4-a716-446655440000"},
{"F47AC10B58CC4372A5670E02B2C3D479", "f47ac10b-58cc-4372-a567-0e02b2c3d479"},
};

for (const auto& [input_str, expected_str] : uuid_string_pairs) {
auto result = UUIDUtils::FromString(input_str);
EXPECT_THAT(result, IsOk());
auto uuid = result.value();
EXPECT_EQ(UUIDUtils::ToString(uuid), expected_str);
}
}

TEST(UUIDUtilTest, FromStringInvalid) {
std::vector<std::string> invalid_uuid_strings = {
"123e4567-e89b-12d3-a456-42661417400", // too short
"123e4567-e89b-12d3-a456-4266141740000", // too long
"g23e4567-e89b-12d3-a456-426614174000", // invalid character
"123e4567e89b12d3a45642661417400", // too short without dashes
"123e4567e89b12d3a4564266141740000", // too long without dashes
"550e8400-e29b-41d4-a716-44665544000Z", // invalid character at end
"550e8400-e29b-41d4-a716-44665544000-", // invalid character at end
"550e8400-e29b-41d4-a716-4466554400", // too short
};

for (const auto& uuid_str : invalid_uuid_strings) {
auto result = UUIDUtils::FromString(uuid_str);
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
EXPECT_THAT(result, HasErrorMessage("Invalid UUID string"));
}
}

} // namespace iceberg
Loading