Skip to content

Commit 8056114

Browse files
committed
feat: add uuid utils
uuid generator for v4 and v7 FromString and ToString utilities
1 parent cb4998c commit 8056114

File tree

5 files changed

+367
-1
lines changed

5 files changed

+367
-1
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ set(ICEBERG_SOURCES
5252
util/decimal.cc
5353
util/murmurhash3_internal.cc
5454
util/timepoint.cc
55-
util/gzip_internal.cc)
55+
util/gzip_internal.cc
56+
util/uuid_util.cc)
5657

5758
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
5859
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)

src/iceberg/util/uuid_util.cc

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/uuid_util.h"
21+
22+
#include <chrono>
23+
#include <cstdint>
24+
#include <cstring>
25+
#include <random>
26+
#include <string>
27+
28+
#include "iceberg/result.h"
29+
#include "iceberg/util/int128.h"
30+
#include "iceberg/util/macros.h"
31+
32+
namespace iceberg {
33+
34+
std::array<uint8_t, 16> UUIDUtils::GenerateUuidV4() {
35+
static std::random_device rd;
36+
static std::mt19937 gen(rd());
37+
static std::uniform_int_distribution<uint64_t> distrib(
38+
std::numeric_limits<uint64_t>::min(), std::numeric_limits<uint64_t>::max());
39+
std::array<uint8_t, 16> uuid;
40+
41+
// Generate two random 64-bit integers
42+
uint64_t high_bits = distrib(gen);
43+
uint64_t low_bits = distrib(gen);
44+
45+
// Combine them into a uint128_t
46+
uint128_t random_128_bit_number = (static_cast<uint128_t>(high_bits) << 64) | low_bits;
47+
48+
// Copy the bytes into the uuid array
49+
std::memcpy(uuid.data(), &random_128_bit_number, 16);
50+
51+
// Set magic numbers for a "version 4" (pseudorandom) UUID and variant,
52+
// see https://datatracker.ietf.org/doc/html/rfc9562#name-uuid-version-4
53+
uuid[6] = (uuid[6] & 0x0F) | 0x40;
54+
// Set variant field, top two bits are 1, 0
55+
uuid[8] = (uuid[8] & 0x3F) | 0x80;
56+
57+
return uuid;
58+
}
59+
60+
std::array<uint8_t, 16> UUIDUtils::GenerateUuidV7() {
61+
// Get the current time in milliseconds since the Unix epoch
62+
auto now = std::chrono::system_clock::now();
63+
auto duration_since_epoch = now.time_since_epoch();
64+
auto unix_ts_ms =
65+
std::chrono::duration_cast<std::chrono::milliseconds>(duration_since_epoch).count();
66+
67+
return GenerateUuidV7(static_cast<uint64_t>(unix_ts_ms));
68+
}
69+
70+
std::array<uint8_t, 16> UUIDUtils::GenerateUuidV7(uint64_t unix_ts_ms) {
71+
std::array<uint8_t, 16> uuid = {};
72+
73+
// Set the timestamp (in milliseconds since Unix epoch)
74+
uuid[0] = (unix_ts_ms >> 40) & 0xFF;
75+
uuid[1] = (unix_ts_ms >> 32) & 0xFF;
76+
uuid[2] = (unix_ts_ms >> 24) & 0xFF;
77+
uuid[3] = (unix_ts_ms >> 16) & 0xFF;
78+
uuid[4] = (unix_ts_ms >> 8) & 0xFF;
79+
uuid[5] = unix_ts_ms & 0xFF;
80+
81+
// Generate random bytes for the remaining fields
82+
static std::random_device rd;
83+
static std::mt19937 gen(rd());
84+
static std::uniform_int_distribution<uint16_t> distrib(
85+
std::numeric_limits<uint16_t>::min(), std::numeric_limits<uint16_t>::max());
86+
87+
// Note: uint8_t is invalid for uniform_int_distribution on Windows
88+
for (size_t i = 6; i < 16; i += 2) {
89+
auto rand = static_cast<uint16_t>(distrib(gen));
90+
uuid[i] = (rand >> 8) & 0xFF;
91+
uuid[i + 1] = rand & 0xFF;
92+
}
93+
94+
// Set magic numbers for a "version 7" (pseudorandom) UUID and variant,
95+
// see https://www.rfc-editor.org/rfc/rfc9562#name-version-field
96+
uuid[6] = (uuid[6] & 0x0F) | 0x70;
97+
// set variant field, top two bits are 1, 0
98+
uuid[8] = (uuid[8] & 0x3F) | 0x80;
99+
100+
return uuid;
101+
}
102+
103+
namespace {
104+
105+
constexpr std::array<uint8_t, 256> BuildHexTable() {
106+
std::array<uint8_t, 256> buf{};
107+
for (int i = 0; i < 256; i++) {
108+
if (i >= '0' && i <= '9') {
109+
buf[i] = static_cast<uint8_t>(i - '0');
110+
} else if (i >= 'a' && i <= 'f') {
111+
buf[i] = static_cast<uint8_t>(i - 'a' + 10);
112+
} else if (i >= 'A' && i <= 'F') {
113+
buf[i] = static_cast<uint8_t>(i - 'A' + 10);
114+
} else {
115+
buf[i] = 0xff;
116+
}
117+
}
118+
return buf;
119+
}
120+
121+
constexpr std::array<uint8_t, 256> BuildShl4Table() {
122+
std::array<uint8_t, 256> buf{};
123+
for (int i = 0; i < 256; i++) {
124+
buf[i] = static_cast<uint8_t>(i << 4);
125+
}
126+
return buf;
127+
}
128+
129+
constexpr auto HEX_TABLE = BuildHexTable();
130+
constexpr auto SHL4_TABLE = BuildShl4Table();
131+
132+
// Parse a UUID string without dashes, e.g. "67e5504410b1426f9247bb680e5fe0c8"
133+
inline Result<std::array<uint8_t, 16>> ParseSimple(std::string_view s) {
134+
ICEBERG_DCHECK(s.size() == 32, "s must be 32 characters long");
135+
136+
std::array<uint8_t, 16> buf{};
137+
for (size_t i = 0; i < 16; i++) {
138+
uint8_t h1 = HEX_TABLE[static_cast<uint8_t>(s[i * 2])];
139+
uint8_t h2 = HEX_TABLE[static_cast<uint8_t>(s[i * 2 + 1])];
140+
141+
if ((h1 | h2) == 0xff) {
142+
return InvalidArgument("Invalid UUID string: {}", s);
143+
}
144+
145+
buf[i] = static_cast<uint8_t>(SHL4_TABLE[h1] | h2);
146+
}
147+
return buf;
148+
}
149+
150+
// Parse a UUID string with dashes, e.g. "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
151+
inline Result<std::array<uint8_t, 16>> ParseHyphenated(std::string_view s) {
152+
ICEBERG_DCHECK(s.size() == 36, "s must be 36 characters long");
153+
154+
// Check that dashes are in the right places
155+
if (!(s[8] == '-' && s[13] == '-' && s[18] == '-' && s[23] == '-')) {
156+
return InvalidArgument("Invalid UUID string: {}", s);
157+
}
158+
159+
constexpr std::array<size_t, 8> positions = {0, 4, 9, 14, 19, 24, 28, 32};
160+
std::array<uint8_t, 16> buf{};
161+
162+
for (size_t j = 0; j < 8; j++) {
163+
size_t i = positions[j];
164+
uint8_t h1 = HEX_TABLE[static_cast<uint8_t>(s[i])];
165+
uint8_t h2 = HEX_TABLE[static_cast<uint8_t>(s[i + 1])];
166+
uint8_t h3 = HEX_TABLE[static_cast<uint8_t>(s[i + 2])];
167+
uint8_t h4 = HEX_TABLE[static_cast<uint8_t>(s[i + 3])];
168+
169+
if ((h1 | h2 | h3 | h4) == 0xff) {
170+
return InvalidArgument("Invalid UUID string: {}", s);
171+
}
172+
173+
buf[j * 2] = static_cast<uint8_t>(SHL4_TABLE[h1] | h2);
174+
buf[j * 2 + 1] = static_cast<uint8_t>(SHL4_TABLE[h3] | h4);
175+
}
176+
177+
return buf;
178+
}
179+
180+
} // namespace
181+
182+
Result<std::array<uint8_t, 16>> UUIDUtils::FromString(std::string_view str) {
183+
if (str.size() == 32) {
184+
return ParseSimple(str);
185+
} else if (str.size() == 36) {
186+
return ParseHyphenated(str);
187+
} else {
188+
return InvalidArgument("Invalid UUID string: {}", str);
189+
}
190+
}
191+
192+
std::string UUIDUtils::ToString(std::span<uint8_t> uuid) {
193+
static const char* hex_chars = "0123456789abcdef";
194+
ICEBERG_DCHECK(uuid.size() == 16, "uuid must be 16 bytes long");
195+
std::string str(36, '-');
196+
197+
for (size_t i = 0; i < 16; i++) {
198+
str[i * 2 + (i >= 4 ? 1 : 0) + (i >= 6 ? 1 : 0) + (i >= 8 ? 1 : 0) +
199+
(i >= 10 ? 1 : 0)] = hex_chars[(uuid[i] >> 4) & 0x0F];
200+
str[i * 2 + 1 + (i >= 4 ? 1 : 0) + (i >= 6 ? 1 : 0) + (i >= 8 ? 1 : 0) +
201+
(i >= 10 ? 1 : 0)] = hex_chars[uuid[i] & 0x0F];
202+
}
203+
204+
return str;
205+
}
206+
207+
} // namespace iceberg

src/iceberg/util/uuid_util.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
#include <array>
23+
#include <cstdint>
24+
#include <span>
25+
#include <string_view>
26+
27+
#include "iceberg/iceberg_export.h"
28+
#include "iceberg/result.h"
29+
30+
/// \file iceberg/util/uuid_util.h
31+
/// \brief UUID (Universally Unique Identifier) utilities.
32+
33+
namespace iceberg {
34+
35+
class ICEBERG_EXPORT UUIDUtils {
36+
public:
37+
/// \brief Generate a random UUID (version 4).
38+
static std::array<uint8_t, 16> GenerateUuidV4();
39+
40+
/// \brief Generate UUID version 7 per RFC 9562, with the current timestamp.
41+
static std::array<uint8_t, 16> GenerateUuidV7();
42+
43+
/// \brief Generate UUID version 7 per RFC 9562, with the given timestamp.
44+
///
45+
/// UUID version 7 consists of a Unix timestamp in milliseconds (48 bits) and
46+
/// 74 random bits, excluding the required version and variant bits.
47+
///
48+
/// \param unix_ts_ms number of milliseconds since start of the UNIX epoch
49+
///
50+
/// \note unix_ts_ms cannot be negative per RFC.
51+
static std::array<uint8_t, 16> GenerateUuidV7(uint64_t unix_ts_ms);
52+
53+
/// \brief Create a UUID from a string in standard format.
54+
static Result<std::array<uint8_t, 16>> FromString(std::string_view str);
55+
56+
/// \brief Convert a UUID to a string in standard format.
57+
static std::string ToString(std::span<uint8_t> uuid);
58+
};
59+
60+
} // namespace iceberg

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ add_iceberg_test(util_test
9191
endian_test.cc
9292
formatter_test.cc
9393
string_util_test.cc
94+
uuid_util_test.cc
9495
visit_type_test.cc)
9596

9697
add_iceberg_test(roaring_test SOURCES roaring_test.cc)

test/uuid_util_test.cc

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/uuid_util.h"
21+
22+
#include <vector>
23+
24+
#include <gtest/gtest.h>
25+
26+
#include "matchers.h"
27+
28+
namespace iceberg {
29+
30+
TEST(UUIDUtilTest, GenerateV4) {
31+
auto uuid = UUIDUtils::GenerateUuidV4();
32+
// just ensure it runs and produces a value
33+
EXPECT_EQ(uuid.size(), 16);
34+
// Version 4 UUIDs have the version number (4) in the 7th byte
35+
EXPECT_EQ((uuid[6] >> 4) & 0x0F, 4);
36+
// Variant is in the 9th byte, the two most significant bits should be 10
37+
EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
38+
}
39+
40+
TEST(UUIDUtilTest, GenerateV7) {
41+
auto uuid = UUIDUtils::GenerateUuidV7();
42+
// just ensure it runs and produces a value
43+
EXPECT_EQ(uuid.size(), 16);
44+
// Version 7 UUIDs have the version number (7) in the 7th byte
45+
EXPECT_EQ((uuid[6] >> 4) & 0x0F, 7);
46+
// Variant is in the 9th byte, the two most significant bits should be 10
47+
EXPECT_EQ((uuid[8] >> 6) & 0x03, 0b10);
48+
}
49+
50+
TEST(UUIDUtilTest, FromString) {
51+
std::vector<std::string> uuid_strings = {
52+
"123e4567-e89b-12d3-a456-426614174000",
53+
"550e8400-e29b-41d4-a716-446655440000",
54+
"f47ac10b-58cc-4372-a567-0e02b2c3d479",
55+
};
56+
57+
for (const auto& uuid_str : uuid_strings) {
58+
auto result = UUIDUtils::FromString(uuid_str);
59+
EXPECT_THAT(result, IsOk());
60+
auto uuid = result.value();
61+
EXPECT_EQ(UUIDUtils::ToString(uuid), uuid_str);
62+
}
63+
64+
std::vector<std::pair<std::string, std::string>> uuid_string_pairs = {
65+
{"123e4567e89b12d3a456426614174000", "123e4567-e89b-12d3-a456-426614174000"},
66+
{"550E8400E29B41D4A716446655440000", "550e8400-e29b-41d4-a716-446655440000"},
67+
{"F47AC10B58CC4372A5670E02B2C3D479", "f47ac10b-58cc-4372-a567-0e02b2c3d479"},
68+
};
69+
70+
for (const auto& [input_str, expected_str] : uuid_string_pairs) {
71+
auto result = UUIDUtils::FromString(input_str);
72+
EXPECT_THAT(result, IsOk());
73+
auto uuid = result.value();
74+
EXPECT_EQ(UUIDUtils::ToString(uuid), expected_str);
75+
}
76+
}
77+
78+
TEST(UUIDUtilTest, FromStringInvalid) {
79+
std::vector<std::string> invalid_uuid_strings = {
80+
"123e4567-e89b-12d3-a456-42661417400", // too short
81+
"123e4567-e89b-12d3-a456-4266141740000", // too long
82+
"g23e4567-e89b-12d3-a456-426614174000", // invalid character
83+
"123e4567e89b12d3a45642661417400", // too short without dashes
84+
"123e4567e89b12d3a4564266141740000", // too long without dashes
85+
"550e8400-e29b-41d4-a716-44665544000Z", // invalid character at end
86+
"550e8400-e29b-41d4-a716-44665544000-", // invalid character at end
87+
"550e8400-e29b-41d4-a716-4466554400", // too short
88+
};
89+
90+
for (const auto& uuid_str : invalid_uuid_strings) {
91+
auto result = UUIDUtils::FromString(uuid_str);
92+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
93+
EXPECT_THAT(result, HasErrorMessage("Invalid UUID string"));
94+
}
95+
}
96+
97+
} // namespace iceberg

0 commit comments

Comments
 (0)