Skip to content

Commit a89b101

Browse files
authored
feat: add UUID representation (#242)
UUID representation along with utilities such as generators for v4 and v7.
1 parent ab0662f commit a89b101

File tree

5 files changed

+425
-1
lines changed

5 files changed

+425
-1
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ set(ICEBERG_SOURCES
5252
util/decimal.cc
5353
util/murmurhash3_internal.cc
5454
util/timepoint.cc
55-
util/gzip_internal.cc)
55+
util/gzip_internal.cc
56+
util/uuid.cc)
5657

5758
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
5859
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)

src/iceberg/util/uuid.cc

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/uuid.h"
21+
22+
#include <chrono>
23+
#include <cstdint>
24+
#include <cstring>
25+
#include <random>
26+
#include <string>
27+
28+
#include "iceberg/exception.h"
29+
#include "iceberg/result.h"
30+
#include "iceberg/util/formatter.h" // IWYU pragma: keep
31+
#include "iceberg/util/int128.h"
32+
#include "iceberg/util/macros.h"
33+
34+
namespace iceberg {
35+
36+
namespace {
37+
38+
constexpr std::array<uint8_t, 256> BuildHexTable() {
39+
std::array<uint8_t, 256> buf{};
40+
for (int32_t i = 0; i < 256; i++) {
41+
if (i >= '0' && i <= '9') {
42+
buf[i] = static_cast<uint8_t>(i - '0');
43+
} else if (i >= 'a' && i <= 'f') {
44+
buf[i] = static_cast<uint8_t>(i - 'a' + 10);
45+
} else if (i >= 'A' && i <= 'F') {
46+
buf[i] = static_cast<uint8_t>(i - 'A' + 10);
47+
} else {
48+
buf[i] = 0xFF;
49+
}
50+
}
51+
return buf;
52+
}
53+
54+
constexpr std::array<uint8_t, 256> BuildShl4Table() {
55+
std::array<uint8_t, 256> buf{};
56+
for (int32_t i = 0; i < 256; i++) {
57+
buf[i] = static_cast<uint8_t>(i << 4);
58+
}
59+
return buf;
60+
}
61+
62+
constexpr auto kHexTable = BuildHexTable();
63+
constexpr auto kShl4Table = BuildShl4Table();
64+
65+
// Parse a UUID string without dashes, e.g. "67e5504410b1426f9247bb680e5fe0c8"
66+
inline Result<Uuid> ParseSimple(std::string_view s) {
67+
ICEBERG_DCHECK(s.size() == 32, "s must be 32 characters long");
68+
69+
std::array<uint8_t, 16> uuid{};
70+
for (size_t i = 0; i < 16; i++) {
71+
uint8_t h1 = kHexTable[static_cast<uint8_t>(s[i * 2])];
72+
uint8_t h2 = kHexTable[static_cast<uint8_t>(s[i * 2 + 1])];
73+
74+
if ((h1 | h2) == 0xFF) [[unlikely]] {
75+
return InvalidArgument("Invalid UUID string: {}", s);
76+
}
77+
78+
uuid[i] = static_cast<uint8_t>(kShl4Table[h1] | h2);
79+
}
80+
return Uuid(std::move(uuid));
81+
}
82+
83+
// Parse a UUID string with dashes, e.g. "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
84+
inline Result<Uuid> ParseHyphenated(std::string_view s) {
85+
ICEBERG_DCHECK(s.size() == 36, "s must be 36 characters long");
86+
87+
// Check that dashes are in the right places
88+
if (!(s[8] == '-' && s[13] == '-' && s[18] == '-' && s[23] == '-')) [[unlikely]] {
89+
return InvalidArgument("Invalid UUID string: {}", s);
90+
}
91+
92+
constexpr std::array<size_t, 8> positions = {0, 4, 9, 14, 19, 24, 28, 32};
93+
std::array<uint8_t, 16> uuid{};
94+
95+
for (size_t j = 0; j < 8; j++) {
96+
size_t i = positions[j];
97+
uint8_t h1 = kHexTable[static_cast<uint8_t>(s[i])];
98+
uint8_t h2 = kHexTable[static_cast<uint8_t>(s[i + 1])];
99+
uint8_t h3 = kHexTable[static_cast<uint8_t>(s[i + 2])];
100+
uint8_t h4 = kHexTable[static_cast<uint8_t>(s[i + 3])];
101+
102+
if ((h1 | h2 | h3 | h4) == 0xFF) [[unlikely]] {
103+
return InvalidArgument("Invalid UUID string: {}", s);
104+
}
105+
106+
uuid[j * 2] = static_cast<uint8_t>(kShl4Table[h1] | h2);
107+
uuid[j * 2 + 1] = static_cast<uint8_t>(kShl4Table[h3] | h4);
108+
}
109+
110+
return Uuid(std::move(uuid));
111+
}
112+
113+
} // namespace
114+
115+
Uuid::Uuid(std::array<uint8_t, kLength> data) : data_(std::move(data)) {}
116+
117+
Uuid Uuid::GenerateV4() {
118+
static std::random_device rd;
119+
static std::mt19937 gen(rd());
120+
static std::uniform_int_distribution<uint64_t> distrib(
121+
std::numeric_limits<uint64_t>::min(), std::numeric_limits<uint64_t>::max());
122+
std::array<uint8_t, 16> uuid;
123+
124+
// Generate two random 64-bit integers
125+
uint64_t high_bits = distrib(gen);
126+
uint64_t low_bits = distrib(gen);
127+
128+
// Combine them into a uint128_t
129+
uint128_t random_128_bit_number = (static_cast<uint128_t>(high_bits) << 64) | low_bits;
130+
131+
// Copy the bytes into the uuid array
132+
std::memcpy(uuid.data(), &random_128_bit_number, 16);
133+
134+
// Set magic numbers for a "version 4" (pseudorandom) UUID and variant,
135+
// see https://datatracker.ietf.org/doc/html/rfc9562#name-uuid-version-4
136+
uuid[6] = (uuid[6] & 0x0F) | 0x40;
137+
// Set variant field, top two bits are 1, 0
138+
uuid[8] = (uuid[8] & 0x3F) | 0x80;
139+
140+
return Uuid(std::move(uuid));
141+
}
142+
143+
Uuid Uuid::GenerateV7() {
144+
// Get the current time in milliseconds since the Unix epoch
145+
auto now = std::chrono::system_clock::now();
146+
auto duration_since_epoch = now.time_since_epoch();
147+
auto unix_ts_ms =
148+
std::chrono::duration_cast<std::chrono::milliseconds>(duration_since_epoch).count();
149+
150+
return GenerateV7(static_cast<uint64_t>(unix_ts_ms));
151+
}
152+
153+
Uuid Uuid::GenerateV7(uint64_t unix_ts_ms) {
154+
std::array<uint8_t, 16> uuid = {};
155+
156+
// Set the timestamp (in milliseconds since Unix epoch)
157+
uuid[0] = (unix_ts_ms >> 40) & 0xFF;
158+
uuid[1] = (unix_ts_ms >> 32) & 0xFF;
159+
uuid[2] = (unix_ts_ms >> 24) & 0xFF;
160+
uuid[3] = (unix_ts_ms >> 16) & 0xFF;
161+
uuid[4] = (unix_ts_ms >> 8) & 0xFF;
162+
uuid[5] = unix_ts_ms & 0xFF;
163+
164+
// Generate random bytes for the remaining fields
165+
static std::random_device rd;
166+
static std::mt19937 gen(rd());
167+
static std::uniform_int_distribution<uint16_t> distrib(
168+
std::numeric_limits<uint16_t>::min(), std::numeric_limits<uint16_t>::max());
169+
170+
// Note: uint8_t is invalid for uniform_int_distribution on Windows
171+
for (size_t i = 6; i < 16; i += 2) {
172+
auto rand = static_cast<uint16_t>(distrib(gen));
173+
uuid[i] = (rand >> 8) & 0xFF;
174+
uuid[i + 1] = rand & 0xFF;
175+
}
176+
177+
// Set magic numbers for a "version 7" (pseudorandom) UUID and variant,
178+
// see https://www.rfc-editor.org/rfc/rfc9562#name-version-field
179+
uuid[6] = (uuid[6] & 0x0F) | 0x70;
180+
// set variant field, top two bits are 1, 0
181+
uuid[8] = (uuid[8] & 0x3F) | 0x80;
182+
183+
return Uuid(std::move(uuid));
184+
}
185+
186+
Result<Uuid> Uuid::FromString(std::string_view str) {
187+
if (str.size() == 32) {
188+
return ParseSimple(str);
189+
} else if (str.size() == 36) {
190+
return ParseHyphenated(str);
191+
} else {
192+
return InvalidArgument("Invalid UUID string: {}", str);
193+
}
194+
}
195+
196+
Result<Uuid> Uuid::FromBytes(std::span<const uint8_t> bytes) {
197+
if (bytes.size() != kLength) [[unlikely]] {
198+
return InvalidArgument("UUID byte array must be exactly {} bytes, was {}", kLength,
199+
bytes.size());
200+
}
201+
std::array<uint8_t, kLength> data;
202+
std::memcpy(data.data(), bytes.data(), kLength);
203+
return Uuid(std::move(data));
204+
}
205+
206+
uint8_t Uuid::operator[](size_t index) const {
207+
ICEBERG_CHECK(index < kLength, "UUID index out of range: {}", index);
208+
return data_[index];
209+
}
210+
211+
std::string Uuid::ToString() const {
212+
return std::format(
213+
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}"
214+
"{:02x}{:02x}{:02x}",
215+
data_[0], data_[1], data_[2], data_[3], data_[4], data_[5], data_[6], data_[7],
216+
data_[8], data_[9], data_[10], data_[11], data_[12], data_[13], data_[14],
217+
data_[15]);
218+
}
219+
220+
} // namespace iceberg

src/iceberg/util/uuid.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
#include <array>
23+
#include <cstdint>
24+
#include <span>
25+
#include <string_view>
26+
27+
#include "iceberg/iceberg_export.h"
28+
#include "iceberg/result.h"
29+
#include "iceberg/util/formattable.h"
30+
31+
/// \file iceberg/util/uuid.h
32+
/// \brief UUID (Universally Unique Identifier) representation.
33+
34+
namespace iceberg {
35+
36+
class ICEBERG_EXPORT Uuid : public util::Formattable {
37+
public:
38+
Uuid() = delete;
39+
constexpr static size_t kLength = 16;
40+
41+
explicit Uuid(std::array<uint8_t, kLength> data);
42+
43+
/// \brief Generate a random UUID (version 4).
44+
static Uuid GenerateV4();
45+
46+
/// \brief Generate UUID version 7 per RFC 9562, with the current timestamp.
47+
static Uuid GenerateV7();
48+
49+
/// \brief Generate UUID version 7 per RFC 9562, with the given timestamp.
50+
///
51+
/// UUID version 7 consists of a Unix timestamp in milliseconds (48 bits) and
52+
/// 74 random bits, excluding the required version and variant bits.
53+
///
54+
/// \param unix_ts_ms number of milliseconds since start of the UNIX epoch
55+
///
56+
/// \note unix_ts_ms cannot be negative per RFC.
57+
static Uuid GenerateV7(uint64_t unix_ts_ms);
58+
59+
/// \brief Create a UUID from a string in standard format.
60+
static Result<Uuid> FromString(std::string_view str);
61+
62+
/// \brief Create a UUID from a 16-byte array.
63+
static Result<Uuid> FromBytes(std::span<const uint8_t> bytes);
64+
65+
/// \brief Get the raw bytes of the UUID.
66+
std::span<const uint8_t> bytes() const { return data_; }
67+
68+
/// \brief Access individual bytes of the UUID.
69+
/// \param index The index of the byte to access (0-15).
70+
/// \return The byte at the specified index.
71+
/// \throw IcebergError if index is out of bounds.
72+
uint8_t operator[](size_t index) const;
73+
74+
/// \brief Convert the UUID to a string in standard format.
75+
std::string ToString() const override;
76+
77+
friend bool operator==(const Uuid& lhs, const Uuid& rhs) {
78+
return lhs.data_ == rhs.data_;
79+
}
80+
81+
private:
82+
std::array<uint8_t, kLength> data_;
83+
};
84+
85+
} // namespace iceberg

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ add_iceberg_test(util_test
9191
endian_test.cc
9292
formatter_test.cc
9393
string_util_test.cc
94+
uuid_test.cc
9495
visit_type_test.cc)
9596

9697
add_iceberg_test(roaring_test SOURCES roaring_test.cc)

0 commit comments

Comments
 (0)