Skip to content

Commit 63fbb67

Browse files
authored
Fixed uuid first bit flipping issue. (#3105)
1 parent aea6df3 commit 63fbb67

File tree

2 files changed

+227
-5
lines changed

2 files changed

+227
-5
lines changed

cpp/deeplake_pg/duckdb_pg_convert.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -315,17 +315,23 @@ Datum duckdb_value_to_pg_datum(
315315
auto* data = FlatVector::GetData<hugeint_t>(vec);
316316
const auto& uuid_val = data[row];
317317

318+
// DuckDB flips the top bit of UUIDs when storing them internally
319+
// to make ORDER BY consistent between UUID and VARCHAR types.
320+
// We must flip it back before converting to string.
321+
// See: duckdb/src/common/types/uuid.cpp:67 (FromString) and :80 (ToString)
322+
uint64_t upper_flipped = static_cast<uint64_t>(uuid_val.upper) ^ (1ULL << 63);
323+
318324
// Convert hugeint to UUID string format
319325
// DuckDB stores UUID as a 128-bit integer
320326
// Format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
321327
char uuid_str[37]; // 36 chars + null terminator
322328
snprintf(uuid_str,
323329
sizeof(uuid_str),
324-
"%08x-%04x-%04x-%04x-%012lx",
325-
(unsigned int)((uuid_val.upper >> 32) & 0xFFFFFFFF),
326-
(unsigned int)((uuid_val.upper >> 16) & 0xFFFF),
327-
(unsigned int)(uuid_val.upper & 0xFFFF),
328-
(unsigned int)((uuid_val.lower >> 48) & 0xFFFF),
330+
"%08lx-%04lx-%04lx-%04lx-%012lx",
331+
(unsigned long)((upper_flipped >> 32) & 0xFFFFFFFFUL),
332+
(unsigned long)((upper_flipped >> 16) & 0xFFFFUL),
333+
(unsigned long)(upper_flipped & 0xFFFFUL),
334+
(unsigned long)((uuid_val.lower >> 48) & 0xFFFFUL),
329335
(unsigned long)(uuid_val.lower & 0xFFFFFFFFFFFFUL));
330336

331337
return DirectFunctionCall1(uuid_in, CStringGetDatum(uuid_str));
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
"""
2+
Test UUID type handling to ensure no data corruption.
3+
4+
Bug Report: UUIDs with leading '00' bytes were being corrupted to '80' when
5+
selected without explicit ::text cast.
6+
7+
Example:
8+
Stored: 0085cc89-607e-4009-98f0-89d48d188e2f
9+
Without cast: 8085cc89-607e-4009-98f0-89d48d188e2f (CORRUPTED!)
10+
With ::text: 0085cc89-607e-4009-98f0-89d48d188e2f (CORRECT)
11+
12+
This test verifies that UUID-to-string conversion works correctly for all
13+
byte values, especially edge cases like 0x00, 0x0F, 0x80, and 0xFF.
14+
"""
15+
import pytest
16+
import asyncpg
17+
from test_utils.assertions import Assertions
18+
19+
20+
@pytest.mark.asyncio
21+
async def test_uuid_no_corruption(db_conn: asyncpg.Connection):
22+
"""
23+
Test that UUID values are not corrupted during conversion to string.
24+
25+
This test specifically checks:
26+
1. UUIDs starting with 0x00 (the reported bug case)
27+
2. UUIDs starting with 0x0F (low nibble non-zero)
28+
3. UUIDs starting with 0x10 (high nibble non-zero)
29+
4. UUIDs starting with 0x80 (high bit set)
30+
5. UUIDs starting with 0xFF (all bits set)
31+
6. NULL UUID values
32+
33+
For each test case, we verify that:
34+
- Direct SELECT matches the original value
35+
- SELECT with ::text cast matches the original value
36+
- Both methods produce identical results
37+
"""
38+
assertions = Assertions(db_conn)
39+
40+
try:
41+
# Create test table with UUID column
42+
await db_conn.execute("""
43+
CREATE TABLE test_uuid_corruption (
44+
id SERIAL PRIMARY KEY,
45+
uuid_val UUID,
46+
description TEXT
47+
) USING deeplake
48+
""")
49+
50+
# Test cases covering edge cases in UUID first byte
51+
test_cases = [
52+
('0085cc89-607e-4009-98f0-89d48d188e2f', 'UUID starting with 0x00 (reported bug)'),
53+
('0f85cc89-607e-4009-98f0-89d48d188e2f', 'UUID starting with 0x0F'),
54+
('1085cc89-607e-4009-98f0-89d48d188e2f', 'UUID starting with 0x10'),
55+
('8085cc89-607e-4009-98f0-89d48d188e2f', 'UUID starting with 0x80'),
56+
('ff85cc89-607e-4009-98f0-89d48d188e2f', 'UUID starting with 0xFF'),
57+
]
58+
59+
# Insert test cases
60+
for uuid_str, description in test_cases:
61+
await db_conn.execute(
62+
"INSERT INTO test_uuid_corruption (uuid_val, description) VALUES ($1, $2)",
63+
uuid_str, description
64+
)
65+
66+
# Insert NULL case
67+
await db_conn.execute(
68+
"INSERT INTO test_uuid_corruption (uuid_val, description) VALUES (NULL, $1)",
69+
'NULL UUID'
70+
)
71+
72+
# Verify row count
73+
await assertions.assert_table_row_count(6, "test_uuid_corruption")
74+
75+
# Test 1: Fetch UUIDs directly and verify they match original values
76+
print("\n=== Test 1: Direct UUID fetch (without explicit cast) ===")
77+
rows = await db_conn.fetch("""
78+
SELECT id, uuid_val, description
79+
FROM test_uuid_corruption
80+
ORDER BY id
81+
""")
82+
83+
for i, row in enumerate(rows[:-1]): # Skip NULL case
84+
uuid_val = str(row['uuid_val']) if row['uuid_val'] else None
85+
expected_uuid = test_cases[i][0]
86+
description = row['description']
87+
88+
assert uuid_val == expected_uuid, (
89+
f"UUID corruption detected!\n"
90+
f" Test case: {description}\n"
91+
f" Expected: {expected_uuid}\n"
92+
f" Got: {uuid_val}\n"
93+
f" This indicates a bug in UUID-to-string conversion."
94+
)
95+
print(f"✓ {description}: {uuid_val}")
96+
97+
# Verify NULL case
98+
assert rows[-1]['uuid_val'] is None, "NULL UUID should remain NULL"
99+
print(f"✓ NULL UUID: None")
100+
101+
# Test 2: Fetch UUIDs with explicit ::text cast
102+
print("\n=== Test 2: UUID fetch with ::text cast ===")
103+
rows_with_cast = await db_conn.fetch("""
104+
SELECT id, uuid_val::text as uuid_str, description
105+
FROM test_uuid_corruption
106+
ORDER BY id
107+
""")
108+
109+
for i, row in enumerate(rows_with_cast[:-1]): # Skip NULL case
110+
uuid_str = row['uuid_str']
111+
expected_uuid = test_cases[i][0]
112+
description = row['description']
113+
114+
assert uuid_str == expected_uuid, (
115+
f"UUID with ::text cast doesn't match!\n"
116+
f" Test case: {description}\n"
117+
f" Expected: {expected_uuid}\n"
118+
f" Got: {uuid_str}"
119+
)
120+
print(f"✓ {description}: {uuid_str}")
121+
122+
# Verify NULL case with cast
123+
assert rows_with_cast[-1]['uuid_str'] is None, "NULL UUID with ::text cast should remain NULL"
124+
print(f"✓ NULL UUID with ::text: None")
125+
126+
# Test 3: Verify direct fetch and ::text cast produce identical results
127+
print("\n=== Test 3: Compare direct fetch vs ::text cast ===")
128+
for i in range(len(test_cases)):
129+
uuid_direct = str(rows[i]['uuid_val']) if rows[i]['uuid_val'] else None
130+
uuid_cast = rows_with_cast[i]['uuid_str']
131+
description = test_cases[i][1]
132+
133+
assert uuid_direct == uuid_cast, (
134+
f"Mismatch between direct and cast results!\n"
135+
f" Test case: {description}\n"
136+
f" Direct: {uuid_direct}\n"
137+
f" Cast: {uuid_cast}\n"
138+
f" These should be identical."
139+
)
140+
print(f"✓ {description}: Both methods match")
141+
142+
# Test 4: Test UUID comparison operations
143+
print("\n=== Test 4: UUID comparison operations ===")
144+
145+
# Test equality with string
146+
count = await db_conn.fetchval("""
147+
SELECT COUNT(*) FROM test_uuid_corruption
148+
WHERE uuid_val = '0085cc89-607e-4009-98f0-89d48d188e2f'::uuid
149+
""")
150+
assert count == 1, f"Expected 1 row matching UUID, got {count}"
151+
print(f"✓ UUID equality comparison works correctly")
152+
153+
# Test UUID ordering
154+
ordered_rows = await db_conn.fetch("""
155+
SELECT uuid_val::text as uuid_str
156+
FROM test_uuid_corruption
157+
WHERE uuid_val IS NOT NULL
158+
ORDER BY uuid_val
159+
""")
160+
161+
# Verify we got all non-NULL UUIDs back
162+
assert len(ordered_rows) == 5, f"Expected 5 non-NULL UUIDs, got {len(ordered_rows)}"
163+
164+
# Verify they are in ascending order
165+
uuid_strs = [row['uuid_str'] for row in ordered_rows]
166+
sorted_uuids = sorted(uuid_strs)
167+
assert uuid_strs == sorted_uuids, (
168+
f"UUIDs not in correct order:\n"
169+
f" Got: {uuid_strs}\n"
170+
f" Expected: {sorted_uuids}"
171+
)
172+
print(f"✓ UUID ordering works correctly")
173+
174+
print("\n=== All UUID tests passed! ===")
175+
176+
finally:
177+
# Cleanup
178+
await db_conn.execute("DROP TABLE IF EXISTS test_uuid_corruption")
179+
180+
181+
@pytest.mark.asyncio
182+
async def test_uuid_edge_cases(db_conn: asyncpg.Connection):
183+
"""
184+
Test additional UUID edge cases and operations.
185+
"""
186+
try:
187+
await db_conn.execute("""
188+
CREATE TABLE test_uuid_edges (
189+
id SERIAL PRIMARY KEY,
190+
uuid_val UUID
191+
) USING deeplake
192+
""")
193+
194+
# Test all-zeros and all-ones UUIDs
195+
await db_conn.execute("""
196+
INSERT INTO test_uuid_edges (uuid_val) VALUES
197+
('00000000-0000-0000-0000-000000000000'),
198+
('ffffffff-ffff-ffff-ffff-ffffffffffff')
199+
""")
200+
201+
# Verify they round-trip correctly
202+
rows = await db_conn.fetch("SELECT uuid_val::text as uuid_str FROM test_uuid_edges ORDER BY id")
203+
204+
assert rows[0]['uuid_str'] == '00000000-0000-0000-0000-000000000000', \
205+
f"All-zeros UUID corrupted: {rows[0]['uuid_str']}"
206+
print(f"✓ All-zeros UUID: {rows[0]['uuid_str']}")
207+
208+
assert rows[1]['uuid_str'] == 'ffffffff-ffff-ffff-ffff-ffffffffffff', \
209+
f"All-ones UUID corrupted: {rows[1]['uuid_str']}"
210+
print(f"✓ All-ones UUID: {rows[1]['uuid_str']}")
211+
212+
print("\n=== All UUID edge case tests passed! ===")
213+
214+
finally:
215+
# Cleanup
216+
await db_conn.execute("DROP TABLE IF EXISTS test_uuid_edges")

0 commit comments

Comments
 (0)