Skip to content

Commit 99ba0d5

Browse files
tanjialiangmeta-codesync[bot]
authored andcommitted
feat: Add fast path to PrefixEncoding when no duplicates (facebookincubator#474)
Summary: Pull Request resolved: facebookincubator#474 X-link: facebookincubator/velox#16321 The change allows PrefixEncoding to use noDuplicate parameter to go for a fast path searching (terminate early without needing to continuing binary search or scanning). Reviewed By: xiaoxmeng Differential Revision: D92676762 fbshipit-source-id: bda5412f54f13779acbfbfa8ec21f188a107d244
1 parent c0dee9b commit 99ba0d5

File tree

16 files changed

+773
-323
lines changed

16 files changed

+773
-323
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SCRIPT_CXX_FLAGS}")
114114

115115
message("FINAL CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
116116

117+
add_compile_options(-ffunction-sections -fdata-sections)
118+
add_link_options(-Wl,--gc-sections)
119+
117120
include(CTest) # include after project() but before add_subdirectory()
118121

119122
find_package(flatbuffers)

dwio/nimble/encodings/Encoding.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,11 @@ class Encoding {
134134
// iterator; if you need to move your row pointer back, reset() and skip().
135135
virtual void skip(uint32_t rowCount) = 0;
136136

137+
/// Seeks to the position at or after the given value.
138+
///
139+
/// @param value Pointer to target value to seek.
140+
/// @return Row index if found, std::nullopt if value is greater than all
141+
/// entries.
137142
virtual std::optional<uint32_t> seekAtOrAfter(const void* value) {
138143
NIMBLE_UNSUPPORTED("seekAtOrAfter is not supported.");
139144
}

dwio/nimble/encodings/PrefixEncoding.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,6 @@ std::optional<uint32_t> PrefixEncoding::seekAtOrAfter(const void* value) {
163163
const auto& targetValue = *static_cast<const std::string_view*>(value);
164164

165165
// Binary search among restart points to find the block containing the target.
166-
// Since keys are unique and sorted, we find the last restart point whose
167-
// value is <= targetValue.
168166
uint32_t left = 0;
169167
uint32_t right = numRestarts_;
170168

@@ -173,12 +171,8 @@ std::optional<uint32_t> PrefixEncoding::seekAtOrAfter(const void* value) {
173171

174172
seekToRestartPoint(mid);
175173
const std::string_view restartValue = decodeEntry();
176-
if (restartValue == targetValue) {
177-
// Exact match found at restart point. Since keys are unique (no
178-
// duplicates), we can return immediately.
179-
return currentRow_ - 1;
180-
}
181-
if (restartValue < targetValue) {
174+
175+
if (restartValue.compare(targetValue) < 0) {
182176
left = mid + 1;
183177
} else {
184178
right = mid;
@@ -199,9 +193,6 @@ std::optional<uint32_t> PrefixEncoding::seekAtOrAfter(const void* value) {
199193
// Linear scan within the block
200194
while (currentRow_ < rowCount_) {
201195
const std::string_view currentValue = decodeEntry();
202-
// Return when we find either:
203-
// 1. Exact match (since keys are unique, no need to scan further), or
204-
// 2. First value greater than target.
205196
if (currentValue >= targetValue) {
206197
return currentRow_ - 1;
207198
}

dwio/nimble/encodings/tests/PrefixEncodingTest.cpp

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,4 +1157,175 @@ TEST_F(PrefixEncodingTest, fuzzerEncode) {
11571157
}
11581158
}
11591159

1160+
// Test seekAtOrAfter with duplicate keys that span across restart intervals.
1161+
// This specifically tests the fix where seeking for a key that matches
1162+
// a restart point value should return the earliest occurrence, which may be
1163+
// in the previous restart interval.
1164+
TEST_F(PrefixEncodingTest, seekExactMatchWithDuplicatesAcrossRestarts) {
1165+
// Test case: duplicates at restart boundary
1166+
// Restart interval is 16, so restart points are at indices 0, 16, 32, etc.
1167+
// We create data where duplicate keys span the restart boundary.
1168+
struct TestCase {
1169+
std::string_view name;
1170+
std::vector<std::string> values;
1171+
std::string seekKey;
1172+
uint32_t expectedPosition;
1173+
};
1174+
1175+
const std::vector<TestCase> testCases = {
1176+
// Duplicate key "key_16" appears at indices 14, 15, 16, 17
1177+
// Restart point at index 16 has value "key_16"
1178+
// Seeking for "key_16" should return 14 (first occurrence in previous
1179+
// interval)
1180+
{"duplicates before restart point (index 16)",
1181+
// Indices 0-13: unique keys, 14-17: "key_16", 18-31: unique keys
1182+
[]() {
1183+
std::vector<std::string> v;
1184+
v.reserve(14);
1185+
for (int i = 0; i < 14; ++i) {
1186+
v.push_back(fmt::format("key_{:02d}", i));
1187+
}
1188+
// Duplicates spanning restart boundary (indices 14, 15, 16, 17)
1189+
for (int i = 0; i < 4; ++i) {
1190+
v.emplace_back("key_16");
1191+
}
1192+
for (int i = 18; i < 32; ++i) {
1193+
v.push_back(fmt::format("key_{:02d}", i));
1194+
}
1195+
return v;
1196+
}(),
1197+
"key_16",
1198+
14},
1199+
1200+
// Duplicate key at restart point with more duplicates before
1201+
// Restart point at index 32, duplicates at indices 28-35
1202+
{"duplicates before restart point (index 32)",
1203+
[]() {
1204+
std::vector<std::string> v;
1205+
v.reserve(28);
1206+
for (int i = 0; i < 28; ++i) {
1207+
v.push_back(fmt::format("key_{:02d}", i));
1208+
}
1209+
// Duplicates spanning restart boundary at index 32 (indices 28-35)
1210+
for (int i = 0; i < 8; ++i) {
1211+
v.emplace_back("key_32");
1212+
}
1213+
for (int i = 36; i < 50; ++i) {
1214+
v.push_back(fmt::format("key_{:02d}", i));
1215+
}
1216+
return v;
1217+
}(),
1218+
"key_32",
1219+
28},
1220+
1221+
// Single duplicate before restart point
1222+
{"single duplicate before restart point",
1223+
[]() {
1224+
std::vector<std::string> v;
1225+
v.reserve(15);
1226+
for (int i = 0; i < 15; ++i) {
1227+
v.push_back(fmt::format("key_{:02d}", i));
1228+
}
1229+
// Index 15 and 16 both have "key_16"
1230+
v.emplace_back("key_16");
1231+
v.emplace_back("key_16");
1232+
for (int i = 17; i < 32; ++i) {
1233+
v.push_back(fmt::format("key_{:02d}", i));
1234+
}
1235+
return v;
1236+
}(),
1237+
"key_16",
1238+
15},
1239+
1240+
// Duplicates spanning multiple restart intervals
1241+
// Restart points at 16, 32; duplicates from index 14 to 34
1242+
{"duplicates spanning multiple restart intervals",
1243+
[]() {
1244+
std::vector<std::string> v;
1245+
v.reserve(14);
1246+
for (int i = 0; i < 14; ++i) {
1247+
v.push_back(fmt::format("key_{:02d}", i));
1248+
}
1249+
// Duplicates from index 14 to 34 (spanning restarts at 16 and 32)
1250+
for (int i = 0; i < 21; ++i) {
1251+
v.emplace_back("key_dup");
1252+
}
1253+
for (int i = 35; i < 50; ++i) {
1254+
v.push_back(fmt::format("key_{:02d}", i));
1255+
}
1256+
return v;
1257+
}(),
1258+
"key_dup",
1259+
14},
1260+
1261+
// Duplicates at restart point but none before (should still work)
1262+
{"duplicates at restart point only",
1263+
[]() {
1264+
std::vector<std::string> v;
1265+
v.reserve(16);
1266+
for (int i = 0; i < 16; ++i) {
1267+
v.push_back(fmt::format("key_{:02d}", i));
1268+
}
1269+
// Duplicates starting exactly at restart point (indices 16-19)
1270+
for (int i = 0; i < 4; ++i) {
1271+
v.emplace_back("key_20");
1272+
}
1273+
for (int i = 20; i < 32; ++i) {
1274+
v.push_back(fmt::format("key_{:02d}", i));
1275+
}
1276+
return v;
1277+
}(),
1278+
"key_20",
1279+
16},
1280+
1281+
// All same values (extreme case)
1282+
{"all same values",
1283+
[]() {
1284+
std::vector<std::string> v;
1285+
v.reserve(50);
1286+
for (int i = 0; i < 50; ++i) {
1287+
v.emplace_back("same_key");
1288+
}
1289+
return v;
1290+
}(),
1291+
"same_key",
1292+
0},
1293+
};
1294+
1295+
for (const auto& testCase : testCases) {
1296+
SCOPED_TRACE(testCase.name);
1297+
1298+
std::vector<std::string_view> values;
1299+
values.reserve(testCase.values.size());
1300+
for (const auto& s : testCase.values) {
1301+
values.push_back(s);
1302+
}
1303+
1304+
Buffer buffer{*pool_};
1305+
auto encoded = EncodingFactory::encode<std::string_view>(
1306+
createSelectionPolicy(), values, buffer);
1307+
stringBuffers_.clear();
1308+
auto encoding =
1309+
EncodingFactory::decode(*pool_, encoded, createStringBufferFactory());
1310+
1311+
std::string_view seekKey = testCase.seekKey;
1312+
auto result = encoding->seekAtOrAfter(&seekKey);
1313+
ASSERT_TRUE(result.has_value())
1314+
<< "Expected to find key: " << testCase.seekKey;
1315+
EXPECT_EQ(result.value(), testCase.expectedPosition)
1316+
<< "Expected position " << testCase.expectedPosition << " for key "
1317+
<< testCase.seekKey << " but got " << result.value();
1318+
1319+
// Also verify that the value at the returned position matches
1320+
encoding->reset();
1321+
if (result.value() > 0) {
1322+
encoding->skip(result.value());
1323+
}
1324+
std::string_view decoded;
1325+
encoding->materialize(1, &decoded);
1326+
EXPECT_EQ(decoded, testCase.seekKey)
1327+
<< "Value at position " << result.value() << " should be "
1328+
<< testCase.seekKey;
1329+
}
1330+
}
11601331
} // namespace facebook::nimble::test

dwio/nimble/encodings/tests/TrivialEncodingTest.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,91 @@ TEST_P(TrivialEncodingTestWithCompression, seekExactMatch) {
156156
}
157157
}
158158

159+
TEST_P(TrivialEncodingTestWithCompression, seekExactMatchWithDuplicateValues) {
160+
const auto values = toVector(
161+
{"apple", "apple", "banana", "banana", "banana", "cherry", "cherry"});
162+
const auto encoding = createEncoding(values, GetParam());
163+
164+
struct {
165+
std::string_view seekValue;
166+
std::optional<uint32_t> expected;
167+
168+
std::string debugString() const {
169+
if (expected.has_value()) {
170+
return fmt::format("seek '{}' -> row {}", seekValue, expected.value());
171+
}
172+
return fmt::format("seek '{}' -> not found", seekValue);
173+
}
174+
} testCases[] = {
175+
// Should return first occurrence of duplicate values
176+
{"apple", 0},
177+
{"banana", 2},
178+
{"cherry", 5},
179+
// Between values (should return first of next value)
180+
{"aaa", 0},
181+
{"app", 0},
182+
{"az", 2},
183+
{"car", 5},
184+
// After all values (not found)
185+
{"date", std::nullopt},
186+
{"zebra", std::nullopt}};
187+
188+
for (const auto& testCase : testCases) {
189+
SCOPED_TRACE(testCase.debugString());
190+
encoding->reset();
191+
auto result = encoding->seekAtOrAfter(&testCase.seekValue);
192+
ASSERT_EQ(result.has_value(), testCase.expected.has_value());
193+
if (testCase.expected.has_value()) {
194+
EXPECT_EQ(result.value(), testCase.expected.value());
195+
}
196+
}
197+
}
198+
199+
TEST_P(TrivialEncodingTestWithCompression, seekAfterValueWithDuplicateValues) {
200+
auto compressionType = GetParam();
201+
// Duplicates at beginning (apple), middle (cherry), and end (kiwi)
202+
auto values = toVector(
203+
{"apple", "apple", "apple", "cherry", "cherry", "grape", "kiwi", "kiwi"});
204+
auto encoding = createEncoding(values, compressionType);
205+
206+
struct {
207+
std::string_view seekValue;
208+
std::optional<uint32_t> expected;
209+
210+
std::string debugString() const {
211+
if (expected.has_value()) {
212+
return fmt::format("seek '{}' -> row {}", seekValue, expected.value());
213+
}
214+
return fmt::format("seek '{}' -> not found", seekValue);
215+
}
216+
} testCases[] = {
217+
// Before first duplicate group
218+
{"", 0},
219+
{"aaa", 0},
220+
// Between apple (end) and cherry (start)
221+
{"banana", 3},
222+
{"car", 3},
223+
// Between cherry (end) and grape
224+
{"date", 5},
225+
{"fig", 5},
226+
// Between grape and kiwi (start of end duplicates)
227+
{"honey", 6},
228+
// After last duplicate group (not found)
229+
{"lemon", std::nullopt},
230+
{"zebra", std::nullopt},
231+
};
232+
233+
for (const auto& testCase : testCases) {
234+
SCOPED_TRACE(testCase.debugString());
235+
encoding->reset();
236+
auto result = encoding->seekAtOrAfter(&testCase.seekValue);
237+
ASSERT_EQ(result.has_value(), testCase.expected.has_value());
238+
if (testCase.expected.has_value()) {
239+
EXPECT_EQ(result.value(), testCase.expected.value());
240+
}
241+
}
242+
}
243+
159244
TEST_P(TrivialEncodingTestWithCompression, seekExactMatchWithEmptyEncoding) {
160245
const auto values = toVector({});
161246
const auto encoding = createEncoding(values, GetParam());

dwio/nimble/index/IndexReader.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,9 @@ IndexReader::IndexReader(
3535
std::unique_ptr<velox::dwio::common::SeekableInputStream> input,
3636
uint32_t stripeIndex,
3737
std::shared_ptr<StripeIndexGroup> indexGroup,
38-
bool noDuplicateKey,
3938
velox::memory::MemoryPool* pool)
4039
: stripeIndex_{stripeIndex},
4140
indexGroup_{std::move(indexGroup)},
42-
noDuplicateKey_{noDuplicateKey},
4341
input_{std::move(input)},
4442
pool_{pool} {
4543
NIMBLE_CHECK_NOT_NULL(input_);
@@ -50,14 +48,9 @@ std::unique_ptr<IndexReader> IndexReader::create(
5048
std::unique_ptr<velox::dwio::common::SeekableInputStream> input,
5149
uint32_t stripeIndex,
5250
std::shared_ptr<StripeIndexGroup> indexGroup,
53-
bool noDuplicateKey,
5451
velox::memory::MemoryPool* pool) {
5552
return std::unique_ptr<IndexReader>(new IndexReader(
56-
std::move(input),
57-
stripeIndex,
58-
std::move(indexGroup),
59-
noDuplicateKey,
60-
pool));
53+
std::move(input), stripeIndex, std::move(indexGroup), pool));
6154
}
6255

6356
std::optional<uint32_t> IndexReader::seekAtOrAfter(

dwio/nimble/index/IndexReader.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ class IndexReader {
4444
std::unique_ptr<velox::dwio::common::SeekableInputStream> input,
4545
uint32_t stripeIndex,
4646
std::shared_ptr<StripeIndexGroup> indexGroup,
47-
bool noDuplicateKey,
4847
velox::memory::MemoryPool* pool);
4948

5049
/// Seek to the position at or after the given encoded key.
@@ -57,7 +56,6 @@ class IndexReader {
5756
std::unique_ptr<velox::dwio::common::SeekableInputStream> input,
5857
uint32_t stripeIndex,
5958
std::shared_ptr<StripeIndexGroup> indexGroup,
60-
bool noDuplicateKey,
6159
velox::memory::MemoryPool* pool);
6260

6361
/// Seek to a key at or after the given encoded key within a chunk.
@@ -80,7 +78,6 @@ class IndexReader {
8078

8179
const uint32_t stripeIndex_;
8280
const std::shared_ptr<StripeIndexGroup> indexGroup_;
83-
const bool noDuplicateKey_;
8481

8582
const std::unique_ptr<velox::dwio::common::SeekableInputStream> input_;
8683
velox::memory::MemoryPool* const pool_;

dwio/nimble/index/TabletIndex.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,6 @@ std::vector<SortOrder> getSortOrders(const serialization::Index* indexRoot) {
8080
return result;
8181
}
8282

83-
bool getNoDuplicateKey(const serialization::Index* indexRoot) {
84-
return indexRoot->no_duplicate_key();
85-
}
86-
8783
} // namespace
8884

8985
std::unique_ptr<TabletIndex> TabletIndex::create(Section indexSection) {
@@ -97,8 +93,7 @@ TabletIndex::TabletIndex(Section indexSection)
9793
numIndexGroups_{getIndexGroupCount(indexRoot_)},
9894
stripeKeys_{getStripeKeys(indexRoot_)},
9995
indexColumns_{getIndexColumns(indexRoot_)},
100-
sortOrders_{getSortOrders(indexRoot_)},
101-
noDuplicateKey_{getNoDuplicateKey(indexRoot_)} {
96+
sortOrders_{getSortOrders(indexRoot_)} {
10297
NIMBLE_CHECK(!indexColumns_.empty());
10398
NIMBLE_CHECK_EQ(numStripes_ + 1, stripeKeys_.size());
10499
NIMBLE_CHECK_EQ(indexColumns_.size(), sortOrders_.size());

0 commit comments

Comments
 (0)