@@ -1157,4 +1157,175 @@ TEST_F(PrefixEncodingTest, fuzzerEncode) {
11571157 }
11581158}
11591159
1160+ // Test seekAtOrAfter with duplicate keys that span across restart intervals.
1161+ // This specifically tests the fix where seeking for a key that matches
1162+ // a restart point value should return the earliest occurrence, which may be
1163+ // in the previous restart interval.
1164+ TEST_F (PrefixEncodingTest, seekExactMatchWithDuplicatesAcrossRestarts) {
1165+ // Test case: duplicates at restart boundary
1166+ // Restart interval is 16, so restart points are at indices 0, 16, 32, etc.
1167+ // We create data where duplicate keys span the restart boundary.
1168+ struct TestCase {
1169+ std::string_view name;
1170+ std::vector<std::string> values;
1171+ std::string seekKey;
1172+ uint32_t expectedPosition;
1173+ };
1174+
1175+ const std::vector<TestCase> testCases = {
1176+ // Duplicate key "key_16" appears at indices 14, 15, 16, 17
1177+ // Restart point at index 16 has value "key_16"
1178+ // Seeking for "key_16" should return 14 (first occurrence in previous
1179+ // interval)
1180+ {" duplicates before restart point (index 16)" ,
1181+ // Indices 0-13: unique keys, 14-17: "key_16", 18-31: unique keys
1182+ []() {
1183+ std::vector<std::string> v;
1184+ v.reserve (14 );
1185+ for (int i = 0 ; i < 14 ; ++i) {
1186+ v.push_back (fmt::format (" key_{:02d}" , i));
1187+ }
1188+ // Duplicates spanning restart boundary (indices 14, 15, 16, 17)
1189+ for (int i = 0 ; i < 4 ; ++i) {
1190+ v.emplace_back (" key_16" );
1191+ }
1192+ for (int i = 18 ; i < 32 ; ++i) {
1193+ v.push_back (fmt::format (" key_{:02d}" , i));
1194+ }
1195+ return v;
1196+ }(),
1197+ " key_16" ,
1198+ 14 },
1199+
1200+ // Duplicate key at restart point with more duplicates before
1201+ // Restart point at index 32, duplicates at indices 28-35
1202+ {" duplicates before restart point (index 32)" ,
1203+ []() {
1204+ std::vector<std::string> v;
1205+ v.reserve (28 );
1206+ for (int i = 0 ; i < 28 ; ++i) {
1207+ v.push_back (fmt::format (" key_{:02d}" , i));
1208+ }
1209+ // Duplicates spanning restart boundary at index 32 (indices 28-35)
1210+ for (int i = 0 ; i < 8 ; ++i) {
1211+ v.emplace_back (" key_32" );
1212+ }
1213+ for (int i = 36 ; i < 50 ; ++i) {
1214+ v.push_back (fmt::format (" key_{:02d}" , i));
1215+ }
1216+ return v;
1217+ }(),
1218+ " key_32" ,
1219+ 28 },
1220+
1221+ // Single duplicate before restart point
1222+ {" single duplicate before restart point" ,
1223+ []() {
1224+ std::vector<std::string> v;
1225+ v.reserve (15 );
1226+ for (int i = 0 ; i < 15 ; ++i) {
1227+ v.push_back (fmt::format (" key_{:02d}" , i));
1228+ }
1229+ // Index 15 and 16 both have "key_16"
1230+ v.emplace_back (" key_16" );
1231+ v.emplace_back (" key_16" );
1232+ for (int i = 17 ; i < 32 ; ++i) {
1233+ v.push_back (fmt::format (" key_{:02d}" , i));
1234+ }
1235+ return v;
1236+ }(),
1237+ " key_16" ,
1238+ 15 },
1239+
1240+ // Duplicates spanning multiple restart intervals
1241+ // Restart points at 16, 32; duplicates from index 14 to 34
1242+ {" duplicates spanning multiple restart intervals" ,
1243+ []() {
1244+ std::vector<std::string> v;
1245+ v.reserve (14 );
1246+ for (int i = 0 ; i < 14 ; ++i) {
1247+ v.push_back (fmt::format (" key_{:02d}" , i));
1248+ }
1249+ // Duplicates from index 14 to 34 (spanning restarts at 16 and 32)
1250+ for (int i = 0 ; i < 21 ; ++i) {
1251+ v.emplace_back (" key_dup" );
1252+ }
1253+ for (int i = 35 ; i < 50 ; ++i) {
1254+ v.push_back (fmt::format (" key_{:02d}" , i));
1255+ }
1256+ return v;
1257+ }(),
1258+ " key_dup" ,
1259+ 14 },
1260+
1261+ // Duplicates at restart point but none before (should still work)
1262+ {" duplicates at restart point only" ,
1263+ []() {
1264+ std::vector<std::string> v;
1265+ v.reserve (16 );
1266+ for (int i = 0 ; i < 16 ; ++i) {
1267+ v.push_back (fmt::format (" key_{:02d}" , i));
1268+ }
1269+ // Duplicates starting exactly at restart point (indices 16-19)
1270+ for (int i = 0 ; i < 4 ; ++i) {
1271+ v.emplace_back (" key_20" );
1272+ }
1273+ for (int i = 20 ; i < 32 ; ++i) {
1274+ v.push_back (fmt::format (" key_{:02d}" , i));
1275+ }
1276+ return v;
1277+ }(),
1278+ " key_20" ,
1279+ 16 },
1280+
1281+ // All same values (extreme case)
1282+ {" all same values" ,
1283+ []() {
1284+ std::vector<std::string> v;
1285+ v.reserve (50 );
1286+ for (int i = 0 ; i < 50 ; ++i) {
1287+ v.emplace_back (" same_key" );
1288+ }
1289+ return v;
1290+ }(),
1291+ " same_key" ,
1292+ 0 },
1293+ };
1294+
1295+ for (const auto & testCase : testCases) {
1296+ SCOPED_TRACE (testCase.name );
1297+
1298+ std::vector<std::string_view> values;
1299+ values.reserve (testCase.values .size ());
1300+ for (const auto & s : testCase.values ) {
1301+ values.push_back (s);
1302+ }
1303+
1304+ Buffer buffer{*pool_};
1305+ auto encoded = EncodingFactory::encode<std::string_view>(
1306+ createSelectionPolicy (), values, buffer);
1307+ stringBuffers_.clear ();
1308+ auto encoding =
1309+ EncodingFactory::decode (*pool_, encoded, createStringBufferFactory ());
1310+
1311+ std::string_view seekKey = testCase.seekKey ;
1312+ auto result = encoding->seekAtOrAfter (&seekKey);
1313+ ASSERT_TRUE (result.has_value ())
1314+ << " Expected to find key: " << testCase.seekKey ;
1315+ EXPECT_EQ (result.value (), testCase.expectedPosition )
1316+ << " Expected position " << testCase.expectedPosition << " for key "
1317+ << testCase.seekKey << " but got " << result.value ();
1318+
1319+ // Also verify that the value at the returned position matches
1320+ encoding->reset ();
1321+ if (result.value () > 0 ) {
1322+ encoding->skip (result.value ());
1323+ }
1324+ std::string_view decoded;
1325+ encoding->materialize (1 , &decoded);
1326+ EXPECT_EQ (decoded, testCase.seekKey )
1327+ << " Value at position " << result.value () << " should be "
1328+ << testCase.seekKey ;
1329+ }
1330+ }
11601331} // namespace facebook::nimble::test
0 commit comments