Skip to content

Commit 499e630

Browse files
authored
RCORE-2170 String compression tests (#7812)
1 parent 5112b13 commit 499e630

File tree

5 files changed

+356
-10
lines changed

5 files changed

+356
-10
lines changed

evergreen/config.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,19 +1863,18 @@ buildvariants:
18631863
- name: finalize_coverage_data
18641864

18651865
- name: macos-array-compression
1866-
display_name: "MacOS 11 arm64 (Compress Arrays)"
1867-
run_on: macos-1100-arm64
1866+
display_name: "MacOS 14 arm64 (Compress Arrays)"
1867+
run_on: macos-14-arm64
18681868
expansions:
1869-
cmake_url: "https://s3.amazonaws.com/static.realm.io/evergreen-assets/cmake-3.26.3-macos-universal.tar.gz"
1870-
cmake_bindir: "./cmake_binaries/CMake.app/Contents/bin"
1869+
cmake_bindir: "/opt/homebrew/bin"
18711870
cmake_toolchain_file: "./tools/cmake/xcode.toolchain.cmake"
1871+
cmake_build_tool_options: "-sdk macosx"
18721872
cmake_generator: Xcode
18731873
max_jobs: $(sysctl -n hw.logicalcpu)
1874-
xcode_developer_dir: /Applications/Xcode13.1.app/Contents/Developer
1874+
xcode_developer_dir: /Applications/Xcode15.2.app/Contents/Developer
18751875
extra_flags: -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_OSX_ARCHITECTURES=arm64
18761876
compress: On
18771877
cmake_build_type: Debug
1878-
coveralls_flag_name: "macos-arm64"
18791878
tasks:
18801879
- name: compile_test
18811880

src/realm/string_interner.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ static std::vector<uint32_t> hash_to_id(Array& node, uint32_t hash, uint8_t hash
223223
if (!node.has_refs()) {
224224
// it's a leaf - default is a list, search starts from index 0.
225225
HashMapIter it(node, hash, hash_size);
226-
if (node.size() > hash_node_min_size) {
226+
if (node.size() >= hash_node_min_size) {
227227
// it is a hash table, so use hash to select index to start searching
228228
// table size must be power of two!
229229
size_t index = hash & (node.size() - 1);
@@ -590,6 +590,7 @@ CompressedStringView& StringInterner::get_compressed(StringID id)
590590
auto index = id - 1; // 0 represents null
591591
auto hi = index >> 8;
592592
auto lo = index & 0xFFUL;
593+
593594
DataLeaf& leaf = m_compressed_leafs[hi];
594595
load_leaf_if_needed(leaf);
595596
REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size());
@@ -618,8 +619,9 @@ std::optional<StringID> StringInterner::lookup(StringData sd)
618619
int StringInterner::compare(StringID A, StringID B)
619620
{
620621
std::lock_guard lock(m_mutex);
621-
REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
622-
REALM_ASSERT_DEBUG(B < m_decompressed_strings.size());
622+
// 0 is null, the first index starts from 1.
623+
REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size());
624+
REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size());
623625
// comparisons against null
624626
if (A == B && A == 0)
625627
return 0;
@@ -635,7 +637,7 @@ int StringInterner::compare(StringID A, StringID B)
635637
int StringInterner::compare(StringData s, StringID A)
636638
{
637639
std::lock_guard lock(m_mutex);
638-
REALM_ASSERT_DEBUG(A < m_decompressed_strings.size());
640+
REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size());
639641
// comparisons against null
640642
if (s.data() == nullptr && A == 0)
641643
return 0;

test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ set(CORE_TEST_SOURCES
7676
test_shared.cpp
7777
test_status.cpp
7878
test_string_data.cpp
79+
test_string_compression.cpp
7980
test_table_view.cpp
8081
test_thread.cpp
8182
test_transactions.cpp

test/test_group.cpp

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2508,5 +2508,176 @@ TEST(Group_ArrayCompression_Correctness_Random_Input)
25082508
#endif
25092509
}
25102510

2511+
TEST(Group_ArrayCompression_Strings)
2512+
{
2513+
GROUP_TEST_PATH(path);
2514+
2515+
// create a bunch of string related properties that are going to be compressed and verify write/read machinery
2516+
// and string correctness.
2517+
Group to_disk;
2518+
TableRef table = to_disk.add_table("test");
2519+
auto col_key_string = table->add_column(type_String, "string");
2520+
auto col_key_list_string = table->add_column_list(type_String, "list_strings");
2521+
auto col_key_set_string = table->add_column_set(type_String, "set_strings");
2522+
auto col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings");
2523+
auto obj = table->create_object();
2524+
2525+
2526+
obj.set_any(col_key_string, {"Test"});
2527+
auto list_s = obj.get_list<String>(col_key_list_string);
2528+
auto set_s = obj.get_set<String>(col_key_set_string);
2529+
auto dictionary_s = obj.get_dictionary(col_key_dict_string);
2530+
2531+
std::string tmp{"aabbbcccaaaaddfwregfgklnjytojfs"};
2532+
for (size_t i = 0; i < 10; ++i) {
2533+
list_s.add({tmp + std::to_string(i)});
2534+
}
2535+
for (size_t i = 0; i < 10; ++i) {
2536+
set_s.insert({tmp + std::to_string(i)});
2537+
}
2538+
for (size_t i = 0; i < 10; ++i) {
2539+
const auto key_value = tmp + std::to_string(i);
2540+
dictionary_s.insert({key_value}, {key_value});
2541+
}
2542+
2543+
CHECK(list_s.size() == 10);
2544+
CHECK(set_s.size() == 10);
2545+
CHECK(dictionary_s.size() == 10);
2546+
2547+
// Serialize to disk (compression should happen when the proper leaf array is serialized to disk)
2548+
to_disk.write(path, crypt_key());
2549+
2550+
#ifdef REALM_DEBUG
2551+
to_disk.verify();
2552+
#endif
2553+
2554+
// Load the tables
2555+
Group from_disk(path, crypt_key());
2556+
TableRef read_table = from_disk.get_table("test");
2557+
auto obj1 = read_table->get_object(0);
2558+
2559+
auto list_s1 = obj.get_list<String>("list_strings");
2560+
auto set_s1 = obj.get_set<String>("set_strings");
2561+
auto dictionary_s1 = obj.get_dictionary("dict_strings");
2562+
2563+
CHECK(obj1.get_any("string") == obj.get_any("string"));
2564+
2565+
2566+
CHECK(list_s1.size() == list_s.size());
2567+
CHECK(set_s1.size() == set_s.size());
2568+
CHECK(dictionary_s1.size() == dictionary_s.size());
2569+
2570+
CHECK(*read_table == *table);
2571+
2572+
for (size_t i = 0; i < list_s1.size(); ++i) {
2573+
CHECK_EQUAL(list_s1.get_any(i), list_s.get_any(i));
2574+
}
2575+
2576+
for (size_t i = 0; i < set_s1.size(); ++i) {
2577+
CHECK_EQUAL(set_s1.get_any(i), set_s.get_any(i));
2578+
}
2579+
2580+
for (size_t i = 0; i < dictionary_s1.size(); ++i) {
2581+
CHECK_EQUAL(dictionary_s1.get_key(i), dictionary_s.get_key(i));
2582+
CHECK_EQUAL(dictionary_s1.get_any(i), dictionary_s.get_any(i));
2583+
}
2584+
2585+
#ifdef REALM_DEBUG
2586+
from_disk.verify();
2587+
#endif
2588+
}
2589+
2590+
TEST(Test_Commit_Compression_Strings)
2591+
{
2592+
auto generate_random_str_len = []() {
2593+
std::random_device rd;
2594+
std::mt19937 generator(rd());
2595+
std::uniform_int_distribution<> distribution(1, 100);
2596+
return distribution(generator);
2597+
};
2598+
2599+
auto generate_random_string = [](size_t length) {
2600+
const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv"
2601+
"wxyz0123456789";
2602+
std::random_device rd;
2603+
std::mt19937 generator(rd());
2604+
std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1);
2605+
2606+
std::string random_str;
2607+
for (size_t i = 0; i < length; ++i)
2608+
random_str += alphabet[distribution(generator)];
2609+
2610+
return random_str;
2611+
};
2612+
2613+
SHARED_GROUP_TEST_PATH(path);
2614+
auto hist = make_in_realm_history();
2615+
DBRef db = DB::create(*hist, path);
2616+
ColKey col_key_string, col_key_list_string, col_key_set_string, col_key_dict_string;
2617+
ObjKey obj_key;
2618+
TableKey table_key;
2619+
2620+
auto rt = db->start_read();
2621+
{
2622+
WriteTransaction wt(db);
2623+
auto table = wt.add_table("test");
2624+
table_key = table->get_key();
2625+
col_key_string = table->add_column(type_String, "string");
2626+
col_key_list_string = table->add_column_list(type_String, "list_strings");
2627+
col_key_set_string = table->add_column_set(type_String, "set_strings");
2628+
col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings");
2629+
Obj obj = table->create_object();
2630+
obj_key = obj.get_key();
2631+
wt.commit();
2632+
}
2633+
// check verify that columns have been created
2634+
rt->advance_read();
2635+
rt->verify();
2636+
2637+
// commit random strings in all the string based columns and verify interner updates
2638+
2639+
for (size_t i = 0; i < 50; ++i) {
2640+
2641+
// some string
2642+
const auto str = generate_random_string(generate_random_str_len());
2643+
2644+
rt = db->start_read();
2645+
{
2646+
WriteTransaction wt(db);
2647+
auto table = wt.get_table(table_key);
2648+
auto obj = table->get_object(obj_key);
2649+
2650+
obj.set_any(col_key_string, {str});
2651+
auto list_s = obj.get_list<String>(col_key_list_string);
2652+
auto set_s = obj.get_set<String>(col_key_set_string);
2653+
auto dictionary_s = obj.get_dictionary(col_key_dict_string);
2654+
2655+
list_s.add({str});
2656+
set_s.insert({str});
2657+
dictionary_s.insert({str}, {str});
2658+
2659+
wt.commit();
2660+
}
2661+
rt->advance_read();
2662+
rt->verify();
2663+
2664+
auto table = rt->get_table(table_key);
2665+
auto obj = table->get_object(obj_key);
2666+
const auto current_str = obj.get_any(col_key_string).get_string();
2667+
CHECK_EQUAL(current_str, str);
2668+
2669+
auto list_s = obj.get_list<String>(col_key_list_string);
2670+
auto set_s = obj.get_set<String>(col_key_set_string);
2671+
auto dictionary_s = obj.get_dictionary(col_key_dict_string);
2672+
2673+
CHECK_EQUAL(list_s.size(), i + 1);
2674+
CHECK_EQUAL(set_s.size(), i + 1);
2675+
CHECK_EQUAL(dictionary_s.size(), i + 1);
2676+
2677+
CHECK_EQUAL(list_s.get_any(i), str);
2678+
CHECK_NOT_EQUAL(set_s.find_any(str), not_found);
2679+
CHECK_NOT_EQUAL(dictionary_s.find_any(str), not_found);
2680+
}
2681+
}
25112682

25122683
#endif // TEST_GROUP

0 commit comments

Comments
 (0)