Skip to content

Commit fc31117

Browse files
authored
RCORE-2162: Add compression of strings in Mixed, Lst<String> and Dictionary (#7804)
1 parent 8f1d472 commit fc31117

24 files changed

+246
-157
lines changed

src/realm/array_backlink.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,12 +225,12 @@ void ArrayBacklink::verify() const
225225
REALM_ASSERT(src_obj.get<Mixed>(src_col_key).get_link() == target_link);
226226
}
227227
else if (val.is_type(type_List)) {
228-
DummyParent parent(src_table, val.get_ref());
228+
DummyParent parent(src_table, val.get_ref(), src_col_key);
229229
Lst<Mixed> list(parent, 0);
230230
REALM_ASSERT(list.find_any(target_link) != npos);
231231
}
232232
else if (val.is_type(type_Dictionary)) {
233-
DummyParent parent(src_table, val.get_ref());
233+
DummyParent parent(src_table, val.get_ref(), src_col_key);
234234
Dictionary dict(parent, 0);
235235
REALM_ASSERT(dict.find_any(target_link) != npos);
236236
}

src/realm/array_mixed.cpp

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -360,9 +360,8 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out,
360360
2. int and pair int arrays, they are used for storing integers, timestamps, floats, doubles,
361361
decimals, links. In general we can compress them, but we need to be careful, controlling the col_type
362362
should prevent compressing data that we want to leave in the current format.
363-
3. string array is for strings and binary data (no compression for now)
364-
4. ref array is actually storing refs to collections. they can only be BPlusTree<int, Mixed> or
365-
BPlusTree<string, Mixed>.
363+
3. string array is for strings and binary data
364+
4. ref array is actually storing refs to collections. They can only be Lst<Mixed> or Dictionary.
366365
5. key array stores unique identifiers for collections in mixed (integers that can be compressed)
367366
*/
368367
Array composite(alloc);
@@ -372,41 +371,48 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out,
372371
auto ref = top.get(i);
373372
ref_type new_ref = ref;
374373
if (ref && !(out.only_modified && alloc.is_read_only(ref))) {
375-
if (i < 3) { // int, and pair_int
376-
// integer arrays
377-
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
378-
}
379-
else if (i == 4) { // collection in mixed
380-
ArrayRef arr_ref(alloc);
381-
arr_ref.init_from_ref(ref);
382-
auto ref_sz = arr_ref.size();
383-
TempArray written_ref_leaf(ref_sz);
384-
385-
for (size_t k = 0; k < ref_sz; k++) {
386-
ref_type new_sub_ref = 0;
387-
if (auto sub_ref = arr_ref.get(k)) {
388-
auto header = alloc.translate(sub_ref);
389-
// Now we have to find out if the nested collection is a
390-
// dictionary or a list. If the top array has a size of 2
391-
// and it is not a BplusTree inner node, then it is a dictionary
392-
if (NodeHeader::get_size_from_header(header) == 2 &&
393-
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
394-
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
395-
}
396-
else {
397-
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
374+
switch (i) {
375+
case payload_idx_int:
376+
// integer array
377+
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
378+
break;
379+
case payload_idx_pair:
380+
// integer array
381+
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
382+
break;
383+
case payload_idx_str:
384+
new_ref = ArrayString::typed_write(ref, out, alloc);
385+
break;
386+
case payload_idx_ref: {
387+
// collection in mixed
388+
ArrayRef arr_ref(alloc);
389+
arr_ref.init_from_ref(ref);
390+
auto ref_sz = arr_ref.size();
391+
TempArray written_ref_leaf(ref_sz);
392+
393+
for (size_t k = 0; k < ref_sz; k++) {
394+
ref_type new_sub_ref = 0;
395+
if (auto sub_ref = arr_ref.get(k)) {
396+
auto header = alloc.translate(sub_ref);
397+
// Now we have to find out if the nested collection is a
398+
// dictionary or a list. If the top array has a size of 2
399+
// and it is not a BplusTree inner node, then it is a dictionary
400+
if (NodeHeader::get_size_from_header(header) == 2 &&
401+
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
402+
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
403+
}
404+
else {
405+
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
406+
}
398407
}
408+
written_ref_leaf.set_as_ref(k, new_sub_ref);
399409
}
400-
written_ref_leaf.set_as_ref(k, new_sub_ref);
410+
new_ref = written_ref_leaf.write(out);
411+
break;
401412
}
402-
new_ref = written_ref_leaf.write(out);
403-
}
404-
else if (i == 5) { // unique keys associated to collections in mixed
405-
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
406-
}
407-
else {
408-
// all the rest we don't want to compress it, at least for now (strings will be needed)
409-
new_ref = Array::write(ref, alloc, out, out.only_modified, false);
413+
case payload_idx_key:
414+
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
415+
break;
410416
}
411417
}
412418
written_leaf.set(i, new_ref);

src/realm/array_mixed.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,15 @@ class ArrayMixed : public ArrayPayload, private Array {
6464
{
6565
Array::set_parent(parent, ndx_in_parent);
6666
}
67+
bool need_string_interner() const override
68+
{
69+
return true;
70+
}
71+
virtual void set_string_interner(StringInterner* interner) const override
72+
{
73+
m_strings.set_string_interner(interner);
74+
}
75+
6776
void init_from_parent()
6877
{
6978
ref_type ref = get_ref_from_parent();

src/realm/array_string.cpp

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818

1919
#include <realm/array_string.hpp>
2020
#include <realm/impl/array_writer.hpp>
21+
#include <realm/table.hpp>
2122
#include <realm/string_interner.hpp>
22-
#include <realm/spec.hpp>
2323
#include <realm/mixed.hpp>
2424

2525
using namespace realm;
@@ -537,17 +537,39 @@ void ArrayString::verify() const
537537
#endif
538538
}
539539

540-
ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interner)
540+
template <>
541+
ref_type ArrayString::typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc)
541542
{
542-
REALM_ASSERT(interner);
543-
// we have to write out all, modified or not, to match the total cleanup
544-
Array interned(Allocator::get_default());
545-
auto sz = size();
546-
interned.create(NodeHeader::type_Normal, true, sz);
547-
for (size_t i = 0; i < sz; ++i) {
548-
interned.set(i, interner->intern(get(i)));
543+
Array leaf(alloc);
544+
leaf.init_from_ref(ref);
545+
ref_type ret_val;
546+
auto header = leaf.get_header();
547+
if (NodeHeader::get_hasrefs_from_header(header) ||
548+
NodeHeader::get_wtype_from_header(header) == NodeHeader::wtype_Multiply) {
549+
// We're interning these strings
550+
ArrayString as(alloc);
551+
as.init_from_ref(ref);
552+
StringInterner* interner = out.table->get_string_interner(out.col_key);
553+
auto sz = as.size();
554+
Array interned(Allocator::get_default());
555+
interned.create(NodeHeader::type_Normal, true, sz);
556+
for (size_t i = 0; i < sz; ++i) {
557+
interned.set(i, interner->intern(as.get(i)));
558+
}
559+
ret_val = interned.write(out, false, false, out.compress);
560+
interned.destroy();
561+
// in a transactional setting:
562+
// Destroy all sub-arrays if present, in order to release memory in file
563+
// This is contrary to the rest of the handling in this function, but needed
564+
// here since sub-arrays may not have been COW'ed and therefore not freed in file.
565+
// We rely on 'only_modified' to indicate that we're in a transactional setting.
566+
if (out.only_modified)
567+
leaf.destroy_deep(true);
568+
}
569+
else {
570+
// whether it's the old enum strings or the new interned strings,
571+
// just write out the array using integer leaf compression
572+
ret_val = leaf.write(out, false, out.only_modified, out.compress);
549573
}
550-
auto retval = interned.write(out, false, false, out.compress);
551-
interned.destroy();
552-
return retval;
574+
return ret_val;
553575
}

src/realm/array_string.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,8 @@ class ArrayString : public ArrayPayload {
126126
static StringData get(const char* header, size_t ndx, Allocator& alloc) noexcept;
127127

128128
void verify() const;
129-
// Write to 'out', if needed using 'interner' to intern any strings.
130-
// An interner of 0 will disable interning. Interned values may be further
131-
// compressed using leaf compression for integer arrays.
132-
ref_type write(_impl::ArrayWriterBase& out, StringInterner* interner);
129+
template <class T>
130+
static ref_type typed_write(ref_type ref, T& out, Allocator& alloc);
133131

134132
private:
135133
static constexpr size_t small_string_max_size = 15; // ArrayStringShort

src/realm/bplustree.hpp

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ namespace realm {
3030

3131
class BPlusTreeBase;
3232
class BPlusTreeInner;
33+
class StringInterner;
3334

3435
/*****************************************************************************/
3536
/* BPlusTreeNode */
@@ -207,6 +208,16 @@ class BPlusTreeBase {
207208
m_root->bp_set_parent(parent, ndx_in_parent);
208209
}
209210

211+
void set_interner(StringInterner* interner)
212+
{
213+
m_interner = interner;
214+
}
215+
216+
StringInterner* get_interner()
217+
{
218+
return m_interner;
219+
}
220+
210221
virtual void erase(size_t) = 0;
211222
virtual void clear() = 0;
212223
virtual void swap(size_t, size_t) = 0;
@@ -234,6 +245,7 @@ class BPlusTreeBase {
234245
std::unique_ptr<BPlusTreeNode> m_root;
235246
Allocator& m_alloc;
236247
ArrayParent* m_parent = nullptr;
248+
StringInterner* m_interner = nullptr;
237249
size_t m_ndx_in_parent = 0;
238250
size_t m_size = 0;
239251
size_t m_cached_leaf_begin;
@@ -300,6 +312,9 @@ class BPlusTree : public BPlusTreeBase {
300312
void init_from_ref(ref_type ref) noexcept override
301313
{
302314
LeafArray::init_from_ref(ref);
315+
if constexpr (realm::is_any_v<T, StringData, Mixed>) {
316+
LeafArray::set_string_interner(m_tree->get_interner());
317+
}
303318
}
304319

305320
ref_type get_ref() const override
@@ -574,19 +589,25 @@ class BPlusTree : public BPlusTreeBase {
574589

575590
std::unique_ptr<BPlusTreeLeaf> create_leaf_node() override
576591
{
577-
std::unique_ptr<BPlusTreeLeaf> leaf = std::make_unique<LeafNode>(this);
578-
static_cast<LeafNode*>(leaf.get())->create();
592+
auto leaf = std::make_unique<LeafNode>(this);
593+
leaf->create();
594+
if constexpr (realm::is_any_v<T, StringData, Mixed>) {
595+
leaf->set_string_interner(m_interner);
596+
}
579597
return leaf;
580598
}
581599
std::unique_ptr<BPlusTreeLeaf> init_leaf_node(ref_type ref) override
582600
{
583-
std::unique_ptr<BPlusTreeLeaf> leaf = std::make_unique<LeafNode>(this);
601+
auto leaf = std::make_unique<LeafNode>(this);
584602
leaf->init_from_ref(ref);
585603
return leaf;
586604
}
587605
BPlusTreeLeaf* cache_leaf(MemRef mem) override
588606
{
589607
m_leaf_cache.init_from_mem(mem);
608+
if constexpr (realm::is_any_v<T, StringData, Mixed>) {
609+
m_leaf_cache.LeafArray::set_string_interner(m_interner);
610+
}
590611
return &m_leaf_cache;
591612
}
592613
void replace_root(std::unique_ptr<BPlusTreeNode> new_root) override

src/realm/cluster.cpp

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ inline void Cluster::set_string_interner(ArrayString& arr, ColKey col_key) const
261261
m_tree_top.set_string_interner(arr, col_key);
262262
}
263263

264+
template <>
265+
inline void Cluster::set_string_interner(ArrayMixed& arr, ColKey col_key) const
266+
{
267+
m_tree_top.set_string_interner(arr, col_key);
268+
}
269+
264270
template <class T>
265271
inline void Cluster::set_spec(T&, ColKey::Idx) const
266272
{
@@ -314,6 +320,7 @@ inline void Cluster::do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_valu
314320
{
315321
ArrayMixed arr(m_alloc);
316322
arr.set_parent(this, col_key.get_index().val + s_first_col_index);
323+
set_string_interner(arr, col_key);
317324
arr.init_from_parent();
318325
arr.insert(ndx, init_value);
319326

@@ -798,6 +805,7 @@ inline void Cluster::do_erase_mixed(size_t ndx, ColKey col_key, ObjKey key, Casc
798805

799806
ArrayMixed values(m_alloc);
800807
values.set_parent(this, col_ndx.val + s_first_col_index);
808+
set_string_interner(values, col_key);
801809
values.init_from_parent();
802810

803811
Mixed value = values.get(ndx);
@@ -1447,6 +1455,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const
14471455
}
14481456
case col_type_Mixed: {
14491457
ArrayMixed arr(m_alloc);
1458+
set_string_interner(arr, col);
14501459
ref_type ref = Array::get_as_ref(j);
14511460
arr.init_from_ref(ref);
14521461
std::cout << ", " << arr.get(i);
@@ -1651,32 +1660,8 @@ ref_type Cluster::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const
16511660
else {
16521661
// Columns
16531662
auto col_key = out.table->m_leaf_ndx2colkey[j - 1];
1663+
out.col_key = col_key;
16541664
auto col_type = col_key.get_type();
1655-
// String columns are interned at this point
1656-
if (out.compress && col_type == col_type_String && !col_key.is_collection()) {
1657-
ArrayRef leaf(m_alloc);
1658-
leaf.init_from_ref(ref);
1659-
auto header = leaf.get_header();
1660-
if (NodeHeader::get_hasrefs_from_header(header) ||
1661-
NodeHeader::get_wtype_from_header(header) == wtype_Multiply) {
1662-
// We're interning these strings
1663-
ArrayString as(m_alloc);
1664-
as.init_from_ref(leaf_rot.get_as_ref());
1665-
written_cluster.set_as_ref(j, as.write(out, out.table->get_string_interner(col_key)));
1666-
// in a transactional setting:
1667-
// Destroy all sub-arrays if present, in order to release memory in file
1668-
// This is contrary to the rest of the handling in this function, but needed
1669-
// here since sub-arrays may not have been COW'ed and therefore not freed in file.
1670-
// We rely on 'only_modified' to indicate that we're in a transactional setting.
1671-
if (only_modified)
1672-
leaf.destroy_deep(true);
1673-
continue;
1674-
}
1675-
// whether it's the old enum strings or the new interned strings,
1676-
// just write out the array using integer leaf compression
1677-
written_cluster.set_as_ref(j, leaf.write(out, false, false, false));
1678-
continue;
1679-
}
16801665
if (col_key.is_collection()) {
16811666
ArrayRef arr_ref(m_alloc);
16821667
arr_ref.init_from_ref(ref);

src/realm/cluster_tree.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,7 @@ void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const
11401140
// Check for owner. This function may be called in context of DictionaryClusterTree
11411141
// in which case m_owner is null (and spec never needed).
11421142
if (m_owner) {
1143-
arr.set_string_interner(_impl::TableFriend::get_string_interner(*m_owner, col_key));
1143+
arr.set_string_interner(m_owner->get_string_interner(col_key));
11441144
}
11451145
}
11461146

src/realm/collection.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index)
155155

156156
BPlusTree<StringData> keys(*ctrl.alloc);
157157
keys.set_parent(&top, 0);
158+
keys.set_interner(ctrl.interner);
158159
keys.init_from_parent();
159160
size_t start = 0;
160161
if (size_t finish = keys.size()) {
@@ -177,6 +178,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index)
177178
}
178179
BPlusTree<Mixed> values(*ctrl.alloc);
179180
values.set_parent(&top, 1);
181+
values.set_interner(ctrl.interner);
180182
values.init_from_parent();
181183
for (; start < finish; start++) {
182184
val = values.get(start);
@@ -194,6 +196,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index)
194196
if (!ref)
195197
return;
196198
BPlusTree<Mixed> list(*ctrl.alloc);
199+
list.set_interner(ctrl.interner);
197200
list.init_from_ref(ref);
198201
if (size_t sz = list.size()) {
199202
size_t start = 0;

0 commit comments

Comments
 (0)