Skip to content

Commit 549c707

Browse files
nicola-cabfinnschiermerjedelbo
authored
RCORE-2055 Array Classification for enabling compression (#7564)
* update next-major * specified part of new layout (new width encoding) * new header format for compressed arrays * code review * code review * start of classifying arrays for compression * classification down to column types * first attempt to cut through the BPlusTree madness * [wip] start on 'type driven' write process * all tests passing (but no compression enabled) * enabled compression for signed integer leafs only * removed some dubious constructions in cluster tree * delete tmp array while classifying arrays * enabled compression of links and backlinks (excl collections) * also compress bplustree of integers/links (experimental) * pref for compressing dicts (not working) * wip * wip * finally: compressing collections (incl dicts) * compressing timestamps now * enabled compression on ObjectID, TypedLink and UUID * also compressing Mixed properties (not list/dicts of Mixed) * Array compression with collections in Mixed (#7412) --------- Co-authored-by: Finn Schiermer Andersen <[email protected]> * merge next-major + collection in mixed * enable dynamic choice of compression method * moved typed_write/typed_print for bptree into class * Merge pull request #7432 from realm/fsa/clean_typed_write moved typed_write/typed_print for bptree into class * cleanup unrelated code changes * fix compilation * cleanup * code review * code review * swap byte 3&4 with byte 6&7 for flex formats for storing A and B sizes * Some modifications * Move Encoding definition * Testing * Perserve type information in typed_write (#7598) * call directly Array::destroy() * Fix issue * remove table from typed_print * lint * point fix avoid compressing history array --------- Co-authored-by: Finn Schiermer Andersen <[email protected]> Co-authored-by: Jørgen Edelbo <[email protected]> Co-authored-by: Finn Schiermer Andersen <[email protected]>
1 parent 7623c9f commit 549c707

26 files changed

+883
-69
lines changed

src/realm/array.cpp

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ size_t Array::bit_width(int64_t v)
209209
return uint64_t(v) >> 31 ? 64 : uint64_t(v) >> 15 ? 32 : uint64_t(v) >> 7 ? 16 : 8;
210210
}
211211

212-
213212
void Array::init_from_mem(MemRef mem) noexcept
214213
{
215214
char* header = Node::init_from_mem(mem);
@@ -289,7 +288,7 @@ ref_type Array::do_write_shallow(_impl::ArrayWriterBase& out) const
289288
}
290289

291290

292-
ref_type Array::do_write_deep(_impl::ArrayWriterBase& out, bool only_if_modified) const
291+
ref_type Array::do_write_deep(_impl::ArrayWriterBase& out, bool only_if_modified, bool compress) const
293292
{
294293
// Temp array for updated refs
295294
Array new_array(Allocator::get_default());
@@ -304,7 +303,7 @@ ref_type Array::do_write_deep(_impl::ArrayWriterBase& out, bool only_if_modified
304303
bool is_ref = (value != 0 && (value & 1) == 0);
305304
if (is_ref) {
306305
ref_type subref = to_ref(value);
307-
ref_type new_subref = write(subref, m_alloc, out, only_if_modified); // Throws
306+
ref_type new_subref = write(subref, m_alloc, out, only_if_modified, compress); // Throws
308307
value = from_ref(new_subref);
309308
}
310309
new_array.add(value); // Throws
@@ -1334,3 +1333,46 @@ bool QueryStateFindAll<IntegerColumn>::match(size_t index) noexcept
13341333

13351334
return (m_limit > m_match_count);
13361335
}
1336+
1337+
void Array::typed_print(std::string prefix) const
1338+
{
1339+
std::cout << "Generic Array " << header_to_string(get_header()) << " @ " << m_ref;
1340+
if (!is_attached()) {
1341+
std::cout << " Unattached";
1342+
return;
1343+
}
1344+
if (size() == 0) {
1345+
std::cout << " Empty" << std::endl;
1346+
return;
1347+
}
1348+
std::cout << " size = " << size() << " {";
1349+
if (has_refs()) {
1350+
std::cout << std::endl;
1351+
for (unsigned n = 0; n < size(); ++n) {
1352+
auto pref = prefix + " " + to_string(n) + ":\t";
1353+
RefOrTagged rot = get_as_ref_or_tagged(n);
1354+
if (rot.is_ref() && rot.get_as_ref()) {
1355+
Array a(m_alloc);
1356+
a.init_from_ref(rot.get_as_ref());
1357+
std::cout << pref;
1358+
a.typed_print(pref);
1359+
}
1360+
else if (rot.is_tagged()) {
1361+
std::cout << pref << rot.get_as_int() << std::endl;
1362+
}
1363+
}
1364+
std::cout << prefix << "}" << std::endl;
1365+
}
1366+
else {
1367+
std::cout << " Leaf of unknown type }" << std::endl;
1368+
}
1369+
}
1370+
1371+
ref_type ArrayPayload::typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc)
1372+
{
1373+
Array arr(alloc);
1374+
arr.init_from_ref(ref);
1375+
// By default we are not compressing
1376+
constexpr bool compress = false;
1377+
return arr.write(out, true, out.only_modified, compress);
1378+
}

src/realm/array.hpp

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -379,11 +379,12 @@ class Array : public Node, public ArrayParent {
379379
///
380380
/// \param only_if_modified Set to `false` to always write, or to `true` to
381381
/// only write the array if it has been modified.
382-
ref_type write(_impl::ArrayWriterBase& out, bool deep, bool only_if_modified) const;
382+
ref_type write(_impl::ArrayWriterBase& out, bool deep, bool only_if_modified, bool compress_in_flight) const;
383383

384384
/// Same as non-static write() with `deep` set to true. This is for the
385385
/// cases where you do not already have an array accessor available.
386-
static ref_type write(ref_type, Allocator&, _impl::ArrayWriterBase&, bool only_if_modified);
386+
static ref_type write(ref_type, Allocator&, _impl::ArrayWriterBase&, bool only_if_modified,
387+
bool compress_in_flight);
387388

388389
size_t find_first(int64_t value, size_t begin = 0, size_t end = size_t(-1)) const;
389390

@@ -459,6 +460,13 @@ class Array : public Node, public ArrayParent {
459460
Array& operator=(const Array&) = delete; // not allowed
460461
Array(const Array&) = delete; // not allowed
461462

463+
/// Takes a 64-bit value and returns the minimum number of bits needed
464+
/// to fit the value. For alignment this is rounded up to nearest
465+
/// log2. Possible results {0, 1, 2, 4, 8, 16, 32, 64}
466+
static size_t bit_width(int64_t value);
467+
468+
void typed_print(std::string prefix) const;
469+
462470
protected:
463471
// This returns the minimum value ("lower bound") of the representable values
464472
// for the given bit width. Valid widths are 0, 1, 2, 4, 8, 16, 32, and 64.
@@ -518,12 +526,6 @@ class Array : public Node, public ArrayParent {
518526
template <size_t w>
519527
int64_t get_universal(const char* const data, const size_t ndx) const;
520528

521-
protected:
522-
/// Takes a 64-bit value and returns the minimum number of bits needed
523-
/// to fit the value. For alignment this is rounded up to nearest
524-
/// log2. Posssible results {0, 1, 2, 4, 8, 16, 32, 64}
525-
static size_t bit_width(int64_t value);
526-
527529
protected:
528530
Getter m_getter = nullptr; // cached to avoid indirection
529531
const VTable* m_vtable = nullptr;
@@ -538,7 +540,7 @@ class Array : public Node, public ArrayParent {
538540

539541
private:
540542
ref_type do_write_shallow(_impl::ArrayWriterBase&) const;
541-
ref_type do_write_deep(_impl::ArrayWriterBase&, bool only_if_modified) const;
543+
ref_type do_write_deep(_impl::ArrayWriterBase&, bool only_if_modified, bool compress) const;
542544

543545
void _mem_usage(size_t& mem) const noexcept;
544546

@@ -552,6 +554,23 @@ class Array : public Node, public ArrayParent {
552554
friend class ArrayWithFind;
553555
};
554556

557+
class TempArray : public Array {
558+
public:
559+
TempArray(size_t sz, Type type = Type::type_HasRefs)
560+
: Array(Allocator::get_default())
561+
{
562+
create(type, false, sz);
563+
}
564+
~TempArray()
565+
{
566+
destroy();
567+
}
568+
ref_type write(_impl::ArrayWriterBase& out)
569+
{
570+
return Array::write(out, false, false, false);
571+
}
572+
};
573+
555574
// Implementation:
556575

557576

@@ -829,7 +848,7 @@ inline void Array::destroy_deep() noexcept
829848
m_data = nullptr;
830849
}
831850

832-
inline ref_type Array::write(_impl::ArrayWriterBase& out, bool deep, bool only_if_modified) const
851+
inline ref_type Array::write(_impl::ArrayWriterBase& out, bool deep, bool only_if_modified, bool compress) const
833852
{
834853
REALM_ASSERT(is_attached());
835854

@@ -839,10 +858,11 @@ inline ref_type Array::write(_impl::ArrayWriterBase& out, bool deep, bool only_i
839858
if (!deep || !m_has_refs)
840859
return do_write_shallow(out); // Throws
841860

842-
return do_write_deep(out, only_if_modified); // Throws
861+
return do_write_deep(out, only_if_modified, compress); // Throws
843862
}
844863

845-
inline ref_type Array::write(ref_type ref, Allocator& alloc, _impl::ArrayWriterBase& out, bool only_if_modified)
864+
inline ref_type Array::write(ref_type ref, Allocator& alloc, _impl::ArrayWriterBase& out, bool only_if_modified,
865+
bool compress)
846866
{
847867
if (only_if_modified && alloc.is_read_only(ref))
848868
return ref;
@@ -853,7 +873,7 @@ inline ref_type Array::write(ref_type ref, Allocator& alloc, _impl::ArrayWriterB
853873
if (!array.m_has_refs)
854874
return array.do_write_shallow(out); // Throws
855875

856-
return array.do_write_deep(out, only_if_modified); // Throws
876+
return array.do_write_deep(out, only_if_modified, compress); // Throws
857877
}
858878

859879
inline void Array::add(int_fast64_t value)

src/realm/array_integer.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ class ArrayInteger : public Array, public ArrayPayload {
7070
}
7171
template <class cond>
7272
bool find(value_type value, size_t start, size_t end, QueryStateBase* state) const;
73+
74+
template <class T>
75+
static ref_type typed_write(ref_type ref, T& out, Allocator& alloc)
76+
{
77+
Array arr(alloc);
78+
arr.init_from_ref(ref);
79+
return arr.write(out, false, out.only_modified, out.compress);
80+
}
7381
};
7482

7583
class ArrayIntNull : public Array, public ArrayPayload {
@@ -139,6 +147,14 @@ class ArrayIntNull : public Array, public ArrayPayload {
139147

140148
size_t find_first(value_type value, size_t begin = 0, size_t end = npos) const;
141149

150+
template <class T>
151+
static ref_type typed_write(ref_type ref, T& out, Allocator& alloc)
152+
{
153+
Array arr(alloc);
154+
arr.init_from_ref(ref);
155+
return arr.write(out, false, out.only_modified, out.compress);
156+
}
157+
142158
protected:
143159
void avoid_null_collision(int64_t value);
144160

src/realm/array_mixed.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818

1919
#include <realm/array_mixed.hpp>
2020
#include <realm/array_basic.hpp>
21+
#include <realm/dictionary.hpp>
22+
#include <realm/impl/array_writer.hpp>
2123

2224
using namespace realm;
2325

@@ -328,6 +330,89 @@ void ArrayMixed::verify() const
328330
// TODO: Implement
329331
}
330332

333+
ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out, Allocator& alloc)
334+
{
335+
if (out.only_modified && alloc.is_read_only(top_ref))
336+
return top_ref;
337+
338+
ArrayRef top(alloc);
339+
top.init_from_ref(top_ref);
340+
size_t sz = top.size();
341+
TempArray written_leaf(sz);
342+
343+
/*
344+
Mixed stores things using different arrays. We need to take into account this in order to
345+
understand what we need to compress and what we can instead leave not compressed.
346+
347+
The main subarrays are:
348+
349+
composite array : index 0
350+
int array : index 1
351+
pair_int array: index 2
352+
string array: index 3
353+
ref array: index 4
354+
key array: index 5
355+
356+
Description of each array:
357+
1. composite array: the data stored here is either a small int (< 32 bits) or an offset to one of
358+
the other arrays where the actual data is.
359+
2. int and pair int arrays, they are used for storing integers, timestamps, floats, doubles,
360+
decimals, links. In general we can compress them, but we need to be careful, controlling the col_type
361+
should prevent compressing data that we want to leave in the current format.
362+
3. string array is for strings and binary data (no compression for now)
363+
4. ref array is actually storing refs to collections. they can only be BPlusTree<int, Mixed> or
364+
BPlusTree<string, Mixed>.
365+
5. key array stores unique identifiers for collections in mixed (integers that can be compressed)
366+
*/
367+
Array composite(alloc);
368+
composite.init_from_ref(top.get_as_ref(0));
369+
written_leaf.set_as_ref(0, composite.write(out, true, out.only_modified, false));
370+
for (size_t i = 1; i < sz; ++i) {
371+
auto ref = top.get(i);
372+
ref_type new_ref = ref;
373+
if (ref && !(out.only_modified && alloc.is_read_only(ref))) {
374+
if (i < 3) { // int, and pair_int
375+
// integer arrays
376+
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
377+
}
378+
else if (i == 4) { // collection in mixed
379+
ArrayRef arr_ref(alloc);
380+
arr_ref.init_from_ref(ref);
381+
auto ref_sz = arr_ref.size();
382+
TempArray written_ref_leaf(ref_sz);
383+
384+
for (size_t k = 0; k < ref_sz; k++) {
385+
ref_type new_sub_ref = 0;
386+
if (auto sub_ref = arr_ref.get(k)) {
387+
auto header = alloc.translate(sub_ref);
388+
// Now we have to find out if the nested collection is a
389+
// dictionary or a list. If the top array has a size of 2
390+
// and it is not a BplusTree inner node, then it is a dictionary
391+
if (NodeHeader::get_size_from_header(header) == 2 &&
392+
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
393+
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
394+
}
395+
else {
396+
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
397+
}
398+
}
399+
written_ref_leaf.set_as_ref(k, new_sub_ref);
400+
}
401+
new_ref = written_ref_leaf.write(out);
402+
}
403+
else if (i == 5) { // unique keys associated to collections in mixed
404+
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
405+
}
406+
else {
407+
// all the rest we don't want to compress it, at least for now (strings will be needed)
408+
new_ref = Array::write(ref, alloc, out, out.only_modified, false);
409+
}
410+
}
411+
written_leaf.set(i, new_ref);
412+
}
413+
return written_leaf.write(out);
414+
}
415+
331416
void ArrayMixed::ensure_array_accessor(Array& arr, size_t ndx_in_parent) const
332417
{
333418
if (!arr.is_attached()) {

src/realm/array_mixed.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ class ArrayMixed : public ArrayPayload, private Array {
100100
int64_t get_key(size_t ndx) const;
101101

102102
void verify() const;
103+
static ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& outs, Allocator& alloc);
103104

104105
private:
105106
enum {

src/realm/array_timestamp.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <realm/array_timestamp.hpp>
2020
#include <realm/array_integer_tpl.hpp>
21+
#include <realm/impl/array_writer.hpp>
2122

2223
using namespace realm;
2324

@@ -244,4 +245,25 @@ void ArrayTimestamp::verify() const
244245
REALM_ASSERT(m_seconds.size() == m_nanoseconds.size());
245246
#endif
246247
}
248+
249+
ref_type ArrayTimestamp::typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc)
250+
{
251+
// timestamps could be compressed, but the formats we support at the moment are not producing
252+
// noticeable gains.
253+
Array top(alloc);
254+
top.init_from_ref(ref);
255+
REALM_ASSERT_DEBUG(top.size() == 2);
256+
257+
TempArray written_top(2);
258+
259+
auto rot0 = top.get_as_ref_or_tagged(0);
260+
auto rot1 = top.get_as_ref_or_tagged(1);
261+
REALM_ASSERT_DEBUG(rot0.is_ref() && rot0.get_as_ref());
262+
REALM_ASSERT_DEBUG(rot1.is_ref() && rot1.get_as_ref());
263+
written_top.set_as_ref(0, Array::write(rot0.get_as_ref(), alloc, out, out.only_modified, false));
264+
written_top.set_as_ref(1, Array::write(rot1.get_as_ref(), alloc, out, out.only_modified, false));
265+
266+
return written_top.write(out);
267+
}
268+
247269
} // namespace realm

src/realm/array_timestamp.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ class ArrayTimestamp : public ArrayPayload, private Array {
108108
size_t find_first(Timestamp value, size_t begin, size_t end) const noexcept;
109109

110110
void verify() const;
111+
static ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc);
111112

112113
private:
113114
ArrayIntNull m_seconds;

0 commit comments

Comments
 (0)