Skip to content

Commit d44b7b5

Browse files
committed
[ntuple] add streaming vector tutorial
1 parent 7791565 commit d44b7b5

File tree

1 file changed

+199
-0
lines changed

1 file changed

+199
-0
lines changed
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
/// \file
2+
/// \ingroup tutorial_ntuple
3+
///
4+
/// Example of a streaming vector: a special purpose container that reads large vectors piece-wise.
5+
///
6+
/// \macro_code
7+
///
8+
/// \date November 2024
9+
/// \author Peter van Gemmeren, the ROOT Team
10+
11+
#include <ROOT/RNTupleModel.hxx>
12+
#include <ROOT/RNTupleReader.hxx>
13+
#include <ROOT/RNTupleReadOptions.hxx>
14+
#include <ROOT/RNTupleRange.hxx>
15+
#include <ROOT/RNTupleTypes.hxx>
16+
#include <ROOT/RNTupleView.hxx>
17+
#include <ROOT/RNTupleWriter.hxx>
18+
19+
#include <TRandom3.h>
20+
21+
#include <cstdint>
22+
#include <iostream>
23+
#include <vector>
24+
#include <utility>
25+
26+
constexpr char const *kFileName = "ntpl016_streaming_vector.root";
27+
constexpr char const *kNTupleName = "ntpl";
28+
constexpr char const *kFieldName = "LargeVector";
29+
constexpr unsigned int kNEvents = 10;
30+
constexpr unsigned int kNElementsPerVector = 1000000;
31+
32+
// Create an RNTuple with a single vector field. Every entry contains a large vector of random integers.
33+
// The vector should be seen as too large to be held entirely in memory during reading.
34+
void CreateRNTuple()
35+
{
36+
auto model = ROOT::RNTupleModel::Create();
37+
auto ptrLargeVector = model->MakeField<std::vector<std::uint32_t>>(kFieldName);
38+
auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), kNTupleName, kFileName);
39+
40+
auto prng = std::make_unique<TRandom3>();
41+
prng->SetSeed();
42+
43+
for (ROOT::NTupleSize_t i = 0; i < kNEvents; i++) {
44+
ptrLargeVector->clear();
45+
for (std::size_t j = 0; j < kNElementsPerVector; j++)
46+
ptrLargeVector->emplace_back(prng->Integer(-1));
47+
writer->Fill();
48+
}
49+
std::cout << "RNTuple written" << std::endl;
50+
}
51+
52+
/*
53+
* ==================================================================================================
54+
*/
55+
56+
// For comparison, the canonical read function that reads the entire vector for every entry.
57+
void ReadRNTupleSimple()
58+
{
59+
auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName);
60+
61+
const auto nEntries = reader->GetNEntries();
62+
std::cout << "Simple reading, found " << nEntries << " entries" << std::endl;
63+
64+
auto ptrLargeVector = reader->GetModel().GetDefaultEntry().GetPtr<std::vector<std::uint32_t>>(kFieldName);
65+
for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) {
66+
reader->LoadEntry(i);
67+
68+
const auto vectorSize = ptrLargeVector->size();
69+
uint64_t sum = 0;
70+
for (auto val : *ptrLargeVector)
71+
sum += val;
72+
73+
std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl;
74+
}
75+
std::cout << "RNTuple simple read" << std::endl;
76+
}
77+
78+
/*
79+
* ==================================================================================================
80+
*/
81+
82+
// The StreamingVectorView class allows iteration over an RNTuple on-disk vector of element type T.
83+
// Unlike an std::vector, this class does not provide random-access but only allows to iterate the data elements
84+
// from beginning to end.
85+
// Internally, it uses an RNTupleCollection view and an item view to load chunks of the vector elements into memory,
86+
// so that never the entire vector needs to stay in memory.
87+
// Note that we don't need to implement loading chunks of data explicitly. Simply by asking for a single vector element
88+
// at every iteration step, the RNTuple views will take care of keeping only the currently required data pages
89+
// in memory. This results in the minimal possible memory footprint of RNTuple.
90+
// Note that for effective streaming, the cluster cache read option needs to be turned off. This may change in the
91+
// future with more fine-grained control of the data preloading.
92+
template <class T>
93+
class StreamingVectorView {
94+
// For a certain entry, the collection view provides the information about the size of the collection and
95+
// the index range of the item view, which is required to read the values of the collection at hand.
96+
ROOT::RNTupleCollectionView fVectorView;
97+
// The "data view" provides access to the vector elements
98+
ROOT::RNTupleView<T> fItemView;
99+
// Given an entry number, the start end end index in the item view to read the corresponding vector elements
100+
ROOT::RNTupleLocalRange fRange{ROOT::kInvalidDescriptorId, ROOT::kInvalidNTupleIndex, ROOT::kInvalidNTupleIndex};
101+
// The index of the entry from which the vector should be read
102+
ROOT::NTupleSize_t fEntry{0};
103+
// The size of the collection in fEntry
104+
ROOT::NTupleSize_t fSize{0};
105+
106+
public:
107+
// A lightweight iterator used in StreamingVectorView::begin() and StreamingVectorView::end().
108+
// Used to iterate over the elements of an RNTuple on-disk vector for a certain entry.
109+
// Dereferencing the iterator returns the corresponding value of the item view.
110+
class Iterator {
111+
ROOT::RNTupleLocalRange::RIterator fRangeItr;
112+
ROOT::RNTupleView<T> &fView;
113+
114+
public:
115+
using iterator = Iterator;
116+
using iterator_category = std::input_iterator_tag;
117+
using value_type = T;
118+
using pointer = const T *;
119+
using reference = const T &;
120+
121+
Iterator(ROOT::RNTupleLocalRange::RIterator rangeItr, ROOT::RNTupleView<T> &view)
122+
: fRangeItr(rangeItr), fView(view)
123+
{
124+
}
125+
126+
iterator operator++(int) /* postfix */
127+
{
128+
auto r = *this;
129+
++(*this);
130+
return r;
131+
}
132+
iterator &operator++() /* prefix */
133+
{
134+
++fRangeItr;
135+
return *this;
136+
}
137+
reference operator*() { return fView.operator()(*fRangeItr); }
138+
pointer operator->() { return &fView.operator()(*fRangeItr); }
139+
bool operator==(const iterator &rh) const { return fRangeItr == rh.fRangeItr; }
140+
bool operator!=(const iterator &rh) const { return fRangeItr != rh.fRangeItr; }
141+
};
142+
143+
explicit StreamingVectorView(ROOT::RNTupleCollectionView vectorView)
144+
: fVectorView(std::move(vectorView)), fItemView(fVectorView.GetView<T>("_0"))
145+
{
146+
}
147+
148+
ROOT::NTupleSize_t size() const { return fSize; }
149+
150+
// The begin() and end() methods enable range-based for loops like `for (auto val : streamingVector)`
151+
Iterator begin() { return Iterator(fRange.begin(), fItemView); }
152+
Iterator end() { return Iterator(fRange.end(), fItemView); }
153+
154+
void LoadEntry(ROOT::NTupleSize_t entry)
155+
{
156+
fEntry = entry;
157+
fRange = fVectorView.GetCollectionRange(fEntry);
158+
fSize = fVectorView.operator()(fEntry);
159+
}
160+
};
161+
162+
// For the streaming vector read, we use a custom class `StreamingVectorView` that implements the piece-wise
163+
// loading of the data during iteration of elements of the on-disk vector. The class has been built such that
164+
// the event loop is almost identical to the simple reading case above.
165+
void ReadRNTupleStreamingVector()
166+
{
167+
ROOT::RNTupleReadOptions options;
168+
// Don't preload data; we want to populate data into memory only as needed
169+
options.SetClusterCache(ROOT::RNTupleReadOptions::EClusterCache::kOff);
170+
auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName, options);
171+
172+
const auto nEntries = reader->GetNEntries();
173+
std::cout << "Streamed reading, found " << nEntries << " entries" << std::endl;
174+
175+
StreamingVectorView<std::uint32_t> streamingVector(reader->GetCollectionView(kFieldName));
176+
177+
for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) {
178+
// Instead of `reader->LoadEntry()`, we tell the streaming vector which entry we want to read.
179+
streamingVector.LoadEntry(i);
180+
181+
// We can ask for the size of the vector without loading the data
182+
const auto vectorSize = streamingVector.size();
183+
184+
// The iteration works exactly as in the simple case
185+
uint64_t sum = 0;
186+
for (auto val : streamingVector)
187+
sum += val;
188+
189+
std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl;
190+
}
191+
std::cout << "RNTuple streaming read" << std::endl;
192+
}
193+
194+
void ntpl016_streaming_vector()
195+
{
196+
CreateRNTuple();
197+
ReadRNTupleSimple();
198+
ReadRNTupleStreamingVector();
199+
}

0 commit comments

Comments
 (0)