|
| 1 | +/// \file |
| 2 | +/// \ingroup tutorial_ntuple |
| 3 | +/// |
| 4 | +/// Example of a streaming vector: a special purpose container that reads large vectors piece-wise. |
| 5 | +/// |
| 6 | +/// \macro_code |
| 7 | +/// |
| 8 | +/// \date November 2024 |
| 9 | +/// \author Peter van Gemmeren, the ROOT Team |
| 10 | + |
| 11 | +#include <ROOT/RNTupleModel.hxx> |
| 12 | +#include <ROOT/RNTupleReader.hxx> |
| 13 | +#include <ROOT/RNTupleReadOptions.hxx> |
| 14 | +#include <ROOT/RNTupleRange.hxx> |
| 15 | +#include <ROOT/RNTupleTypes.hxx> |
| 16 | +#include <ROOT/RNTupleView.hxx> |
| 17 | +#include <ROOT/RNTupleWriter.hxx> |
| 18 | + |
| 19 | +#include <TRandom3.h> |
| 20 | + |
| 21 | +#include <cstdint> |
| 22 | +#include <iostream> |
| 23 | +#include <vector> |
| 24 | +#include <utility> |
| 25 | + |
| 26 | +constexpr char const *kFileName = "ntpl015_streaming_vector.root"; |
| 27 | +constexpr char const *kNTupleName = "ntpl"; |
| 28 | +constexpr char const *kFieldName = "LargeVector"; |
| 29 | +constexpr unsigned int kNEvents = 10; |
| 30 | +constexpr unsigned int kVectorSize = 1000000; |
| 31 | + |
| 32 | +// Create an RNTuple with a single vector field. Every entry contains a large vector of random integers. |
| 33 | +// The vector should be seen as too large to be held entirely in memory during reading. |
| 34 | +void CreateRNTuple() |
| 35 | +{ |
| 36 | + auto model = ROOT::RNTupleModel::Create(); |
| 37 | + auto ptrLargeVector = model->MakeField<std::vector<std::uint32_t>>(kFieldName); |
| 38 | + auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), kNTupleName, kFileName); |
| 39 | + |
| 40 | + auto prng = std::make_unique<TRandom3>(); |
| 41 | + prng->SetSeed(); |
| 42 | + |
| 43 | + for (ROOT::NTupleSize_t i = 0; i < kNEvents; i++) { |
| 44 | + ptrLargeVector->clear(); |
| 45 | + for (std::size_t j = 0; j < kVectorSize; j++) |
| 46 | + ptrLargeVector->emplace_back(prng->Integer(-1)); |
| 47 | + writer->Fill(); |
| 48 | + } |
| 49 | + std::cout << "RNTuple written" << std::endl; |
| 50 | +} |
| 51 | + |
| 52 | +/* |
| 53 | + * ================================================================================================== |
| 54 | + */ |
| 55 | + |
| 56 | +// For comparison, the canonical read function that reads the entire vector for every entry. |
| 57 | +void ReadRNTupleSimple() |
| 58 | +{ |
| 59 | + auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName); |
| 60 | + |
| 61 | + const auto nEntries = reader->GetNEntries(); |
| 62 | + std::cout << "Simple reading, found " << nEntries << " entries" << std::endl; |
| 63 | + |
| 64 | + auto ptrLargeVector = reader->GetModel().GetDefaultEntry().GetPtr<std::vector<std::uint32_t>>(kFieldName); |
| 65 | + for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) { |
| 66 | + reader->LoadEntry(i); |
| 67 | + |
| 68 | + const auto vectorSize = ptrLargeVector->size(); |
| 69 | + uint64_t sum = 0; |
| 70 | + for (auto val : *ptrLargeVector) |
| 71 | + sum += val; |
| 72 | + |
| 73 | + std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl; |
| 74 | + } |
| 75 | + std::cout << "RNTuple simple read" << std::endl; |
| 76 | +} |
| 77 | + |
| 78 | +/* |
| 79 | + * ================================================================================================== |
| 80 | + */ |
| 81 | + |
| 82 | +// The StreamingVector class allows iteration over an RNTuple on-disk vector of element type T. |
| 83 | +// Unlike an std::vector, this class does not provide random-access but only allows to iterate the data elements |
| 84 | +// from beginning to end. |
| 85 | +// Internally, it uses an RNTupleCollection view and an item view to load chunks of the vector elements into memory, |
| 86 | +// so that never the entire vector needs to stay in memory. |
| 87 | +// Note that we don't need to implement loading chunks of data explicitly. Simply by asking for a single vector element |
| 88 | +// at every iteration step, the RNTuple views will take care of keeping only the currently required data pages |
| 89 | +// in memory. |
| 90 | +template <class T> |
| 91 | +class StreamingVector { |
| 92 | + // For a certain entry, the collection view provides the information about the size of the collection and |
| 93 | + // the index range of the item view, which is required to read the values of the collection at hand. |
| 94 | + ROOT::RNTupleCollectionView fVectorView; |
| 95 | + // The "data view" provides access to the vector elements |
| 96 | + ROOT::RNTupleView<T> fItemView; |
| 97 | + // Given an entry number, the start end end index in the item view to read the corresponding vector elements |
| 98 | + ROOT::RNTupleLocalRange fRange{ROOT::kInvalidDescriptorId, ROOT::kInvalidNTupleIndex, ROOT::kInvalidNTupleIndex}; |
| 99 | + // The index of the entry from which the vector should be read |
| 100 | + ROOT::NTupleSize_t fEntry{0}; |
| 101 | + // The size of the collection in fEntry |
| 102 | + ROOT::NTupleSize_t fSize{0}; |
| 103 | + |
| 104 | +public: |
| 105 | + // A lightweight iterator used in StreamingVector::begin() and StreamingVector::end(). Used to iterate over the |
| 106 | + // elements of an RNTuple on-disk vector for a certain entry. Dereferencing the iterator returns the corresponding |
| 107 | + // value of the item view. |
| 108 | + class iterator { |
| 109 | + ROOT::RNTupleLocalRange::RIterator fRangeItr; |
| 110 | + ROOT::RNTupleView<T> &fView; |
| 111 | + |
| 112 | + public: |
| 113 | + iterator(ROOT::RNTupleLocalRange::RIterator rangeItr, ROOT::RNTupleView<T> &view) |
| 114 | + : fRangeItr(rangeItr), fView(view) |
| 115 | + { |
| 116 | + } |
| 117 | + |
| 118 | + iterator operator++(int) /* postfix */ |
| 119 | + { |
| 120 | + auto r = *this; |
| 121 | + ++(*this); |
| 122 | + return r; |
| 123 | + } |
| 124 | + iterator &operator++() /* prefix */ |
| 125 | + { |
| 126 | + ++fRangeItr; |
| 127 | + return *this; |
| 128 | + } |
| 129 | + const T &operator*() { return fView.operator()(*fRangeItr); } |
| 130 | + const T *operator->() { return &fView.operator()(*fRangeItr); } |
| 131 | + bool operator==(const iterator &rh) const { return fRangeItr == rh.fRangeItr; } |
| 132 | + bool operator!=(const iterator &rh) const { return fRangeItr != rh.fRangeItr; } |
| 133 | + }; |
| 134 | + |
| 135 | + explicit StreamingVector(ROOT::RNTupleCollectionView vectorView) |
| 136 | + : fVectorView(std::move(vectorView)), fItemView(fVectorView.GetView<T>("_0")) |
| 137 | + { |
| 138 | + } |
| 139 | + |
| 140 | + //StreamingVector(ROOT::RNTupleReader &reader, std::string_view fieldName) |
| 141 | + // : fVectorView(reader.GetCollectionView(fieldName)), fItemView(fVectorView.GetView<T>("_0")), |
| 142 | + // fRange(ROOT::kInvalidDescriptorId, ROOT::kInvalidNTupleIndex, ROOT::kInvalidNTupleIndex) |
| 143 | + //{ |
| 144 | + //} |
| 145 | + |
| 146 | + ROOT::NTupleSize_t size() const { return fSize; } |
| 147 | + |
| 148 | + // The begin() and end() methods enable range-based for loops like `for (auto val : streamingVector)` |
| 149 | + iterator begin() { return iterator(fRange.begin(), fItemView); } |
| 150 | + iterator end() { return iterator(fRange.end(), fItemView); } |
| 151 | + |
| 152 | + void LoadEntry(ROOT::NTupleSize_t entry) |
| 153 | + { |
| 154 | + fEntry = entry; |
| 155 | + fRange = fVectorView.GetCollectionRange(fEntry); |
| 156 | + fSize = fVectorView.operator()(fEntry); |
| 157 | + } |
| 158 | +}; |
| 159 | + |
| 160 | +// For the streaming vector read, we use a custom class `StreamingVector` that implements the piece-wise |
| 161 | +// loading of the data during iteration of elements of the on-disk vector. The class has been built such that |
| 162 | +// the event loop is almost identical to the simple reading case above. |
| 163 | +void ReadRNTupleStreamingVector() |
| 164 | +{ |
| 165 | + ROOT::RNTupleReadOptions options; |
| 166 | + // Don't preload data; we want to populate data into memory only as needed |
| 167 | + options.SetClusterCache(ROOT::RNTupleReadOptions::EClusterCache::kOff); |
| 168 | + auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName, options); |
| 169 | + |
| 170 | + const auto nEntries = reader->GetNEntries(); |
| 171 | + std::cout << "Streamed reading, found " << nEntries << " entries" << std::endl; |
| 172 | + |
| 173 | + StreamingVector<std::uint32_t> streamingVector(reader->GetCollectionView(kFieldName)); |
| 174 | + |
| 175 | + for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) { |
| 176 | + // Instead of `reader->LoadEntry()`, we tell the streaming vector which entry we want to read. |
| 177 | + streamingVector.LoadEntry(i); |
| 178 | + |
| 179 | + // We can ask for the size of the vector without loading the data |
| 180 | + const auto vectorSize = streamingVector.size(); |
| 181 | + |
| 182 | + // The iteration works exactly as in the simple case |
| 183 | + uint64_t sum = 0; |
| 184 | + for (auto val : streamingVector) |
| 185 | + sum += val; |
| 186 | + |
| 187 | + std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl; |
| 188 | + } |
| 189 | + std::cout << "RNTuple streaming read" << std::endl; |
| 190 | +} |
| 191 | + |
| 192 | +void ntpl016_streaming_vector() |
| 193 | +{ |
| 194 | + CreateRNTuple(); |
| 195 | + ReadRNTupleSimple(); |
| 196 | + ReadRNTupleStreamingVector(); |
| 197 | +} |
0 commit comments