diff --git a/tree/ntuple/inc/ROOT/RNTupleRange.hxx b/tree/ntuple/inc/ROOT/RNTupleRange.hxx index d1a67e5e663ba..8b51f949a140c 100644 --- a/tree/ntuple/inc/ROOT/RNTupleRange.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleRange.hxx @@ -80,9 +80,9 @@ public: // clang-format on class RNTupleLocalRange { private: - const ROOT::DescriptorId_t fClusterId; - const ROOT::NTupleSize_t fStart; - const ROOT::NTupleSize_t fEnd; + ROOT::DescriptorId_t fClusterId; + ROOT::NTupleSize_t fStart; + ROOT::NTupleSize_t fEnd; public: class RIterator { diff --git a/tree/ntuple/inc/ROOT/RNTupleView.hxx b/tree/ntuple/inc/ROOT/RNTupleView.hxx index 572420bce5d8b..0e4e59751b715 100644 --- a/tree/ntuple/inc/ROOT/RNTupleView.hxx +++ b/tree/ntuple/inc/ROOT/RNTupleView.hxx @@ -362,9 +362,20 @@ private: public: RNTupleCollectionView(const RNTupleCollectionView &other) = delete; - RNTupleCollectionView(RNTupleCollectionView &&other) = default; RNTupleCollectionView &operator=(const RNTupleCollectionView &other) = delete; - RNTupleCollectionView &operator=(RNTupleCollectionView &&other) = default; + RNTupleCollectionView(RNTupleCollectionView &&other) + : fSource(other.fSource), fField(std::move(other.fField)), fValue(fField.CreateValue()) + { + } + RNTupleCollectionView &operator=(RNTupleCollectionView &&other) + { + if (this == &other) + return *this; + std::swap(fSource, other.fSource); + std::swap(fField, other.fField); + fValue = fField.CreateValue(); + return *this; + } ~RNTupleCollectionView() = default; ROOT::RNTupleLocalRange GetCollectionRange(ROOT::NTupleSize_t globalIndex) diff --git a/tutorials/io/ntuple/ntpl016_streaming_vector.C b/tutorials/io/ntuple/ntpl016_streaming_vector.C new file mode 100644 index 0000000000000..4a5ad4f2499f6 --- /dev/null +++ b/tutorials/io/ntuple/ntpl016_streaming_vector.C @@ -0,0 +1,191 @@ +/// \file +/// \ingroup tutorial_ntuple +/// +/// Example of a streaming vector: a special purpose container that reads large vectors piece-wise. +/// +/// \macro_code +/// +/// \date November 2024 +/// \author Peter van Gemmeren, the ROOT Team + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +constexpr char const *kFileName = "ntpl015_streaming_vector.root"; +constexpr char const *kNTupleName = "ntpl"; +constexpr char const *kFieldName = "LargeVector"; +constexpr unsigned int kNEvents = 10; +constexpr unsigned int kVectorSize = 1000000; + +// Create an RNTuple with a single vector field. Every entry contains a large vector of random integers. +// The vector should be seen as too large to be held entirely in memory during reading. +void CreateRNTuple() +{ + auto model = ROOT::RNTupleModel::Create(); + auto ptrLargeVector = model->MakeField>(kFieldName); + auto writer = ROOT::RNTupleWriter::Recreate(std::move(model), kNTupleName, kFileName); + + auto prng = std::make_unique(); + prng->SetSeed(); + + for (ROOT::NTupleSize_t i = 0; i < kNEvents; i++) { + ptrLargeVector->clear(); + for (std::size_t j = 0; j < kVectorSize; j++) + ptrLargeVector->emplace_back(prng->Integer(-1)); + writer->Fill(); + } + std::cout << "RNTuple written" << std::endl; +} + +/* + * ================================================================================================== + */ + +// For comparison, the canonical read function that reads the entire vector for every entry. +void ReadRNTupleSimple() +{ + auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName); + + const auto nEntries = reader->GetNEntries(); + std::cout << "Simple reading, found " << nEntries << " entries" << std::endl; + + auto ptrLargeVector = reader->GetModel().GetDefaultEntry().GetPtr>(kFieldName); + for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) { + reader->LoadEntry(i); + + const auto vectorSize = ptrLargeVector->size(); + uint64_t sum = 0; + for (auto val : *ptrLargeVector) + sum += val; + + std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl; + } + std::cout << "RNTuple simple read" << std::endl; +} + +/* + * ================================================================================================== + */ + +// The StreamingVectorView class allows iteration over an RNTuple on-disk vector of element type T. +// Unlike an std::vector, this class does not provide random-access but only allows to iterate the data elements +// from beginning to end. +// Internally, it uses an RNTupleCollection view and an item view to load chunks of the vector elements into memory, +// so that never the entire vector needs to stay in memory. +// Note that we don't need to implement loading chunks of data explicitly. Simply by asking for a single vector element +// at every iteration step, the RNTuple views will take care of keeping only the currently required data pages +// in memory. +template +class StreamingVectorView { + // For a certain entry, the collection view provides the information about the size of the collection and + // the index range of the item view, which is required to read the values of the collection at hand. + ROOT::RNTupleCollectionView fVectorView; + // The "data view" provides access to the vector elements + ROOT::RNTupleView fItemView; + // Given an entry number, the start end end index in the item view to read the corresponding vector elements + ROOT::RNTupleLocalRange fRange{ROOT::kInvalidDescriptorId, ROOT::kInvalidNTupleIndex, ROOT::kInvalidNTupleIndex}; + // The index of the entry from which the vector should be read + ROOT::NTupleSize_t fEntry{0}; + // The size of the collection in fEntry + ROOT::NTupleSize_t fSize{0}; + +public: + // A lightweight iterator used in StreamingVectorView::begin() and StreamingVectorView::end(). + // Used to iterate over the elements of an RNTuple on-disk vector for a certain entry. + // Dereferencing the iterator returns the corresponding value of the item view. + class iterator { + ROOT::RNTupleLocalRange::RIterator fRangeItr; + ROOT::RNTupleView &fView; + + public: + iterator(ROOT::RNTupleLocalRange::RIterator rangeItr, ROOT::RNTupleView &view) + : fRangeItr(rangeItr), fView(view) + { + } + + iterator operator++(int) /* postfix */ + { + auto r = *this; + ++(*this); + return r; + } + iterator &operator++() /* prefix */ + { + ++fRangeItr; + return *this; + } + const T &operator*() { return fView.operator()(*fRangeItr); } + const T *operator->() { return &fView.operator()(*fRangeItr); } + bool operator==(const iterator &rh) const { return fRangeItr == rh.fRangeItr; } + bool operator!=(const iterator &rh) const { return fRangeItr != rh.fRangeItr; } + }; + + explicit StreamingVectorView(ROOT::RNTupleCollectionView vectorView) + : fVectorView(std::move(vectorView)), fItemView(fVectorView.GetView("_0")) + { + } + + ROOT::NTupleSize_t size() const { return fSize; } + + // The begin() and end() methods enable range-based for loops like `for (auto val : streamingVector)` + iterator begin() { return iterator(fRange.begin(), fItemView); } + iterator end() { return iterator(fRange.end(), fItemView); } + + void LoadEntry(ROOT::NTupleSize_t entry) + { + fEntry = entry; + fRange = fVectorView.GetCollectionRange(fEntry); + fSize = fVectorView.operator()(fEntry); + } +}; + +// For the streaming vector read, we use a custom class `StreamingVectorView` that implements the piece-wise +// loading of the data during iteration of elements of the on-disk vector. The class has been built such that +// the event loop is almost identical to the simple reading case above. +void ReadRNTupleStreamingVector() +{ + ROOT::RNTupleReadOptions options; + // Don't preload data; we want to populate data into memory only as needed + options.SetClusterCache(ROOT::RNTupleReadOptions::EClusterCache::kOff); + auto reader = ROOT::RNTupleReader::Open(kNTupleName, kFileName, options); + + const auto nEntries = reader->GetNEntries(); + std::cout << "Streamed reading, found " << nEntries << " entries" << std::endl; + + StreamingVectorView streamingVector(reader->GetCollectionView(kFieldName)); + + for (ROOT::NTupleSize_t i = 0; i < nEntries; i++) { + // Instead of `reader->LoadEntry()`, we tell the streaming vector which entry we want to read. + streamingVector.LoadEntry(i); + + // We can ask for the size of the vector without loading the data + const auto vectorSize = streamingVector.size(); + + // The iteration works exactly as in the simple case + uint64_t sum = 0; + for (auto val : streamingVector) + sum += val; + + std::cout << "Size and sum of vector: " << vectorSize << " " << sum << std::endl; + } + std::cout << "RNTuple streaming read" << std::endl; +} + +void ntpl016_streaming_vector() +{ + CreateRNTuple(); + ReadRNTupleSimple(); + ReadRNTupleStreamingVector(); +}