Skip to content

Commit 025fdcb

Browse files
committed
[df] Add data source for TTree processing
A new RDataSource derived class is implemented for TTree data processing. The class follows almost entirely the same approach for processing of other data sources, with the exception of the MT case which still needs to be partially handled externally by TTreeProcessorMT. Nonetheless, the class and its API can be used throughout the RDF codebase to centralise all TTree-related usage and processing. While implementing the class, a few missing features or limitations of RDataSource were found which needed API extensions. For the moment, all the API extensions of RDataSource are private and it will be later decided whether to make them public or not.
1 parent dc4dd71 commit 025fdcb

File tree

10 files changed

+887
-2
lines changed

10 files changed

+887
-2
lines changed

tree/dataframe/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
4949
ROOT/RRootDS.hxx
5050
ROOT/RSnapshotOptions.hxx
5151
ROOT/RTrivialDS.hxx
52+
ROOT/RTTreeDS.hxx
5253
ROOT/RDF/ActionHelpers.hxx
5354
ROOT/RDF/ColumnReaderUtils.hxx
5455
ROOT/RDF/GraphNode.hxx
@@ -107,6 +108,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
107108
src/RCutFlowReport.cxx
108109
src/RDataFrame.cxx
109110
src/RDatasetSpec.cxx
111+
src/RDataSource.cxx
110112
src/RDFActionHelpers.cxx
111113
src/RDFColumnReaderUtils.cxx
112114
src/RDFColumnRegister.cxx
@@ -129,6 +131,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTDataFrame
129131
src/RSample.cxx
130132
src/RTreeColumnReader.cxx
131133
src/RResultPtr.cxx
134+
src/RTTreeDS.cxx
132135
src/RVariationBase.cxx
133136
src/RVariationReader.cxx
134137
src/RVariationsDescription.cxx

tree/dataframe/inc/ROOT/RDF/RLoopManager.hxx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class RDataSource;
4444
} // ns RDF
4545

4646
namespace Internal {
47+
class RSlotStack;
4748
namespace RDF {
4849
std::vector<std::string> GetBranchNames(TTree &t, bool allowDuplicates = true);
4950

@@ -312,6 +313,13 @@ public:
312313
{
313314
return fSuppressErrorsForMissingBranches;
314315
}
316+
317+
/// The task run by every thread on the input entry range, for the generic RDataSource.
318+
void DataSourceThreadTask(const std::pair<ULong64_t, ULong64_t> &entryRange, ROOT::Internal::RSlotStack &slotStack,
319+
std::atomic<ULong64_t> &entryCount);
320+
/// The task run by every thread on an entry range (known by the input TTreeReader), for the TTree data source.
321+
void
322+
TTreeThreadTask(TTreeReader &treeReader, ROOT::Internal::RSlotStack &slotStack, std::atomic<ULong64_t> &entryCount);
315323
};
316324

317325
/// \brief Create an RLoopManager that reads a TChain.

tree/dataframe/inc/ROOT/RDF/Utils.hxx

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ struct IsVector_t : public std::false_type {};
123123
template <typename T, typename A>
124124
struct IsVector_t<std::vector<T, A>> : public std::true_type {};
125125

126+
std::string GetBranchOrLeafTypeName(TTree &t, const std::string &colName);
127+
126128
const std::type_info &TypeName2TypeID(const std::string &name);
127129

128130
std::string TypeID2TypeName(const std::type_info &id);
@@ -314,6 +316,14 @@ struct CallGuaranteedOrder {
314316
f(std::forward<Args>(args)...);
315317
}
316318
};
319+
320+
template <typename T>
321+
auto MakeAliasedSharedPtr(T *rawPtr)
322+
{
323+
const static std::shared_ptr<T> fgRawPtrCtrlBlock;
324+
return std::shared_ptr<T>(fgRawPtrCtrlBlock, rawPtr);
325+
}
326+
317327
} // end NS RDF
318328
} // end NS Internal
319329
} // end NS ROOT

tree/dataframe/inc/ROOT/RDataSource.hxx

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,28 @@
1818

1919
#include <algorithm> // std::transform
2020
#include <cassert>
21+
#include <optional>
22+
#include <set>
2123
#include <string>
2224
#include <typeinfo>
25+
#include <unordered_map>
26+
#include <variant>
2327
#include <vector>
28+
#include <functional>
29+
30+
// Need to fwd-declare TTreeReader for CreateColumnReader
31+
class TTreeReader;
32+
namespace ROOT::Detail::RDF {
33+
class RLoopManager;
34+
}
2435

2536
namespace ROOT {
2637
namespace RDF {
2738
class RDataSource;
39+
class RSampleInfo;
40+
namespace Experimental {
41+
class RSample;
42+
}
2843
}
2944
}
3045

@@ -71,6 +86,23 @@ public:
7186
};
7287

7388
} // ns TDS
89+
90+
namespace RDF {
91+
std::string GetTypeNameWithOpts(const ROOT::RDF::RDataSource &ds, std::string_view colName, bool vector2RVec);
92+
const std::vector<std::string> &GetTopLevelFieldNames(const ROOT::RDF::RDataSource &ds);
93+
const std::vector<std::string> &GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &ds);
94+
void CallInitializeWithOpts(ROOT::RDF::RDataSource &ds, const std::set<std::string> &suppressErrorsForMissingColumns);
95+
std::string DescribeDataset(ROOT::RDF::RDataSource &ds);
96+
ROOT::RDF::RSampleInfo
97+
CreateSampleInfo(const ROOT::RDF::RDataSource &ds,
98+
const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &sampleMap);
99+
void RunFinalChecks(const ROOT::RDF::RDataSource &ds, bool nodesLeftNotRun);
100+
void ProcessMT(ROOT::RDF::RDataSource &ds, ROOT::Detail::RDF::RLoopManager &lm);
101+
std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
102+
CreateColumnReader(ROOT::RDF::RDataSource &ds, unsigned int slot, std::string_view col, const std::type_info &tid,
103+
TTreeReader *treeReader);
104+
} // namespace RDF
105+
74106
} // ns Internal
75107

76108
namespace RDF {
@@ -117,6 +149,57 @@ protected:
117149

118150
unsigned int fNSlots{};
119151

152+
std::optional<std::pair<ULong64_t, ULong64_t>> fGlobalEntryRange{};
153+
154+
friend std::string ROOT::Internal::RDF::GetTypeNameWithOpts(const RDataSource &, std::string_view, bool);
155+
virtual std::string GetTypeNameWithOpts(std::string_view colName, bool) const { return GetTypeName(colName); }
156+
157+
friend const std::vector<std::string> &ROOT::Internal::RDF::GetTopLevelFieldNames(const ROOT::RDF::RDataSource &);
158+
virtual const std::vector<std::string> &GetTopLevelFieldNames() const { return GetColumnNames(); }
159+
160+
friend const std::vector<std::string> &
161+
ROOT::Internal::RDF::GetColumnNamesNoDuplicates(const ROOT::RDF::RDataSource &);
162+
virtual const std::vector<std::string> &GetColumnNamesNoDuplicates() const { return GetColumnNames(); }
163+
164+
friend void ROOT::Internal::RDF::CallInitializeWithOpts(ROOT::RDF::RDataSource &, const std::set<std::string> &);
165+
virtual void InitializeWithOpts(const std::set<std::string> &) { Initialize(); }
166+
167+
friend std::string ROOT::Internal::RDF::DescribeDataset(ROOT::RDF::RDataSource &);
168+
virtual std::string DescribeDataset() { return "Dataframe from datasource " + GetLabel(); }
169+
170+
friend ROOT::RDF::RSampleInfo
171+
ROOT::Internal::RDF::CreateSampleInfo(const ROOT::RDF::RDataSource &,
172+
const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &);
173+
virtual ROOT::RDF::RSampleInfo
174+
CreateSampleInfo(const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &) const;
175+
176+
friend void ROOT::Internal::RDF::RunFinalChecks(const ROOT::RDF::RDataSource &, bool);
177+
virtual void RunFinalChecks(bool) const {}
178+
179+
friend void ROOT::Internal::RDF::ProcessMT(RDataSource &, ROOT::Detail::RDF::RLoopManager &);
180+
virtual void ProcessMT(ROOT::Detail::RDF::RLoopManager &);
181+
182+
friend std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
183+
ROOT::Internal::RDF::CreateColumnReader(ROOT::RDF::RDataSource &, unsigned int, std::string_view,
184+
const std::type_info &, TTreeReader *);
185+
/**
186+
* \brief Creates a column reader for the requested column
187+
*
188+
* In the general case, this is just a redirect to the right GetColumnReaders overload. The signature notably also
189+
* has a TTreeReader * parameter. This is currently necessary to still allow the TTree-based MT scheduling via
190+
* TTreeProcessorMT. We use the TTreeProcessorMT::Process method to launch the same kernel across all threads. In
191+
* each thread task, TTreeProcessorMT creates a thread-local instance of a TTreeReader which is going to read the
192+
* range of events assigned to that task. That TTreeReader instance is what is passed to this method whenever a
193+
* column reader needs to be created in a thread task. In the future this method might be removed by either allowing
194+
* to request a handle to the thread-local TTreeReader instance programmatically from the TTreeProcessorMT, or
195+
* refactoring the TTreeProcessorMT scheduling into RTTreeDS altogether.
196+
*/
197+
virtual std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
198+
CreateColumnReader(unsigned int slot, std::string_view col, const std::type_info &tid, TTreeReader *)
199+
{
200+
return GetColumnReaders(slot, col, tid);
201+
}
202+
120203
public:
121204
RDataSource() = default;
122205
// Rule of five
@@ -242,6 +325,13 @@ public:
242325
/// Concrete datasources can override the default implementation.
243326
virtual std::string GetLabel() { return "Custom Datasource"; }
244327

328+
/// \brief Restrict processing to a [begin, end) range of entries.
329+
/// \param entryRange The range of entries to process.
330+
virtual void SetGlobalEntryRange(std::pair<ULong64_t, ULong64_t> entryRange)
331+
{
332+
fGlobalEntryRange = std::move(entryRange);
333+
};
334+
245335
protected:
246336
/// type-erased vector of pointers to pointers to column values - one per slot
247337
virtual Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) = 0;
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
/**
2+
\file ROOT/RTTreeDS.hxx
3+
\ingroup dataframe
4+
\author Vincenzo Eduardo Padulano
5+
\date 2024-12
6+
*/
7+
8+
/*************************************************************************
9+
* Copyright (C) 1995-2024, Rene Brun and Fons Rademakers. *
10+
* All rights reserved. *
11+
* *
12+
* For the licensing terms see $ROOTSYS/LICENSE. *
13+
* For the list of contributors see $ROOTSYS/README/CREDITS. *
14+
*************************************************************************/
15+
16+
#ifndef ROOT_INTERNAL_RDF_RTTREEDS
17+
#define ROOT_INTERNAL_RDF_RTTREEDS
18+
19+
#include "ROOT/RDataSource.hxx"
20+
21+
#include <memory>
22+
#include <string>
23+
#include <vector>
24+
#include <stdexcept>
25+
#include <string_view>
26+
27+
// Begin forward decls
28+
29+
namespace ROOT {
30+
class RDataFrame;
31+
}
32+
33+
namespace ROOT::Detail::RDF {
34+
class RLoopManager;
35+
}
36+
37+
namespace ROOT::RDF {
38+
class RSampleInfo;
39+
}
40+
41+
namespace ROOT::RDF::Experimental {
42+
class RSample;
43+
}
44+
45+
namespace ROOT::TreeUtils {
46+
struct RFriendInfo;
47+
}
48+
49+
class TChain;
50+
class TDirectory;
51+
class TTree;
52+
class TTreeReader;
53+
54+
// End forward decls
55+
56+
namespace ROOT::Internal::RDF {
57+
58+
class RTTreeDS final : public ROOT::RDF::RDataSource {
59+
std::vector<std::string> fBranchNamesWithDuplicates{};
60+
std::vector<std::string> fBranchNamesWithoutDuplicates{};
61+
std::vector<std::string> fTopLevelBranchNames{};
62+
63+
std::shared_ptr<TTree> fTree;
64+
65+
std::unique_ptr<TTreeReader> fTreeReader;
66+
67+
std::vector<std::unique_ptr<TChain>> fFriends;
68+
69+
ROOT::RDF::RSampleInfo
70+
CreateSampleInfo(const std::unordered_map<std::string, ROOT::RDF::Experimental::RSample *> &sampleMap) const final;
71+
72+
void RunFinalChecks(bool nodesLeftNotRun) const final;
73+
74+
void Setup(std::shared_ptr<TTree> &&tree, const ROOT::TreeUtils::RFriendInfo *friendInfo = nullptr);
75+
76+
std::vector<std::pair<ULong64_t, ULong64_t>> GetTTreeEntryRange(TTree &tree);
77+
std::vector<std::pair<ULong64_t, ULong64_t>> GetTChainEntryRange(TChain &chain);
78+
79+
public:
80+
RTTreeDS(std::shared_ptr<TTree> tree);
81+
RTTreeDS(std::shared_ptr<TTree> tree, const ROOT::TreeUtils::RFriendInfo &friendInfo);
82+
RTTreeDS(std::string_view treeName, TDirectory *dirPtr);
83+
RTTreeDS(std::string_view treeName, std::string_view fileNameGlob);
84+
RTTreeDS(std::string_view treeName, const std::vector<std::string> &fileNameGlobs);
85+
86+
// Rule of five
87+
RTTreeDS(const RTTreeDS &) = delete;
88+
RTTreeDS &operator=(const RTTreeDS &) = delete;
89+
RTTreeDS(RTTreeDS &&) = delete;
90+
RTTreeDS &operator=(RTTreeDS &&) = delete;
91+
~RTTreeDS() final; // Define destructor where data member types are defined
92+
93+
void Initialize() final;
94+
95+
void Finalize() final;
96+
97+
std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
98+
99+
const std::vector<std::string> &GetColumnNames() const final { return fBranchNamesWithDuplicates; }
100+
101+
bool HasColumn(std::string_view colName) const final
102+
{
103+
return std::find(fBranchNamesWithDuplicates.begin(), fBranchNamesWithDuplicates.end(), colName) !=
104+
fBranchNamesWithDuplicates.end();
105+
}
106+
107+
std::string GetTypeName(std::string_view colName) const final;
108+
109+
std::string GetTypeNameWithOpts(std::string_view colName, bool vector2RVec) const final;
110+
111+
bool SetEntry(unsigned int, ULong64_t entry) final;
112+
113+
Record_t GetColumnReadersImpl(std::string_view /* name */, const std::type_info & /* ti */) final
114+
{
115+
// This datasource uses the newer GetColumnReaders() API
116+
return {};
117+
}
118+
119+
std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
120+
GetColumnReaders(unsigned int, std::string_view, const std::type_info &) final
121+
{
122+
// This data source creates column readers via CreateColumnReader
123+
throw std::runtime_error("GetColumnReaders should not be called on this data source, something wrong happened!");
124+
}
125+
126+
std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase> CreateColumnReader(unsigned int slot, std::string_view col,
127+
const std::type_info &tid,
128+
TTreeReader *treeReader) final;
129+
130+
std::string GetLabel() final { return "TTreeDS"; }
131+
132+
TTree *GetTree();
133+
134+
const std::vector<std::string> &GetTopLevelFieldNames() const final { return fTopLevelBranchNames; }
135+
136+
const std::vector<std::string> &GetColumnNamesNoDuplicates() const final { return fBranchNamesWithoutDuplicates; }
137+
138+
void InitializeWithOpts(const std::set<std::string> &suppressErrorsForMissingBranches) final;
139+
140+
std::string DescribeDataset() final;
141+
142+
std::string AsString() final { return "TTree data source"; }
143+
144+
std::size_t GetNFiles() const final;
145+
146+
void ProcessMT(ROOT::Detail::RDF::RLoopManager &lm) final;
147+
};
148+
149+
ROOT::RDataFrame FromTTree(std::string_view treeName, std::string_view fileNameGlob);
150+
ROOT::RDataFrame FromTTree(std::string_view treeName, const std::vector<std::string> &fileNameGlobs);
151+
152+
} // namespace ROOT::Internal::RDF
153+
154+
#endif

0 commit comments

Comments
 (0)