Skip to content

Commit e271749

Browse files
authored
Store the collection information in a struct instead of a tuple (#711)
* Make tuple a proper struct * Rename struct to better match naming convention * Keep thte ROOTReader functional for older versions * Add name to CollectionWriteInfo Makes it possible to drop the collection id table storage * Make RNTuple output the same as TTree based one * Cleanup and minor refactoring * Add storage type to meta info * Update documentation
1 parent d8e4d94 commit e271749

File tree

13 files changed

+154
-128
lines changed

13 files changed

+154
-128
lines changed

doc/storage_details.md

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,23 +62,29 @@ there will be the following branches per supported type
6262

6363
The podio related metadata, stored in the `podio_metadata` `TTree` (or
6464
`RNTupleModel`) contains the following general information once per file
65-
6665
- The version of podio that has been used to write this file
6766
- The complete datamodel definitions for each datamodel that was encountered
6867
when writing the file
6968

70-
And the following information once per category
71-
- The mapping of collection names to collection IDs
72-
- The types of all the stored collections
73-
- The schema version of all stored collections
74-
- Which collections are stored as subset collections
75-
76-
Here the `TTree` based and `RNTuple` based backends differ slightly in the way
77-
these data are stored exactly. The `TTree` based backend stores the data in a
78-
slightly more structured way, taking advantage of ROOTs capabilities to stream
79-
out more complex object, e.g. the `podio::CollectionIDTable` is streamed as a
80-
whole. The `RNTuple` based backend on the other hand, destructures the
81-
information into separate fields that run in parallel.
69+
And the following information once per category for each collection in that category
70+
- The collection ID
71+
- The collection name
72+
- The collection type
73+
- Whether the collection is a subset collection
74+
- The collection schema version
75+
- The collection storage type (which is different from the collection type and
76+
describes the format in which the data is actually stored rather than how it
77+
can be accessed in memory)
78+
79+
From a technical point of view this information is stored as a
80+
`std::vector<podio::root_utils::CollectionWriteInfo>`.
81+
82+
```{note}
83+
The exact details of how this information is stored in podio files has changed
84+
several times. The readers provided by podio handle these changes transparently,
85+
but other readers might have to adapt for these changes. **Notable changes
86+
happened before v01-00, and v01-03.**
87+
```
8288

8389
## SIO
8490

include/podio/RNTupleReader.h

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "podio/SchemaEvolution.h"
66
#include "podio/podioVersion.h"
77
#include "podio/utilities/DatamodelRegistryIOHelpers.h"
8+
#include "podio/utilities/RootHelpers.h"
89

910
#include <string>
1011
#include <string_view>
@@ -171,15 +172,8 @@ class RNTupleReader {
171172
std::unordered_map<std::string, std::vector<unsigned>> m_readerEntries{};
172173
std::unordered_map<std::string, unsigned> m_totalEntries{};
173174

174-
struct CollectionInfo {
175-
std::vector<unsigned int> id{};
176-
std::vector<std::string> name{};
177-
std::vector<std::string> type{};
178-
std::vector<short> isSubsetCollection{};
179-
std::vector<SchemaVersionT> schemaVersion{};
180-
};
181-
182-
std::unordered_map<std::string, CollectionInfo> m_collectionInfo{};
175+
/// Map each category to the collections that have been written and are available
176+
std::unordered_map<std::string, std::vector<podio::root_utils::CollectionWriteInfo>> m_collectionInfo{};
183177

184178
std::vector<std::string> m_availableCategories{};
185179

include/podio/RNTupleWriter.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,9 @@ class RNTupleWriter {
117117
struct CategoryInfo {
118118
std::unique_ptr<root_compat::RNTupleWriter> writer{nullptr}; ///< The RNTupleWriter for this category
119119

120-
// The following are assumed to run in parallel!
121-
std::vector<uint32_t> ids{}; ///< The ids of all collections
122-
std::vector<std::string> names{}; ///< The names of all collections
123-
std::vector<std::string> types{}; ///< The types of all collections
124-
std::vector<short> subsetCollections{}; ///< The flags identifying the subcollections
125-
std::vector<SchemaVersionT> schemaVersions{}; ///< The schema versions of all collections
120+
/// Collection info for this category
121+
std::vector<root_utils::CollectionWriteInfo> collInfo{};
122+
std::vector<std::string> names{}; ///< The names of all collections to write
126123

127124
// Storage for the keys & values of all the parameters of this category
128125
// (resp. at least the current entry)

include/podio/ROOTLegacyReader.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class ROOTLegacyReader {
116116
private:
117117
std::pair<TTree*, unsigned> getLocalTreeAndEntry(const std::string& treename);
118118

119-
void createCollectionBranches(const std::vector<root_utils::CollectionWriteInfoT>& collInfo);
119+
void createCollectionBranches(const std::vector<root_utils::CollectionWriteInfo>& collInfo);
120120

121121
podio::GenericParameters readEventMetaData();
122122

include/podio/ROOTWriter.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,10 @@ class ROOTWriter {
103103
/// Helper struct to group together all necessary state to write / process a
104104
/// given category. Created during the first writing of a category
105105
struct CategoryInfo {
106-
TTree* tree{nullptr}; ///< The TTree to which this category is written
107-
std::vector<root_utils::CollectionBranches> branches{}; ///< The branches for this category
108-
std::vector<root_utils::CollectionWriteInfoT> collInfo{}; ///< Collection info for this category
109-
podio::CollectionIDTable idTable{}; ///< The collection id table for this category
110-
std::vector<std::string> collsToWrite{}; ///< The collections to write for this category
106+
TTree* tree{nullptr}; ///< The TTree to which this category is written
107+
std::vector<root_utils::CollectionBranches> branches{}; ///< The branches for this category
108+
std::vector<root_utils::CollectionWriteInfo> collInfo{}; ///< Collection info for this category
109+
std::vector<std::string> collsToWrite{}; ///< The collections to write for this category
111110

112111
// Storage for the keys & values of all the parameters of this category
113112
// (resp. at least the current entry)

include/podio/utilities/RootHelpers.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,17 @@ namespace root_utils {
1818
// A collection of additional information that describes the collection: the
1919
// collectionID, the collection (data) type, whether it is a subset
2020
// collection, and its schema version
21+
struct CollectionWriteInfo {
22+
uint32_t collectionID{static_cast<uint32_t>(-1)}; ///< collection id
23+
std::string dataType{}; ///< The fully qualified data type of the collection
24+
bool isSubset{false}; ///< Whether this collection is a subset collection or not
25+
unsigned int schemaVersion{0}; ///< The schema version of the collection type
26+
std::string name{}; ///< The name of the collection
27+
std::string storageType{}; ///< The type in which the data is actually stored
28+
};
29+
// The format used until version 1.2
2130
using CollectionWriteInfoT = std::tuple<uint32_t, std::string, bool, unsigned int>;
31+
2232
// for backwards compatibility
2333
using CollectionInfoWithoutSchemaT = std::tuple<int, std::string, bool>;
2434

src/RNTupleReader.cc

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
#include "podio/CollectionIDTable.h"
55
#include "podio/DatamodelRegistry.h"
66
#include "podio/GenericParameters.h"
7+
#include "podio/utilities/RootHelpers.h"
78
#include "rootUtils.h"
89

910
#include <ROOT/RError.hxx>
1011

1112
#include <algorithm>
13+
#include <cstdint>
1214
#include <memory>
1315

1416
// Adjust for the move of this out of ROOT v7 in
@@ -48,27 +50,11 @@ bool RNTupleReader::initCategory(const std::string& category) {
4850
// Assume that the metadata is the same in all files
4951
auto filename = m_filenames[0];
5052

51-
auto& collInfo = m_collectionInfo[category];
53+
auto collInfo = m_metadata_readers[filename]->GetView<std::vector<root_utils::CollectionWriteInfo>>(
54+
{root_utils::collInfoName(category)});
5255

53-
auto id = m_metadata_readers[filename]->GetView<std::vector<unsigned int>>(root_utils::idTableName(category));
54-
collInfo.id = id(0);
55-
56-
auto collectionName =
57-
m_metadata_readers[filename]->GetView<std::vector<std::string>>(root_utils::collectionName(category));
58-
collInfo.name = collectionName(0);
59-
60-
auto collectionType =
61-
m_metadata_readers[filename]->GetView<std::vector<std::string>>(root_utils::collInfoName(category));
62-
collInfo.type = collectionType(0);
63-
64-
auto subsetCollection =
65-
m_metadata_readers[filename]->GetView<std::vector<short>>(root_utils::subsetCollection(category));
66-
collInfo.isSubsetCollection = subsetCollection(0);
67-
68-
auto schemaVersion = m_metadata_readers[filename]->GetView<std::vector<SchemaVersionT>>("schemaVersion_" + category);
69-
collInfo.schemaVersion = schemaVersion(0);
70-
71-
m_idTables[category] = std::make_shared<CollectionIDTable>(collInfo.id, collInfo.name);
56+
m_collectionInfo[category] = collInfo(0);
57+
m_idTables[category] = root_utils::makeCollIdTable(collInfo(0));
7258

7359
return true;
7460
}
@@ -162,7 +148,7 @@ std::unique_ptr<ROOTFrameData> RNTupleReader::readEntry(const std::string& categ
162148
// Make sure to not silently ignore non-existant but requested collections
163149
if (!collsToRead.empty()) {
164150
for (const auto& name : collsToRead) {
165-
if (std::ranges::find(collInfo.name, name) == collInfo.name.end()) {
151+
if (std::ranges::find(collInfo, name, &root_utils::CollectionWriteInfo::name) == collInfo.end()) {
166152
throw std::invalid_argument(name + " is not available from Frame");
167153
}
168154
}
@@ -184,47 +170,46 @@ std::unique_ptr<ROOTFrameData> RNTupleReader::readEntry(const std::string& categ
184170
// we set all the fields there in any case.
185171
auto dentry = m_readers[category][readerIndex]->GetModel().CreateEntry();
186172

187-
for (size_t i = 0; i < collInfo.id.size(); ++i) {
188-
if (!collsToRead.empty() && std::ranges::find(collsToRead, collInfo.name[i]) == collsToRead.end()) {
173+
for (const auto& coll : collInfo) {
174+
if (!collsToRead.empty() && std::ranges::find(collsToRead, coll.name) == collsToRead.end()) {
189175
continue;
190176
}
191-
const auto& collType = collInfo.type[i];
177+
const auto& collType = coll.dataType;
192178
const auto& bufferFactory = podio::CollectionBufferFactory::instance();
193-
auto maybeBuffers =
194-
bufferFactory.createBuffers(collType, collInfo.schemaVersion[i], collInfo.isSubsetCollection[i]);
179+
auto maybeBuffers = bufferFactory.createBuffers(collType, coll.schemaVersion, coll.isSubset);
195180
auto collBuffers = maybeBuffers.value_or(podio::CollectionReadBuffers{});
196181

197182
if (!maybeBuffers) {
198-
std::cout << "WARNING: Buffers couldn't be created for collection " << collInfo.name[i] << " of type "
199-
<< collInfo.type[i] << " and schema version " << collInfo.schemaVersion[i] << std::endl;
183+
std::cout << "WARNING: Buffers couldn't be created for collection " << coll.name << " of type " << coll.dataType
184+
<< " and schema version " << coll.schemaVersion << std::endl;
200185
return nullptr;
201186
}
202187

203-
if (collInfo.isSubsetCollection[i]) {
204-
auto brName = root_utils::subsetBranch(collInfo.name[i]);
188+
if (coll.isSubset) {
189+
auto brName = root_utils::subsetBranch(coll.name);
205190
auto vec = new std::vector<podio::ObjectID>;
206191
dentry->BindRawPtr(brName, vec);
207192
collBuffers.references->at(0) = std::unique_ptr<std::vector<podio::ObjectID>>(vec);
208193
} else {
209-
dentry->BindRawPtr(collInfo.name[i], collBuffers.data);
194+
dentry->BindRawPtr(coll.name, collBuffers.data);
210195

211196
const auto relVecNames = podio::DatamodelRegistry::instance().getRelationNames(collType);
212197
for (size_t j = 0; j < relVecNames.relations.size(); ++j) {
213198
const auto relName = relVecNames.relations[j];
214199
auto vec = new std::vector<podio::ObjectID>;
215-
const auto brName = root_utils::refBranch(collInfo.name[i], relName);
200+
const auto brName = root_utils::refBranch(coll.name, relName);
216201
dentry->BindRawPtr(brName, vec);
217202
collBuffers.references->at(j) = std::unique_ptr<std::vector<podio::ObjectID>>(vec);
218203
}
219204

220205
for (size_t j = 0; j < relVecNames.vectorMembers.size(); ++j) {
221206
const auto vecName = relVecNames.vectorMembers[j];
222-
const auto brName = root_utils::vecBranch(collInfo.name[i], vecName);
207+
const auto brName = root_utils::vecBranch(coll.name, vecName);
223208
dentry->BindRawPtr(brName, collBuffers.vectorMembers->at(j).second);
224209
}
225210
}
226211

227-
buffers.emplace(collInfo.name[i], std::move(collBuffers));
212+
buffers.emplace(coll.name, std::move(collBuffers));
228213
}
229214

230215
m_readers[category][readerIndex]->LoadEntry(localEntry, *dentry);

src/RNTupleWriter.cc

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "podio/DatamodelRegistry.h"
33
#include "podio/SchemaEvolution.h"
44
#include "podio/podioVersion.h"
5+
#include "podio/utilities/RootHelpers.h"
56
#include "rootUtils.h"
67

78
#include "TFile.h"
@@ -92,14 +93,13 @@ void RNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& cat
9293
auto model = createModels(collections);
9394
catInfo.writer = root_compat::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
9495

96+
catInfo.collInfo.reserve(collections.size());
9597
for (const auto& [name, coll] : collections) {
96-
catInfo.ids.emplace_back(coll->getID());
97-
catInfo.types.emplace_back(coll->getTypeName());
98-
catInfo.subsetCollections.emplace_back(coll->isSubsetCollection());
99-
catInfo.schemaVersions.emplace_back(coll->getSchemaVersion());
98+
catInfo.collInfo.emplace_back(coll->getID(), std::string(coll->getTypeName()), coll->isSubsetCollection(),
99+
coll->getSchemaVersion(), name, root_utils::getStorageTypeName(coll));
100100
}
101101
} else {
102-
if (!root_utils::checkConsistentColls(catInfo.names, collsToWrite)) {
102+
if (!root_utils::checkConsistentColls(catInfo.collInfo, collsToWrite)) {
103103
throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
104104
root_utils::getInconsistentCollsMsg(catInfo.names, collsToWrite));
105105
}
@@ -260,16 +260,9 @@ void RNTupleWriter::finish() {
260260
}
261261

262262
for (auto& [category, collInfo] : m_categories) {
263-
auto idField = metadata->MakeField<std::vector<unsigned int>>({root_utils::idTableName(category)});
264-
*idField = collInfo.ids;
265-
auto collectionNameField = metadata->MakeField<std::vector<std::string>>({root_utils::collectionName(category)});
266-
*collectionNameField = collInfo.names;
267-
auto collectionTypeField = metadata->MakeField<std::vector<std::string>>({root_utils::collInfoName(category)});
268-
*collectionTypeField = collInfo.types;
269-
auto subsetCollectionField = metadata->MakeField<std::vector<short>>({root_utils::subsetCollection(category)});
270-
*subsetCollectionField = collInfo.subsetCollections;
271-
auto schemaVersionField = metadata->MakeField<std::vector<SchemaVersionT>>({"schemaVersion_" + category});
272-
*schemaVersionField = collInfo.schemaVersions;
263+
auto collInfoField =
264+
metadata->MakeField<std::vector<root_utils::CollectionWriteInfo>>({root_utils::collInfoName(category)});
265+
*collInfoField = collInfo.collInfo;
273266
}
274267

275268
metadata->Freeze();

src/ROOTLegacyReader.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,12 @@ void ROOTLegacyReader::openFiles(const std::vector<std::string>& filenames) {
152152
collInfoBranch->SetAddress(&collectionInfo);
153153
collInfoBranch->GetEntry(0);
154154
}
155-
createCollectionBranches(*collectionInfo);
155+
std::vector<root_utils::CollectionWriteInfo> collInfo;
156+
collInfo.reserve(collectionInfo->size());
157+
for (auto& [id, typeName, isSubsetColl, schemaVersion] : *collectionInfo) {
158+
collInfo.emplace_back(id, std::move(typeName), isSubsetColl, schemaVersion);
159+
}
160+
createCollectionBranches(collInfo);
156161
delete collectionInfo;
157162
} else {
158163
std::cout << "PODIO: Reconstructing CollectionTypeInfo branch from other sources in file: \'"
@@ -170,10 +175,10 @@ unsigned ROOTLegacyReader::getEntries(const std::string& name) const {
170175
return m_chain->GetEntries();
171176
}
172177

173-
void ROOTLegacyReader::createCollectionBranches(const std::vector<root_utils::CollectionWriteInfoT>& collInfo) {
178+
void ROOTLegacyReader::createCollectionBranches(const std::vector<root_utils::CollectionWriteInfo>& collInfo) {
174179
size_t collectionIndex{0};
175180

176-
for (const auto& [collID, collType, isSubsetColl, collSchemaVersion] : collInfo) {
181+
for (const auto& [collID, collType, isSubsetColl, collSchemaVersion, _, __] : collInfo) {
177182
// We only write collections that are in the collectionIDTable, so no need
178183
// to check here
179184
const auto name = m_table->name(collID).value();

0 commit comments

Comments
 (0)