Skip to content

Commit fd9ee5f

Browse files
committed
All RNTupleTemp related files
Names have changed but still using TTree for now.
1 parent 678f32b commit fd9ee5f

File tree

115 files changed

+12137
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+12137
-0
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<use name="DataFormats/Common"/>
2+
<use name="DataFormats/Provenance"/>
3+
<use name="FWCore/Catalog"/>
4+
<use name="FWCore/Framework"/>
5+
<use name="FWCore/MessageLogger"/>
6+
<use name="FWCore/ParameterSet"/>
7+
<use name="FWCore/ServiceRegistry"/>
8+
<use name="FWCore/Sources"/>
9+
<use name="FWCore/Utilities"/>
10+
<use name="IOPool/Common"/>
11+
<use name="Utilities/StorageFactory"/>
12+
<use name="clhep"/>
13+
<use name="rootcore"/>
14+
<use name="rootntuple"/>
15+
<flags EDM_PLUGIN="1"/>
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
2+
#include "FWIO/RNTupleTempInput/src/DuplicateChecker.h"
3+
#include "FWCore/ParameterSet/interface/ParameterSet.h"
4+
#include "FWCore/Utilities/interface/Exception.h"
5+
#include "FWCore/MessageLogger/interface/MessageLogger.h"
6+
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
7+
8+
#include <cassert>
9+
#include <algorithm>
10+
11+
namespace edm::rntuple_temp {
12+
13+
DuplicateChecker::DuplicateChecker(ParameterSet const& pset)
14+
: dataType_(unknown), itIsKnownTheFileHasNoDuplicates_(false), disabled_(false) {
15+
// The default value provided as the second argument to the getUntrackedParameter function call
16+
// is not used when the ParameterSet has been validated and the parameters are not optional
17+
// in the description. This is currently true when RNTupleTempSource is the primary input source.
18+
// The modules that use RNTupleTempSource as a SecSource have not defined their fillDescriptions function
19+
// yet, so the ParameterSet does not get validated yet. As soon as all the modules with a SecSource
20+
// have defined descriptions, the defaults in the getUntrackedParameterSet function calls can
21+
// and should be deleted from the code.
22+
std::string duplicateCheckMode =
23+
pset.getUntrackedParameter<std::string>("duplicateCheckMode", std::string("checkAllFilesOpened"));
24+
25+
if (duplicateCheckMode == std::string("noDuplicateCheck"))
26+
duplicateCheckMode_ = noDuplicateCheck;
27+
else if (duplicateCheckMode == std::string("checkEachFile"))
28+
duplicateCheckMode_ = checkEachFile;
29+
else if (duplicateCheckMode == std::string("checkEachRealDataFile"))
30+
duplicateCheckMode_ = checkEachRealDataFile;
31+
else if (duplicateCheckMode == std::string("checkAllFilesOpened"))
32+
duplicateCheckMode_ = checkAllFilesOpened;
33+
else {
34+
throw cms::Exception("Configuration")
35+
<< "Illegal configuration parameter value passed to RNTupleTempSource for\n"
36+
<< "the \"duplicateCheckMode\" parameter, legal values are:\n"
37+
<< "\"noDuplicateCheck\", \"checkEachFile\", \"checkEachRealDataFile\", \"checkAllFilesOpened\"\n";
38+
}
39+
}
40+
41+
void DuplicateChecker::disable() {
42+
disabled_ = true;
43+
dataType_ = unknown;
44+
relevantPreviousEvents_.clear();
45+
itIsKnownTheFileHasNoDuplicates_ = false;
46+
}
47+
48+
void DuplicateChecker::inputFileOpened(bool realData,
49+
IndexIntoFile const& indexIntoFile,
50+
std::vector<std::shared_ptr<IndexIntoFile> > const& indexesIntoFiles,
51+
std::vector<std::shared_ptr<IndexIntoFile> >::size_type currentIndexIntoFile) {
52+
dataType_ = realData ? isRealData : isSimulation;
53+
if (checkDisabled())
54+
return;
55+
56+
relevantPreviousEvents_.clear();
57+
itIsKnownTheFileHasNoDuplicates_ = false;
58+
59+
if (duplicateCheckMode_ == checkAllFilesOpened) {
60+
// Compares the current IndexIntoFile to all the previous ones and saves any duplicates.
61+
// One unintended thing, it also saves the duplicate runs and lumis.
62+
for (std::vector<std::shared_ptr<IndexIntoFile> >::size_type i = 0; i < currentIndexIntoFile; ++i) {
63+
if (indexesIntoFiles[i].get() != nullptr) {
64+
indexIntoFile.set_intersection(*indexesIntoFiles[i], relevantPreviousEvents_);
65+
}
66+
}
67+
}
68+
if (relevantPreviousEvents_.empty()) {
69+
if (!indexIntoFile.containsDuplicateEvents()) {
70+
itIsKnownTheFileHasNoDuplicates_ = true;
71+
}
72+
}
73+
}
74+
75+
void DuplicateChecker::inputFileClosed() {
76+
dataType_ = unknown;
77+
relevantPreviousEvents_.clear();
78+
itIsKnownTheFileHasNoDuplicates_ = false;
79+
}
80+
81+
bool DuplicateChecker::isDuplicateAndCheckActive(
82+
int index, RunNumber_t run, LuminosityBlockNumber_t lumi, EventNumber_t event, std::string const& fileName) {
83+
if (itIsKnownTheFileHasNoDuplicates_)
84+
return false;
85+
if (checkDisabled())
86+
return false;
87+
88+
IndexIntoFile::IndexRunLumiEventKey newEvent(index, run, lumi, event);
89+
bool duplicate = !relevantPreviousEvents_.insert(newEvent).second;
90+
91+
if (duplicate) {
92+
if (duplicateCheckMode_ == checkAllFilesOpened) {
93+
LogWarning("DuplicateEvent") << "Duplicate Events found in entire set of input files.\n"
94+
<< "Both events were from run " << run << " and luminosity block " << lumi
95+
<< " with event number " << event << ".\n"
96+
<< "The duplicate was from file " << fileName << ".\n"
97+
<< "The duplicate will be skipped.\n";
98+
} else {
99+
LogWarning("DuplicateEvent") << "Duplicate Events found in file " << fileName << ".\n"
100+
<< "Both events were from run " << run << " and luminosity block " << lumi
101+
<< " with event number " << event << ".\n"
102+
<< "The duplicate will be skipped.\n";
103+
}
104+
return true;
105+
}
106+
return false;
107+
}
108+
109+
void DuplicateChecker::fillDescription(ParameterSetDescription& desc) {
110+
std::string defaultString("checkAllFilesOpened");
111+
desc.addUntracked<std::string>("duplicateCheckMode", defaultString)
112+
->setComment(
113+
"'checkAllFilesOpened': check across all input files\n"
114+
"'checkEachFile': check each input file independently\n"
115+
"'checkEachRealDataFile': check each real data input file independently\n"
116+
"'noDuplicateCheck': no duplicate checking\n");
117+
}
118+
} // namespace edm::rntuple_temp
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#ifndef DataFormats_Provenance_DuplicateChecker_h
2+
#define DataFormats_Provenance_DuplicateChecker_h
3+
4+
/*----------------------------------------------------------------------
5+
6+
FWIO/RNTupleTempInput/src/DuplicateChecker.h
7+
8+
Used by RNTupleTempSource to detect events with
9+
the same process history, run, lumi, and event number.
10+
It is configurable whether it checks for duplicates
11+
within the scope of each single input file or all input
12+
files or does not check for duplicates at all.
13+
14+
----------------------------------------------------------------------*/
15+
16+
#include "DataFormats/Provenance/interface/EventID.h"
17+
#include "DataFormats/Provenance/interface/RunID.h"
18+
#include "DataFormats/Provenance/interface/IndexIntoFile.h"
19+
20+
#include <memory>
21+
#include <set>
22+
#include <string>
23+
#include <vector>
24+
25+
namespace edm {
26+
class ParameterSet;
27+
class ParameterSetDescription;
28+
} // namespace edm
29+
30+
namespace edm::rntuple_temp {
31+
32+
class DuplicateChecker {
33+
public:
34+
DuplicateChecker(ParameterSet const& pset);
35+
36+
void disable();
37+
38+
void inputFileOpened(bool realData,
39+
IndexIntoFile const& indexIntoFile,
40+
std::vector<std::shared_ptr<IndexIntoFile> > const& indexesIntoFiles,
41+
std::vector<std::shared_ptr<IndexIntoFile> >::size_type currentIndexIntoFile);
42+
43+
void inputFileClosed();
44+
45+
bool noDuplicatesInFile() const { return itIsKnownTheFileHasNoDuplicates_; }
46+
47+
bool checkDisabled() const {
48+
return duplicateCheckMode_ == noDuplicateCheck ||
49+
(duplicateCheckMode_ == checkEachRealDataFile && dataType_ == isSimulation) || disabled_;
50+
}
51+
52+
// Note that all references to the ProcessHistoryID in this class are to
53+
// the "reduced" process history, including the index argument to this function.
54+
bool isDuplicateAndCheckActive(
55+
int index, RunNumber_t run, LuminosityBlockNumber_t lumi, EventNumber_t event, std::string const& fileName);
56+
57+
bool checkingAllFiles() const { return checkAllFilesOpened == duplicateCheckMode_; }
58+
59+
static void fillDescription(ParameterSetDescription& desc);
60+
61+
private:
62+
enum DuplicateCheckMode { noDuplicateCheck, checkEachFile, checkEachRealDataFile, checkAllFilesOpened };
63+
64+
DuplicateCheckMode duplicateCheckMode_;
65+
66+
enum DataType { isRealData, isSimulation, unknown };
67+
68+
DataType dataType_;
69+
70+
// If checking the entire input for duplicates, then this holds
71+
// events from previous files that duplicate events in the
72+
// the current file. Plus it holds events that have been already
73+
// processed in the current file. It is not used if there are
74+
// no duplicates or duplicate checking has been disabled.
75+
std::set<IndexIntoFile::IndexRunLumiEventKey> relevantPreviousEvents_;
76+
77+
bool itIsKnownTheFileHasNoDuplicates_;
78+
79+
bool disabled_;
80+
};
81+
} // namespace edm::rntuple_temp
82+
#endif
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*----------------------------------------------------------------------
2+
----------------------------------------------------------------------*/
3+
#include "EmbeddedRNTupleTempSource.h"
4+
#include "InputFile.h"
5+
#include "RootEmbeddedFileSequence.h"
6+
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
7+
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
8+
#include "FWCore/Sources/interface/VectorInputSourceDescription.h"
9+
#include "FWCore/Sources/interface/InputSourceRunHelper.h"
10+
11+
namespace edm {
12+
13+
class EventID;
14+
class EventPrincipal;
15+
} // namespace edm
16+
namespace edm::rntuple_temp {
17+
18+
EmbeddedRNTupleTempSource::EmbeddedRNTupleTempSource(ParameterSet const& pset,
19+
VectorInputSourceDescription const& desc)
20+
: VectorInputSource(pset, desc),
21+
rootServiceChecker_(),
22+
nStreams_(desc.allocations_->numberOfStreams()),
23+
// The default value provided as the second argument to the getUntrackedParameter function call
24+
// is not used when the ParameterSet has been validated and the parameters are not optional
25+
// in the description. This is currently true when RNTupleTempSource is the primary input source.
26+
// The modules that use RNTupleTempSource as a SecSource have not defined their fillDescriptions function
27+
// yet, so the ParameterSet does not get validated yet. As soon as all the modules with a SecSource
28+
// have defined descriptions, the defaults in the getUntrackedParameterSet function calls can
29+
// and should be deleted from the code.
30+
//
31+
skipBadFiles_(pset.getUntrackedParameter<bool>("skipBadFiles", false)),
32+
bypassVersionCheck_(pset.getUntrackedParameter<bool>("bypassVersionCheck", false)),
33+
treeMaxVirtualSize_(pset.getUntrackedParameter<int>("treeMaxVirtualSize", -1)),
34+
productSelectorRules_(pset, "inputCommands", "InputSource"),
35+
runHelper_(new DefaultInputSourceRunHelper()),
36+
catalog_(pset.getUntrackedParameter<std::vector<std::string> >("fileNames"),
37+
pset.getUntrackedParameter<std::string>("overrideCatalog", std::string())),
38+
// Note: fileSequence_ needs to be initialized last, because it uses data members
39+
// initialized previously in its own initialization.
40+
fileSequence_(new RootEmbeddedFileSequence(pset, *this, catalog_)) {}
41+
42+
EmbeddedRNTupleTempSource::~EmbeddedRNTupleTempSource() {}
43+
44+
void EmbeddedRNTupleTempSource::beginJob() {}
45+
46+
void EmbeddedRNTupleTempSource::endJob() {
47+
fileSequence_->endJob();
48+
InputFile::reportReadBranches();
49+
}
50+
51+
void EmbeddedRNTupleTempSource::closeFile_() { fileSequence_->closeFile(); }
52+
53+
bool EmbeddedRNTupleTempSource::readOneEvent(EventPrincipal& cache,
54+
size_t& fileNameHash,
55+
CLHEP::HepRandomEngine* engine,
56+
EventID const* id,
57+
bool recycleFiles) {
58+
return fileSequence_->readOneEvent(cache, fileNameHash, engine, id, recycleFiles);
59+
}
60+
61+
void EmbeddedRNTupleTempSource::readOneSpecified(EventPrincipal& cache,
62+
size_t& fileNameHash,
63+
SecondaryEventIDAndFileInfo const& id) {
64+
fileSequence_->readOneSpecified(cache, fileNameHash, id);
65+
}
66+
67+
void EmbeddedRNTupleTempSource::dropUnwantedBranches_(std::vector<std::string> const& wantedBranches) {
68+
std::vector<std::string> rules;
69+
rules.reserve(wantedBranches.size() + 1);
70+
rules.emplace_back("drop *");
71+
for (std::string const& branch : wantedBranches) {
72+
rules.push_back("keep " + branch + "_*");
73+
}
74+
ParameterSet pset;
75+
pset.addUntrackedParameter("inputCommands", rules);
76+
productSelectorRules_ = ProductSelectorRules(pset, "inputCommands", "InputSource");
77+
}
78+
79+
void EmbeddedRNTupleTempSource::fillDescriptions(ConfigurationDescriptions& descriptions) {
80+
ParameterSetDescription desc;
81+
82+
std::vector<std::string> defaultStrings;
83+
desc.setComment("Reads EDM/Root files for mixing.");
84+
desc.addUntracked<std::vector<std::string> >("fileNames")->setComment("Names of files to be processed.");
85+
desc.addUntracked<std::string>("overrideCatalog", std::string());
86+
desc.addUntracked<bool>("skipBadFiles", false)
87+
->setComment(
88+
"True: Ignore any missing or unopenable input file.\n"
89+
"False: Throw exception if missing or unopenable input file.");
90+
desc.addUntracked<bool>("bypassVersionCheck", false)
91+
->setComment(
92+
"True: Bypass release version check.\n"
93+
"False: Throw exception if reading file in a release prior to the release in which the file was written.");
94+
desc.addUntracked<int>("treeMaxVirtualSize", -1)
95+
->setComment("Size of ROOT TTree TBasket cache. Affects performance.");
96+
97+
ProductSelectorRules::fillDescription(desc, "inputCommands");
98+
RootEmbeddedFileSequence::fillDescription(desc);
99+
100+
descriptions.add("source", desc);
101+
}
102+
} // namespace edm::rntuple_temp

0 commit comments

Comments
 (0)