@@ -38,46 +38,71 @@ def _has_new_structure(fs, base_dir, user, years):
38
38
39
39
return False
40
40
41
- def _get_sample_from_subsample (subsample_name ):
41
+ def _get_sample_from_subsample (subsample_name , is_data ):
42
42
"""
43
43
Determine the sample name from the subsample name using the SAMPLES dictionary.
44
+ Source:
45
+ - https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_datasets.py
46
+ - https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_mc.py
44
47
"""
45
48
# If no match found, try to infer from common patterns
46
- if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name :
47
- if "VBF" in subsample_name :
48
- return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
49
+ if is_data :
50
+ # Data
51
+ if "JetHT" in subsample_name or "JetMET" in subsample_name :
52
+ return "JetMET"
53
+ elif "EGamma" in subsample_name :
54
+ return "EGamma"
55
+ elif "Muon" in subsample_name :
56
+ return "Muon"
57
+ elif "Tau" in subsample_name :
58
+ return "Tau"
59
+ elif "BTagMu" in subsample_name :
60
+ return "BTagMu"
61
+ elif "MuonEG" in subsample_name :
62
+ return "MuonEG"
63
+ elif "ParkingVBF" in subsample_name :
64
+ return "ParkingVBF"
65
+ elif "ParkingSingleMuon" in subsample_name :
66
+ return "ParkingSingleMuon"
49
67
else :
50
- return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
51
- elif "Hto2B" in subsample_name :
52
- return "Hbb"
53
- elif "Hto2C" in subsample_name :
54
- return "Hcc"
55
- elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name :
56
- return "Htautau"
57
- elif "QCD-4Jets_HT" in subsample_name :
58
- return "QCD-4Jets_HT"
59
- elif "QCD_PT" in subsample_name :
60
- return "QCD_PT"
61
- elif "TTto" in subsample_name :
62
- return "TT"
63
- elif any (x in subsample_name for x in ["TbarWplus" , "TWminus" , "TbarBQ" , "TBbarQ" ]):
64
- return "SingleTop"
65
- elif "DYto2L-4Jets" in subsample_name :
66
- return "DYJetsLO"
67
- elif "DYto2L-2Jets" in subsample_name :
68
- return "DYJetsNLO"
69
- elif any (x in subsample_name for x in ["Wto2Q-3Jets" , "WtoLNu-4Jets" , "Zto2Q-4Jets" ]):
70
- return "VJetsLO"
71
- elif any (x in subsample_name for x in ["Wto2Q-2Jets" , "WtoLNu-2Jets" , "Zto2Q-2Jets" ]):
72
- return "VJetsNLO"
73
- elif any (x in subsample_name for x in ["WW_" , "WZ_" , "ZZ_" , "WWto4Q" , "WWtoLNu2Q" , "WZto3LNu" , "WZto4Q" , "ZZto2L2Q" , "ZZto4L" ]):
74
- return "Diboson"
75
- elif any (x in subsample_name for x in ["VBFZto2Q" , "VBFWto2Q" , "VBFto2L" , "VBFto2Nu" , "VBFtoLNu" ]):
76
- return "EWKV"
77
- elif any (x in subsample_name for x in ["WGtoLNuG" , "WGto2QG" , "ZGto2NuG" , "ZGto2QG" ]):
78
- return "VGamma"
68
+ raise ValueError (f"Could not determine sample from subsample name: { subsample_name } . Please check the naming conventions." )
69
+ else :
70
+ # MC
71
+ if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name :
72
+ if "VBF" in subsample_name :
73
+ return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
74
+ else :
75
+ return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
76
+ elif "Hto2B" in subsample_name :
77
+ return "Hbb"
78
+ elif "Hto2C" in subsample_name :
79
+ return "Hcc"
80
+ elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name :
81
+ return "Htautau"
82
+ elif "QCD-4Jets_HT" in subsample_name :
83
+ return "QCD-4Jets_HT"
84
+ elif "QCD_PT" in subsample_name :
85
+ return "QCD_PT"
86
+ elif "TTto" in subsample_name :
87
+ return "TT"
88
+ elif any (x in subsample_name for x in ["TbarWplus" , "TWminus" , "TbarBQ" , "TBbarQ" ]):
89
+ return "SingleTop"
90
+ elif "DYto2L-4Jets" in subsample_name :
91
+ return "DYJetsLO"
92
+ elif "DYto2L-2Jets" in subsample_name :
93
+ return "DYJetsNLO"
94
+ elif any (x in subsample_name for x in ["Wto2Q-3Jets" , "WtoLNu-4Jets" , "Zto2Q-4Jets" ]):
95
+ return "VJetsLO"
96
+ elif any (x in subsample_name for x in ["Wto2Q-2Jets" , "WtoLNu-2Jets" , "Zto2Q-2Jets" ]):
97
+ return "VJetsNLO"
98
+ elif any (x in subsample_name for x in ["WW_" , "WZ_" , "ZZ_" , "WWto4Q" , "WWtoLNu2Q" , "WZto3LNu" , "WZto4Q" , "ZZto2L2Q" , "ZZto4L" ]):
99
+ return "Diboson"
100
+ elif any (x in subsample_name for x in ["VBFZto2Q" , "VBFWto2Q" , "VBFto2L" , "VBFto2Nu" , "VBFtoLNu" ]):
101
+ return "EWKV"
102
+ elif any (x in subsample_name for x in ["WGtoLNuG" , "WGto2QG" , "ZGto2NuG" , "ZGto2QG" ]):
103
+ return "VGamma"
79
104
80
- raise ValueError (f"Could not determine sample from subsample name: { subsample_name } . Please check the naming conventions." )
105
+ raise ValueError (f"Could not determine sample from subsample name: { subsample_name } . Please check the naming conventions." )
81
106
82
107
83
108
def xrootd_index_private_nano (
@@ -119,14 +144,18 @@ def xrootd_index_private_nano(
119
144
120
145
if files is None :
121
146
files = {}
147
+
148
+ # Check version
149
+ if len (users ) > 0 :
150
+ use_new_structure = _has_new_structure (fs , base_dir , users [0 ], years )
151
+ print (f"Using { 'new' if use_new_structure else 'old' } directory structure" )
152
+ else :
153
+ # no users to search for
154
+ return {}
122
155
123
156
for user in users :
124
157
print (f"\t { user } " )
125
158
126
- # Check version
127
- use_new_structure = _has_new_structure (fs , base_dir , user , years )
128
- print (f"\t \t Using { 'new' if use_new_structure else 'old' } directory structure" )
129
-
130
159
for year in years :
131
160
print (f"\t \t { year } " )
132
161
if year not in files :
@@ -144,13 +173,7 @@ def xrootd_index_private_nano(
144
173
145
174
for subsample in tsubsamples :
146
175
print (f"\t \t \t Processing { subsample } " )
147
- # For new structure, infer sample name from subsample
148
- if is_data :
149
- # For data, the subsample IS the sample (e.g., "Tau", "JetMET")
150
- sample = subsample
151
- else :
152
- # For MC, infer sample from subsample name
153
- sample = _get_sample_from_subsample (subsample )
176
+ sample = _get_sample_from_subsample (subsample , is_data )
154
177
155
178
# Filter by samples if specified
156
179
if samples is not None and sample not in samples :
@@ -181,12 +204,6 @@ def xrootd_index_private_nano(
181
204
tfiles = []
182
205
try :
183
206
for f1 in _dirlist (fs , spath ): # dataset directory
184
- # For Data files, f1 is the subsample name
185
- if is_data :
186
- if f1 in files [year ][sample ]:
187
- warnings .warn (f"Duplicate subsample found! { f1 } " , stacklevel = 2 )
188
- print (f"\t \t \t \t { f1 } " )
189
-
190
207
f1path = spath / f1
191
208
for f2 in _dirlist (fs , f1path ): # timestamp directory
192
209
f2path = f1path / f2
@@ -198,8 +215,12 @@ def xrootd_index_private_nano(
198
215
tfiles += [f"{ redirector } { f3path !s} /{ f } " for f in root_files ]
199
216
200
217
if is_data :
201
- files [year ][sample ][f1 ] = tfiles
202
- print (f"\t \t \t \t \t { len (tfiles )} files" )
218
+ # For data, concatenate files from related subsamples
219
+ # e.g. EGamma0 and EGamma1 should be combined
220
+ if f1 not in files [year ][sample ]:
221
+ files [year ][sample ][f1 ] = []
222
+ files [year ][sample ][f1 ].extend (tfiles )
223
+ print (f"\t \t \t \t \t { len (tfiles )} files added" )
203
224
204
225
if not is_data :
205
226
files [year ][sample ][subsample_name ] = tfiles
@@ -208,10 +229,6 @@ def xrootd_index_private_nano(
208
229
except FileNotFoundError :
209
230
print (f"\t \t \t \t Warning: Could not access { spath } " )
210
231
continue
211
-
212
- except FileNotFoundError :
213
- print (f"\t \t \t \t Warning: Could not access { spath } " )
214
- continue
215
232
216
233
else :
217
234
# Old structure: single year directory
0 commit comments