@@ -23,6 +23,87 @@ def _dirlist(fs, path) -> list:
23
23
24
24
return [f .name for f in listing ]
25
25
26
+ def _has_new_structure (fs , base_dir , user , years ):
27
+ """Check if the directory uses the new structure (data_{year}, mc_{year}) or old structure ({year})."""
28
+ user_path = base_dir / user
29
+ try :
30
+ user_contents = _dirlist (fs , user_path )
31
+ except FileNotFoundError :
32
+ return False
33
+
34
+ # Check if any data_{year} or mc_{year} directories exist
35
+ for year in years :
36
+ if f"data_{ year } " in user_contents or f"mc_{ year } " in user_contents :
37
+ return True
38
+
39
+ return False
40
+
41
+ def _get_sample_from_subsample (subsample_name , is_data ):
42
+ """
43
+ Determine the sample name from the subsample name using the SAMPLES dictionary.
44
+ Source:
45
+ - https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_datasets.py
46
+ - https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_mc.py
47
+ """
48
+ # If no match found, try to infer from common patterns
49
+ if is_data :
50
+ # Data
51
+ if "JetHT" in subsample_name or "JetMET" in subsample_name :
52
+ return "JetMET"
53
+ elif "EGamma" in subsample_name :
54
+ return "EGamma"
55
+ elif "Muon" in subsample_name :
56
+ return "Muon"
57
+ elif "Tau" in subsample_name :
58
+ return "Tau"
59
+ elif "BTagMu" in subsample_name :
60
+ return "BTagMu"
61
+ elif "MuonEG" in subsample_name :
62
+ return "MuonEG"
63
+ elif "ParkingVBF" in subsample_name :
64
+ return "ParkingVBF"
65
+ elif "ParkingSingleMuon" in subsample_name :
66
+ return "ParkingSingleMuon"
67
+ else :
68
+ raise ValueError (f"Could not determine sample from subsample name: { subsample_name } . Please check the naming conventions." )
69
+ else :
70
+ # MC
71
+ if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name :
72
+ if "VBF" in subsample_name :
73
+ return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
74
+ else :
75
+ return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
76
+ elif "Hto2B" in subsample_name :
77
+ return "Hbb"
78
+ elif "Hto2C" in subsample_name :
79
+ return "Hcc"
80
+ elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name :
81
+ return "Htautau"
82
+ elif "QCD-4Jets_HT" in subsample_name :
83
+ return "QCD"
84
+ elif "QCD_PT" in subsample_name :
85
+ return "QCD_PT"
86
+ elif "TTto" in subsample_name :
87
+ return "TT"
88
+ elif any (x in subsample_name for x in ["TbarWplus" , "TWminus" , "TbarBQ" , "TBbarQ" ]):
89
+ return "SingleTop"
90
+ elif "DYto2L-4Jets" in subsample_name :
91
+ return "DYJetsLO"
92
+ elif "DYto2L-2Jets" in subsample_name :
93
+ return "DYJetsNLO"
94
+ elif any (x in subsample_name for x in ["Wto2Q-3Jets" , "WtoLNu-4Jets" , "Zto2Q-4Jets" ]):
95
+ return "VJetsLO"
96
+ elif any (x in subsample_name for x in ["Wto2Q-2Jets" , "WtoLNu-2Jets" , "Zto2Q-2Jets" ]):
97
+ return "VJetsNLO"
98
+ elif any (x in subsample_name for x in ["WW_" , "WZ_" , "ZZ_" , "WWto4Q" , "WWtoLNu2Q" , "WZto3LNu" , "WZto4Q" , "ZZto2L2Q" , "ZZto4L" ]):
99
+ return "Diboson"
100
+ elif any (x in subsample_name for x in ["VBFZto2Q" , "VBFWto2Q" , "VBFto2L" , "VBFto2Nu" , "VBFtoLNu" ]):
101
+ return "EWKV"
102
+ elif any (x in subsample_name for x in ["WGtoLNuG" , "WGto2QG" , "ZGto2NuG" , "ZGto2QG" ]):
103
+ return "VGamma"
104
+
105
+ raise ValueError (f"Could not determine sample from subsample name: { subsample_name } . Please check the naming conventions." )
106
+
26
107
27
108
def xrootd_index_private_nano (
28
109
base_dir : str ,
@@ -39,21 +120,21 @@ def xrootd_index_private_nano(
39
120
Can specify specific users, years, samples, and subsamples to search for;
40
121
otherwise, it will search for all by default.
41
122
42
- Files are organized as :
123
+ Supports both old and new directory structures :
43
124
125
+ Old structure:
44
126
MC:
45
127
......redirector.......|...............base_dir....................|..user.|year|sample|
46
128
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/HHbbtt/
47
- ....................................subsample.......................................|
48
- GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_LHEweights_TuneCP5_13p6TeV_powheg-pythia8/
49
- .............................f1...........................|.....f2......|.f3.|......
50
- GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV/241028_235514/000*/*.root
129
+
130
+ New structure:
131
+ MC:
132
+ ......redirector.......|...............base_dir....................|..user.|year|sample|
133
+ root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022_mc/HHbbtt/
51
134
52
135
Data:
53
136
......redirector.......|...............base_dir....................|..user.|year|sample|
54
- root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/Tau/
55
- .f1|..subsample.|.....f2......|.f3.|......
56
- Tau/Tau_Run2022D/241114_222843/000*/*.root
137
+ root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022_data/Tau/
57
138
"""
58
139
fs = client .FileSystem (redirector )
59
140
base_dir = Path (base_dir )
@@ -63,71 +144,167 @@ def xrootd_index_private_nano(
63
144
64
145
if files is None :
65
146
files = {}
147
+
148
+ # Check version
149
+ if len (users ) > 0 :
150
+ use_new_structure = _has_new_structure (fs , base_dir , users [0 ], years )
151
+ print (f"Using { 'new' if use_new_structure else 'old' } directory structure" )
152
+ else :
153
+ # no users to search for
154
+ return {}
66
155
67
156
for user in users :
68
157
print (f"\t { user } " )
158
+
69
159
for year in years :
70
160
print (f"\t \t { year } " )
71
161
if year not in files :
72
162
files [year ] = {}
73
163
74
- ypath = base_dir / user / year
75
- tsamples = _dirlist (fs , ypath ) if samples is None else samples
76
- for sample in tsamples :
77
- if sample not in files [year ]:
78
- files [year ][sample ] = {}
79
- elif overwrite_sample :
80
- warnings .warn (f"Overwriting existing sample { sample } " , stacklevel = 2 )
81
- files [year ][sample ] = {}
82
-
83
- print (f"\t \t \t { sample } " )
84
- spath = ypath / sample
85
-
86
- is_data = sample in hh_vars .DATA_SAMPLES
87
-
88
- tsubsamples = _dirlist (fs , spath ) if subsamples is None else subsamples
89
- for subsample in tsubsamples :
90
- subsample_name = subsample .split ("_TuneCP5" )[0 ].split ("_LHEweights" )[0 ]
91
- if not is_data :
92
- if subsample_name in files [year ][sample ]:
93
- warnings .warn (
94
- f"Duplicate subsample found! { subsample_name } " , stacklevel = 2
95
- )
96
-
164
+ if use_new_structure :
165
+ # New structure: separate data_{year} and mc_{year} directories
166
+ for is_data in (True , False ):
167
+ if is_data :
168
+ ypath = base_dir / user / f"data_{ year } "
169
+ else :
170
+ ypath = base_dir / user / f"mc_{ year } "
171
+
172
+ tsubsamples = _dirlist (fs , ypath ) if subsamples is None else subsamples
173
+
174
+ for subsample in tsubsamples :
175
+ print (f"\t \t \t Processing { subsample } " )
176
+ sample = _get_sample_from_subsample (subsample , is_data )
177
+
178
+ # Filter by samples if specified
179
+ if samples is not None and sample not in samples :
180
+ continue
181
+
182
+ if sample not in files [year ]:
183
+ files [year ][sample ] = {}
184
+ elif overwrite_sample :
185
+ warnings .warn (f"Overwriting existing sample { sample } " , stacklevel = 2 )
186
+ files [year ][sample ] = {}
187
+
188
+ print (f"\t \t \t { sample } " )
189
+ spath = ypath / subsample
190
+
191
+ # Clean subsample name
192
+ subsample_name = subsample .split ("_TuneCP5" )[0 ].split ("_LHEweights" )[0 ]
97
193
print (f"\t \t \t \t { subsample_name } " )
98
-
99
- sspath = spath / subsample
100
- for f1 in _dirlist (fs , sspath ):
101
- # For Data files, f1 is the subsample name
102
- if is_data :
103
- if f1 in files [year ][sample ]:
104
- warnings .warn (f"Duplicate subsample found! { f1 } " , stacklevel = 2 )
105
-
106
- print (f"\t \t \t \t { f1 } " )
107
-
108
- f1path = sspath / f1
109
- for f2 in _dirlist (fs , f1path ):
110
- f2path = f1path / f2
111
- tfiles = []
112
- for f3 in _dirlist (fs , f2path ):
113
- f3path = f2path / f3
114
- tfiles += [
115
- f"{ redirector } { f3path !s} /{ f } "
116
- for f in _dirlist (fs , f3path )
117
- if f .endswith (".root" )
118
- ]
119
-
120
- if is_data :
121
- files [year ][sample ][f1 ] = tfiles
122
- print (f"\t \t \t \t \t { len (tfiles )} files" )
123
-
124
- if not is_data :
125
- files [year ][sample ][subsample_name ] = tfiles
126
- print (f"\t \t \t \t \t { len (tfiles )} files" )
194
+
195
+ if not is_data :
196
+ if subsample_name in files [year ][sample ]:
197
+ warnings .warn (
198
+ f"Duplicate subsample found! { subsample = } ({ subsample_name = } ) for { year = } " ,
199
+ stacklevel = 2
200
+ )
201
+ print (f"\t \t \t \t { subsample_name } " )
202
+
203
+ # Navigate through the directory structure (4 levels for new structure)
204
+ try :
205
+ for f1 in _dirlist (fs , spath ): # dataset directory
206
+ f1path = spath / f1
207
+ tfiles = [] # Reset for each dataset directory
208
+
209
+ for f2 in _dirlist (fs , f1path ): # timestamp directory
210
+ f2path = f1path / f2
211
+ for f3 in _dirlist (fs , f2path ): # chunk directory (0000, 0001, etc.)
212
+ f3path = f2path / f3
213
+ f3_contents = _dirlist (fs , f3path )
214
+ root_files = [f for f in f3_contents if f .endswith (".root" )]
215
+ if root_files :
216
+ tfiles += [f"{ redirector } { f3path !s} /{ f } " for f in root_files ]
217
+
218
+ # Process files for this specific dataset directory
219
+ if is_data :
220
+ run_info = f1 .replace ("_DAZSLE_PFNano" , "" )
221
+ subsample_key = f"{ sample } _{ run_info } "
222
+
223
+ if subsample_key not in files [year ][sample ]:
224
+ files [year ][sample ][subsample_key ] = []
225
+ files [year ][sample ][subsample_key ].extend (tfiles )
226
+ print (f"\t \t \t \t \t { len (tfiles )} files added" )
227
+
228
+ # Handle MC case outside the f1 loop since it processes all files together
229
+ if not is_data :
230
+ files [year ][sample ][subsample_name ] = tfiles
231
+ print (f"\t \t \t \t \t { len (tfiles )} files" )
232
+
233
+ except FileNotFoundError :
234
+ print (f"\t \t \t \t Warning: Could not access { spath } " )
235
+ continue
236
+
237
+ else :
238
+ # Old structure: single year directory
239
+ ypath = base_dir / user / year
240
+ try :
241
+ tsamples = _dirlist (fs , ypath ) if samples is None else samples
242
+ except FileNotFoundError :
243
+ continue
244
+
245
+ for sample in tsamples :
246
+ if sample not in files [year ]:
247
+ files [year ][sample ] = {}
248
+ elif overwrite_sample :
249
+ warnings .warn (f"Overwriting existing sample { sample } " , stacklevel = 2 )
250
+ files [year ][sample ] = {}
251
+
252
+ print (f"\t \t \t { sample } " )
253
+ spath = ypath / sample
254
+
255
+ is_data = sample in hh_vars .DATA_SAMPLES
256
+
257
+ try :
258
+ tsubsamples = _dirlist (fs , spath ) if subsamples is None else subsamples
259
+ except FileNotFoundError :
260
+ continue
261
+
262
+ for subsample in tsubsamples :
263
+ subsample_name = subsample .split ("_TuneCP5" )[0 ].split ("_LHEweights" )[0 ]
264
+ if not is_data :
265
+ if subsample_name in files [year ][sample ]:
266
+ warnings .warn (
267
+ f"Duplicate subsample found! { subsample_name } " , stacklevel = 2
268
+ )
269
+
270
+ print (f"\t \t \t \t { subsample_name } " )
271
+
272
+ sspath = spath / subsample
273
+ try :
274
+ for f1 in _dirlist (fs , sspath ):
275
+ # For Data files, f1 is the subsample name
276
+ if is_data :
277
+ if f1 in files [year ][sample ]:
278
+ warnings .warn (f"Duplicate subsample found! { f1 } " , stacklevel = 2 )
279
+
280
+ print (f"\t \t \t \t { f1 } " )
281
+
282
+ f1path = sspath / f1
283
+ for f2 in _dirlist (fs , f1path ):
284
+ f2path = f1path / f2
285
+ tfiles = []
286
+ for f3 in _dirlist (fs , f2path ):
287
+ f3path = f2path / f3
288
+ tfiles += [
289
+ f"{ redirector } { f3path !s} /{ f } "
290
+ for f in _dirlist (fs , f3path )
291
+ if f .endswith (".root" )
292
+ ]
293
+
294
+ if is_data :
295
+ files [year ][sample ][f1 ] = tfiles
296
+ print (f"\t \t \t \t \t { len (tfiles )} files" )
297
+
298
+ if not is_data :
299
+ files [year ][sample ][subsample_name ] = tfiles
300
+ print (f"\t \t \t \t \t { len (tfiles )} files" )
301
+
302
+ except FileNotFoundError :
303
+ print (f"\t \t \t \t Warning: Could not access { sspath } " )
304
+ continue
127
305
128
306
return files
129
307
130
-
131
308
def main ():
132
309
# Set up argument parser
133
310
parser = argparse .ArgumentParser ()
0 commit comments