@@ -38,6 +38,47 @@ def _has_new_structure(fs, base_dir, user, years):
38
38
39
39
return False
40
40
41
+ def _get_sample_from_subsample (subsample_name ):
42
+ """
43
+ Determine the sample name from the subsample name using the SAMPLES dictionary.
44
+ """
45
+ # If no match found, try to infer from common patterns
46
+ if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name :
47
+ if "VBF" in subsample_name :
48
+ return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
49
+ else :
50
+ return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
51
+ elif "Hto2B" in subsample_name :
52
+ return "Hbb"
53
+ elif "Hto2C" in subsample_name :
54
+ return "Hcc"
55
+ elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name :
56
+ return "Htautau"
57
+ elif "QCD-4Jets_HT" in subsample_name :
58
+ return "QCD-4Jets_HT"
59
+ elif "QCD_PT" in subsample_name :
60
+ return "QCD_PT"
61
+ elif "TTto" in subsample_name :
62
+ return "TT"
63
+ elif any (x in subsample_name for x in ["TbarWplus" , "TWminus" , "TbarBQ" , "TBbarQ" ]):
64
+ return "SingleTop"
65
+ elif "DYto2L-4Jets" in subsample_name :
66
+ return "DYJetsLO"
67
+ elif "DYto2L-2Jets" in subsample_name :
68
+ return "DYJetsNLO"
69
+ elif any (x in subsample_name for x in ["Wto2Q-3Jets" , "WtoLNu-4Jets" , "Zto2Q-4Jets" ]):
70
+ return "VJetsLO"
71
+ elif any (x in subsample_name for x in ["Wto2Q-2Jets" , "WtoLNu-2Jets" , "Zto2Q-2Jets" ]):
72
+ return "VJetsNLO"
73
+ elif any (x in subsample_name for x in ["WW_" , "WZ_" , "ZZ_" , "WWto4Q" , "WWtoLNu2Q" , "WZto3LNu" , "WZto4Q" , "ZZto2L2Q" , "ZZto4L" ]):
74
+ return "Diboson"
75
+ elif any (x in subsample_name for x in ["VBFZto2Q" , "VBFWto2Q" , "VBFto2L" , "VBFto2Nu" , "VBFtoLNu" ]):
76
+ return "EWKV"
77
+ elif any (x in subsample_name for x in ["WGtoLNuG" , "WGto2QG" , "ZGto2NuG" , "ZGto2QG" ]):
78
+ return "VGamma"
79
+
80
+ raise ValueError (f"Could not determine sample from subsample name: { subsample_name } . Please check the naming conventions." )
81
+
41
82
42
83
def xrootd_index_private_nano (
43
84
base_dir : str ,
@@ -99,57 +140,87 @@ def xrootd_index_private_nano(
99
140
else :
100
141
ypath = base_dir / user / f"mc_{ year } "
101
142
102
- tsamples = _dirlist (fs , ypath ) if samples is None else samples
143
+ tsubsamples = _dirlist (fs , ypath ) if subsamples is None else subsamples
144
+
145
+ for subsample in tsubsamples :
146
+ print (f"\t \t \t Processing { subsample } " )
147
+ # For new structure, infer sample name from subsample
148
+ if is_data :
149
+ # For data, the subsample IS the sample (e.g., "Tau", "JetMET")
150
+ sample = subsample
151
+ else :
152
+ # For MC, infer sample from subsample name
153
+ sample = _get_sample_from_subsample (subsample )
103
154
104
- for sample in tsamples :
155
+ # Filter by samples if specified
156
+ if samples is not None and sample not in samples :
157
+ continue
158
+
105
159
if sample not in files [year ]:
106
160
files [year ][sample ] = {}
107
161
elif overwrite_sample :
108
162
warnings .warn (f"Overwriting existing sample { sample } " , stacklevel = 2 )
109
163
files [year ][sample ] = {}
110
164
111
165
print (f"\t \t \t { sample } " )
112
- spath = ypath / sample
166
+ spath = ypath / subsample
113
167
114
- tsubsamples = _dirlist (fs , spath ) if subsamples is None else subsamples
115
- for subsample in tsubsamples :
116
- subsample_name = subsample .split ("_TuneCP5" )[0 ].split ("_LHEweights" )[0 ]
117
- if not is_data :
118
- if subsample_name in files [year ][sample ]:
119
- warnings .warn (
120
- f"Duplicate subsample found! { subsample_name } " , stacklevel = 2
121
- )
122
-
123
- print (f"\t \t \t \t { subsample_name } " )
168
+ # Clean subsample name
169
+ subsample_name = subsample .split ("_TuneCP5" )[0 ].split ("_LHEweights" )[0 ]
170
+ print (f"\t \t \t \t { subsample_name } " )
171
+
172
+ if not is_data :
173
+ if subsample_name in files [year ][sample ]:
174
+ warnings .warn (
175
+ f"Duplicate subsample found! { subsample = } ({ subsample_name = } ) for { year = } " ,
176
+ stacklevel = 2
177
+ )
178
+ print (f"\t \t \t \t { subsample_name } " )
124
179
125
- sspath = spath / subsample
126
- for f1 in _dirlist (fs , sspath ):
180
+ # Navigate through the directory structure (4 levels for new structure)
181
+ tfiles = []
182
+ try :
183
+ for f1 in _dirlist (fs , spath ): # dataset directory
127
184
# For Data files, f1 is the subsample name
128
185
if is_data :
129
186
if f1 in files [year ][sample ]:
130
187
warnings .warn (f"Duplicate subsample found! { f1 } " , stacklevel = 2 )
131
188
print (f"\t \t \t \t { f1 } " )
132
189
133
- f1path = sspath / f1
134
- for f2 in _dirlist (fs , f1path ):
190
+ f1path = spath / f1
191
+ for f2 in _dirlist (fs , f1path ): # timestamp directory
135
192
f2path = f1path / f2
136
- tfiles = []
137
- f2_contents = _dirlist (fs , f2path )
138
- root_files = [f for f in f2_contents if f .endswith (".root" )]
139
- if root_files :
140
- tfiles += [f"{ redirector } { f2path !s} /{ f } " for f in root_files ]
193
+ for f3 in _dirlist (fs , f2path ): # chunk directory (0000, 0001, etc.)
194
+ f3path = f2path / f3
195
+ f3_contents = _dirlist (fs , f3path )
196
+ root_files = [f for f in f3_contents if f .endswith (".root" )]
197
+ if root_files :
198
+ tfiles += [f"{ redirector } { f3path !s} /{ f } " for f in root_files ]
141
199
142
- if is_data :
143
- files [year ][sample ][f1 ] = tfiles
144
- print (f"\t \t \t \t \t { len (tfiles )} files" )
200
+ if is_data :
201
+ files [year ][sample ][f1 ] = tfiles
202
+ print (f"\t \t \t \t \t { len (tfiles )} files" )
145
203
146
204
if not is_data :
147
205
files [year ][sample ][subsample_name ] = tfiles
148
206
print (f"\t \t \t \t \t { len (tfiles )} files" )
207
+
208
+ except FileNotFoundError :
209
+ print (f"\t \t \t \t Warning: Could not access { spath } " )
210
+ continue
211
+
212
+ except FileNotFoundError :
213
+ print (f"\t \t \t \t Warning: Could not access { spath } " )
214
+ continue
215
+
149
216
else :
150
217
# Old structure: single year directory
151
218
ypath = base_dir / user / year
152
- tsamples = _dirlist (fs , ypath ) if samples is None else samples
219
+ try :
220
+ tsamples = _dirlist (fs , ypath ) if samples is None else samples
221
+ except FileNotFoundError :
222
+ continue
223
+
153
224
for sample in tsamples :
154
225
if sample not in files [year ]:
155
226
files [year ][sample ] = {}
@@ -162,7 +233,11 @@ def xrootd_index_private_nano(
162
233
163
234
is_data = sample in hh_vars .DATA_SAMPLES
164
235
165
- tsubsamples = _dirlist (fs , spath ) if subsamples is None else subsamples
236
+ try :
237
+ tsubsamples = _dirlist (fs , spath ) if subsamples is None else subsamples
238
+ except FileNotFoundError :
239
+ continue
240
+
166
241
for subsample in tsubsamples :
167
242
subsample_name = subsample .split ("_TuneCP5" )[0 ].split ("_LHEweights" )[0 ]
168
243
if not is_data :
@@ -174,36 +249,40 @@ def xrootd_index_private_nano(
174
249
print (f"\t \t \t \t { subsample_name } " )
175
250
176
251
sspath = spath / subsample
177
- for f1 in _dirlist (fs , sspath ):
178
- # For Data files, f1 is the subsample name
179
- if is_data :
180
- if f1 in files [year ][sample ]:
181
- warnings .warn (f"Duplicate subsample found! { f1 } " , stacklevel = 2 )
182
-
183
- print (f"\t \t \t \t { f1 } " )
184
-
185
- f1path = sspath / f1
186
- for f2 in _dirlist (fs , f1path ):
187
- f2path = f1path / f2
188
- tfiles = []
189
- for f3 in _dirlist (fs , f2path ):
190
- f3path = f2path / f3
191
- tfiles += [
192
- f"{ redirector } { f3path !s} /{ f } "
193
- for f in _dirlist (fs , f3path )
194
- if f .endswith (".root" )
195
- ]
196
-
197
- if is_data :
198
- files [year ][sample ][f1 ] = tfiles
199
- print (f"\t \t \t \t \t { len (tfiles )} files" )
252
+ try :
253
+ for f1 in _dirlist (fs , sspath ):
254
+ # For Data files, f1 is the subsample name
255
+ if is_data :
256
+ if f1 in files [year ][sample ]:
257
+ warnings .warn (f"Duplicate subsample found! { f1 } " , stacklevel = 2 )
200
258
201
- if not is_data :
202
- files [year ][sample ][subsample_name ] = tfiles
203
- print (f"\t \t \t \t \t { len (tfiles )} files" )
259
+ print (f"\t \t \t \t { f1 } " )
204
260
205
- return files
261
+ f1path = sspath / f1
262
+ for f2 in _dirlist (fs , f1path ):
263
+ f2path = f1path / f2
264
+ tfiles = []
265
+ for f3 in _dirlist (fs , f2path ):
266
+ f3path = f2path / f3
267
+ tfiles += [
268
+ f"{ redirector } { f3path !s} /{ f } "
269
+ for f in _dirlist (fs , f3path )
270
+ if f .endswith (".root" )
271
+ ]
272
+
273
+ if is_data :
274
+ files [year ][sample ][f1 ] = tfiles
275
+ print (f"\t \t \t \t \t { len (tfiles )} files" )
206
276
277
+ if not is_data :
278
+ files [year ][sample ][subsample_name ] = tfiles
279
+ print (f"\t \t \t \t \t { len (tfiles )} files" )
280
+
281
+ except FileNotFoundError :
282
+ print (f"\t \t \t \t Warning: Could not access { sspath } " )
283
+ continue
284
+
285
+ return files
207
286
208
287
def main ():
209
288
# Set up argument parser
0 commit comments