Skip to content

Commit 18bf01e

Browse files
committed
Categorize subsamples
1 parent c8e393a commit 18bf01e

File tree

1 file changed

+133
-54
lines changed

1 file changed

+133
-54
lines changed

data/index_private_nano.py

Lines changed: 133 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,47 @@ def _has_new_structure(fs, base_dir, user, years):
3838

3939
return False
4040

41+
def _get_sample_from_subsample(subsample_name):
42+
"""
43+
Determine the sample name from the subsample name using the SAMPLES dictionary.
44+
"""
45+
# If no match found, try to infer from common patterns
46+
if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name:
47+
if "VBF" in subsample_name:
48+
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
49+
else:
50+
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
51+
elif "Hto2B" in subsample_name:
52+
return "Hbb"
53+
elif "Hto2C" in subsample_name:
54+
return "Hcc"
55+
elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name:
56+
return "Htautau"
57+
elif "QCD-4Jets_HT" in subsample_name:
58+
return "QCD-4Jets_HT"
59+
elif "QCD_PT" in subsample_name:
60+
return "QCD_PT"
61+
elif "TTto" in subsample_name:
62+
return "TT"
63+
elif any(x in subsample_name for x in ["TbarWplus", "TWminus", "TbarBQ", "TBbarQ"]):
64+
return "SingleTop"
65+
elif "DYto2L-4Jets" in subsample_name:
66+
return "DYJetsLO"
67+
elif "DYto2L-2Jets" in subsample_name:
68+
return "DYJetsNLO"
69+
elif any(x in subsample_name for x in ["Wto2Q-3Jets", "WtoLNu-4Jets", "Zto2Q-4Jets"]):
70+
return "VJetsLO"
71+
elif any(x in subsample_name for x in ["Wto2Q-2Jets", "WtoLNu-2Jets", "Zto2Q-2Jets"]):
72+
return "VJetsNLO"
73+
elif any(x in subsample_name for x in ["WW_", "WZ_", "ZZ_", "WWto4Q", "WWtoLNu2Q", "WZto3LNu", "WZto4Q", "ZZto2L2Q", "ZZto4L"]):
74+
return "Diboson"
75+
elif any(x in subsample_name for x in ["VBFZto2Q", "VBFWto2Q", "VBFto2L", "VBFto2Nu", "VBFtoLNu"]):
76+
return "EWKV"
77+
elif any(x in subsample_name for x in ["WGtoLNuG", "WGto2QG", "ZGto2NuG", "ZGto2QG"]):
78+
return "VGamma"
79+
80+
raise ValueError(f"Could not determine sample from subsample name: {subsample_name}. Please check the naming conventions.")
81+
4182

4283
def xrootd_index_private_nano(
4384
base_dir: str,
@@ -99,57 +140,87 @@ def xrootd_index_private_nano(
99140
else:
100141
ypath = base_dir / user / f"mc_{year}"
101142

102-
tsamples = _dirlist(fs, ypath) if samples is None else samples
143+
tsubsamples = _dirlist(fs, ypath) if subsamples is None else subsamples
144+
145+
for subsample in tsubsamples:
146+
print(f"\t\t\tProcessing {subsample}")
147+
# For new structure, infer sample name from subsample
148+
if is_data:
149+
# For data, the subsample IS the sample (e.g., "Tau", "JetMET")
150+
sample = subsample
151+
else:
152+
# For MC, infer sample from subsample name
153+
sample = _get_sample_from_subsample(subsample)
103154

104-
for sample in tsamples:
155+
# Filter by samples if specified
156+
if samples is not None and sample not in samples:
157+
continue
158+
105159
if sample not in files[year]:
106160
files[year][sample] = {}
107161
elif overwrite_sample:
108162
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
109163
files[year][sample] = {}
110164

111165
print(f"\t\t\t{sample}")
112-
spath = ypath / sample
166+
spath = ypath / subsample
113167

114-
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
115-
for subsample in tsubsamples:
116-
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
117-
if not is_data:
118-
if subsample_name in files[year][sample]:
119-
warnings.warn(
120-
f"Duplicate subsample found! {subsample_name}", stacklevel=2
121-
)
122-
123-
print(f"\t\t\t\t{subsample_name}")
168+
# Clean subsample name
169+
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
170+
print(f"\t\t\t\t{subsample_name}")
171+
172+
if not is_data:
173+
if subsample_name in files[year][sample]:
174+
warnings.warn(
175+
f"Duplicate subsample found! {subsample=} ({subsample_name=}) for {year=}",
176+
stacklevel=2
177+
)
178+
print(f"\t\t\t\t{subsample_name}")
124179

125-
sspath = spath / subsample
126-
for f1 in _dirlist(fs, sspath):
180+
# Navigate through the directory structure (4 levels for new structure)
181+
tfiles = []
182+
try:
183+
for f1 in _dirlist(fs, spath): # dataset directory
127184
# For Data files, f1 is the subsample name
128185
if is_data:
129186
if f1 in files[year][sample]:
130187
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
131188
print(f"\t\t\t\t{f1}")
132189

133-
f1path = sspath / f1
134-
for f2 in _dirlist(fs, f1path):
190+
f1path = spath / f1
191+
for f2 in _dirlist(fs, f1path): # timestamp directory
135192
f2path = f1path / f2
136-
tfiles = []
137-
f2_contents = _dirlist(fs, f2path)
138-
root_files = [f for f in f2_contents if f.endswith(".root")]
139-
if root_files:
140-
tfiles += [f"{redirector}{f2path!s}/{f}" for f in root_files]
193+
for f3 in _dirlist(fs, f2path): # chunk directory (0000, 0001, etc.)
194+
f3path = f2path / f3
195+
f3_contents = _dirlist(fs, f3path)
196+
root_files = [f for f in f3_contents if f.endswith(".root")]
197+
if root_files:
198+
tfiles += [f"{redirector}{f3path!s}/{f}" for f in root_files]
141199

142-
if is_data:
143-
files[year][sample][f1] = tfiles
144-
print(f"\t\t\t\t\t{len(tfiles)} files")
200+
if is_data:
201+
files[year][sample][f1] = tfiles
202+
print(f"\t\t\t\t\t{len(tfiles)} files")
145203

146204
if not is_data:
147205
files[year][sample][subsample_name] = tfiles
148206
print(f"\t\t\t\t\t{len(tfiles)} files")
207+
208+
except FileNotFoundError:
209+
print(f"\t\t\t\tWarning: Could not access {spath}")
210+
continue
211+
212+
except FileNotFoundError:
213+
print(f"\t\t\t\tWarning: Could not access {spath}")
214+
continue
215+
149216
else:
150217
# Old structure: single year directory
151218
ypath = base_dir / user / year
152-
tsamples = _dirlist(fs, ypath) if samples is None else samples
219+
try:
220+
tsamples = _dirlist(fs, ypath) if samples is None else samples
221+
except FileNotFoundError:
222+
continue
223+
153224
for sample in tsamples:
154225
if sample not in files[year]:
155226
files[year][sample] = {}
@@ -162,7 +233,11 @@ def xrootd_index_private_nano(
162233

163234
is_data = sample in hh_vars.DATA_SAMPLES
164235

165-
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
236+
try:
237+
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
238+
except FileNotFoundError:
239+
continue
240+
166241
for subsample in tsubsamples:
167242
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
168243
if not is_data:
@@ -174,36 +249,40 @@ def xrootd_index_private_nano(
174249
print(f"\t\t\t\t{subsample_name}")
175250

176251
sspath = spath / subsample
177-
for f1 in _dirlist(fs, sspath):
178-
# For Data files, f1 is the subsample name
179-
if is_data:
180-
if f1 in files[year][sample]:
181-
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
182-
183-
print(f"\t\t\t\t{f1}")
184-
185-
f1path = sspath / f1
186-
for f2 in _dirlist(fs, f1path):
187-
f2path = f1path / f2
188-
tfiles = []
189-
for f3 in _dirlist(fs, f2path):
190-
f3path = f2path / f3
191-
tfiles += [
192-
f"{redirector}{f3path!s}/{f}"
193-
for f in _dirlist(fs, f3path)
194-
if f.endswith(".root")
195-
]
196-
197-
if is_data:
198-
files[year][sample][f1] = tfiles
199-
print(f"\t\t\t\t\t{len(tfiles)} files")
252+
try:
253+
for f1 in _dirlist(fs, sspath):
254+
# For Data files, f1 is the subsample name
255+
if is_data:
256+
if f1 in files[year][sample]:
257+
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
200258

201-
if not is_data:
202-
files[year][sample][subsample_name] = tfiles
203-
print(f"\t\t\t\t\t{len(tfiles)} files")
259+
print(f"\t\t\t\t{f1}")
204260

205-
return files
261+
f1path = sspath / f1
262+
for f2 in _dirlist(fs, f1path):
263+
f2path = f1path / f2
264+
tfiles = []
265+
for f3 in _dirlist(fs, f2path):
266+
f3path = f2path / f3
267+
tfiles += [
268+
f"{redirector}{f3path!s}/{f}"
269+
for f in _dirlist(fs, f3path)
270+
if f.endswith(".root")
271+
]
272+
273+
if is_data:
274+
files[year][sample][f1] = tfiles
275+
print(f"\t\t\t\t\t{len(tfiles)} files")
206276

277+
if not is_data:
278+
files[year][sample][subsample_name] = tfiles
279+
print(f"\t\t\t\t\t{len(tfiles)} files")
280+
281+
except FileNotFoundError:
282+
print(f"\t\t\t\tWarning: Could not access {sspath}")
283+
continue
284+
285+
return files
207286

208287
def main():
209288
# Set up argument parser

0 commit comments

Comments
 (0)