Skip to content

Commit 954a9b4

Browse files
authored
Merge pull request #14 from zichunhao/main
Add backward-compatible support for new directory structures in nano indexing
2 parents 489570d + 8e3bc0b commit 954a9b4

File tree

1 file changed

+238
-61
lines changed

1 file changed

+238
-61
lines changed

data/index_private_nano.py

Lines changed: 238 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,87 @@ def _dirlist(fs, path) -> list:
2323

2424
return [f.name for f in listing]
2525

26+
def _has_new_structure(fs, base_dir, user, years):
27+
"""Check if the directory uses the new structure (data_{year}, mc_{year}) or old structure ({year})."""
28+
user_path = base_dir / user
29+
try:
30+
user_contents = _dirlist(fs, user_path)
31+
except FileNotFoundError:
32+
return False
33+
34+
# Check if any data_{year} or mc_{year} directories exist
35+
for year in years:
36+
if f"data_{year}" in user_contents or f"mc_{year}" in user_contents:
37+
return True
38+
39+
return False
40+
41+
def _get_sample_from_subsample(subsample_name, is_data):
42+
"""
43+
Determine the sample name from the subsample name using the SAMPLES dictionary.
44+
Source:
45+
- https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_datasets.py
46+
- https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_mc.py
47+
"""
48+
# If no match found, try to infer from common patterns
49+
if is_data:
50+
# Data
51+
if "JetHT" in subsample_name or "JetMET" in subsample_name:
52+
return "JetMET"
53+
elif "EGamma" in subsample_name:
54+
return "EGamma"
55+
elif "Muon" in subsample_name:
56+
return "Muon"
57+
elif "Tau" in subsample_name:
58+
return "Tau"
59+
elif "BTagMu" in subsample_name:
60+
return "BTagMu"
61+
elif "MuonEG" in subsample_name:
62+
return "MuonEG"
63+
elif "ParkingVBF" in subsample_name:
64+
return "ParkingVBF"
65+
elif "ParkingSingleMuon" in subsample_name:
66+
return "ParkingSingleMuon"
67+
else:
68+
raise ValueError(f"Could not determine sample from subsample name: {subsample_name}. Please check the naming conventions.")
69+
else:
70+
# MC
71+
if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name:
72+
if "VBF" in subsample_name:
73+
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
74+
else:
75+
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
76+
elif "Hto2B" in subsample_name:
77+
return "Hbb"
78+
elif "Hto2C" in subsample_name:
79+
return "Hcc"
80+
elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name:
81+
return "Htautau"
82+
elif "QCD-4Jets_HT" in subsample_name:
83+
return "QCD"
84+
elif "QCD_PT" in subsample_name:
85+
return "QCD_PT"
86+
elif "TTto" in subsample_name:
87+
return "TT"
88+
elif any(x in subsample_name for x in ["TbarWplus", "TWminus", "TbarBQ", "TBbarQ"]):
89+
return "SingleTop"
90+
elif "DYto2L-4Jets" in subsample_name:
91+
return "DYJetsLO"
92+
elif "DYto2L-2Jets" in subsample_name:
93+
return "DYJetsNLO"
94+
elif any(x in subsample_name for x in ["Wto2Q-3Jets", "WtoLNu-4Jets", "Zto2Q-4Jets"]):
95+
return "VJetsLO"
96+
elif any(x in subsample_name for x in ["Wto2Q-2Jets", "WtoLNu-2Jets", "Zto2Q-2Jets"]):
97+
return "VJetsNLO"
98+
elif any(x in subsample_name for x in ["WW_", "WZ_", "ZZ_", "WWto4Q", "WWtoLNu2Q", "WZto3LNu", "WZto4Q", "ZZto2L2Q", "ZZto4L"]):
99+
return "Diboson"
100+
elif any(x in subsample_name for x in ["VBFZto2Q", "VBFWto2Q", "VBFto2L", "VBFto2Nu", "VBFtoLNu"]):
101+
return "EWKV"
102+
elif any(x in subsample_name for x in ["WGtoLNuG", "WGto2QG", "ZGto2NuG", "ZGto2QG"]):
103+
return "VGamma"
104+
105+
raise ValueError(f"Could not determine sample from subsample name: {subsample_name}. Please check the naming conventions.")
106+
26107

27108
def xrootd_index_private_nano(
28109
base_dir: str,
@@ -39,21 +120,21 @@ def xrootd_index_private_nano(
39120
Can specify specific users, years, samples, and subsamples to search for;
40121
otherwise, it will search for all by default.
41122
42-
Files are organized as:
123+
Supports both old and new directory structures:
43124
125+
Old structure:
44126
MC:
45127
......redirector.......|...............base_dir....................|..user.|year|sample|
46128
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/HHbbtt/
47-
....................................subsample.......................................|
48-
GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_LHEweights_TuneCP5_13p6TeV_powheg-pythia8/
49-
.............................f1...........................|.....f2......|.f3.|......
50-
GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV/241028_235514/000*/*.root
129+
130+
New structure:
131+
MC:
132+
......redirector.......|...............base_dir....................|..user.|year|sample|
133+
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022_mc/HHbbtt/
51134
52135
Data:
53136
......redirector.......|...............base_dir....................|..user.|year|sample|
54-
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/Tau/
55-
.f1|..subsample.|.....f2......|.f3.|......
56-
Tau/Tau_Run2022D/241114_222843/000*/*.root
137+
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022_data/Tau/
57138
"""
58139
fs = client.FileSystem(redirector)
59140
base_dir = Path(base_dir)
@@ -63,71 +144,167 @@ def xrootd_index_private_nano(
63144

64145
if files is None:
65146
files = {}
147+
148+
# Check version
149+
if len(users) > 0:
150+
use_new_structure = _has_new_structure(fs, base_dir, users[0], years)
151+
print(f"Using {'new' if use_new_structure else 'old'} directory structure")
152+
else:
153+
# no users to search for
154+
return {}
66155

67156
for user in users:
68157
print(f"\t{user}")
158+
69159
for year in years:
70160
print(f"\t\t{year}")
71161
if year not in files:
72162
files[year] = {}
73163

74-
ypath = base_dir / user / year
75-
tsamples = _dirlist(fs, ypath) if samples is None else samples
76-
for sample in tsamples:
77-
if sample not in files[year]:
78-
files[year][sample] = {}
79-
elif overwrite_sample:
80-
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
81-
files[year][sample] = {}
82-
83-
print(f"\t\t\t{sample}")
84-
spath = ypath / sample
85-
86-
is_data = sample in hh_vars.DATA_SAMPLES
87-
88-
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
89-
for subsample in tsubsamples:
90-
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
91-
if not is_data:
92-
if subsample_name in files[year][sample]:
93-
warnings.warn(
94-
f"Duplicate subsample found! {subsample_name}", stacklevel=2
95-
)
96-
164+
if use_new_structure:
165+
# New structure: separate data_{year} and mc_{year} directories
166+
for is_data in (True, False):
167+
if is_data:
168+
ypath = base_dir / user / f"data_{year}"
169+
else:
170+
ypath = base_dir / user / f"mc_{year}"
171+
172+
tsubsamples = _dirlist(fs, ypath) if subsamples is None else subsamples
173+
174+
for subsample in tsubsamples:
175+
print(f"\t\t\tProcessing {subsample}")
176+
sample = _get_sample_from_subsample(subsample, is_data)
177+
178+
# Filter by samples if specified
179+
if samples is not None and sample not in samples:
180+
continue
181+
182+
if sample not in files[year]:
183+
files[year][sample] = {}
184+
elif overwrite_sample:
185+
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
186+
files[year][sample] = {}
187+
188+
print(f"\t\t\t{sample}")
189+
spath = ypath / subsample
190+
191+
# Clean subsample name
192+
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
97193
print(f"\t\t\t\t{subsample_name}")
98-
99-
sspath = spath / subsample
100-
for f1 in _dirlist(fs, sspath):
101-
# For Data files, f1 is the subsample name
102-
if is_data:
103-
if f1 in files[year][sample]:
104-
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
105-
106-
print(f"\t\t\t\t{f1}")
107-
108-
f1path = sspath / f1
109-
for f2 in _dirlist(fs, f1path):
110-
f2path = f1path / f2
111-
tfiles = []
112-
for f3 in _dirlist(fs, f2path):
113-
f3path = f2path / f3
114-
tfiles += [
115-
f"{redirector}{f3path!s}/{f}"
116-
for f in _dirlist(fs, f3path)
117-
if f.endswith(".root")
118-
]
119-
120-
if is_data:
121-
files[year][sample][f1] = tfiles
122-
print(f"\t\t\t\t\t{len(tfiles)} files")
123-
124-
if not is_data:
125-
files[year][sample][subsample_name] = tfiles
126-
print(f"\t\t\t\t\t{len(tfiles)} files")
194+
195+
if not is_data:
196+
if subsample_name in files[year][sample]:
197+
warnings.warn(
198+
f"Duplicate subsample found! {subsample=} ({subsample_name=}) for {year=}",
199+
stacklevel=2
200+
)
201+
print(f"\t\t\t\t{subsample_name}")
202+
203+
# Navigate through the directory structure (4 levels for new structure)
204+
try:
205+
for f1 in _dirlist(fs, spath): # dataset directory
206+
f1path = spath / f1
207+
tfiles = [] # Reset for each dataset directory
208+
209+
for f2 in _dirlist(fs, f1path): # timestamp directory
210+
f2path = f1path / f2
211+
for f3 in _dirlist(fs, f2path): # chunk directory (0000, 0001, etc.)
212+
f3path = f2path / f3
213+
f3_contents = _dirlist(fs, f3path)
214+
root_files = [f for f in f3_contents if f.endswith(".root")]
215+
if root_files:
216+
tfiles += [f"{redirector}{f3path!s}/{f}" for f in root_files]
217+
218+
# Process files for this specific dataset directory
219+
if is_data:
220+
run_info = f1.replace("_DAZSLE_PFNano", "")
221+
subsample_key = f"{sample}_{run_info}"
222+
223+
if subsample_key not in files[year][sample]:
224+
files[year][sample][subsample_key] = []
225+
files[year][sample][subsample_key].extend(tfiles)
226+
print(f"\t\t\t\t\t{len(tfiles)} files added")
227+
228+
# Handle MC case outside the f1 loop since it processes all files together
229+
if not is_data:
230+
files[year][sample][subsample_name] = tfiles
231+
print(f"\t\t\t\t\t{len(tfiles)} files")
232+
233+
except FileNotFoundError:
234+
print(f"\t\t\t\tWarning: Could not access {spath}")
235+
continue
236+
237+
else:
238+
# Old structure: single year directory
239+
ypath = base_dir / user / year
240+
try:
241+
tsamples = _dirlist(fs, ypath) if samples is None else samples
242+
except FileNotFoundError:
243+
continue
244+
245+
for sample in tsamples:
246+
if sample not in files[year]:
247+
files[year][sample] = {}
248+
elif overwrite_sample:
249+
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
250+
files[year][sample] = {}
251+
252+
print(f"\t\t\t{sample}")
253+
spath = ypath / sample
254+
255+
is_data = sample in hh_vars.DATA_SAMPLES
256+
257+
try:
258+
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
259+
except FileNotFoundError:
260+
continue
261+
262+
for subsample in tsubsamples:
263+
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
264+
if not is_data:
265+
if subsample_name in files[year][sample]:
266+
warnings.warn(
267+
f"Duplicate subsample found! {subsample_name}", stacklevel=2
268+
)
269+
270+
print(f"\t\t\t\t{subsample_name}")
271+
272+
sspath = spath / subsample
273+
try:
274+
for f1 in _dirlist(fs, sspath):
275+
# For Data files, f1 is the subsample name
276+
if is_data:
277+
if f1 in files[year][sample]:
278+
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
279+
280+
print(f"\t\t\t\t{f1}")
281+
282+
f1path = sspath / f1
283+
for f2 in _dirlist(fs, f1path):
284+
f2path = f1path / f2
285+
tfiles = []
286+
for f3 in _dirlist(fs, f2path):
287+
f3path = f2path / f3
288+
tfiles += [
289+
f"{redirector}{f3path!s}/{f}"
290+
for f in _dirlist(fs, f3path)
291+
if f.endswith(".root")
292+
]
293+
294+
if is_data:
295+
files[year][sample][f1] = tfiles
296+
print(f"\t\t\t\t\t{len(tfiles)} files")
297+
298+
if not is_data:
299+
files[year][sample][subsample_name] = tfiles
300+
print(f"\t\t\t\t\t{len(tfiles)} files")
301+
302+
except FileNotFoundError:
303+
print(f"\t\t\t\tWarning: Could not access {sspath}")
304+
continue
127305

128306
return files
129307

130-
131308
def main():
132309
# Set up argument parser
133310
parser = argparse.ArgumentParser()

0 commit comments

Comments
 (0)