Skip to content

Commit c8e393a

Browse files
committed
Update xrootd_index_private_nano to support new directory structures
1 parent 190318e commit c8e393a

File tree

1 file changed

+137
-60
lines changed

1 file changed

+137
-60
lines changed

data/index_private_nano.py

Lines changed: 137 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,21 @@ def _dirlist(fs, path) -> list:
2323

2424
return [f.name for f in listing]
2525

26+
def _has_new_structure(fs, base_dir, user, years):
27+
"""Check if the directory uses the new structure (data_{year}, mc_{year}) or old structure ({year})."""
28+
user_path = base_dir / user
29+
try:
30+
user_contents = _dirlist(fs, user_path)
31+
except FileNotFoundError:
32+
return False
33+
34+
# Check if any data_{year} or mc_{year} directories exist
35+
for year in years:
36+
if f"data_{year}" in user_contents or f"mc_{year}" in user_contents:
37+
return True
38+
39+
return False
40+
2641

2742
def xrootd_index_private_nano(
2843
base_dir: str,
@@ -39,21 +54,21 @@ def xrootd_index_private_nano(
3954
Can specify specific users, years, samples, and subsamples to search for;
4055
otherwise, it will search for all by default.
4156
42-
Files are organized as:
57+
Supports both old and new directory structures:
4358
59+
Old structure:
4460
MC:
4561
......redirector.......|...............base_dir....................|..user.|year|sample|
4662
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/HHbbtt/
47-
....................................subsample.......................................|
48-
GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_LHEweights_TuneCP5_13p6TeV_powheg-pythia8/
49-
.............................f1...........................|.....f2......|.f3.|......
50-
GluGlutoHHto2B2Tau_kl-1p00_kt-1p00_c2-0p00_TuneCP5_13p6TeV/241028_235514/000*/*.root
63+
64+
New structure:
65+
MC:
66+
......redirector.......|...............base_dir....................|..user.|year|sample|
67+
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022_mc/HHbbtt/
5168
5269
Data:
5370
......redirector.......|...............base_dir....................|..user.|year|sample|
54-
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022/Tau/
55-
.f1|..subsample.|.....f2......|.f3.|......
56-
Tau/Tau_Run2022D/241114_222843/000*/*.root
71+
root://cmseos.fnal.gov//store/user/lpcdihiggsboost/NanoAOD_v12_ParT/rkansal/2022_data/Tau/
5772
"""
5873
fs = client.FileSystem(redirector)
5974
base_dir = Path(base_dir)
@@ -66,65 +81,127 @@ def xrootd_index_private_nano(
6681

6782
for user in users:
6883
print(f"\t{user}")
84+
85+
# Check version
86+
use_new_structure = _has_new_structure(fs, base_dir, user, years)
87+
print(f"\t\tUsing {'new' if use_new_structure else 'old'} directory structure")
88+
6989
for year in years:
7090
print(f"\t\t{year}")
7191
if year not in files:
7292
files[year] = {}
7393

74-
ypath = base_dir / user / year
75-
tsamples = _dirlist(fs, ypath) if samples is None else samples
76-
for sample in tsamples:
77-
if sample not in files[year]:
78-
files[year][sample] = {}
79-
elif overwrite_sample:
80-
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
81-
files[year][sample] = {}
82-
83-
print(f"\t\t\t{sample}")
84-
spath = ypath / sample
85-
86-
is_data = sample in hh_vars.DATA_SAMPLES
87-
88-
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
89-
for subsample in tsubsamples:
90-
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
91-
if not is_data:
92-
if subsample_name in files[year][sample]:
93-
warnings.warn(
94-
f"Duplicate subsample found! {subsample_name}", stacklevel=2
95-
)
96-
97-
print(f"\t\t\t\t{subsample_name}")
98-
99-
sspath = spath / subsample
100-
for f1 in _dirlist(fs, sspath):
101-
# For Data files, f1 is the subsample name
102-
if is_data:
103-
if f1 in files[year][sample]:
104-
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
105-
106-
print(f"\t\t\t\t{f1}")
107-
108-
f1path = sspath / f1
109-
for f2 in _dirlist(fs, f1path):
110-
f2path = f1path / f2
111-
tfiles = []
112-
for f3 in _dirlist(fs, f2path):
113-
f3path = f2path / f3
114-
tfiles += [
115-
f"{redirector}{f3path!s}/{f}"
116-
for f in _dirlist(fs, f3path)
117-
if f.endswith(".root")
118-
]
119-
120-
if is_data:
121-
files[year][sample][f1] = tfiles
94+
if use_new_structure:
95+
# New structure: separate data_{year} and mc_{year} directories
96+
for is_data in (True, False):
97+
if is_data:
98+
ypath = base_dir / user / f"data_{year}"
99+
else:
100+
ypath = base_dir / user / f"mc_{year}"
101+
102+
tsamples = _dirlist(fs, ypath) if samples is None else samples
103+
104+
for sample in tsamples:
105+
if sample not in files[year]:
106+
files[year][sample] = {}
107+
elif overwrite_sample:
108+
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
109+
files[year][sample] = {}
110+
111+
print(f"\t\t\t{sample}")
112+
spath = ypath / sample
113+
114+
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
115+
for subsample in tsubsamples:
116+
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
117+
if not is_data:
118+
if subsample_name in files[year][sample]:
119+
warnings.warn(
120+
f"Duplicate subsample found! {subsample_name}", stacklevel=2
121+
)
122+
123+
print(f"\t\t\t\t{subsample_name}")
124+
125+
sspath = spath / subsample
126+
for f1 in _dirlist(fs, sspath):
127+
# For Data files, f1 is the subsample name
128+
if is_data:
129+
if f1 in files[year][sample]:
130+
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
131+
print(f"\t\t\t\t{f1}")
132+
133+
f1path = sspath / f1
134+
for f2 in _dirlist(fs, f1path):
135+
f2path = f1path / f2
136+
tfiles = []
137+
f2_contents = _dirlist(fs, f2path)
138+
root_files = [f for f in f2_contents if f.endswith(".root")]
139+
if root_files:
140+
tfiles += [f"{redirector}{f2path!s}/{f}" for f in root_files]
141+
142+
if is_data:
143+
files[year][sample][f1] = tfiles
144+
print(f"\t\t\t\t\t{len(tfiles)} files")
145+
146+
if not is_data:
147+
files[year][sample][subsample_name] = tfiles
148+
print(f"\t\t\t\t\t{len(tfiles)} files")
149+
else:
150+
# Old structure: single year directory
151+
ypath = base_dir / user / year
152+
tsamples = _dirlist(fs, ypath) if samples is None else samples
153+
for sample in tsamples:
154+
if sample not in files[year]:
155+
files[year][sample] = {}
156+
elif overwrite_sample:
157+
warnings.warn(f"Overwriting existing sample {sample}", stacklevel=2)
158+
files[year][sample] = {}
159+
160+
print(f"\t\t\t{sample}")
161+
spath = ypath / sample
162+
163+
is_data = sample in hh_vars.DATA_SAMPLES
164+
165+
tsubsamples = _dirlist(fs, spath) if subsamples is None else subsamples
166+
for subsample in tsubsamples:
167+
subsample_name = subsample.split("_TuneCP5")[0].split("_LHEweights")[0]
168+
if not is_data:
169+
if subsample_name in files[year][sample]:
170+
warnings.warn(
171+
f"Duplicate subsample found! {subsample_name}", stacklevel=2
172+
)
173+
174+
print(f"\t\t\t\t{subsample_name}")
175+
176+
sspath = spath / subsample
177+
for f1 in _dirlist(fs, sspath):
178+
# For Data files, f1 is the subsample name
179+
if is_data:
180+
if f1 in files[year][sample]:
181+
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
182+
183+
print(f"\t\t\t\t{f1}")
184+
185+
f1path = sspath / f1
186+
for f2 in _dirlist(fs, f1path):
187+
f2path = f1path / f2
188+
tfiles = []
189+
for f3 in _dirlist(fs, f2path):
190+
f3path = f2path / f3
191+
tfiles += [
192+
f"{redirector}{f3path!s}/{f}"
193+
for f in _dirlist(fs, f3path)
194+
if f.endswith(".root")
195+
]
196+
197+
if is_data:
198+
files[year][sample][f1] = tfiles
199+
print(f"\t\t\t\t\t{len(tfiles)} files")
200+
201+
if not is_data:
202+
files[year][sample][subsample_name] = tfiles
122203
print(f"\t\t\t\t\t{len(tfiles)} files")
123204

124-
if not is_data:
125-
files[year][sample][subsample_name] = tfiles
126-
print(f"\t\t\t\t\t{len(tfiles)} files")
127-
128205
return files
129206

130207

0 commit comments

Comments
 (0)