Skip to content

Commit 238ccce

Browse files
committed
Categorize data samples
1 parent 18bf01e commit 238ccce

File tree

1 file changed

+74
-57
lines changed

1 file changed

+74
-57
lines changed

data/index_private_nano.py

Lines changed: 74 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -38,46 +38,71 @@ def _has_new_structure(fs, base_dir, user, years):
3838

3939
return False
4040

41-
def _get_sample_from_subsample(subsample_name):
41+
def _get_sample_from_subsample(subsample_name, is_data):
4242
"""
4343
Determine the sample name from the subsample name using the SAMPLES dictionary.
44+
Source:
45+
- https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_datasets.py
46+
- https://github.com/rkansal47/Run3_nano_submission/blob/40a74eeffd5d0b935629567dc291a32c9c43abb7/datasets/get_mc.py
4447
"""
4548
# If no match found, try to infer from common patterns
46-
if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name:
47-
if "VBF" in subsample_name:
48-
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
49+
if is_data:
50+
# Data
51+
if "JetHT" in subsample_name or "JetMET" in subsample_name:
52+
return "JetMET"
53+
elif "EGamma" in subsample_name:
54+
return "EGamma"
55+
elif "Muon" in subsample_name:
56+
return "Muon"
57+
elif "Tau" in subsample_name:
58+
return "Tau"
59+
elif "BTagMu" in subsample_name:
60+
return "BTagMu"
61+
elif "MuonEG" in subsample_name:
62+
return "MuonEG"
63+
elif "ParkingVBF" in subsample_name:
64+
return "ParkingVBF"
65+
elif "ParkingSingleMuon" in subsample_name:
66+
return "ParkingSingleMuon"
4967
else:
50-
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
51-
elif "Hto2B" in subsample_name:
52-
return "Hbb"
53-
elif "Hto2C" in subsample_name:
54-
return "Hcc"
55-
elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name:
56-
return "Htautau"
57-
elif "QCD-4Jets_HT" in subsample_name:
58-
return "QCD-4Jets_HT"
59-
elif "QCD_PT" in subsample_name:
60-
return "QCD_PT"
61-
elif "TTto" in subsample_name:
62-
return "TT"
63-
elif any(x in subsample_name for x in ["TbarWplus", "TWminus", "TbarBQ", "TBbarQ"]):
64-
return "SingleTop"
65-
elif "DYto2L-4Jets" in subsample_name:
66-
return "DYJetsLO"
67-
elif "DYto2L-2Jets" in subsample_name:
68-
return "DYJetsNLO"
69-
elif any(x in subsample_name for x in ["Wto2Q-3Jets", "WtoLNu-4Jets", "Zto2Q-4Jets"]):
70-
return "VJetsLO"
71-
elif any(x in subsample_name for x in ["Wto2Q-2Jets", "WtoLNu-2Jets", "Zto2Q-2Jets"]):
72-
return "VJetsNLO"
73-
elif any(x in subsample_name for x in ["WW_", "WZ_", "ZZ_", "WWto4Q", "WWtoLNu2Q", "WZto3LNu", "WZto4Q", "ZZto2L2Q", "ZZto4L"]):
74-
return "Diboson"
75-
elif any(x in subsample_name for x in ["VBFZto2Q", "VBFWto2Q", "VBFto2L", "VBFto2Nu", "VBFtoLNu"]):
76-
return "EWKV"
77-
elif any(x in subsample_name for x in ["WGtoLNuG", "WGto2QG", "ZGto2NuG", "ZGto2QG"]):
78-
return "VGamma"
68+
raise ValueError(f"Could not determine sample from subsample name: {subsample_name}. Please check the naming conventions.")
69+
else:
70+
# MC
71+
if "HHto4B" in subsample_name or "HHto2B2Tau" in subsample_name:
72+
if "VBF" in subsample_name:
73+
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
74+
else:
75+
return "HHbbtt" if "2B2Tau" in subsample_name else "HH4b"
76+
elif "Hto2B" in subsample_name:
77+
return "Hbb"
78+
elif "Hto2C" in subsample_name:
79+
return "Hcc"
80+
elif "Hto2Tau" in subsample_name or "HTo2Tau" in subsample_name:
81+
return "Htautau"
82+
elif "QCD-4Jets_HT" in subsample_name:
83+
return "QCD-4Jets_HT"
84+
elif "QCD_PT" in subsample_name:
85+
return "QCD_PT"
86+
elif "TTto" in subsample_name:
87+
return "TT"
88+
elif any(x in subsample_name for x in ["TbarWplus", "TWminus", "TbarBQ", "TBbarQ"]):
89+
return "SingleTop"
90+
elif "DYto2L-4Jets" in subsample_name:
91+
return "DYJetsLO"
92+
elif "DYto2L-2Jets" in subsample_name:
93+
return "DYJetsNLO"
94+
elif any(x in subsample_name for x in ["Wto2Q-3Jets", "WtoLNu-4Jets", "Zto2Q-4Jets"]):
95+
return "VJetsLO"
96+
elif any(x in subsample_name for x in ["Wto2Q-2Jets", "WtoLNu-2Jets", "Zto2Q-2Jets"]):
97+
return "VJetsNLO"
98+
elif any(x in subsample_name for x in ["WW_", "WZ_", "ZZ_", "WWto4Q", "WWtoLNu2Q", "WZto3LNu", "WZto4Q", "ZZto2L2Q", "ZZto4L"]):
99+
return "Diboson"
100+
elif any(x in subsample_name for x in ["VBFZto2Q", "VBFWto2Q", "VBFto2L", "VBFto2Nu", "VBFtoLNu"]):
101+
return "EWKV"
102+
elif any(x in subsample_name for x in ["WGtoLNuG", "WGto2QG", "ZGto2NuG", "ZGto2QG"]):
103+
return "VGamma"
79104

80-
raise ValueError(f"Could not determine sample from subsample name: {subsample_name}. Please check the naming conventions.")
105+
raise ValueError(f"Could not determine sample from subsample name: {subsample_name}. Please check the naming conventions.")
81106

82107

83108
def xrootd_index_private_nano(
@@ -119,14 +144,18 @@ def xrootd_index_private_nano(
119144

120145
if files is None:
121146
files = {}
147+
148+
# Check version
149+
if len(users) > 0:
150+
use_new_structure = _has_new_structure(fs, base_dir, users[0], years)
151+
print(f"Using {'new' if use_new_structure else 'old'} directory structure")
152+
else:
153+
# no users to search for
154+
return {}
122155

123156
for user in users:
124157
print(f"\t{user}")
125158

126-
# Check version
127-
use_new_structure = _has_new_structure(fs, base_dir, user, years)
128-
print(f"\t\tUsing {'new' if use_new_structure else 'old'} directory structure")
129-
130159
for year in years:
131160
print(f"\t\t{year}")
132161
if year not in files:
@@ -144,13 +173,7 @@ def xrootd_index_private_nano(
144173

145174
for subsample in tsubsamples:
146175
print(f"\t\t\tProcessing {subsample}")
147-
# For new structure, infer sample name from subsample
148-
if is_data:
149-
# For data, the subsample IS the sample (e.g., "Tau", "JetMET")
150-
sample = subsample
151-
else:
152-
# For MC, infer sample from subsample name
153-
sample = _get_sample_from_subsample(subsample)
176+
sample = _get_sample_from_subsample(subsample, is_data)
154177

155178
# Filter by samples if specified
156179
if samples is not None and sample not in samples:
@@ -181,12 +204,6 @@ def xrootd_index_private_nano(
181204
tfiles = []
182205
try:
183206
for f1 in _dirlist(fs, spath): # dataset directory
184-
# For Data files, f1 is the subsample name
185-
if is_data:
186-
if f1 in files[year][sample]:
187-
warnings.warn(f"Duplicate subsample found! {f1}", stacklevel=2)
188-
print(f"\t\t\t\t{f1}")
189-
190207
f1path = spath / f1
191208
for f2 in _dirlist(fs, f1path): # timestamp directory
192209
f2path = f1path / f2
@@ -198,8 +215,12 @@ def xrootd_index_private_nano(
198215
tfiles += [f"{redirector}{f3path!s}/{f}" for f in root_files]
199216

200217
if is_data:
201-
files[year][sample][f1] = tfiles
202-
print(f"\t\t\t\t\t{len(tfiles)} files")
218+
# For data, concatenate files from related subsamples
219+
# e.g. EGamma0 and EGamma1 should be combined
220+
if f1 not in files[year][sample]:
221+
files[year][sample][f1] = []
222+
files[year][sample][f1].extend(tfiles)
223+
print(f"\t\t\t\t\t{len(tfiles)} files added")
203224

204225
if not is_data:
205226
files[year][sample][subsample_name] = tfiles
@@ -208,10 +229,6 @@ def xrootd_index_private_nano(
208229
except FileNotFoundError:
209230
print(f"\t\t\t\tWarning: Could not access {spath}")
210231
continue
211-
212-
except FileNotFoundError:
213-
print(f"\t\t\t\tWarning: Could not access {spath}")
214-
continue
215232

216233
else:
217234
# Old structure: single year directory

0 commit comments

Comments
 (0)