Skip to content

Commit a7589ef

Browse files
authored
Merge pull request #47755 from AdrianoDee/nogolden_and_run_das
Allowing No Golden and Run Based Skim Option of `das-up-to-nevents.py`
2 parents 6ffd83e + 115f34e commit a7589ef

File tree

1 file changed

+125
-101
lines changed

1 file changed

+125
-101
lines changed

Configuration/PyReleaseValidation/scripts/das-up-to-nevents.py

Lines changed: 125 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,18 @@ def das_do_command(cmd):
4040
out = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode('utf8')
4141
return out.split("\n")
4242

43+
def das_key(dataset):
44+
return 'dataset='+dataset if "#" not in dataset else 'block='+dataset
45+
4346
def das_file_site(dataset, site):
44-
cmd = "dasgoclient --query='file dataset=%s site=%s'"%(dataset,site)
47+
cmd = "dasgoclient --query='file %s site=%s'"%(das_key(dataset),site)
4548
out = das_do_command(cmd)
4649
df = pd.DataFrame(out,columns=["file"])
4750

4851
return df
4952

5053
def das_file_data(dataset,opt=""):
51-
cmd = "dasgoclient --query='file dataset=%s %s| grep file.name, file.nevents'"%(dataset,opt)
54+
cmd = "dasgoclient --query='file %s %s| grep file.name, file.nevents'"%(das_key(dataset),opt)
5255
out = das_do_command(cmd)
5356
out = [np.array(r.split(" "))[[0,3]] for r in out if len(r) > 0]
5457

@@ -58,7 +61,8 @@ def das_file_data(dataset,opt=""):
5861
return df
5962

6063
def das_lumi_data(dataset,opt=""):
61-
cmd = "dasgoclient --query='file,lumi,run dataset=%s %s'"%(dataset,opt)
64+
65+
cmd = "dasgoclient --query='file,lumi,run %s %s'"%(das_key(dataset),opt)
6266

6367
out = das_do_command(cmd)
6468
out = [r.split(" ") for r in out if len(r)>0]
@@ -68,7 +72,7 @@ def das_lumi_data(dataset,opt=""):
6872
return df
6973

7074
def das_run_events_data(dataset,run,opt=""):
71-
cmd = "dasgoclient --query='file dataset=%s run=%s %s | sum(file.nevents) '"%(dataset,run,opt)
75+
cmd = "dasgoclient --query='file %s run=%s %s | sum(file.nevents) '"%(das_key(dataset),run,opt)
7276
out = das_do_command(cmd)[0]
7377

7478
out = [o for o in out.split(" ") if "sum" not in o]
@@ -77,7 +81,7 @@ def das_run_events_data(dataset,run,opt=""):
7781
return out
7882

7983
def das_run_data(dataset,opt=""):
80-
cmd = "dasgoclient --query='run dataset=%s %s '"%(dataset,opt)
84+
cmd = "dasgoclient --query='run %s %s '"%(das_key(dataset),opt)
8185
out = das_do_command(cmd)
8286

8387
return out
@@ -92,7 +96,7 @@ def no_intersection():
9296
if __name__ == '__main__':
9397

9498
parser = argparse.ArgumentParser()
95-
parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW' )",type=str,required=True)
99+
parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW', may also be a block (e.g /ZeroBias/Run2024J-v1/RAW#d8058bab-4e55-45b0-abb6-405aa3abc2af)",type=str,required=True)
96100
parser.add_argument('--threshold','-t', help ="Event threshold per file",type=int,default=-1)
97101
parser.add_argument('--events','-e', help ="Tot number of events targeted",type=int,default=-1)
98102
parser.add_argument('--outfile','-o', help='Dump results to file', type=str, default=None)
@@ -101,6 +105,8 @@ def no_intersection():
101105
parser.add_argument('--site','-s', help='Only data at specific site', type=str, default=None)
102106
parser.add_argument('--lumis','-l', help='Output file for lumi ranges for the selected files (if black no lumiranges calculated)', type=str, default=None)
103107
parser.add_argument('--precheck','-pc', action='store_true', help='Check run per run before building the dataframes, to avoid huge caching.')
108+
parser.add_argument('--nogolden','-ng', action='store_true', help='Do not crosscheck the dataset run and lumis with a Golden json for data certification')
109+
parser.add_argument('--run','-r', help ="Target a specific run",type=int,default=None,nargs="+")
104110
args = parser.parse_args()
105111

106112
if args.proxy is not None:
@@ -117,108 +123,126 @@ def no_intersection():
117123
outfile = args.outfile
118124
site = args.site
119125
lumis = args.lumis
120-
121-
## get the greatest golden json
122-
year = dataset.split("Run")[1][2:4] # from 20XX to XX
123-
PD = dataset.split("/")[1]
124-
cert_type = "Collisions" + str(year)
125-
if "Cosmics" in dataset:
126-
cert_type = "Cosmics" + str(year)
127-
elif "Commisioning" in dataset:
128-
cert_type = "Commisioning2020"
129-
elif "HI" in PD:
130-
cert_type = "Collisions" + str(year) + "HI"
131-
132-
cert_path = base_cert_path + cert_type + "/"
133-
web_fallback = False
134-
135-
## if we have access to eos we get from there ...
136-
if os.path.isdir(cert_path):
137-
json_list = os.listdir(cert_path)
138-
if len(json_list) == 0:
139-
web_fallback == True
140-
json_list = [c for c in json_list if "Golden" in c and "era" not in c]
141-
json_list = [c for c in json_list if c.startswith("Cert_C") and c.endswith("json")]
142-
else:
143-
web_fallback = True
144-
## ... if not we go to the website
145-
if web_fallback:
146-
cert_url = base_cert_url + cert_type + "/"
147-
json_list = get_url_clean(cert_url).split("\n")
148-
json_list = [c for c in json_list if "Golden" in c and "era" not in c and "Cert_C" in c]
149-
json_list = [[cc for cc in c.split(" ") if cc.startswith("Cert_C") and cc.endswith("json")][0] for c in json_list]
150-
151-
# the larger the better, assuming file naming schema
152-
# Cert_X_RunStart_RunFinish_Type.json
153-
run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list]
154-
latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
155-
best_json = str(latest_json[0])
156-
if not web_fallback:
157-
with open(cert_path + "/" + best_json) as js:
158-
golden = json.load(js)
159-
else:
160-
golden = get_url_clean(cert_url + best_json)
161-
golden = ast.literal_eval(golden) #converts string to dict
162-
163-
# golden json with all the lumisections
164-
golden_flat = {}
165-
for k in golden:
166-
R = []
167-
for r in golden[k]:
168-
R = R + [f for f in range(r[0],r[1]+1)]
169-
golden_flat[k] = R
170-
171-
# let's just check there's an intersection between the
172-
# dataset and the json
173-
data_runs = das_run_data(dataset)
174-
golden_data_runs = [r for r in data_runs if r in golden_flat]
175-
176-
if (len(golden_data_runs)==0):
177-
no_intersection()
178-
179-
# building the dataframe, cleaning for bad lumis
180-
golden_data_runs_tocheck = golden_data_runs
126+
runs = args.run
181127
das_opt = ""
182-
if testing or args.precheck:
183-
golden_data_runs_tocheck = []
184-
# Here we check run per run.
185-
# This implies more dasgoclient queries, but smaller outputs
186-
# useful when running the IB/PR tests not to have huge
187-
# query results that have to be cached.
188-
189-
sum_events = 0
190-
191-
for r in golden_data_runs:
192-
sum_events = sum_events + int(das_run_events_data(dataset,r))
193-
golden_data_runs_tocheck.append(r)
194-
if events > 0 and sum_events > events:
195-
break
196-
197-
das_opt = "run in %s"%(str([int(g) for g in golden_data_runs_tocheck]))
198-
128+
129+
if runs is not None:
130+
das_opt = "run in %s"%(str([int(r) for r in runs]))
131+
132+
if not args.nogolden:
133+
134+
## get the greatest golden json
135+
year = dataset.split("Run")[1][2:4] # from 20XX to XX
136+
PD = dataset.split("/")[1]
137+
cert_type = "Collisions" + str(year)
138+
if "Cosmics" in dataset:
139+
cert_type = "Cosmics" + str(year)
140+
elif "Commisioning" in dataset:
141+
cert_type = "Commisioning2020"
142+
elif "HI" in PD:
143+
cert_type = "Collisions" + str(year) + "HI"
144+
145+
cert_path = base_cert_path + cert_type + "/"
146+
web_fallback = False
147+
148+
## if we have access to eos we get from there ...
149+
if os.path.isdir(cert_path):
150+
json_list = os.listdir(cert_path)
151+
if len(json_list) == 0:
152+
web_fallback == True
153+
json_list = [c for c in json_list if "Golden" in c and "era" not in c]
154+
json_list = [c for c in json_list if c.startswith("Cert_C") and c.endswith("json")]
155+
else:
156+
web_fallback = True
157+
## ... if not we go to the website
158+
if web_fallback:
159+
cert_url = base_cert_url + cert_type + "/"
160+
json_list = get_url_clean(cert_url).split("\n")
161+
json_list = [c for c in json_list if "Golden" in c and "era" not in c and "Cert_C" in c]
162+
json_list = [[cc for cc in c.split(" ") if cc.startswith("Cert_C") and cc.endswith("json")][0] for c in json_list]
163+
164+
# the larger the better, assuming file naming schema
165+
# Cert_X_RunStart_RunFinish_Type.json
166+
# TODO if args.run keep golden only with right range
167+
168+
run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list]
169+
latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
170+
best_json = str(latest_json[0])
171+
if not web_fallback:
172+
with open(cert_path + "/" + best_json) as js:
173+
golden = json.load(js)
174+
else:
175+
golden = get_url_clean(cert_url + best_json)
176+
golden = ast.literal_eval(golden) #converts string to dict
177+
178+
# skim for runs in input
179+
if runs is not None:
180+
for k in golden:
181+
if k not in args.run:
182+
golden.pop(k)
183+
184+
# golden json with all the lumisections
185+
golden_flat = {}
186+
for k in golden:
187+
R = []
188+
for r in golden[k]:
189+
R = R + [f for f in range(r[0],r[1]+1)]
190+
golden_flat[k] = R
191+
192+
# let's just check there's an intersection between the
193+
# dataset and the json
194+
data_runs = das_run_data(dataset)
195+
golden_data_runs = [r for r in data_runs if r in golden_flat]
196+
197+
if (len(golden_data_runs)==0):
198+
no_intersection()
199+
200+
# building the dataframe, cleaning for bad lumis
201+
golden_data_runs_tocheck = golden_data_runs
202+
203+
if testing or args.precheck:
204+
golden_data_runs_tocheck = []
205+
# Here we check run per run.
206+
# This implies more dasgoclient queries, but smaller outputs
207+
# useful when running the IB/PR tests not to have huge
208+
# query results that have to be cached.
209+
210+
sum_events = 0
211+
212+
for r in golden_data_runs:
213+
sum_events = sum_events + int(das_run_events_data(dataset,r))
214+
golden_data_runs_tocheck.append(r)
215+
if events > 0 and sum_events > events:
216+
break
217+
218+
das_opt = "run in %s"%(str([int(g) for g in golden_data_runs_tocheck]))
219+
199220
df = das_lumi_data(dataset,opt=das_opt).merge(das_file_data(dataset,opt=das_opt),on="file",how="inner") # merge file informations with run and lumis
200-
201221
df["lumis"] = [[int(ff) for ff in f.replace("[","").replace("]","").split(",")] for f in df.lumis.values]
202-
df_rs = []
203-
for r in golden_data_runs_tocheck:
204-
cut = (df["run"] == r)
205-
if not any(cut):
206-
continue
222+
223+
if not args.nogolden:
224+
225+
df_rs = []
226+
for r in golden_data_runs_tocheck:
227+
cut = (df["run"] == r)
228+
if not any(cut):
229+
continue
207230

208-
df_r = df[cut]
231+
df_r = df[cut]
209232

210-
# jumping low event content runs
211-
if df_r["events"].sum() < threshold:
212-
continue
233+
# jumping low event content runs
234+
if df_r["events"].sum() < threshold:
235+
continue
213236

214-
good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis])
215-
n_lumis = np.array([len(l) for l in df_r.lumis])
216-
df_rs.append(df_r[good_lumis==n_lumis])
237+
good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis])
238+
n_lumis = np.array([len(l) for l in df_r.lumis])
239+
df_rs.append(df_r[good_lumis==n_lumis])
217240

218-
if (len(df_rs)==0):
219-
no_intersection()
241+
if (len(df_rs)==0):
242+
no_intersection()
243+
244+
df = pd.concat(df_rs)
220245

221-
df = pd.concat(df_rs)
222246
df.loc[:,"min_lumi"] = [min(f) for f in df.lumis]
223247
df.loc[:,"max_lumi"] = [max(f) for f in df.lumis]
224248
df = df.sort_values(["run","min_lumi","max_lumi"])
@@ -251,4 +275,4 @@ def no_intersection():
251275

252276
sys.exit(0)
253277

254-
278+

0 commit comments

Comments
 (0)