@@ -40,15 +40,18 @@ def das_do_command(cmd):
4040 out = subprocess .check_output (cmd , shell = True , executable = "/bin/bash" ).decode ('utf8' )
4141 return out .split ("\n " )
4242
43+ def das_key (dataset ):
44+ return 'dataset=' + dataset if "#" not in dataset else 'block=' + dataset
45+
4346def das_file_site (dataset , site ):
44- cmd = "dasgoclient --query='file dataset= %s site=%s'" % (dataset ,site )
47+ cmd = "dasgoclient --query='file %s site=%s'" % (das_key ( dataset ) ,site )
4548 out = das_do_command (cmd )
4649 df = pd .DataFrame (out ,columns = ["file" ])
4750
4851 return df
4952
5053def das_file_data (dataset ,opt = "" ):
51- cmd = "dasgoclient --query='file dataset= %s %s| grep file.name, file.nevents'" % (dataset ,opt )
54+ cmd = "dasgoclient --query='file %s %s| grep file.name, file.nevents'" % (das_key ( dataset ) ,opt )
5255 out = das_do_command (cmd )
5356 out = [np .array (r .split (" " ))[[0 ,3 ]] for r in out if len (r ) > 0 ]
5457
@@ -58,7 +61,8 @@ def das_file_data(dataset,opt=""):
5861 return df
5962
6063def das_lumi_data (dataset ,opt = "" ):
61- cmd = "dasgoclient --query='file,lumi,run dataset=%s %s'" % (dataset ,opt )
64+
65+ cmd = "dasgoclient --query='file,lumi,run %s %s'" % (das_key (dataset ),opt )
6266
6367 out = das_do_command (cmd )
6468 out = [r .split (" " ) for r in out if len (r )> 0 ]
@@ -68,7 +72,7 @@ def das_lumi_data(dataset,opt=""):
6872 return df
6973
7074def das_run_events_data (dataset ,run ,opt = "" ):
71- cmd = "dasgoclient --query='file dataset= %s run=%s %s | sum(file.nevents) '" % (dataset ,run ,opt )
75+ cmd = "dasgoclient --query='file %s run=%s %s | sum(file.nevents) '" % (das_key ( dataset ) ,run ,opt )
7276 out = das_do_command (cmd )[0 ]
7377
7478 out = [o for o in out .split (" " ) if "sum" not in o ]
@@ -77,7 +81,7 @@ def das_run_events_data(dataset,run,opt=""):
7781 return out
7882
7983def das_run_data (dataset ,opt = "" ):
80- cmd = "dasgoclient --query='run dataset= %s %s '" % (dataset ,opt )
84+ cmd = "dasgoclient --query='run %s %s '" % (das_key ( dataset ) ,opt )
8185 out = das_do_command (cmd )
8286
8387 return out
@@ -92,7 +96,7 @@ def no_intersection():
9296if __name__ == '__main__' :
9397
9498 parser = argparse .ArgumentParser ()
95- parser .add_argument ('--dataset' ,'-d' , default = None , help = "Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW' )" ,type = str ,required = True )
99+ parser .add_argument ('--dataset' ,'-d' , default = None , help = "Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW', may also be a block (e.g /ZeroBias/Run2024J-v1/RAW#d8058bab-4e55-45b0-abb6-405aa3abc2af )" ,type = str ,required = True )
96100 parser .add_argument ('--threshold' ,'-t' , help = "Event threshold per file" ,type = int ,default = - 1 )
97101 parser .add_argument ('--events' ,'-e' , help = "Tot number of events targeted" ,type = int ,default = - 1 )
98102 parser .add_argument ('--outfile' ,'-o' , help = 'Dump results to file' , type = str , default = None )
@@ -101,6 +105,8 @@ def no_intersection():
101105 parser .add_argument ('--site' ,'-s' , help = 'Only data at specific site' , type = str , default = None )
102106 parser .add_argument ('--lumis' ,'-l' , help = 'Output file for lumi ranges for the selected files (if black no lumiranges calculated)' , type = str , default = None )
103107 parser .add_argument ('--precheck' ,'-pc' , action = 'store_true' , help = 'Check run per run before building the dataframes, to avoid huge caching.' )
108+ parser .add_argument ('--nogolden' ,'-ng' , action = 'store_true' , help = 'Do not crosscheck the dataset run and lumis with a Golden json for data certification' )
109+ parser .add_argument ('--run' ,'-r' , help = "Target a specific run" ,type = int ,default = None ,nargs = "+" )
104110 args = parser .parse_args ()
105111
106112 if args .proxy is not None :
@@ -117,108 +123,126 @@ def no_intersection():
117123 outfile = args .outfile
118124 site = args .site
119125 lumis = args .lumis
120-
121- ## get the greatest golden json
122- year = dataset .split ("Run" )[1 ][2 :4 ] # from 20XX to XX
123- PD = dataset .split ("/" )[1 ]
124- cert_type = "Collisions" + str (year )
125- if "Cosmics" in dataset :
126- cert_type = "Cosmics" + str (year )
127- elif "Commisioning" in dataset :
128- cert_type = "Commisioning2020"
129- elif "HI" in PD :
130- cert_type = "Collisions" + str (year ) + "HI"
131-
132- cert_path = base_cert_path + cert_type + "/"
133- web_fallback = False
134-
135- ## if we have access to eos we get from there ...
136- if os .path .isdir (cert_path ):
137- json_list = os .listdir (cert_path )
138- if len (json_list ) == 0 :
139- web_fallback == True
140- json_list = [c for c in json_list if "Golden" in c and "era" not in c ]
141- json_list = [c for c in json_list if c .startswith ("Cert_C" ) and c .endswith ("json" )]
142- else :
143- web_fallback = True
144- ## ... if not we go to the website
145- if web_fallback :
146- cert_url = base_cert_url + cert_type + "/"
147- json_list = get_url_clean (cert_url ).split ("\n " )
148- json_list = [c for c in json_list if "Golden" in c and "era" not in c and "Cert_C" in c ]
149- json_list = [[cc for cc in c .split (" " ) if cc .startswith ("Cert_C" ) and cc .endswith ("json" )][0 ] for c in json_list ]
150-
151- # the larger the better, assuming file naming schema
152- # Cert_X_RunStart_RunFinish_Type.json
153- run_ranges = [int (c .split ("_" )[3 ]) - int (c .split ("_" )[2 ]) for c in json_list ]
154- latest_json = np .array (json_list [np .argmax (run_ranges )]).reshape (1 ,- 1 )[0 ].astype (str )
155- best_json = str (latest_json [0 ])
156- if not web_fallback :
157- with open (cert_path + "/" + best_json ) as js :
158- golden = json .load (js )
159- else :
160- golden = get_url_clean (cert_url + best_json )
161- golden = ast .literal_eval (golden ) #converts string to dict
162-
163- # golden json with all the lumisections
164- golden_flat = {}
165- for k in golden :
166- R = []
167- for r in golden [k ]:
168- R = R + [f for f in range (r [0 ],r [1 ]+ 1 )]
169- golden_flat [k ] = R
170-
171- # let's just check there's an intersection between the
172- # dataset and the json
173- data_runs = das_run_data (dataset )
174- golden_data_runs = [r for r in data_runs if r in golden_flat ]
175-
176- if (len (golden_data_runs )== 0 ):
177- no_intersection ()
178-
179- # building the dataframe, cleaning for bad lumis
180- golden_data_runs_tocheck = golden_data_runs
126+ runs = args .run
181127 das_opt = ""
182- if testing or args .precheck :
183- golden_data_runs_tocheck = []
184- # Here we check run per run.
185- # This implies more dasgoclient queries, but smaller outputs
186- # useful when running the IB/PR tests not to have huge
187- # query results that have to be cached.
188-
189- sum_events = 0
190-
191- for r in golden_data_runs :
192- sum_events = sum_events + int (das_run_events_data (dataset ,r ))
193- golden_data_runs_tocheck .append (r )
194- if events > 0 and sum_events > events :
195- break
196-
197- das_opt = "run in %s" % (str ([int (g ) for g in golden_data_runs_tocheck ]))
198-
128+
129+ if runs is not None :
130+ das_opt = "run in %s" % (str ([int (r ) for r in runs ]))
131+
132+ if not args .nogolden :
133+
134+ ## get the greatest golden json
135+ year = dataset .split ("Run" )[1 ][2 :4 ] # from 20XX to XX
136+ PD = dataset .split ("/" )[1 ]
137+ cert_type = "Collisions" + str (year )
138+ if "Cosmics" in dataset :
139+ cert_type = "Cosmics" + str (year )
140+ elif "Commisioning" in dataset :
141+ cert_type = "Commisioning2020"
142+ elif "HI" in PD :
143+ cert_type = "Collisions" + str (year ) + "HI"
144+
145+ cert_path = base_cert_path + cert_type + "/"
146+ web_fallback = False
147+
148+ ## if we have access to eos we get from there ...
149+ if os .path .isdir (cert_path ):
150+ json_list = os .listdir (cert_path )
151+ if len (json_list ) == 0 :
152+ web_fallback == True
153+ json_list = [c for c in json_list if "Golden" in c and "era" not in c ]
154+ json_list = [c for c in json_list if c .startswith ("Cert_C" ) and c .endswith ("json" )]
155+ else :
156+ web_fallback = True
157+ ## ... if not we go to the website
158+ if web_fallback :
159+ cert_url = base_cert_url + cert_type + "/"
160+ json_list = get_url_clean (cert_url ).split ("\n " )
161+ json_list = [c for c in json_list if "Golden" in c and "era" not in c and "Cert_C" in c ]
162+ json_list = [[cc for cc in c .split (" " ) if cc .startswith ("Cert_C" ) and cc .endswith ("json" )][0 ] for c in json_list ]
163+
164+ # the larger the better, assuming file naming schema
165+ # Cert_X_RunStart_RunFinish_Type.json
166+ # TODO if args.run keep golden only with right range
167+
168+ run_ranges = [int (c .split ("_" )[3 ]) - int (c .split ("_" )[2 ]) for c in json_list ]
169+ latest_json = np .array (json_list [np .argmax (run_ranges )]).reshape (1 ,- 1 )[0 ].astype (str )
170+ best_json = str (latest_json [0 ])
171+ if not web_fallback :
172+ with open (cert_path + "/" + best_json ) as js :
173+ golden = json .load (js )
174+ else :
175+ golden = get_url_clean (cert_url + best_json )
176+ golden = ast .literal_eval (golden ) #converts string to dict
177+
178+ # skim for runs in input
179+ if runs is not None :
180+ for k in golden :
181+ if k not in args .run :
182+ golden .pop (k )
183+
184+ # golden json with all the lumisections
185+ golden_flat = {}
186+ for k in golden :
187+ R = []
188+ for r in golden [k ]:
189+ R = R + [f for f in range (r [0 ],r [1 ]+ 1 )]
190+ golden_flat [k ] = R
191+
192+ # let's just check there's an intersection between the
193+ # dataset and the json
194+ data_runs = das_run_data (dataset )
195+ golden_data_runs = [r for r in data_runs if r in golden_flat ]
196+
197+ if (len (golden_data_runs )== 0 ):
198+ no_intersection ()
199+
200+ # building the dataframe, cleaning for bad lumis
201+ golden_data_runs_tocheck = golden_data_runs
202+
203+ if testing or args .precheck :
204+ golden_data_runs_tocheck = []
205+ # Here we check run per run.
206+ # This implies more dasgoclient queries, but smaller outputs
207+ # useful when running the IB/PR tests not to have huge
208+ # query results that have to be cached.
209+
210+ sum_events = 0
211+
212+ for r in golden_data_runs :
213+ sum_events = sum_events + int (das_run_events_data (dataset ,r ))
214+ golden_data_runs_tocheck .append (r )
215+ if events > 0 and sum_events > events :
216+ break
217+
218+ das_opt = "run in %s" % (str ([int (g ) for g in golden_data_runs_tocheck ]))
219+
199220 df = das_lumi_data (dataset ,opt = das_opt ).merge (das_file_data (dataset ,opt = das_opt ),on = "file" ,how = "inner" ) # merge file informations with run and lumis
200-
201221 df ["lumis" ] = [[int (ff ) for ff in f .replace ("[" ,"" ).replace ("]" ,"" ).split ("," )] for f in df .lumis .values ]
202- df_rs = []
203- for r in golden_data_runs_tocheck :
204- cut = (df ["run" ] == r )
205- if not any (cut ):
206- continue
222+
223+ if not args .nogolden :
224+
225+ df_rs = []
226+ for r in golden_data_runs_tocheck :
227+ cut = (df ["run" ] == r )
228+ if not any (cut ):
229+ continue
207230
208- df_r = df [cut ]
231+ df_r = df [cut ]
209232
210- # jumping low event content runs
211- if df_r ["events" ].sum () < threshold :
212- continue
233+ # jumping low event content runs
234+ if df_r ["events" ].sum () < threshold :
235+ continue
213236
214- good_lumis = np .array ([len ([ll for ll in l if ll in golden_flat [r ]]) for l in df_r .lumis ])
215- n_lumis = np .array ([len (l ) for l in df_r .lumis ])
216- df_rs .append (df_r [good_lumis == n_lumis ])
237+ good_lumis = np .array ([len ([ll for ll in l if ll in golden_flat [r ]]) for l in df_r .lumis ])
238+ n_lumis = np .array ([len (l ) for l in df_r .lumis ])
239+ df_rs .append (df_r [good_lumis == n_lumis ])
217240
218- if (len (df_rs )== 0 ):
219- no_intersection ()
241+ if (len (df_rs )== 0 ):
242+ no_intersection ()
243+
244+ df = pd .concat (df_rs )
220245
221- df = pd .concat (df_rs )
222246 df .loc [:,"min_lumi" ] = [min (f ) for f in df .lumis ]
223247 df .loc [:,"max_lumi" ] = [max (f ) for f in df .lumis ]
224248 df = df .sort_values (["run" ,"min_lumi" ,"max_lumi" ])
@@ -251,4 +275,4 @@ def no_intersection():
251275
252276 sys .exit (0 )
253277
254-
278+
0 commit comments