1212import xmltodict
1313import numpy as np
1414import pandas as pd
15+ from joblib .parallel import Parallel , delayed
1516
1617import openml
1718import openml .utils
@@ -54,6 +55,7 @@ def run_model_on_task(
5455 upload_flow : bool = False ,
5556 return_flow : bool = False ,
5657 dataset_format : str = "dataframe" ,
58+ n_jobs : Optional [int ] = None ,
5759) -> Union [OpenMLRun , Tuple [OpenMLRun , OpenMLFlow ]]:
5860 """Run the model on the dataset defined by the task.
5961
@@ -84,6 +86,10 @@ def run_model_on_task(
8486 dataset_format : str (default='dataframe')
8587 If 'array', the dataset is passed to the model as a numpy array.
8688 If 'dataframe', the dataset is passed to the model as a pandas dataframe.
89+ n_jobs : int (default=None)
90+ The number of processes/threads to distribute the evaluation asynchronously.
91+ If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
92+ If `-1`, then the job uses as many cores available.
8793
8894 Returns
8995 -------
@@ -131,6 +137,7 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas
131137 add_local_measures = add_local_measures ,
132138 upload_flow = upload_flow ,
133139 dataset_format = dataset_format ,
140+ n_jobs = n_jobs ,
134141 )
135142 if return_flow :
136143 return run , flow
@@ -146,6 +153,7 @@ def run_flow_on_task(
146153 add_local_measures : bool = True ,
147154 upload_flow : bool = False ,
148155 dataset_format : str = "dataframe" ,
156+ n_jobs : Optional [int ] = None ,
149157) -> OpenMLRun :
150158
151159 """Run the model provided by the flow on the dataset defined by task.
@@ -181,6 +189,10 @@ def run_flow_on_task(
181189 dataset_format : str (default='dataframe')
182190 If 'array', the dataset is passed to the model as a numpy array.
183191 If 'dataframe', the dataset is passed to the model as a pandas dataframe.
192+ n_jobs : int (default=None)
193+ The number of processes/threads to distribute the evaluation asynchronously.
194+ If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
195+ If `-1`, then the job uses as many cores available.
184196
185197 Returns
186198 -------
@@ -265,6 +277,7 @@ def run_flow_on_task(
265277 extension = flow .extension ,
266278 add_local_measures = add_local_measures ,
267279 dataset_format = dataset_format ,
280+ n_jobs = n_jobs ,
268281 )
269282
270283 data_content , trace , fold_evaluations , sample_evaluations = res
@@ -425,6 +438,7 @@ def _run_task_get_arffcontent(
425438 extension : "Extension" ,
426439 add_local_measures : bool ,
427440 dataset_format : str ,
441+ n_jobs : int = None ,
428442) -> Tuple [
429443 List [List ],
430444 Optional [OpenMLRunTrace ],
@@ -447,55 +461,37 @@ def _run_task_get_arffcontent(
447461 # methods, less maintenance, less confusion)
448462 num_reps , num_folds , num_samples = task .get_split_dimensions ()
449463
464+ jobs = []
450465 for n_fit , (rep_no , fold_no , sample_no ) in enumerate (
451466 itertools .product (range (num_reps ), range (num_folds ), range (num_samples ),), start = 1
452467 ):
453-
454- train_indices , test_indices = task .get_train_test_split_indices (
455- repeat = rep_no , fold = fold_no , sample = sample_no
456- )
457- if isinstance (task , OpenMLSupervisedTask ):
458- x , y = task .get_X_and_y (dataset_format = dataset_format )
459- if dataset_format == "dataframe" :
460- train_x = x .iloc [train_indices ]
461- train_y = y .iloc [train_indices ]
462- test_x = x .iloc [test_indices ]
463- test_y = y .iloc [test_indices ]
464- else :
465- train_x = x [train_indices ]
466- train_y = y [train_indices ]
467- test_x = x [test_indices ]
468- test_y = y [test_indices ]
469- elif isinstance (task , OpenMLClusteringTask ):
470- x = task .get_X (dataset_format = dataset_format )
471- if dataset_format == "dataframe" :
472- train_x = x .iloc [train_indices ]
473- else :
474- train_x = x [train_indices ]
475- train_y = None
476- test_x = None
477- test_y = None
478- else :
479- raise NotImplementedError (task .task_type )
480-
481- config .logger .info (
482- "Going to execute flow '%s' on task %d for repeat %d fold %d sample %d." ,
483- flow .name ,
484- task .task_id ,
485- rep_no ,
486- fold_no ,
487- sample_no ,
488- )
489-
490- pred_y , proba_y , user_defined_measures_fold , trace = extension ._run_model_on_fold (
468+ jobs .append ((n_fit , rep_no , fold_no , sample_no ))
469+
470+ # The forked child process may not copy the configuration state of OpenML from the parent.
471+ # Current configuration setup needs to be copied and passed to the child processes.
472+ _config = config .get_config_as_dict ()
473+ # Execute runs in parallel
474+ # assuming the same number of tasks as workers (n_jobs), the total compute time for this
475+ # statement will be similar to the slowest run
476+ job_rvals = Parallel (verbose = 0 , n_jobs = n_jobs )(
477+ delayed (_run_task_get_arffcontent_parallel_helper )(
478+ extension = extension ,
479+ flow = flow ,
480+ fold_no = fold_no ,
491481 model = model ,
492- task = task ,
493- X_train = train_x ,
494- y_train = train_y ,
495482 rep_no = rep_no ,
496- fold_no = fold_no ,
497- X_test = test_x ,
483+ sample_no = sample_no ,
484+ task = task ,
485+ dataset_format = dataset_format ,
486+ configuration = _config ,
498487 )
488+ for n_fit , rep_no , fold_no , sample_no in jobs
489+ ) # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs`
490+
491+ for n_fit , rep_no , fold_no , sample_no in jobs :
492+ pred_y , proba_y , test_indices , test_y , trace , user_defined_measures_fold = job_rvals [
493+ n_fit - 1
494+ ]
499495 if trace is not None :
500496 traces .append (trace )
501497
@@ -615,6 +611,75 @@ def _calculate_local_measure(sklearn_fn, openml_name):
615611 )
616612
617613
614+ def _run_task_get_arffcontent_parallel_helper (
615+ extension : "Extension" ,
616+ flow : OpenMLFlow ,
617+ fold_no : int ,
618+ model : Any ,
619+ rep_no : int ,
620+ sample_no : int ,
621+ task : OpenMLTask ,
622+ dataset_format : str ,
623+ configuration : Dict = None ,
624+ ) -> Tuple [
625+ np .ndarray ,
626+ Optional [pd .DataFrame ],
627+ np .ndarray ,
628+ Optional [pd .DataFrame ],
629+ Optional [OpenMLRunTrace ],
630+ "OrderedDict[str, float]" ,
631+ ]:
632+ # Sets up the OpenML instantiated in the child process to match that of the parent's
633+ # if configuration=None, loads the default
634+ config ._setup (configuration )
635+
636+ train_indices , test_indices = task .get_train_test_split_indices (
637+ repeat = rep_no , fold = fold_no , sample = sample_no
638+ )
639+
640+ if isinstance (task , OpenMLSupervisedTask ):
641+ x , y = task .get_X_and_y (dataset_format = dataset_format )
642+ if dataset_format == "dataframe" :
643+ train_x = x .iloc [train_indices ]
644+ train_y = y .iloc [train_indices ]
645+ test_x = x .iloc [test_indices ]
646+ test_y = y .iloc [test_indices ]
647+ else :
648+ train_x = x [train_indices ]
649+ train_y = y [train_indices ]
650+ test_x = x [test_indices ]
651+ test_y = y [test_indices ]
652+ elif isinstance (task , OpenMLClusteringTask ):
653+ x = task .get_X (dataset_format = dataset_format )
654+ if dataset_format == "dataframe" :
655+ train_x = x .iloc [train_indices ]
656+ else :
657+ train_x = x [train_indices ]
658+ train_y = None
659+ test_x = None
660+ test_y = None
661+ else :
662+ raise NotImplementedError (task .task_type )
663+ config .logger .info (
664+ "Going to execute flow '%s' on task %d for repeat %d fold %d sample %d." ,
665+ flow .name ,
666+ task .task_id ,
667+ rep_no ,
668+ fold_no ,
669+ sample_no ,
670+ )
671+ pred_y , proba_y , user_defined_measures_fold , trace , = extension ._run_model_on_fold (
672+ model = model ,
673+ task = task ,
674+ X_train = train_x ,
675+ y_train = train_y ,
676+ rep_no = rep_no ,
677+ fold_no = fold_no ,
678+ X_test = test_x ,
679+ )
680+ return pred_y , proba_y , test_indices , test_y , trace , user_defined_measures_fold
681+
682+
618683def get_runs (run_ids ):
619684 """Gets all runs in run_ids list.
620685
0 commit comments