|
1 | 1 | # -*- encoding: utf-8 -*- |
| 2 | +import math |
2 | 3 | import numbers |
3 | 4 | import multiprocessing |
4 | 5 | import glob |
@@ -71,7 +72,13 @@ def __init__( |
71 | 72 | If performance_range_threshold > 0, might return less models |
72 | 73 | max_models_on_disc: int |
73 | 74 | Defines the maximum number of models that are kept in the disc. |
74 | | - If int, it must be greater or equal than 1. If None, feature is disabled. |
| 75 | + If int, it must be greater or equal than 1, and dictates the max number of |
| 76 | + models to keep. |
| 77 | + If float, it will be interpreted as the max megabytes allowed of disc space. That |
| 78 | + is, if the number of ensemble candidates require more disc space than this float |
| 79 | + value, the worst models will be deleted to keep within this budget. |
| 80 | + Models and predictions of the worst-performing models will be deleted then. |
| 81 | + If None, the feature is disabled. |
75 | 82 | It defines an upper bound on the models that can be used in the ensemble. |
76 | 83 | performance_range_threshold: float |
77 | 84 | Keep only models that are better than: |
@@ -118,11 +125,17 @@ def __init__( |
118 | 125 |
|
119 | 126 | self.ensemble_nbest = ensemble_nbest |
120 | 127 |
|
121 | | - if max_models_on_disc is not None and max_models_on_disc < 1: |
| 128 | + # max_models_on_disc can be a float, in such case we need to |
| 129 | + # remember the user specified Megabytes and translate this to |
| 130 | + # max number of ensemble models. max_resident_models keeps the |
| 131 | + # maximum number of models in disc |
| 132 | + if max_models_on_disc is not None and max_models_on_disc < 0: |
122 | 133 | raise ValueError( |
123 | 134 | "max_models_on_disc has to be a positive number or None" |
124 | 135 | ) |
125 | 136 | self.max_models_on_disc = max_models_on_disc |
| 137 | + self.max_resident_models = None |
| 138 | + |
126 | 139 | self.seed = seed |
127 | 140 | self.shared_mode = shared_mode # pSMAC? |
128 | 141 | self.max_iterations = max_iterations |
@@ -319,6 +332,38 @@ def main(self, return_pred=False): |
319 | 332 | if return_pred: |
320 | 333 | return valid_pred, test_pred |
321 | 334 |
|
| 335 | + def get_disk_consumption(self, pred_path): |
| 336 | + """ |
| 337 | + gets the cost of a model being on disc |
| 338 | + """ |
| 339 | + |
| 340 | + match = self.model_fn_re.search(pred_path) |
| 341 | + if not match: |
| 342 | + raise ValueError("Invalid path format %s" % pred_path) |
| 343 | + _full_name = match.group(0) |
| 344 | + _seed = match.group(1) |
| 345 | + _num_run = match.group(2) |
| 346 | + _budget = match.group(3) |
| 347 | + |
| 348 | + # Besides the prediction, we have to take care of three other files: model, |
| 349 | + # validation and test. |
| 350 | + model_name = '%s.%s.%s.model' % (_seed, _num_run, _budget) |
| 351 | + model_path = os.path.join(self.dir_models, model_name) |
| 352 | + pred_valid_name = 'predictions_valid' + _full_name |
| 353 | + pred_valid_path = os.path.join(self.dir_valid, pred_valid_name) |
| 354 | + pred_test_name = 'predictions_test' + _full_name |
| 355 | + pred_test_path = os.path.join(self.dir_test, pred_test_name) |
| 356 | + |
| 357 | + paths = [model_path, pred_path] |
| 358 | + if os.path.exists(pred_valid_path): |
| 359 | + paths.append(pred_valid_path) |
| 360 | + if os.path.exists(pred_test_path): |
| 361 | + paths.append(pred_test_path) |
| 362 | + this_model_cost = sum([os.path.getsize(path) for path in paths]) |
| 363 | + |
| 364 | + # get the megabytes |
| 365 | + return round(this_model_cost / math.pow(1024, 2), 2) |
| 366 | + |
322 | 367 | def score_ensemble_preds(self): |
323 | 368 | """ |
324 | 369 | score predictions on ensemble building data set; |
@@ -395,6 +440,7 @@ def score_ensemble_preds(self): |
395 | 440 | "seed": _seed, |
396 | 441 | "num_run": _num_run, |
397 | 442 | "budget": _budget, |
| 443 | + "disc_space_cost_mb": None, |
398 | 444 | Y_ENSEMBLE: None, |
399 | 445 | Y_VALID: None, |
400 | 446 | Y_TEST: None, |
@@ -438,6 +484,9 @@ def score_ensemble_preds(self): |
438 | 484 | y_ens_fn |
439 | 485 | ) |
440 | 486 | self.read_preds[y_ens_fn]["loaded"] = 2 |
| 487 | + self.read_preds[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( |
| 488 | + y_ens_fn |
| 489 | + ) |
441 | 490 |
|
442 | 491 | n_read_files += 1 |
443 | 492 |
|
@@ -514,14 +563,51 @@ def get_n_best_preds(self): |
514 | 563 | self.logger.debug("Library Pruning: using for ensemble only " |
515 | 564 | " %d (out of %d) models" % (keep_nbest, len(sorted_keys))) |
516 | 565 |
|
| 566 | + # If max_models_on_disc is None, do nothing |
517 | 567 | # One can only read at most max_models_on_disc models |
518 | | - if self.max_models_on_disc is not None and keep_nbest > self.max_models_on_disc: |
| 568 | + if self.max_models_on_disc is not None: |
| 569 | + if not isinstance(self.max_models_on_disc, numbers.Integral): |
| 570 | + consumption = [ |
| 571 | + [ |
| 572 | + v["ens_score"], |
| 573 | + v["disc_space_cost_mb"], |
| 574 | + ] for v in self.read_preds.values() if v["disc_space_cost_mb"] is not None |
| 575 | + ] |
| 576 | + max_consumption = max(i[1] for i in consumption) |
| 577 | + |
| 578 | + # We are pessimistic with the consumption limit indicated by |
| 579 | + # max_models_on_disc by 1 model. Such model is assumed to spend |
| 580 | + # max_consumption megabytes |
| 581 | + if (sum(i[1] for i in consumption) + max_consumption) > self.max_models_on_disc: |
| 582 | + |
| 583 | + # just leave the best -- higher is better! |
| 584 | + # This list is in descending order, to preserve the best models |
| 585 | + sorted_cum_consumption = np.cumsum([ |
| 586 | + i[1] for i in list(reversed(sorted(consumption))) |
| 587 | + ]) |
| 588 | + max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc) |
| 589 | + |
| 590 | + # Make sure that at least 1 model survives |
| 591 | + self.max_resident_models = max(1, max_models) |
| 592 | + self.logger.warning( |
| 593 | + "Limiting num of models via float max_models_on_disc={}" |
| 594 | + " as accumulated={} worst={} num_models={}".format( |
| 595 | + self.max_models_on_disc, |
| 596 | + (sum(i[1] for i in consumption) + max_consumption), |
| 597 | + max_consumption, |
| 598 | + self.max_resident_models |
| 599 | + ) |
| 600 | + ) |
| 601 | + else: |
| 602 | + self.max_resident_models = self.max_models_on_disc |
| 603 | + |
| 604 | + if self.max_resident_models is not None and keep_nbest > self.max_resident_models: |
519 | 605 | self.logger.debug( |
520 | 606 | "Restricting the number of models to %d instead of %d due to argument " |
521 | 607 | "max_models_on_disc", |
522 | | - self.max_models_on_disc, keep_nbest, |
| 608 | + self.max_resident_models, keep_nbest, |
523 | 609 | ) |
524 | | - keep_nbest = self.max_models_on_disc |
| 610 | + keep_nbest = self.max_resident_models |
525 | 611 |
|
526 | 612 | # consider performance_range_threshold |
527 | 613 | if self.performance_range_threshold > 0: |
@@ -692,7 +778,7 @@ def fit_ensemble(self, selected_keys: list): |
692 | 778 | ) |
693 | 779 |
|
694 | 780 | # Delete files of non-candidate models |
695 | | - if self.max_models_on_disc is not None: |
| 781 | + if self.max_resident_models is not None: |
696 | 782 | self._delete_excess_models() |
697 | 783 |
|
698 | 784 | return None |
@@ -734,7 +820,7 @@ def fit_ensemble(self, selected_keys: list): |
734 | 820 | return None |
735 | 821 |
|
736 | 822 | # Delete files of non-candidate models |
737 | | - if self.max_models_on_disc is not None: |
| 823 | + if self.max_resident_models is not None: |
738 | 824 | self._delete_excess_models() |
739 | 825 |
|
740 | 826 | return ensemble |
@@ -839,14 +925,14 @@ def _delete_excess_models(self): |
839 | 925 | sorted_keys = self._get_list_of_sorted_preds() |
840 | 926 | sorted_keys = list(map(lambda x: x[0], sorted_keys)) |
841 | 927 |
|
842 | | - if len(sorted_keys) <= self.max_models_on_disc: |
| 928 | + if len(sorted_keys) <= self.max_resident_models: |
843 | 929 | # Don't waste time if not enough models to delete |
844 | 930 | return |
845 | 931 |
|
846 | | - # The top self.max_models_on_disc models would be the candidates |
| 932 | + # The top self.max_resident_models models would be the candidates |
847 | 933 | # Any other low performance model will be deleted |
848 | 934 | # The list is in ascending order of score |
849 | | - candidates = sorted_keys[:self.max_models_on_disc] |
| 935 | + candidates = sorted_keys[:self.max_resident_models] |
850 | 936 |
|
851 | 937 | # Loop through the files currently in the directory |
852 | 938 | for pred_path in self.y_ens_files: |
|
0 commit comments