Skip to content

Commit acb85bb

Browse files
authored
Backporting fix for preemption_watcher. (#11045)
1 parent 9fb4a4f commit acb85bb

File tree

3 files changed

+3
-3
lines changed

3 files changed

+3
-3
lines changed

official/core/config_definitions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ class TrainerConfig(base_config.Config):
272272
recovery_max_trials: int = 0
273273
validation_summary_subdir: str = "validation"
274274
# Preemption on-demand checkpoint.
275-
preemption_on_demand_checkpoint: bool = True
275+
preemption_on_demand_checkpoint: bool = False
276276

277277

278278
@dataclasses.dataclass

official/nlp/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def _run_experiment_with_preemption_recovery(params, model_dir):
5353
**params.runtime.model_parallelism())
5454
with distribution_strategy.scope():
5555
task = task_factory.get_task(params.task, logging_dir=model_dir)
56-
preemption_watcher = tf.distribute.experimental.PreemptionWatcher()
56+
preemption_watcher = None
5757

5858
train_lib.run_experiment(
5959
distribution_strategy=distribution_strategy,

official/vision/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def _run_experiment_with_preemption_recovery(params, model_dir):
4646
tpu_address=params.runtime.tpu)
4747
with distribution_strategy.scope():
4848
task = task_factory.get_task(params.task, logging_dir=model_dir)
49-
preemption_watcher = tf.distribute.experimental.PreemptionWatcher()
49+
preemption_watcher = None
5050

5151
train_lib.run_experiment(
5252
distribution_strategy=distribution_strategy,

0 commit comments

Comments
 (0)