tensorflow
diff --git a/‎.github/bot_config.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/bot_config.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/vision/instance_segmentation.ipynb‎
Lines changed: 4 additions & 4 deletions b/‎docs/vision/instance_segmentation.ipynb‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎official/core/base_task.py‎
Lines changed: 3 additions & 1 deletion b/‎official/core/base_task.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎official/core/base_trainer.py‎
Lines changed: 27 additions & 6 deletions b/‎official/core/base_trainer.py‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎official/core/config_definitions.py‎
Lines changed: 10 additions & 7 deletions b/‎official/core/config_definitions.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎official/core/savedmodel_checkpoint_manager.py‎
Lines changed: 16 additions & 3 deletions b/‎official/core/savedmodel_checkpoint_manager.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎official/core/train_lib.py‎
Lines changed: 5 additions & 2 deletions b/‎official/core/train_lib.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎official/core/train_utils.py‎
Lines changed: 44 additions & 0 deletions b/‎official/core/train_utils.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎official/legacy/image_classification/configs/base_configs.py‎
Lines changed: 9 additions & 3 deletions b/‎official/legacy/image_classification/configs/base_configs.py‎
Lines changed: 9 additions & 3 deletions
@@ -21,6 +21,4 @@
 
 # A list of assignees
 assignees:
-   - sushreebarsa
    - laxmareddyp
-   - sineeli
@@ -745,7 +745,7 @@
       },
       "outputs": [],
       "source": [
-        "def show_batch(raw_records, num_of_examples):\n",
+        "def show_batch(raw_records):\n",
         "  plt.figure(figsize=(20, 20))\n",
         "  use_normalized_coordinates=True\n",
         "  min_score_thresh = 0.30\n",
@@ -802,7 +802,7 @@
         "\n",
         "train_tfrecords = tf.io.gfile.glob(exp_config.task.train_data.input_path)\n",
         "raw_records = tf.data.TFRecordDataset(train_tfrecords).shuffle(buffer_size=buffer_size).take(num_of_examples)\n",
-        "show_batch(raw_records, num_of_examples)"
+        "show_batch(raw_records)"
       ]
     },
     {
@@ -962,7 +962,7 @@
         "\n",
         "test_tfrecords = tf.io.gfile.glob('./lvis_tfrecords/val*')\n",
         "test_ds = tf.data.TFRecordDataset(test_tfrecords).take(num_of_examples)\n",
-        "show_batch(test_ds, num_of_examples)"
+        "show_batch(test_ds)"
       ]
     },
     {
@@ -1095,7 +1095,7 @@
         "    detection_masks = tf.convert_to_tensor(result['detection_masks'][0])\n",
         "    detection_boxes = tf.convert_to_tensor(result['detection_boxes'][0])\n",
         "    detection_masks_reframed = reframe_box_masks_to_image_masks(\n",
-        "              detection_masks, detection_boxes/255.0,\n",
+        "              detection_masks, detection_boxes/256.0,\n",
         "                image_np.shape[0], image_np.shape[1])\n",
         "    detection_masks_reframed = tf.cast(\n",
         "        detection_masks_reframed \u003e min_score_thresh,\n",
 
@@ -57,7 +57,9 @@ def __init__(self,
     """
     super().__init__(name=name)
     self._task_config = params
-    self._logging_dir = logging_dir
+    self._logging_dir = (
+        logging_dir or ""
+    )  # Empty directory hints current working dir.
 
   @property
   def task_config(self):
 
@@ -47,18 +47,29 @@ def init_async(self):
           tf.distribute.experimental.coordinator.ClusterCoordinator(
               self._strategy))
 
+  def coordinator_for_async(
+      self,
+  ) -> tf.distribute.experimental.coordinator.ClusterCoordinator:
+    if not self._coordinator:
+      raise ValueError(
+          "Coordinator uninitialized for async run. Call init_async() first."
+      )
+    return self._coordinator
+
   def join(self):
     """Join all async steps. Only useful in aysnc training."""
     if getattr(self, "_is_async", False):
-      self._coordinator.join()
+      self.coordinator_for_async().join()
 
   def create_train_loop_fn(self):
     """Creates a eval loop from the given step function and options."""
     train_loop_fn = super().create_train_loop_fn()
     if getattr(self, "_is_async", False):
 
       def _async_loop_fn(iterator, num_steps):
-        self._coordinator.schedule(train_loop_fn, args=(iterator, num_steps))
+        self.coordinator_for_async().schedule(
+            train_loop_fn, args=(iterator, num_steps)
+        )
 
       return _async_loop_fn
     else:
@@ -76,7 +87,9 @@ def create_eval_loop_fn(self, has_state: bool):
       def _async_loop_fn(iterator, num_steps, state=None, reduce_fn=None):
         assert state is None
         assert reduce_fn is None
-        self._coordinator.schedule(eval_loop_fn, args=(iterator, num_steps))
+        self.coordinator_for_async().schedule(
+            eval_loop_fn, args=(iterator, num_steps)
+        )
 
       return _async_loop_fn
     else:
@@ -102,7 +115,9 @@ def distribute_dataset(self, dataset_or_fn, *args, **kwargs):
           *args, **kwargs)
       per_worker_dataset_fn = tf.function(per_worker_dataset_fn)
 
-      return self._coordinator.create_per_worker_dataset(per_worker_dataset_fn)
+      return self.coordinator_for_async().create_per_worker_dataset(
+          per_worker_dataset_fn
+      )
     else:
       return orbit.utils.make_distributed_dataset(self._strategy, dataset_or_fn,
                                                   *args, **kwargs)
@@ -352,7 +367,10 @@ def next_train_inputs(self, iterator):
     This method provides a way to control how to fetch the next model input, and
     what data to send to the model.
 
-    This function runs in eager mode.
+    Note: This function runs on the host side when accelerators are used.
+
+    Note: Depending on the training setup this may or may not run in eager mode.
+    In most cases it will be run in graph mode.
 
     Args:
       iterator: Dataset iterator to generate the next inputs from.
@@ -399,7 +417,10 @@ def next_eval_inputs(self, iterator):
     processed later in `aggregate_logs`. This is useful for sending extra logs
     downstream that are not compatible with the accelerators.
 
-    This function runs in eager mode.
+    Note: This function runs on the host side when accelerators are used.
+
+    Note: Depending on the training setup this may or may not run in eager mode.
+    In most cases it will be run in graph mode.
 
     Args:
       iterator: Dataset iterator to generate the next inputs from.
 
@@ -214,6 +214,7 @@ class TrainerConfig(base_config.Config):
     train_tf_while_loop: whether or not to use tf while loop.
     train_tf_function: whether or not to use tf_function for training loop.
     eval_tf_function: whether or not to use tf_function for eval.
+    eval_tf_while_loop: whether or not to use tf while loop for eval.
     allow_tpu_summary: Whether to allow summary happen inside the XLA program
       runs on TPU through automatic outside compilation.
     steps_per_loop: number of steps per loop to report training metrics. This
@@ -244,7 +245,9 @@ class TrainerConfig(base_config.Config):
     preemption_on_demand_checkpoint: whether or not to save on-demand
       checkpoints after a preemption.
   """
-  optimizer_config: OptimizationConfig = OptimizationConfig()
+  optimizer_config: OptimizationConfig = dataclasses.field(
+      default_factory=OptimizationConfig
+  )
   # Orbit settings.
   train_tf_while_loop: bool = True
   train_tf_function: bool = True
@@ -276,16 +279,16 @@ class TrainerConfig(base_config.Config):
   recovery_max_trials: int = 0
   validation_summary_subdir: str = "validation"
   # Preemption on-demand checkpoint.
-  preemption_on_demand_checkpoint: bool = True
+  preemption_on_demand_checkpoint: bool = True  # copybara-replace
 
 
 @dataclasses.dataclass
 class TaskConfig(base_config.Config):
   """Config passed to task."""
   init_checkpoint: str = ""
   model: Optional[base_config.Config] = None
-  train_data: DataConfig = DataConfig()
-  validation_data: DataConfig = DataConfig()
+  train_data: DataConfig = dataclasses.field(default_factory=DataConfig)
+  validation_data: DataConfig = dataclasses.field(default_factory=DataConfig)
   name: Optional[str] = None
   # Configs for differential privacy
   # These configs are only effective if you use create_optimizer in
@@ -301,6 +304,6 @@ class TaskConfig(base_config.Config):
 @dataclasses.dataclass
 class ExperimentConfig(base_config.Config):
   """Top-level configuration."""
-  task: TaskConfig = TaskConfig()
-  trainer: TrainerConfig = TrainerConfig()
-  runtime: RuntimeConfig = RuntimeConfig()
+  task: TaskConfig = dataclasses.field(default_factory=TaskConfig)
+  trainer: TrainerConfig = dataclasses.field(default_factory=TrainerConfig)
+  runtime: RuntimeConfig = dataclasses.field(default_factory=RuntimeConfig)
@@ -72,13 +72,19 @@ def save(self,
 
     # Save the models for the checkpoint that just got written.
     saved_modules_directory = make_saved_modules_directory_name(checkpoint_path)
+    # Atomic export of SavedModel. Write into a temporary direcotory and then
+    # rename as the final direcotory after finishing the writing.
+    # This can avoid trying to read an unfinished savedmodel.
+    saved_modules_directory_tmp = saved_modules_directory + '_temp'
     for model_name, model in self._modules_to_export.items():
       signatures = getattr(model, 'saved_model_signatures', None)
       if signatures is not None:
         tf.saved_model.save(
             obj=model,
-            export_dir=os.path.join(saved_modules_directory, model_name),
+            export_dir=os.path.join(saved_modules_directory_tmp, model_name),
             signatures=signatures)
+    if tf.io.gfile.exists(saved_modules_directory_tmp):
+      tf.io.gfile.rename(saved_modules_directory_tmp, saved_modules_directory)
 
     saved_modules_directories_to_keep = [
         make_saved_modules_directory_name(ckpt) for ckpt in self.checkpoints
@@ -105,7 +111,14 @@ def get_existing_savedmodels(self) -> List[str]:
     """
     saved_modules_glob = make_saved_modules_directory_name(
         self._checkpoint_prefix + '-*')
-    return tf.io.gfile.glob(saved_modules_glob)
+    savedmodels = tf.io.gfile.glob(saved_modules_glob)
+    # Filter out temporary savedmodel.
+    savedmodels = [
+        savedmodel
+        for savedmodel in savedmodels
+        if savedmodel.endswith(SAVED_MODULES_PATH_SUFFIX)
+    ]
+    return savedmodels
 
   @property
   def latest_savedmodel(self) -> Union[str, None]:
@@ -214,7 +227,7 @@ def wait_for_new_savedmodel(
     logging.info('Waiting for new savedmodel at %s', self._directory)
     stop_time = time.time() + timeout if timeout is not None else None
 
-    last_savedmodel_number = 0
+    last_savedmodel_number = -1
     if last_savedmodel:
       last_savedmodel_number = self.get_savedmodel_number_from_path(
           last_savedmodel)
 
@@ -137,7 +137,7 @@ def trainer(self) -> base_trainer.Trainer:
     return self._trainer
 
   @property
-  def checkpoint_manager(self) -> tf.train.CheckpointManager:
+  def checkpoint_manager(self) -> Optional[tf.train.CheckpointManager]:
     """The CheckpointManager that stores the checkpoints in a train job."""
     return self._checkpoint_manager
 
@@ -205,11 +205,14 @@ def _build_controller(
     """Builds a Orbit controler."""
     train_actions = [] if not train_actions else train_actions
     if trainer:
+      checkpoint_manager = self.checkpoint_manager
+      assert checkpoint_manager, 'Checkpoint manager required but undefined.'
       train_actions += actions.get_train_actions(
           self.params,
           trainer,
           self.model_dir,
-          checkpoint_manager=self.checkpoint_manager)
+          checkpoint_manager=checkpoint_manager,
+      )
 
     eval_actions = [] if not eval_actions else eval_actions
     if evaluator:
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Training utils."""
+
 import dataclasses
 import inspect
 import json
@@ -22,10 +23,12 @@
 
 from absl import logging
 import gin
+import numpy as np
 import orbit
 import tensorflow as tf
 
 # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.framework import ops
 from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph
 # pylint: enable=g-direct-tensorflow-import
 from official.core import base_task
@@ -564,3 +567,44 @@ def try_count_flops(model: Union[tf.Module, tf.keras.Model],
           'reached before this run.', e)
       return None
   return None
+
+
+@ops.RegisterStatistics('Einsum', 'flops')
+def _einsum_flops(graph, node):
+  """Calculates the compute resources needed for Einsum."""
+  assert len(node.input) == 2
+  x_shape = tf.compat.v1.graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[0])
+  y_shape = tf.compat.v1.graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
+  x_shape.assert_is_fully_defined()
+  y_shape.assert_is_fully_defined()
+  x_shape = x_shape.as_list()
+  y_shape = y_shape.as_list()
+  equation = str(node.attr['equation'])
+  equation = (
+      equation.replace('s:', '')
+      .replace('"', '')
+      .replace(' ', '')
+      .replace('\n', '')
+  )
+  x_str = equation.split(',')[0]
+  y_r_str = equation.split(',')[1]
+  y_str = y_r_str.split('->')[0]
+  r_str = y_r_str.split('->')[1]
+  shape_dic = {}
+  contracted = set()
+  for indice in x_str + y_str:
+    if indice in x_str:
+      indice_dim = x_shape[x_str.find(indice)]
+    elif indice in y_str:
+      indice_dim = y_shape[y_str.find(indice)]
+    else:
+      raise ValueError('indice {} not found in inputs'.format(indice))
+    shape_dic[indice] = indice_dim
+    if indice not in r_str:
+      contracted.add(indice)
+  madds = np.prod([shape_dic[indice] for indice in r_str]) * (
+      np.prod([shape_dic[indice] for indice in contracted]))
+  flops = 2 * madds
+  return ops.OpStats('flops', flops)
@@ -112,10 +112,16 @@ class TrainConfig(hyperparams.Config):
   resume_checkpoint: bool = None
   epochs: int = None
   steps: int = None
-  callbacks: CallbacksConfig = CallbacksConfig()
+  callbacks: CallbacksConfig = dataclasses.field(
+      default_factory=CallbacksConfig
+  )
   metrics: MetricsConfig = None
-  tensorboard: TensorBoardConfig = TensorBoardConfig()
-  time_history: TimeHistoryConfig = TimeHistoryConfig()
+  tensorboard: TensorBoardConfig = dataclasses.field(
+      default_factory=TensorBoardConfig
+  )
+  time_history: TimeHistoryConfig = dataclasses.field(
+      default_factory=TimeHistoryConfig
+  )
   set_epoch_loop: bool = False