PaddlePaddle
diff --git a/‎plsc/config.py‎
Lines changed: 9 additions & 6 deletions b/‎plsc/config.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎plsc/entry.py‎
Lines changed: 90 additions & 45 deletions b/‎plsc/entry.py‎
Lines changed: 90 additions & 45 deletions
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 from easydict import EasyDict as edict
-
-
 """
 Default Parameters
 """
@@ -27,17 +25,22 @@
 config.dataset_dir = './train_data'
 config.train_image_num = 5822653
 config.model_name = 'ResNet50'
-config.train_epochs = 120
+config.train_epochs = None
+config.train_steps = 180000
 config.checkpoint_dir = ""
 config.with_test = True
 config.model_save_dir = "output"
 config.warmup_epochs = 0
+config.model_parallel = False
 
-config.loss_type = "dist_arcface"
+config.loss_type = "arcface"
 config.num_classes = 85742
+config.sample_ratio = 1.0
 config.image_shape = (3, 112, 112)
-config.margin = 0.5
+config.margin1 = 1.0
+config.margin2 = 0.5
+config.margin3 = 0.0
 config.scale = 64.0
 config.lr = 0.1
-config.lr_steps = (100000, 160000, 220000)
+config.lr_steps = (100000, 160000, 180000)
 config.emb_dim = 512
@@ -58,25 +58,6 @@ class Entry(object):
     The class to encapsulate all operations.
     """
 
-    def _check(self):
-        """
-        Check the validation of parameters.
-        """
-        supported_types = [
-            "softmax",
-            "arcface",
-            "dist_softmax",
-            "dist_arcface",
-        ]
-        assert self.loss_type in supported_types, \
-            "All supported types are {}, but given {}.".format(
-             supported_types, self.loss_type)
-
-        if self.loss_type in ["dist_softmax", "dist_arcface"]:
-            assert self.num_trainers > 1, \
-                "At least 2 trainers are required for distributed fc-layer. " \
-                "You can start your job using paddle.distributed.launch module."
-
     def __init__(self):
         self.config = config.config
         super(Entry, self).__init__()
@@ -118,8 +99,12 @@ def __init__(self):
         self.val_targets = self.config.val_targets
         self.dataset_dir = self.config.dataset_dir
         self.num_classes = self.config.num_classes
+        self.sample_ratio = self.config.sample_ratio
+        self.model_parallel = self.config.model_parallel
         self.loss_type = self.config.loss_type
-        self.margin = self.config.margin
+        self.margin1 = self.config.margin1
+        self.margin2 = self.config.margin2
+        self.margin3 = self.config.margin3
         self.scale = self.config.scale
         self.lr = self.config.lr
         self.lr_steps = self.config.lr_steps
@@ -128,12 +113,16 @@ def __init__(self):
         self.model_name = self.config.model_name
         self.emb_dim = self.config.emb_dim
         self.train_epochs = self.config.train_epochs
+        self.train_steps = self.config.train_steps
         self.checkpoint_dir = self.config.checkpoint_dir
         self.with_test = self.config.with_test
         self.model_save_dir = self.config.model_save_dir
         self.warmup_epochs = self.config.warmup_epochs
         self.calc_train_acc = False
 
+        assert not (self.train_epochs and self.train_steps
+                    ), 'train_steps and train_epochs only one can be set'
+
         self.max_last_checkpoint_num = 5
         if self.checkpoint_dir:
             self.checkpoint_dir = os.path.abspath(self.checkpoint_dir)
@@ -166,6 +155,8 @@ def __init__(self):
             logger.info('\t' + str(key) + ": " + str(self.config[key]))
         logger.info('trainer_id: {}, num_trainers: {}'.format(trainer_id,
                                                               num_trainers))
+        logger.info('global_train_batch_size: {}'.format(
+            self.global_train_batch_size))
         logger.info('default lr_decay_factor: {}'.format(self.lr_decay_factor))
         logger.info('default log period: {}'.format(self.log_period))
         logger.info('default test period: {}'.format(self.test_period))
@@ -327,6 +318,23 @@ def set_class_num(self, num):
         self.num_classes = num
         logger.info("Set num_classes to {}.".format(num))
 
+    def set_model_parallel(self, flag):
+        """
+        Set the flag of model parallel.
+        """
+        self.model_parallel = flag
+        if flag:
+            assert self.num_trainers > 1, "The number of GPUs must greater " \
+                "than 1 when using model parallel training"
+        logger.info("Set model_parallel to {}.".format(flag))
+
+    def set_sample_ratio(self, sample_ratio):
+        """
+        Set the sample ratio of Partial FC.
+        """
+        self.sample_ratio = sample_ratio
+        logger.info("Set sample_ratio to {}.".format(sample_ratio))
+
     def set_emb_size(self, size):
         """
         Set the size of the last hidding layer before the distributed fc-layer.
@@ -348,9 +356,18 @@ def set_train_epochs(self, num):
         """
         Set the number of epochs to train.
         """
+        self.train_steps = None
         self.train_epochs = num
         logger.info("Set train_epochs to {}.".format(num))
 
+    def set_train_steps(self, num):
+        """
+        Set the number of steps to train.
+        """
+        self.train_epochs = None
+        self.train_steps = num
+        logger.info("Set train_steps to {}.".format(num))
+
     def set_checkpoint_dir(self, directory):
         """
         Set the directory for checkpoint loaded before training/testing.
@@ -371,15 +388,39 @@ def set_warmup_epochs(self, num):
         self.warmup_epochs = num
         logger.info("Set warmup_epochs to {}.".format(num))
 
-    def set_loss_type(self, loss_type):
-        supported_types = [
-            "dist_softmax", "dist_arcface", "softmax", "arcface"
-        ]
-        if loss_type not in supported_types:
-            raise ValueError("All supported loss types: {}".format(
-                supported_types))
+    def set_loss_type(self,
+                      loss_type,
+                      margin1=None,
+                      margin2=None,
+                      margin3=None):
+        """
+        Set the loss type. Supported arcface, cosface, sphereface loss type.
+        You also can set combined margin loss by yourself via marign1, margin2, maring3.
+        """
         self.loss_type = loss_type
-        logger.info("Set loss_type to {}.".format(loss_type))
+        if "arcface" == loss_type:
+            self.margin1 = 1.0 if margin1 is None else margin1
+            self.margin2 = 0.5 if margin2 is None else margin2
+            self.margin3 = 0.0 if margin3 is None else margin3
+        elif "cosface" == loss_type:
+            self.margin1 = 1.0 if margin1 is None else margin1
+            self.margin2 = 0.0 if margin2 is None else margin2
+            self.margin3 = 0.35 if margin3 is None else margin3
+        elif "sphereface" == loss_type:
+            self.margin1 = 1.35 if margin1 is None else margin1
+            self.margin2 = 0.0 if margin2 is None else margin2
+            self.margin3 = 0.0 if margin3 is None else margin3
+        else:
+            self.margin1 = margin1
+            self.margin2 = margin2
+            self.margin3 = margin3
+        assert self.margin1 is not None, "margin1 must be set"
+        assert self.margin2 is not None, "margin2 must be set"
+        assert self.margin3 is not None, "margin3 must be set"
+
+        logger.info(
+            "Set loss_type to {}, margin1 = {}, margin2 = {}, margin3 = {}.".
+            format(loss_type, self.margin1, self.margin2, self.margin3))
 
     def set_optimizer(self, optimizer):
         if not isinstance(optimizer, Optimizer):
@@ -421,6 +462,8 @@ def _get_optimizer(self):
             steps_per_pass = int(
                 math.ceil(images_per_trainer * 1.0 / self.train_batch_size))
             logger.info("Steps per epoch: %d" % steps_per_pass)
+            if self.train_epochs is None:
+                self.train_epochs = self.train_steps // steps_per_pass + 1
             warmup_steps = steps_per_pass * self.warmup_epochs
             batch_denom = 1024
             base_lr = start_lr * global_batch_size / batch_denom
@@ -445,12 +488,11 @@ def _get_optimizer(self):
                 weight_decay=paddle.regularizer.L2Decay(5e-4))
             self.optimizer = optimizer
 
-        if self.loss_type in ["dist_softmax", "dist_arcface"]:
+        if self.model_parallel:
             self.optimizer = DistributedClassificationOptimizer(
                 self.optimizer,
                 self.train_batch_size,
                 use_fp16=self.use_fp16,
-                loss_type=self.loss_type,
                 fp16_user_dict=self.fp16_user_dict)
         elif self.use_fp16:
             self.optimizer = paddle.static.amp.decorate(
@@ -486,23 +528,32 @@ def build_program(self, is_train=True, use_parallel_test=False):
                 input_field.build()
                 self.input_field = input_field
 
+                if self.model_parallel:
+                    msg = 'Using model parallelism for training.'
+                    logger.info(msg)
+                if self.sample_ratio < 1.0:
+                    msg = 'Using Partial FC and sample ratio = %.2f.' % self.sample_ratio
+                    logger.info(msg)
                 emb, loss, prob = model.get_output(
                     input=input_field,
                     num_classes=self.num_classes,
                     num_ranks=num_trainers,
                     rank_id=trainer_id,
+                    model_parallel=self.model_parallel,
                     is_train=is_train,
                     param_attr=self.param_attr,
                     bias_attr=self.bias_attr,
-                    loss_type=self.loss_type,
-                    margin=self.margin,
-                    scale=self.scale)
+                    margin1=self.margin1,
+                    margin2=self.margin2,
+                    margin3=self.margin3,
+                    scale=self.scale,
+                    sample_ratio=self.sample_ratio)
 
                 acc1 = None
                 acc5 = None
 
-                if self.loss_type in ["dist_softmax", "dist_arcface"]:
-                    if self.calc_train_acc:
+                if self.calc_train_acc:
+                    if self.model_parallel:
                         shard_prob = loss._get_info("shard_prob")
 
                         prob_list = []
@@ -520,8 +571,7 @@ def build_program(self, is_train=True, use_parallel_test=False):
                             input=prob,
                             label=paddle.reshape(label_all, (-1, 1)),
                             k=5)
-                else:
-                    if self.calc_train_acc:
+                    else:
                         acc1 = paddle.static.accuracy(
                             input=prob,
                             label=paddle.reshape(input_field.label, (-1, 1)),
@@ -540,7 +590,7 @@ def build_program(self, is_train=True, use_parallel_test=False):
                         dist_optimizer.minimize(loss)
                     else:  # single card training
                         optimizer.minimize(loss)
-                    if "dist" in self.loss_type or self.use_fp16:
+                    if self.model_parallel or self.use_fp16:
                         optimizer = optimizer._optimizer
                 elif use_parallel_test:
                     emb_list = []
@@ -714,9 +764,7 @@ def load(self, program, for_train=True):
             else:
                 state_dict[name] = tensor
 
-        distributed = self.loss_type in ["dist_softmax", "dist_arcface"]
-
-        if for_train or distributed:
+        if for_train or self.model_parallel:
             meta_file = os.path.join(checkpoint_dir, 'meta.json')
             if not os.path.exists(meta_file):
                 logger.error(
@@ -729,7 +777,7 @@ def load(self, program, for_train=True):
                 config = json.load(handle)
 
         # Preporcess distributed parameters.
-        if distributed:
+        if self.model_parallel:
             pretrain_nranks = config['pretrain_nranks']
             assert pretrain_nranks > 0
             emb_dim = config['emb_dim']
@@ -899,8 +947,6 @@ def _run_test(self, exe, test_list, test_name_list, feeder, fetch_list):
             sys.stdout.flush()
 
     def test(self):
-        self._check()
-
         trainer_id = self.trainer_id
         num_trainers = self.num_trainers
 
@@ -979,7 +1025,6 @@ def test(self):
         logger.info("test time: {:.4f}".format(test_end - test_start))
 
     def train(self):
-        self._check()
         self.has_run_train = True
 
         trainer_id = self.trainer_id