InfiniTensor
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 58 additions & 65 deletions b/‎example/gpt2/main.cc‎
Lines changed: 58 additions & 65 deletions
@@ -67,7 +67,6 @@ DEFINE_bool(sequence_parallel, false, "Whether to enable Sequence Parallel");
 DEFINE_uint32(
     pipeline_parallel, 1,
     "Pipeline Parallel world size, will always use device=cuda and use all cuda visible devices when set to true");
-DEFINE_uint32(num_microbatches, 4, "the num of microbatches in pipeline parallelism");
 
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
@@ -148,14 +147,16 @@ void Train(const nn::parallel::Rank &rank) {
             pp_pg = ProcessGroupFactory::Instance()->GetOrCreate(
                 GetPipelineParallelProcessGroupName(rank.thread_rank()), GetPipelineParallelGroupRanks(pp_world_size));
             pp_rank = pp_pg->GetGroupRank(rank.thread_rank());
+
+            nn::parallel::pp_rank = pp_rank;
         }
     } else {
         device = FLAGS_device == kDeviceCPU ? DeviceManager::Instance()->GetDefaultDevice()
                                             : DeviceManager::Instance()->GetDevice(DeviceType::kCUDA, 0);
     }
 
     // calculate gradient accumulation from the desired total batch size and the current run configuration
-    const auto tokens_per_fwdbwd = FLAGS_batch_size * FLAGS_sequence_length * (ddp_world_size * pp_world_size);
+    const auto tokens_per_fwdbwd = FLAGS_batch_size * FLAGS_sequence_length * ddp_world_size;
     CHECK_EQ(FLAGS_total_batch_size % tokens_per_fwdbwd, 0);
     const auto grad_accum_steps = FLAGS_total_batch_size / tokens_per_fwdbwd;
     LOG(INFO) << "total desired batch size: " << FLAGS_total_batch_size
@@ -197,16 +198,9 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<DistributedDataParallel>(model, rank.thread_rank());
     }
 
-    std::unique_ptr<DataLoader> train_loader;
-    if (pp_world_size > 1) {
-        train_loader = std::make_unique<DataLoader>(
-            std::make_shared<TinyShakespeareDataset>(FLAGS_input_bin, FLAGS_sequence_length),
-            FLAGS_batch_size * pp_world_size);
-    } else {
-        train_loader = std::make_unique<DistributedDataLoader>(
-            std::make_shared<TinyShakespeareDataset>(FLAGS_input_bin, FLAGS_sequence_length), FLAGS_batch_size,
-            ddp_rank, ddp_world_size);
-    }
+    auto num_microbatches = FLAGS_total_batch_size / (FLAGS_batch_size * FLAGS_sequence_length * ddp_world_size);
+    DistributedDataLoader train_loader(std::make_shared<TinyShakespeareDataset>(FLAGS_input_bin, FLAGS_sequence_length),
+                                       FLAGS_batch_size * num_microbatches, ddp_rank, ddp_world_size);
 
     std::optional<DistributedDataLoader> val_loader = std::nullopt;
     if (!FLAGS_input_val_bin.empty()) {
@@ -231,7 +225,7 @@ void Train(const nn::parallel::Rank &rank) {
     };
     auto optimizer = optimizer_factory(model->Parameters());
 
-    auto train_iter = train_loader->begin();
+    auto train_iter = train_loader.begin();
     std::shared_ptr<nn::Module> loss_fn
         = (tp_world_size > 1) ? std::static_pointer_cast<nn::Module>(
               std::make_shared<VocabParallelCrossEntropyLoss>(model_config.original_vocab_size))
@@ -240,13 +234,9 @@ void Train(const nn::parallel::Rank &rank) {
     LOG(INFO) << "Rank " << rank.thread_rank() << ": start training";
 
     if (pp_world_size > 1) {
-        CHECK_EQ((FLAGS_batch_size * pp_world_size) % FLAGS_num_microbatches, 0)
-            << "FLAGS_batch_size (" << (FLAGS_batch_size * pp_world_size)
-            << ") must be divisible by FLAGS_num_microbatches (" << FLAGS_num_microbatches << ")";
-        auto shapes = std::vector<std::vector<int64_t>>{{(FLAGS_batch_size * pp_world_size) / FLAGS_num_microbatches,
-                                                         FLAGS_sequence_length, model->GetConfig()["n_embd"]}};
+        auto shapes = std::vector<std::vector<int64_t>>{{FLAGS_batch_size, FLAGS_sequence_length, model_config.n_embd}};
 
-        model = std::make_shared<nn::parallel::PipelineParallel>(model, pp_world_size, FLAGS_num_microbatches, shapes,
+        model = std::make_shared<nn::parallel::PipelineParallel>(model, pp_world_size, num_microbatches, shapes,
                                                                  pp_rank, optimizer_factory);
     }
 
@@ -274,65 +264,68 @@ void Train(const nn::parallel::Rank &rank) {
             break;
         }
 
+        float lossf = 0.0f;
         // model->Train();
         if (pp_world_size == 1) {
             optimizer->ZeroGrad();
-        }
-        // if we are trying to overfit a single batch, we reset the loader here
-        if (FLAGS_overfit_single_batch) {
-            // train_loader.Reset();
-        }
-        float lossf = 0.0f;
+
+            // if we are trying to overfit a single batch, we reset the loader here
+            if (FLAGS_overfit_single_batch) {
+                // train_loader.Reset();
+            }
+
 #ifdef PROFILE_MODE
-        Profiler::Instance().SetTag("Step_" + std::to_string(step));
+            Profiler::Instance().SetTag("Step_" + std::to_string(step));
 #endif
-        for (int micro_step = 0; micro_step < grad_accum_steps; ++micro_step) {
-            // enable autocast for the current step
-            infini_train::AutocastGuard autocast_guard(device->Type(), dtype);
+            for (int micro_step = 0; micro_step < grad_accum_steps; ++micro_step) {
+                // enable autocast for the current step
+                infini_train::AutocastGuard autocast_guard(device->Type(), dtype);
+
+                // (bs, seq_len), (bs, seq_len)
+                auto [x, y] = *train_iter;
+                // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
+                // TODO(dcj): support dataloader.reset() later
+                ++train_iter;
+                x = std::make_shared<Tensor>(x->To(device));
+                y = std::make_shared<Tensor>(y->To(device));
+
+                LOG(INFO) << "Rank " << rank.thread_rank() << ": start forward";
+                // (bs, seq_len, vocab_size)
+                auto logits = model->Forward({x, y})[0];
+                LOG(INFO) << "Rank " << rank.thread_rank() << ": finish model forward, start loss forward";
+                auto loss = loss_fn->Forward({logits, y})[0];
+                loss = loss / grad_accum_steps;
+
+                // disable autocast for the current step (backward is not under autocast)
+                autocast_guard.Disable();
+
+                LOG(INFO) << "Rank " << rank.thread_rank() << ": finish loss forward";
+                if (ddp_world_size > 1) {
+                    function::AllReduce(loss, function::ReduceOpType::kAvg);
+                }
+                auto loss_cpu = loss->To(DeviceManager::Instance()->GetDefaultDevice());
+                lossf += static_cast<const float *>(loss_cpu.DataPtr())[0];
+                LOG(INFO) << "Rank " << rank.thread_rank() << ": start backward";
+                loss->Backward();
+                LOG(INFO) << "Rank " << rank.thread_rank() << ": finish backward";
+            }
 
-            // (bs, seq_len), (bs, seq_len)
+            optimizer->Step();
+        } else {
             auto [x, y] = *train_iter;
             // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
             // TODO(dcj): support dataloader.reset() later
             ++train_iter;
             x = std::make_shared<Tensor>(x->To(device));
             y = std::make_shared<Tensor>(y->To(device));
 
-            if (pp_world_size > 1) {
-                lossf = model->TrainStep({x}, {y}, loss_fn);
-
-                auto loss_tensor = std::make_shared<Tensor>(std::vector<int64_t>{}, DataType::kFLOAT32);
-                static_cast<float *>(loss_tensor->DataPtr())[0] = lossf;
-                auto loss_device_ptr = std::make_shared<Tensor>(loss_tensor->To(device));
-                function::AllReduce(loss_device_ptr, function::ReduceOpType::kMax);
-                auto loss_copy = loss_device_ptr->To(DeviceManager::Instance()->GetDefaultDevice());
-                lossf = static_cast<const float *>(loss_copy.DataPtr())[0];
-                continue;
-            }
-
-            LOG(INFO) << "Rank " << rank.thread_rank() << ": start forward";
-            // (bs, seq_len, vocab_size)
-            auto logits = model->Forward({x, y})[0];
-            LOG(INFO) << "Rank " << rank.thread_rank() << ": finish model forward, start loss forward";
-            auto loss = loss_fn->Forward({logits, y})[0];
-            loss = loss / grad_accum_steps;
-
-            // disable autocast for the current step (backward is not under autocast)
-            autocast_guard.Disable();
-
-            LOG(INFO) << "Rank " << rank.thread_rank() << ": finish loss forward";
-            if (ddp_world_size > 1) {
-                function::AllReduce(loss, function::ReduceOpType::kAvg);
-            }
-            auto loss_cpu = loss->To(DeviceManager::Instance()->GetDefaultDevice());
-            lossf += static_cast<const float *>(loss_cpu.DataPtr())[0];
-            LOG(INFO) << "Rank " << rank.thread_rank() << ": start backward";
-            loss->Backward();
-            LOG(INFO) << "Rank " << rank.thread_rank() << ": finish backward";
-        }
-
-        if (pp_world_size == 1) {
-            optimizer->Step();
+            lossf = model->TrainStep({x}, {y}, loss_fn);
+            auto loss_tensor = std::make_shared<Tensor>(std::vector<int64_t>{}, DataType::kFLOAT32);
+            static_cast<float *>(loss_tensor->DataPtr())[0] = lossf;
+            auto loss_device_ptr = std::make_shared<Tensor>(loss_tensor->To(device));
+            function::AllReduce(loss_device_ptr, function::ReduceOpType::kMax);
+            auto loss_copy = loss_device_ptr->To(DeviceManager::Instance()->GetDefaultDevice());
+            lossf = static_cast<const float *>(loss_copy.DataPtr())[0];
         }
         const auto iter_end = std::chrono::high_resolution_clock::now();
         const double duration_us = std::chrono::duration<double, std::micro>(iter_end - iter_start).count();