fix position bias in tensor parallel (#1714)

minhthuc2502 · web-flow · commit 5eb5d5a379ce · 2024-05-30T09:59:22.000+02:00
* fix position bias in tensor parallel

* add symbol ncclCommFinalize
diff --git a/src/cuda/nccl_stub.cc b/src/cuda/nccl_stub.cc
@@ -69,9 +69,9 @@ extern "C" {
   return func(comm);
   }
 
-  ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  ncclResult_t ncclCommFinalize(ncclComm_t comm) {
     using Signature = ncclResult_t(*)(ncclComm_t comm);
-    static auto func = ctranslate2::load_symbol<Signature>("ncclCommAbort");
+    static auto func = ctranslate2::load_symbol<Signature>("ncclCommFinalize");
     return func(comm);
   }
 
diff --git a/src/devices.cc b/src/devices.cc
@@ -196,7 +196,7 @@ namespace ctranslate2 {
     for (auto* comm : _nccl_comms) {
         //finalizing NCCL
         if (*comm) {
-          NCCL_CHECK(ncclCommAbort(*comm));
+          NCCL_CHECK(ncclCommFinalize(*comm));
           NCCL_CHECK(ncclCommDestroy(*comm));
         }
     }
diff --git a/src/layers/attention.cc b/src/layers/attention.cc
@@ -1,5 +1,7 @@
 #include "ctranslate2/layers/attention.h"
 #include "ctranslate2/ops/split.h"
+#include "ctranslate2/utils.h"
+
 
 #include <algorithm>
 #include <cmath>
@@ -210,11 +212,20 @@ namespace ctranslate2 {
                                                  is_decoder,
                                                  with_cache ? key_length - 1 : 0);
         }
+        StorageView* position_bias_per_gpu = position_bias;
+        StorageView position_bias_tmp(position_bias->dtype(), position_bias->device());
+        if (ScopedMPISetter::getCurRank() != 0) {
+          const dim_t num_head_per_gpu = SAFE_DIVIDE(position_bias->dim(0), ScopedMPISetter::getNRanks());
+          ops::Slide slide_ops(0, num_head_per_gpu * ScopedMPISetter::getCurRank(),
+                               num_head_per_gpu, true);
+          slide_ops(*position_bias, position_bias_tmp);
+          position_bias_per_gpu = &position_bias_tmp;
+        }
 
         DEVICE_AND_TYPE_DISPATCH(output.device(), output.dtype(),
-                                 primitives<D>::add_batch_broadcast(position_bias->data<T>(),
+                                 primitives<D>::add_batch_broadcast(position_bias_per_gpu->data<T>(),
                                                                     output.data<T>(),
-                                                                    position_bias->size(),
+                                                                    position_bias_per_gpu->size(),
                                                                     output.size()));
       }
 

Original file line number	Diff line number	Diff line change
`@@ -69,9 +69,9 @@ extern "C" {`
`69`	`69`	`return func(comm);`
`70`	`70`	`}`
`71`	`71`
`72`		`- ncclResult_t ncclCommAbort(ncclComm_t comm) {`
	`72`	`+ ncclResult_t ncclCommFinalize(ncclComm_t comm) {`
`73`	`73`	`using Signature = ncclResult_t(*)(ncclComm_t comm);`
`74`		`- static auto func = ctranslate2::load_symbol<Signature>("ncclCommAbort");`
	`74`	`+ static auto func = ctranslate2::load_symbol<Signature>("ncclCommFinalize");`
`75`	`75`	`return func(comm);`
`76`	`76`	`}`
`77`	`77`
Original file line number	Diff line number	Diff line change
`@@ -196,7 +196,7 @@ namespace ctranslate2 {`
`196`	`196`	`for (auto* comm : _nccl_comms) {`
`197`	`197`	`//finalizing NCCL`
`198`	`198`	`if (*comm) {`
`199`		`- NCCL_CHECK(ncclCommAbort(*comm));`
	`199`	`+ NCCL_CHECK(ncclCommFinalize(*comm));`
`200`	`200`	`NCCL_CHECK(ncclCommDestroy(*comm));`
`201`	`201`	`}`
`202`	`202`	`}`