Skip to content

Commit c9ae191

Browse files
author
Chen Xi
committed
add tensor parallel support
Signed-off-by: Chen Xi <[email protected]>
1 parent cb8507b commit c9ae191

File tree

6 files changed

+44
-28
lines changed

6 files changed

+44
-28
lines changed

ggml/include/ggml-sycl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
2929
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
3030

3131
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
32+
GGML_API int ggml_backend_sycl_rank(void);
33+
GGML_API int ggml_backend_sycl_world_size(void);
3234
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
3335
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
3436
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();

ggml/include/ggml.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -599,8 +599,8 @@ extern "C" {
599599
TENSOR_NO_CHANGE,
600600
TENSOR_SPLIT_BY_ROW,
601601
TENSOR_SPLIT_BY_COLUMN,
602-
TENSOR_KEEPED_ON_MASTER,
603-
}
602+
TENSOR_KEEPED_ON_MASTER
603+
};
604604

605605
// n-dimensional tensor
606606
struct ggml_tensor {
@@ -637,9 +637,9 @@ extern "C" {
637637

638638
void * extra; // extra things e.g. for ggml-cuda.cu
639639

640-
enum tensor_parallel_mode split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
640+
enum tensor_parallel_mode split_mode; // {tensor_parallel_mode::TENSOR_NO_CHANGE};
641641

642-
// char padding[4];
642+
char padding[12];
643643
};
644644

645645
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

ggml/src/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,17 @@ if (GGML_SYCL)
566566
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
567567
endif()
568568

569+
set(oneCCL_DIR "/opt/intel/oneapi/ccl/latest/lib/cmake/oneCCL")
570+
set(MPI_INCLUDE_PATH "/opt/intel/oneapi/mpi/latest/include")
571+
set(MPI_LIBRARY_PATH "/opt/intel/oneapi/mpi/latest/lib/")
572+
set(ONECCL_INCLUDE_PATH "/opt/intel/oneapi/ccl/latest/include")
573+
set(ONECCL_LIBRARY_PATH "/opt/intel/oneapi/ccl/latest/lib/")
574+
include_directories(${MPI_INCLUDE_PATH} ${ONECCL_INCLUDE_PATH})
575+
find_library(MPI_LIBRARY mpi HINTS ${MPI_LIBRARY_PATH})
576+
find_library(ONECCL_LIBRARY ccl HINTS ${ONECCL_LIBRARY_PATH})
577+
# find_package(oneCCL REQUIRED)
578+
message("-- oneCCL found")
579+
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${MPI_LIBRARY_PATH} ${ONECCL_LIBRARY_PATH})
569580
if (WIN32)
570581
find_package(IntelSYCL REQUIRED)
571582
find_package(MKL REQUIRED)

ggml/src/ggml-sycl.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,12 +1747,12 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
17471747

17481748
int ggml_backend_sycl_rank() {
17491749
// use ccl rank as main gpu
1750-
return dpct::dev_mgr::instance().get_ccl_rank();
1750+
return dpct::dev_mgr::instance().get_rank();
17511751
}
17521752

17531753
int ggml_backend_sycl_world_size() {
17541754
// use ccl rank as main gpu
1755-
return dpct::dev_mgr::instance().get_ccl_world_size();
1755+
return dpct::dev_mgr::instance().get_world_size();
17561756
}
17571757

17581758
void ggml_backend_sycl_print_sycl_devices() {
@@ -4237,9 +4237,9 @@ catch (sycl::exception const &exc) {
42374237
std::exit(1);
42384238
}
42394239

4240-
static bool split_tensor(const struct ggml_tensor * src, void* dst, void* data, int split_mode) {
4241-
int rank = ggml_backend_sycl_rank()
4242-
int world_size = ggml_backend_sycl_world_size()
4240+
static bool split_tensor(const struct ggml_tensor * src, void* dst, const void* data, enum tensor_parallel_mode split_mode) {
4241+
int rank = ggml_backend_sycl_rank();
4242+
int world_size = ggml_backend_sycl_world_size();
42434243
auto type_traits = ggml_internal_get_type_traits(src->type);
42444244
size_t element_size = type_traits.type_size / type_traits.blck_size;
42454245
const int64_t dst_size = ggml_nelements(src) * element_size / world_size;
@@ -4288,7 +4288,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
42884288
if (tensor->split_mode == tensor_parallel_mode::TENSOR_NO_CHANGE) {
42894289
memcpy(host_buf, data, size);
42904290
} else {
4291-
if (!split_tensor(tensor, host_buf, data, size, tensor->split_mode)) {
4291+
if (!split_tensor(tensor, ((void*)host_buf), data, tensor->split_mode)) {
42924292
std::cerr << "split tensor failed!" << std::endl;
42934293
}
42944294
}
@@ -4505,8 +4505,8 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
45054505
static bool ggml_backend_sycl_buffer_type_initialized = false;
45064506

45074507
if (!ggml_backend_sycl_buffer_type_initialized) {
4508-
if (dpct::dev_mgr::instance().world_size() > 1) {
4509-
auto rank = dpct::dev_mgr::instance().get_rank();
4508+
if (ggml_backend_sycl_world_size() > 1) {
4509+
auto rank = ggml_backend_sycl_rank();
45104510
auto & device_tp = dpct::dev_mgr::instance().get_device(rank);
45114511
queue_ptr stream = &(device_tp.default_queue());
45124512
// TODO(xi): buffer_types always use 0 to avoid changes on public code

ggml/src/ggml-sycl/dpct/helper.hpp

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@
1313
#ifndef GGML_SYCL_DPCT_HELPER_HPP
1414
#define GGML_SYCL_DPCT_HELPER_HPP
1515

16+
#include <stdlib.h>
1617
#include <sycl/sycl.hpp>
1718
#include <sycl/half_type.hpp>
1819
#include <oneapi/ccl.hpp>
1920
#include <oneapi/mkl.hpp>
2021
#include <map>
22+
#include <mpi.h>
2123

2224
#include "ggml.h"
2325

@@ -480,8 +482,6 @@ namespace dpct
480482
int _max_nd_range_size_i[3];
481483
uint32_t _device_id;
482484
std::array<unsigned char, 16> _uuid;
483-
uint32_t _rank;
484-
uint32_t _world_size;
485485
};
486486

487487
static int get_major_version(const sycl::device &dev)
@@ -873,8 +873,8 @@ namespace dpct
873873
}
874874
return -1;
875875
}
876-
inline int get_ccl_rank() { return _rank; }
877-
inline int get_ccl_world_size() { return _world_size; }
876+
inline int get_rank() { return _rank; }
877+
inline int get_world_size() { return _world_size; }
878878
inline ccl::communicator create_ccl_communicator(ccl::device dev, ccl::context ctx) {
879879
return ccl::create_communicator(_world_size, _rank, dev, ctx, _kvs);
880880
@@ -1002,7 +1002,13 @@ namespace dpct
10021002
return convert_backend_index(backend1) < convert_backend_index(backend2);
10031003
}
10041004
1005-
static void init_ccl() {
1005+
static void mpi_finalize() {
1006+
static int is_finalized = 0;
1007+
MPI_Finalized(&is_finalized);
1008+
if (!is_finalized) MPI_Finalize();
1009+
}
1010+
1011+
void init_ccl() {
10061012
ccl::init();
10071013
MPI_Init(NULL, NULL);
10081014
MPI_Comm_size(MPI_COMM_WORLD, &_world_size);
@@ -1018,7 +1024,6 @@ namespace dpct
10181024
MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
10191025
_kvs = ccl::create_kvs(main_addr);
10201026
}
1021-
10221027
}
10231028
10241029
dev_mgr()

src/llama.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4368,8 +4368,6 @@ struct llama_model_loader {
43684368
int n_created = 0;
43694369
// For tensor parallelism
43704370
int world_size = 1;
4371-
int rank = 0;
4372-
bool enable_tp = false;
43734371

43744372
int64_t n_elements = 0;
43754373
size_t n_bytes = 0;
@@ -4630,7 +4628,6 @@ struct llama_model_loader {
46304628
this->use_mmap = use_mmap;
46314629
this->check_tensors = check_tensors;
46324630
world_size = ggml_backend_get_world_size();
4633-
rank = ggml_backend_get_rank();
46344631
}
46354632

46364633
~llama_model_loader() {
@@ -4859,12 +4856,12 @@ struct llama_model_loader {
48594856
ggml_set_name(tensor, ggml_get_name(cur));
48604857
if (flags == TENSOR_SPLIT_BY_ROW) {
48614858
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_ROW;
4862-
}
4863-
if (flags == TENSOR_SPLIT_BY_COLUMN) {
4859+
} else if (flags == TENSOR_SPLIT_BY_COLUMN) {
48644860
tensor->split_mode = tensor_parallel_mode::TENSOR_SPLIT_BY_COLUMN;
4865-
}
4866-
if (flags == TENSOR_KEEPED_ON_MASTER) {
4861+
} else if (flags == TENSOR_KEEPED_ON_MASTER) {
48674862
tensor->split_mode = tensor_parallel_mode::TENSOR_KEEPED_ON_MASTER;
4863+
} else {
4864+
tensor->split_mode = tensor_parallel_mode::TENSOR_NO_CHANGE;
48684865
}
48694866

48704867
if (flags == TENSOR_DUPLICATED) {
@@ -7023,8 +7020,9 @@ static bool llm_load_tensors(
70237020
if (n_expert > 0 && hparams.n_expert_used == 0) {
70247021
throw std::runtime_error("model has expert layers but no expert layers are used");
70257022
}
7026-
7023+
bool enable_tp = false;
70277024
if (split_mode == LLAMA_SPLIT_MODE_TENSOR) {
7025+
int world_size = ggml_backend_get_world_size();
70287026
if (world_size > 1) {
70297027
enable_tp = true;
70307028
// need to change the size before load tensor
@@ -7078,7 +7076,7 @@ static bool llm_load_tensors(
70787076
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, llama_model_loader::TENSOR_SPLIT_BY_COLUMN);
70797077

70807078
// optional bias tensors
7081-
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
7079+
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
70827080
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, bias_split_mode);
70837081
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, bias_split_mode);
70847082
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, bias_split_mode);
@@ -7109,7 +7107,7 @@ static bool llm_load_tensors(
71097107
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_SPLIT_BY_ROW);
71107108

71117109
// optional MLP bias
7112-
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN
7110+
auto bias_split_mode = llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_SPLIT_BY_COLUMN;
71137111
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, bias_split_mode);
71147112
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED | llama_model_loader::TENSOR_KEEPED_ON_MASTER);
71157113
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, bias_split_mode);

0 commit comments

Comments
 (0)