Skip to content

Commit aed827c

Browse files
[INTEL_HPU] runtime fix and add fake gpu kernels for LlamaInferenceModel (#1501)
1 parent fd1c961 commit aed827c

File tree

3 files changed

+253
-3
lines changed

3 files changed

+253
-3
lines changed

backends/intel_hpu/custom_ops/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
CppExtension(
2929
sources=[
3030
"./src/index_copy.cc",
31+
"./src/fake_gpu_kernels.cc",
3132
],
3233
include_dirs=[
3334
"../",
Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <algorithm>
16+
#include <chrono>
17+
#include <cstdlib>
18+
#include <iostream>
19+
#include <string>
20+
#include <vector>
21+
22+
#include "paddle/extension.h"
23+
24+
void NgramMatch(const paddle::Tensor& input_ids,
25+
const paddle::Tensor& input_ids_len,
26+
const paddle::Tensor& pre_ids,
27+
const paddle::Tensor& step_idx,
28+
const paddle::Tensor& draft_token_num,
29+
const paddle::Tensor& draft_tokens,
30+
const paddle::Tensor& seq_lens_this_time,
31+
const paddle::Tensor& seq_lens_encoder,
32+
const paddle::Tensor& seq_lens_decoder,
33+
const int real_batch_size,
34+
const int max_ngram_size,
35+
const int max_draft_tokens) {}
36+
37+
PD_BUILD_OP(ngram_match)
38+
.Inputs({"input_ids",
39+
"input_ids_len",
40+
"pre_ids",
41+
"step_idx",
42+
"draft_token_num",
43+
"draft_tokens",
44+
"seq_lens_this_time",
45+
"seq_lens_encoder",
46+
"seq_lens_decoder"})
47+
.Attrs({"real_batch_size: int",
48+
"max_ngram_size: int",
49+
"max_draft_tokens: int"})
50+
.Outputs({"draft_tokens_out", "seq_lens_this_time_out"})
51+
.SetKernelFn(PD_KERNEL(NgramMatch))
52+
.SetInplaceMap({{"draft_tokens", "draft_tokens_out"},
53+
{"seq_lens_this_time", "seq_lens_this_time_out"}});
54+
55+
std::vector<paddle::Tensor> TopPCandidates(
56+
const paddle::Tensor& probs,
57+
const paddle::Tensor& top_p,
58+
const paddle::Tensor& output_padding_offset,
59+
int candidates_len,
60+
int max_seq_len) {
61+
return {top_p};
62+
}
63+
64+
std::vector<std::vector<int64_t>> TopPCandidatesInferShape(
65+
const std::vector<int64_t>& probs_shape,
66+
const std::vector<int64_t>& top_p_shape,
67+
const std::vector<int64_t>& output_padding_offset_shape,
68+
int max_candidates_len) {
69+
int token_num = probs_shape[0];
70+
return {{token_num, max_candidates_len},
71+
{token_num, max_candidates_len},
72+
{token_num}};
73+
}
74+
75+
std::vector<paddle::DataType> TopPCandidatesInferDtype(
76+
const paddle::DataType& probs_dtype,
77+
const paddle::DataType& top_p_dtype,
78+
const paddle::DataType& output_padding_offset_dtype) {
79+
return {probs_dtype, paddle::DataType::INT64, paddle::DataType::INT32};
80+
}
81+
82+
PD_BUILD_OP(top_p_candidates)
83+
.Inputs({"probs", "top_p", "output_padding_offset"})
84+
.Outputs({"verify_scores", "verify_tokens", "actual_candidate_lens"})
85+
.Attrs({"candidates_len: int", "max_seq_len: int"})
86+
.SetKernelFn(PD_KERNEL(TopPCandidates))
87+
.SetInferShapeFn(PD_INFER_SHAPE(TopPCandidatesInferShape))
88+
.SetInferDtypeFn(PD_INFER_DTYPE(TopPCandidatesInferDtype));
89+
90+
void SpeculateVerifyAndUpdate(const paddle::Tensor& accept_tokens,
91+
const paddle::Tensor& accept_num,
92+
const paddle::Tensor& step_idx,
93+
const paddle::Tensor& seq_lens_encoder,
94+
const paddle::Tensor& seq_lens_decoder,
95+
const paddle::Tensor& stop_flags,
96+
const paddle::Tensor& not_need_stop,
97+
const paddle::Tensor& draft_tokens,
98+
const paddle::Tensor& seq_lens_this_time,
99+
const paddle::Tensor& verify_tokens,
100+
const paddle::Tensor& verify_scores,
101+
const paddle::Tensor& max_dec_len,
102+
const paddle::Tensor& end_tokens,
103+
const paddle::Tensor& is_block_step,
104+
const paddle::Tensor& output_cum_offsets,
105+
const paddle::Tensor& actual_candidate_len,
106+
const paddle::Tensor& actual_draft_token_nums,
107+
const paddle::Tensor& topp,
108+
int max_seq_len,
109+
int verify_window,
110+
bool enable_topp) {}
111+
112+
PD_BUILD_OP(speculate_verify_and_update)
113+
.Inputs({"accept_tokens",
114+
"accept_num",
115+
"step_idx",
116+
"seq_lens_encoder",
117+
"seq_lens_decoder",
118+
"stop_flags",
119+
"not_need_stop",
120+
"draft_tokens",
121+
"seq_lens_this_time",
122+
"verify_tokens",
123+
"verify_scores",
124+
"max_dec_len",
125+
"end_tokens",
126+
"is_block_step",
127+
"output_cum_offsets",
128+
"actual_candidate_len",
129+
"actual_draft_token_nums",
130+
"topp"})
131+
.Outputs({"accept_tokens_out",
132+
"accept_num_out",
133+
"step_idx_out",
134+
"seq_lens_encoder_out",
135+
"seq_lens_decoder_out",
136+
"stop_flags_out",
137+
"not_need_stop_out",
138+
"draft_tokens_out"})
139+
.Attrs({"max_seq_len: int", "verify_window: int", "enable_topp: bool"})
140+
.SetInplaceMap({{"accept_tokens", "accept_tokens_out"},
141+
{"accept_num", "accept_num_out"},
142+
{"step_idx", "step_idx_out"},
143+
{"seq_lens_encoder", "seq_lens_encoder_out"},
144+
{"seq_lens_decoder", "seq_lens_decoder_out"},
145+
{"stop_flags", "stop_flags_out"},
146+
{"not_need_stop", "not_need_stop_out"},
147+
{"draft_tokens", "draft_tokens_out"}})
148+
.SetKernelFn(PD_KERNEL(SpeculateVerifyAndUpdate));
149+
150+
void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor& pre_ids_all,
151+
const paddle::Tensor& accept_tokens,
152+
const paddle::Tensor& accept_num,
153+
const paddle::Tensor& stop_flags,
154+
const paddle::Tensor& seq_lens_this_time,
155+
const paddle::Tensor& seq_lens_encoder,
156+
const paddle::Tensor& seq_lens_decoder,
157+
const paddle::Tensor& step_idx) {}
158+
159+
PD_BUILD_OP(speculate_set_value_by_flags_and_idx)
160+
.Inputs({"pre_ids_all",
161+
"accept_tokens",
162+
"accept_num",
163+
"stop_flags",
164+
"seq_lens_this_time",
165+
"seq_lens_encoder",
166+
"seq_lens_decoder",
167+
"step_idx"})
168+
.Outputs({"pre_ids_all_out"})
169+
.SetInplaceMap({{"pre_ids_all", "pre_ids_all_out"}})
170+
.SetKernelFn(PD_KERNEL(SpeculateSetValueByFlagsAndIdx));
171+
172+
std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
173+
const paddle::Tensor& seq_lens_this_time,
174+
const paddle::Tensor& seq_lens_encoder,
175+
const paddle::Tensor& seq_lens_decoder) {
176+
return {seq_lens_this_time};
177+
}
178+
179+
std::vector<std::vector<int64_t>> SpeculateGetSeqLensOutputInferShape(
180+
const std::vector<int64_t>& seq_lens_this_time_shape,
181+
const std::vector<int64_t>& seq_lens_encoder_shape,
182+
const std::vector<int64_t>& seq_lens_decoder_shape) {
183+
int64_t bsz = seq_lens_this_time_shape[0];
184+
return {{bsz}};
185+
}
186+
187+
std::vector<paddle::DataType> SpeculateGetSeqLensOutputInferDtype(
188+
const paddle::DataType& seq_lens_this_time_dtype,
189+
const paddle::DataType& seq_lens_encoder_dtype,
190+
const paddle::DataType& seq_lens_decoder_dtype) {
191+
return {seq_lens_this_time_dtype};
192+
}
193+
194+
PD_BUILD_OP(speculate_get_seq_lens_output)
195+
.Inputs({"seq_lens_this_time", "seq_lens_encoder", "seq_lens_decoder"})
196+
.Outputs({"seq_lens_output"})
197+
.SetKernelFn(PD_KERNEL(SpeculateGetSeqLensOutput))
198+
.SetInferShapeFn(PD_INFER_SHAPE(SpeculateGetSeqLensOutputInferShape))
199+
.SetInferDtypeFn(PD_INFER_DTYPE(SpeculateGetSeqLensOutputInferDtype));
200+
201+
std::vector<paddle::Tensor> SpeculateGetOutputPaddingOffset(
202+
const paddle::Tensor& output_cum_offsets_tmp,
203+
const paddle::Tensor& out_token_num,
204+
const paddle::Tensor& seq_lens_output,
205+
const int max_seq_len) {
206+
return {output_cum_offsets_tmp};
207+
}
208+
209+
std::vector<std::vector<int64_t>> SpeculateGetOutputPaddingOffsetInferShape(
210+
const std::vector<int64_t>& output_cum_offsets_tmp_shape,
211+
const std::vector<int64_t>& out_token_num_shape,
212+
const std::vector<int64_t>& seq_lens_output_shape) {
213+
int64_t bsz = output_cum_offsets_tmp_shape[0];
214+
return {{-1}, {bsz}};
215+
}
216+
217+
std::vector<paddle::DataType> SpeculateGetOutputPaddingOffsetInferDtype(
218+
const paddle::DataType& output_cum_offsets_tmp_dtype,
219+
const paddle::DataType& out_token_num_dtype,
220+
const paddle::DataType& seq_lens_output_dtype) {
221+
return {output_cum_offsets_tmp_dtype, output_cum_offsets_tmp_dtype};
222+
}
223+
224+
PD_BUILD_OP(speculate_get_output_padding_offset)
225+
.Inputs({"output_cum_offsets_tmp", "out_token_num", "seq_lens_output"})
226+
.Outputs({"output_padding_offset", "output_cum_offsets"})
227+
.Attrs({"max_seq_len: int"})
228+
.SetKernelFn(PD_KERNEL(SpeculateGetOutputPaddingOffset))
229+
.SetInferShapeFn(PD_INFER_SHAPE(SpeculateGetOutputPaddingOffsetInferShape))
230+
.SetInferDtypeFn(PD_INFER_DTYPE(SpeculateGetOutputPaddingOffsetInferDtype));
231+
232+
void SaveOutMsg(const paddle::Tensor& x,
233+
const paddle::Tensor& not_need_stop,
234+
const paddle::optional<paddle::Tensor>& accept_num,
235+
int64_t rank_id) {}
236+
237+
PD_BUILD_OP(save_output)
238+
.Inputs({"x", "not_need_stop", paddle::Optional("accept_num")})
239+
.Attrs({"rank_id: int64_t"})
240+
.Outputs({"x_out"})
241+
.SetInplaceMap({{"x", "x_out"}})
242+
.SetKernelFn(PD_KERNEL(SaveOutMsg));

backends/intel_hpu/runtime/runtime.cc

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,9 @@ class RuntimeManager {
240240
PD_CHECK(status == synSuccess,
241241
"[RUNTIME] synStreamSynchronize(stream_h2d) failed = %d",
242242
status);
243+
status = synHostUnmap(device->id, src);
244+
LOG_IF(ERROR, status != synSuccess)
245+
<< "[RUNTIME] synHostUnmap() failed = " << status;
243246

244247
} else if (flag == 1) {
245248
if (stream_d2h == nullptr) {
@@ -264,6 +267,9 @@ class RuntimeManager {
264267
PD_CHECK(status == synSuccess,
265268
"[RUNTIME] synStreamSynchronize() failed = %d",
266269
status);
270+
status = synHostUnmap(device->id, dst);
271+
LOG_IF(ERROR, status != synSuccess)
272+
<< "[RUNTIME] synHostUnmap() failed = " << status;
267273

268274
} else if (flag == 2) {
269275
if (stream_d2d == nullptr) {
@@ -392,8 +398,9 @@ class RuntimeManager {
392398
// not found, map and cache
393399
status = synHostMap(device->id, size, ptr);
394400
LOG_IF(ERROR, status != synSuccess)
395-
<< "[RUNTIME] synHostMap() failed = " << status;
396-
hostMappedAddress[ptr] = size;
401+
<< "[RUNTIME] synHostMap() failed = " << status << " ptr=" << ptr
402+
<< " size=" << size;
403+
// hostMappedAddress[ptr] = size;
397404
} else {
398405
if (it->second != size) {
399406
// found but size not equal
@@ -405,7 +412,7 @@ class RuntimeManager {
405412
status = synHostMap(device->id, size, ptr);
406413
LOG_IF(ERROR, status != synSuccess)
407414
<< "[RUNTIME] synHostMap() failed = " << status;
408-
hostMappedAddress[ptr] = size;
415+
// hostMappedAddress[ptr] = size;
409416
}
410417
}
411418
}

0 commit comments

Comments
 (0)