Skip to content

Commit 715118b

Browse files
committed
Single kernel tests
1 parent 9ee1328 commit 715118b

File tree

3 files changed

+294
-0
lines changed

3 files changed

+294
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#include <sycl/sycl.hpp>
2+
#include <ext/oneapi/bfloat16.hpp>
3+
#include <iostream>
4+
#include <vector>
5+
#include <cmath>
6+
7+
using namespace sycl;
8+
9+
void dequantize_q8_0_bf16_kernel(const int8_t *data,
10+
sycl::ext::oneapi::bfloat16 *output,
11+
const int blk_size, const int ele_per_blk,
12+
const int num_blocks,
13+
const sycl::nd_item<3> &item_ct1) {
14+
long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
15+
item_ct1.get_local_id(2);
16+
for (long long block_id = global_idx; block_id < num_blocks;
17+
block_id +=
18+
item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
19+
sycl::ext::oneapi::bfloat16 *__restrict__ output_blk =
20+
(sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk);
21+
const int8_t* cur_block = data + block_id * blk_size;
22+
float scale = sycl::vec<sycl::half, 1>(*((sycl::half *)cur_block))
23+
.convert<float, sycl::rounding_mode::automatic>()[0];
24+
cur_block += 2;
25+
for (int i = 0; i < ele_per_blk; i++) {
26+
output_blk[i] = sycl::ext::oneapi::bfloat16(scale * cur_block[i]);
27+
}
28+
}
29+
}
30+
31+
int main() {
32+
// Define the parameters
33+
const int blk_size = 10;
34+
const int ele_per_blk = 8;
35+
const int num_blocks = 2;
36+
37+
// Initialize input data
38+
std::vector<int8_t> data(blk_size * num_blocks);
39+
std::vector<sycl::ext::oneapi::bfloat16> output(ele_per_blk * num_blocks, 0.0f);
40+
41+
// Fill the data with some values
42+
for (int i = 0; i < num_blocks; ++i) {
43+
sycl::half scale = 0.5f;
44+
std::memcpy(data.data() + i * blk_size, &scale, sizeof(sycl::half));
45+
for (int j = 2; j < blk_size; ++j) {
46+
data[i * blk_size + j] = j - 2;
47+
}
48+
}
49+
50+
// Create a SYCL queue
51+
queue q;
52+
53+
// Allocate device memory
54+
int8_t* d_data = malloc_device<int8_t>(data.size(), q);
55+
sycl::ext::oneapi::bfloat16* d_output = malloc_device<sycl::ext::oneapi::bfloat16>(output.size(), q);
56+
57+
// Copy data to device
58+
q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait();
59+
q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait();
60+
61+
// Define the kernel execution configuration
62+
range<3> global_work_size(1, 1, num_blocks);
63+
range<3> local_work_size(1, 1, 1);
64+
65+
// Launch the kernel
66+
q.submit([&](handler& h) {
67+
h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) {
68+
dequantize_q8_0_bf16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1);
69+
});
70+
}).wait();
71+
72+
// Copy the result back to host
73+
q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait();
74+
75+
// Free device memory
76+
free(d_data, q);
77+
free(d_output, q);
78+
79+
// Check the results
80+
bool success = true;
81+
for (int i = 0; i < num_blocks; ++i) {
82+
sycl::half scale = 0.5f;
83+
for (int j = 0; j < ele_per_blk; ++j) {
84+
float expected = scale * (j);
85+
if (std::fabs(static_cast<float>(output[i * ele_per_blk + j]) - expected) > 1e-3) {
86+
success = false;
87+
std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << expected << ", got " << static_cast<float>(output[i * ele_per_blk + j]) << std::endl;
88+
}
89+
}
90+
}
91+
92+
if (success) {
93+
std::cout << "Test passed!" << std::endl;
94+
} else {
95+
std::cout << "Test failed!" << std::endl;
96+
}
97+
98+
return 0;
99+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#include <sycl/sycl.hpp>
2+
#include <iostream>
3+
#include <vector>
4+
#include <cmath>
5+
6+
using namespace sycl;
7+
8+
void dequantize_q8_0_fp16_kernel(const int8_t *data, sycl::half *output,
9+
const int blk_size, const int ele_per_blk,
10+
const int num_blocks,
11+
const sycl::nd_item<3> &item_ct1) {
12+
long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
13+
item_ct1.get_local_id(2);
14+
for (long long block_id = global_idx; block_id < num_blocks;
15+
block_id +=
16+
item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
17+
sycl::half *__restrict__ output_blk =
18+
(sycl::half *)(output + block_id * ele_per_blk);
19+
const int8_t* cur_block = data + block_id * blk_size;
20+
float scale = sycl::vec<sycl::half, 1>(*((sycl::half *)cur_block))
21+
.convert<float, sycl::rounding_mode::automatic>()[0];
22+
cur_block += 2;
23+
for (int i = 0; i < ele_per_blk; i++) {
24+
output_blk[i] =
25+
sycl::vec<float, 1>(scale * cur_block[i])
26+
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
27+
}
28+
}
29+
}
30+
31+
int main() {
32+
// Define the parameters
33+
const int blk_size = 10;
34+
const int ele_per_blk = 8;
35+
const int num_blocks = 2;
36+
37+
// Initialize input data
38+
std::vector<int8_t> data(blk_size * num_blocks);
39+
std::vector<sycl::half> output(ele_per_blk * num_blocks, 0.0f);
40+
41+
// Fill the data with some values
42+
for (int i = 0; i < num_blocks; ++i) {
43+
sycl::half scale = 0.5f;
44+
std::memcpy(data.data() + i * blk_size, &scale, sizeof(sycl::half));
45+
for (int j = 2; j < blk_size; ++j) {
46+
data[i * blk_size + j] = j - 2;
47+
}
48+
}
49+
50+
// Create a SYCL queue
51+
queue q;
52+
53+
// Allocate device memory
54+
int8_t* d_data = malloc_device<int8_t>(data.size(), q);
55+
sycl::half* d_output = malloc_device<sycl::half>(output.size(), q);
56+
57+
// Copy data to device
58+
q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait();
59+
q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::half)).wait();
60+
61+
// Define the kernel execution configuration
62+
range<3> global_work_size(1, 1, num_blocks);
63+
range<3> local_work_size(1, 1, 1);
64+
65+
// Launch the kernel
66+
q.submit([&](handler& h) {
67+
h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) {
68+
dequantize_q8_0_fp16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1);
69+
});
70+
}).wait();
71+
72+
// Copy the result back to host
73+
q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::half)).wait();
74+
75+
// Free device memory
76+
free(d_data, q);
77+
free(d_output, q);
78+
79+
// Check the results
80+
bool success = true;
81+
for (int i = 0; i < num_blocks; ++i) {
82+
sycl::half scale = 0.5f;
83+
for (int j = 0; j < ele_per_blk; ++j) {
84+
sycl::half expected = sycl::vec<float, 1>(scale * (j)).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
85+
if (std::fabs(static_cast<float>(output[i * ele_per_blk + j]) - static_cast<float>(expected)) > 1e-3) {
86+
success = false;
87+
std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << static_cast<float>(expected) << ", got " << static_cast<float>(output[i * ele_per_blk + j]) << std::endl;
88+
}
89+
}
90+
}
91+
92+
if (success) {
93+
std::cout << "Test passed!" << std::endl;
94+
} else {
95+
std::cout << "Test failed!" << std::endl;
96+
}
97+
98+
return 0;
99+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#include <sycl/sycl.hpp>
2+
#include <iostream>
3+
#include <vector>
4+
#include <cmath>
5+
6+
using namespace sycl;
7+
8+
void dequantize_q8_0_fp32_kernel(const int8_t* data, float* output, const int blk_size, const int ele_per_blk, const int num_blocks,
9+
const sycl::nd_item<3> &item_ct1) {
10+
long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
11+
item_ct1.get_local_id(2);
12+
for (long long block_id = global_idx; block_id < num_blocks;
13+
block_id +=
14+
item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
15+
float* __restrict__ output_blk = (float*)(output + block_id * ele_per_blk);
16+
const int8_t* cur_block = data + block_id * blk_size;
17+
float scale = sycl::vec<sycl::half, 1>(*((sycl::half *)cur_block))
18+
.convert<float, sycl::rounding_mode::automatic>()[0];
19+
cur_block += 2;
20+
for (int i = 0; i < ele_per_blk; i++){
21+
output_blk[i] = scale * cur_block[i];
22+
}
23+
}
24+
}
25+
26+
int main() {
27+
// Define the parameters
28+
const int blk_size = 10;
29+
const int ele_per_blk = 8;
30+
const int num_blocks = 2;
31+
32+
// Initialize input data
33+
std::vector<int8_t> data(blk_size * num_blocks);
34+
std::vector<float> output(ele_per_blk * num_blocks, 0.0f);
35+
36+
// Fill the data with some values
37+
for (int i = 0; i < num_blocks; ++i) {
38+
sycl::half scale = 0.5f;
39+
std::memcpy(data.data() + i * blk_size, &scale, sizeof(sycl::half));
40+
for (int j = 2; j < blk_size; ++j) {
41+
data[i * blk_size + j] = j - 2;
42+
}
43+
}
44+
45+
// Create a SYCL queue
46+
queue q;
47+
auto dev = q.get_device();
48+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
49+
50+
// Allocate device memory
51+
int8_t* d_data = malloc_device<int8_t>(data.size(), q);
52+
float* d_output = malloc_device<float>(output.size(), q);
53+
54+
// Copy data to device
55+
q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait();
56+
q.memcpy(d_output, output.data(), output.size() * sizeof(float)).wait();
57+
58+
// Define the kernel execution configuration
59+
range<3> global_work_size(1, 1, num_blocks);
60+
range<3> local_work_size(1, 1, 1);
61+
62+
// Launch the kernel
63+
q.submit([&](handler& h) {
64+
h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) {
65+
dequantize_q8_0_fp32_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1);
66+
});
67+
}).wait();
68+
69+
// Copy the result back to host
70+
q.memcpy(output.data(), d_output, output.size() * sizeof(float)).wait();
71+
72+
// Free device memory
73+
free(d_data, q);
74+
free(d_output, q);
75+
76+
// Check the results
77+
bool success = true;
78+
for (int i = 0; i < num_blocks; ++i) {
79+
sycl::half scale = 0.5f;
80+
for (int j = 0; j < ele_per_blk; ++j) {
81+
float expected = scale * (j);
82+
if (std::fabs(output[i * ele_per_blk + j] - expected) > 1e-5) {
83+
success = false;
84+
std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << expected << ", got " << output[i * ele_per_blk + j] << std::endl;
85+
}
86+
}
87+
}
88+
89+
if (success) {
90+
std::cout << "Test passed!" << std::endl;
91+
} else {
92+
std::cout << "Test failed!" << std::endl;
93+
}
94+
95+
return 0;
96+
}

0 commit comments

Comments
 (0)