Skip to content

Commit 91b8bd3

Browse files
committed
Single kernel tests need debug
1 parent 3dca94f commit 91b8bd3

File tree

4 files changed

+589
-0
lines changed

4 files changed

+589
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#include <sycl/sycl.hpp>
2+
#include <iostream>
3+
#include <vector>
4+
#include <cmath>
5+
6+
using namespace sycl;
7+
8+
void dequantize_q2_k_bf16_kernel(const int8_t *data,
9+
sycl::ext::oneapi::bfloat16 *output,
10+
const int blk_size, const int ele_per_blk,
11+
const int num_blocks,
12+
const sycl::nd_item<3> &item_ct1) {
13+
long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
14+
item_ct1.get_local_id(2);
15+
for (long long block_id = global_idx; block_id < num_blocks;
16+
block_id +=
17+
item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
18+
sycl::ext::oneapi::bfloat16 *__restrict__ output_blk =
19+
(sycl::ext::oneapi::bfloat16 *)(output + block_id * ele_per_blk);
20+
21+
const float d =
22+
sycl::vec<sycl::half, 1>(*(reinterpret_cast<const sycl::half *>(
23+
data + block_id * blk_size + 80)))
24+
.convert<float, sycl::rounding_mode::automatic>()[0];
25+
const float min =
26+
sycl::vec<sycl::half, 1>(*(reinterpret_cast<const sycl::half *>(
27+
data + block_id * blk_size + 82)))
28+
.convert<float, sycl::rounding_mode::automatic>()[0];
29+
30+
const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
31+
32+
int is = 0;
33+
float dl, ml;
34+
35+
for (int n = 0; n < 256; n += 128) {
36+
int shift = 0;
37+
for (int j = 0; j < 4; ++j) {
38+
uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
39+
uint8_t sc = *scales;
40+
dl = d * (sc & 0xF); ml = min * (sc >> 4);
41+
for (int l = 0; l < 16; ++l) *output_blk++ =
42+
sycl::ext::oneapi::bfloat16(
43+
dl * ((int8_t)((q[l] >> shift) & 3)) - ml);
44+
45+
scales = (uint8_t*)(data + block_id * blk_size + (is++));
46+
sc = *scales;
47+
48+
dl = d * (sc & 0xF); ml = min * (sc >> 4);
49+
for (int l = 0; l < 16; ++l) *output_blk++ =
50+
sycl::ext::oneapi::bfloat16(
51+
dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml);
52+
53+
shift += 2;
54+
}
55+
q += 32;
56+
}
57+
}
58+
}
59+
60+
int main() {
61+
// Define the parameters
62+
const int blk_size = 128 + 16 + 2 * sizeof(sycl::half); // Adjusted to match the kernel's data layout
63+
const int ele_per_blk = 256;
64+
const int num_blocks = 2;
65+
66+
// Initialize input data
67+
std::vector<int8_t> data(blk_size * num_blocks);
68+
std::vector<sycl::ext::oneapi::bfloat16> output(ele_per_blk * num_blocks, 0.0f);
69+
70+
// Fill the data with some values
71+
for (int i = 0; i < num_blocks; ++i) {
72+
sycl::half d = 0.5f;
73+
sycl::half min = 0.1f;
74+
std::memcpy(data.data() + i * blk_size + 80, &d, sizeof(sycl::half));
75+
std::memcpy(data.data() + i * blk_size + 82, &min, sizeof(sycl::half));
76+
for (int j = 0; j < 16; ++j) {
77+
data[i * blk_size + j] = j;
78+
}
79+
for (int j = 16; j < 128 + 16; ++j) {
80+
data[i * blk_size + j] = (j - 16) % 256;
81+
}
82+
}
83+
84+
// Create a SYCL queue
85+
queue q;
86+
87+
// Allocate device memory
88+
int8_t* d_data = malloc_device<int8_t>(data.size(), q);
89+
sycl::ext::oneapi::bfloat16* d_output = malloc_device<sycl::ext::oneapi::bfloat16>(output.size(), q);
90+
91+
// Copy data to device
92+
q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait();
93+
q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait();
94+
95+
// Define the kernel execution configuration
96+
range<3> global_work_size(1, 1, num_blocks);
97+
range<3> local_work_size(1, 1, 1);
98+
99+
// Launch the kernel
100+
q.submit([&](handler& h) {
101+
h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) {
102+
dequantize_q2_k_bf16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1);
103+
});
104+
}).wait();
105+
106+
// Copy the result back to host
107+
q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::ext::oneapi::bfloat16)).wait();
108+
109+
// Free device memory
110+
free(d_data, q);
111+
free(d_output, q);
112+
113+
// Check the results
114+
bool success = true;
115+
for (int i = 0; i < num_blocks; ++i) {
116+
sycl::half d = 0.5f;
117+
sycl::half min = 0.1f;
118+
for (int j = 0; j < ele_per_blk; ++j) {
119+
// Calculate expected value
120+
int block_offset = i * blk_size;
121+
int q_offset = block_offset + 16 + (j / 128) * 32;
122+
int scale_offset = block_offset + (j / 64) * 2;
123+
uint8_t sc = data[scale_offset];
124+
float dl = d * (sc & 0xF);
125+
float ml = min * (sc >> 4);
126+
int q_idx = (j % 64) / 16;
127+
int shift = (j % 16) * 2;
128+
int8_t q_val = (data[q_offset + q_idx] >> shift) & 3;
129+
float expected = dl * q_val - ml;
130+
sycl::ext::oneapi::bfloat16 expected_bf16 = sycl::ext::oneapi::bfloat16(expected);
131+
132+
if (std::fabs(static_cast<float>(output[i * ele_per_blk + j]) - static_cast<float>(expected_bf16)) > 1e-3) {
133+
success = false;
134+
std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << static_cast<float>(expected_bf16) << ", got " << static_cast<float>(output[i * ele_per_blk + j]) << std::endl;
135+
}
136+
}
137+
}
138+
139+
if (success) {
140+
std::cout << "Test passed!" << std::endl;
141+
} else {
142+
std::cout << "Test failed!" << std::endl;
143+
}
144+
145+
return 0;
146+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#include <sycl/sycl.hpp>
2+
#include <iostream>
3+
#include <vector>
4+
#include <cmath>
5+
6+
using namespace sycl;
7+
8+
void dequantize_q2_k_fp16_kernel(const int8_t *data, sycl::half *output,
9+
const int blk_size, const int ele_per_blk,
10+
const int num_blocks,
11+
const sycl::nd_item<3> &item_ct1) {
12+
long long global_idx = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
13+
item_ct1.get_local_id(2);
14+
for (long long block_id = global_idx; block_id < num_blocks;
15+
block_id +=
16+
item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
17+
sycl::half *__restrict__ output_blk =
18+
(sycl::half *)(output + block_id * ele_per_blk);
19+
20+
const float d =
21+
sycl::vec<sycl::half, 1>(*(reinterpret_cast<const sycl::half *>(
22+
data + block_id * blk_size + 80)))
23+
.convert<float, sycl::rounding_mode::automatic>()[0];
24+
const float min =
25+
sycl::vec<sycl::half, 1>(*(reinterpret_cast<const sycl::half *>(
26+
data + block_id * blk_size + 82)))
27+
.convert<float, sycl::rounding_mode::automatic>()[0];
28+
29+
const uint8_t * __restrict__ q = (uint8_t*)(data + block_id * blk_size + 16);
30+
31+
int is = 0;
32+
float dl, ml;
33+
34+
for (int n = 0; n < 256; n += 128) {
35+
int shift = 0;
36+
for (int j = 0; j < 4; ++j) {
37+
uint8_t* scales = (uint8_t*)(data + block_id * blk_size + (is++));
38+
uint8_t sc = *scales;
39+
dl = d * (sc & 0xF); ml = min * (sc >> 4);
40+
for (int l = 0; l < 16; ++l) *output_blk++ =
41+
sycl::vec<float, 1>(dl * ((int8_t)((q[l] >> shift) & 3)) -
42+
ml)
43+
.convert<sycl::half,
44+
sycl::rounding_mode::automatic>()[0];
45+
46+
scales = (uint8_t*)(data + block_id * blk_size + (is++));
47+
sc = *scales;
48+
49+
dl = d * (sc & 0xF); ml = min * (sc >> 4);
50+
for (int l = 0; l < 16; ++l) *output_blk++ =
51+
sycl::vec<float, 1>(
52+
dl * ((int8_t)((q[l + 16] >> shift) & 3)) - ml)
53+
.convert<sycl::half,
54+
sycl::rounding_mode::automatic>()[0];
55+
56+
shift += 2;
57+
}
58+
q += 32;
59+
}
60+
}
61+
}
62+
63+
int main() {
64+
// Define the parameters
65+
const int blk_size = 128 + 16 + 2 * sizeof(sycl::half); // Adjusted to match the kernel's data layout
66+
const int ele_per_blk = 256;
67+
const int num_blocks = 2;
68+
69+
// Initialize input data
70+
std::vector<int8_t> data(blk_size * num_blocks);
71+
std::vector<sycl::half> output(ele_per_blk * num_blocks, 0.0f);
72+
73+
// Fill the data with some values
74+
for (int i = 0; i < num_blocks; ++i) {
75+
sycl::half d = 0.5f;
76+
sycl::half min = 0.1f;
77+
std::memcpy(data.data() + i * blk_size + 80, &d, sizeof(sycl::half));
78+
std::memcpy(data.data() + i * blk_size + 82, &min, sizeof(sycl::half));
79+
for (int j = 0; j < 16; ++j) {
80+
data[i * blk_size + j] = j;
81+
}
82+
for (int j = 16; j < 128 + 16; ++j) {
83+
data[i * blk_size + j] = (j - 16) % 256;
84+
}
85+
}
86+
87+
// Create a SYCL queue
88+
queue q;
89+
90+
// Allocate device memory
91+
int8_t* d_data = malloc_device<int8_t>(data.size(), q);
92+
sycl::half* d_output = malloc_device<sycl::half>(output.size(), q);
93+
94+
// Copy data to device
95+
q.memcpy(d_data, data.data(), data.size() * sizeof(int8_t)).wait();
96+
q.memcpy(d_output, output.data(), output.size() * sizeof(sycl::half)).wait();
97+
98+
// Define the kernel execution configuration
99+
range<3> global_work_size(1, 1, num_blocks);
100+
range<3> local_work_size(1, 1, 1);
101+
102+
// Launch the kernel
103+
q.submit([&](handler& h) {
104+
h.parallel_for(nd_range<3>(global_work_size, local_work_size), [=](nd_item<3> item_ct1) {
105+
dequantize_q2_k_fp16_kernel(d_data, d_output, blk_size, ele_per_blk, num_blocks, item_ct1);
106+
});
107+
}).wait();
108+
109+
// Copy the result back to host
110+
q.memcpy(output.data(), d_output, output.size() * sizeof(sycl::half)).wait();
111+
112+
// Free device memory
113+
free(d_data, q);
114+
free(d_output, q);
115+
116+
// Check the results
117+
bool success = true;
118+
for (int i = 0; i < num_blocks; ++i) {
119+
sycl::half d = 0.5f;
120+
sycl::half min = 0.1f;
121+
for (int j = 0; j < ele_per_blk; ++j) {
122+
// Calculate expected value
123+
int block_offset = i * blk_size;
124+
int q_offset = block_offset + 16 + (j / 128) * 32;
125+
int scale_offset = block_offset + (j / 64) * 2;
126+
uint8_t sc = data[scale_offset];
127+
float dl = d * (sc & 0xF);
128+
float ml = min * (sc >> 4);
129+
int q_idx = (j % 64) / 16;
130+
int shift = (j % 16) * 2;
131+
int8_t q_val = (data[q_offset + q_idx] >> shift) & 3;
132+
float expected = dl * q_val - ml;
133+
sycl::half expected_half = sycl::vec<float, 1>(expected).convert<sycl::half, sycl::rounding_mode::automatic>()[0];
134+
135+
if (std::fabs(static_cast<float>(output[i * ele_per_blk + j]) - static_cast<float>(expected_half)) > 1e-3) {
136+
success = false;
137+
std::cout << "Mismatch at block " << i << ", element " << j << ": expected " << static_cast<float>(expected_half) << ", got " << static_cast<float>(output[i * ele_per_blk + j]) << std::endl;
138+
}
139+
}
140+
}
141+
142+
if (success) {
143+
std::cout << "Test passed!" << std::endl;
144+
} else {
145+
std::cout << "Test failed!" << std::endl;
146+
}
147+
148+
return 0;
149+
}

0 commit comments

Comments
 (0)