|
| 1 | +#include <dpct/dnnl_utils.hpp> |
| 2 | +#include <sycl/sycl.hpp> |
| 3 | +#include <dpct/dpct.hpp> |
| 4 | +#include <cstdio> |
| 5 | +#include <cstdlib> |
| 6 | +#include <dpct/blas_utils.hpp> |
| 7 | + |
| 8 | +#include <iostream> |
| 9 | +#include <stdexcept> |
| 10 | +#include <vector> |
| 11 | +#include <cmath> |
| 12 | + |
| 13 | +using data_type = double; |
| 14 | +template <typename T> |
| 15 | +bool check(std::vector<T> &expect, std::vector<T> &actual, int num, |
| 16 | + float precision) { |
| 17 | + for (int i = 0; i < num; i++) { |
| 18 | + if (std::abs(expect[i] - actual[i]) > precision) { |
| 19 | + std::cout << "test failed" << std::endl; |
| 20 | + std::cout << "expect:" << expect[i] << std::endl; |
| 21 | + std::cout << "actual:" << actual[i] << std::endl; |
| 22 | + return false; |
| 23 | + } |
| 24 | + } |
| 25 | + return true; |
| 26 | +} |
| 27 | +bool cublasCheck() { |
| 28 | + dpct::device_ext &dev_ct1 = dpct::get_current_device(); |
| 29 | + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); |
| 30 | + dpct::queue_ptr handle = NULL; |
| 31 | + dpct::queue_ptr stream = &q_ct1; |
| 32 | + |
| 33 | + const std::vector<data_type> A = {1.0, 2.0, 3.0, 4.0}; |
| 34 | + const int incx = 1; |
| 35 | + |
| 36 | + int result = 0.0; |
| 37 | + |
| 38 | + data_type *d_A = nullptr; |
| 39 | + |
| 40 | + handle = &q_ct1; |
| 41 | + |
| 42 | + /* |
| 43 | + DPCT1025:0: The SYCL queue is created ignoring the flag and priority options. |
| 44 | + */ |
| 45 | + stream = dev_ct1.create_queue(); |
| 46 | + handle = stream; |
| 47 | + |
| 48 | + d_A = (data_type *)sycl::malloc_device(sizeof(data_type) * A.size(), q_ct1); |
| 49 | + |
| 50 | + stream->memcpy(d_A, A.data(), sizeof(data_type) * A.size()); |
| 51 | + |
| 52 | + int64_t *res_temp_ptr_ct1 = sycl::malloc_shared<int64_t>(1, q_ct1); |
| 53 | + oneapi::mkl::blas::column_major::iamax(*handle, A.size(), d_A, incx, |
| 54 | + res_temp_ptr_ct1, |
| 55 | + oneapi::mkl::index_base::one) |
| 56 | + .wait(); |
| 57 | + int res_temp_host_ct2 = (int)*res_temp_ptr_ct1; |
| 58 | + dpct::dpct_memcpy(&result, &res_temp_host_ct2, sizeof(int)); |
| 59 | + sycl::free(res_temp_ptr_ct1, q_ct1); |
| 60 | + |
| 61 | + stream->wait(); |
| 62 | + |
| 63 | + sycl::free(d_A, q_ct1); |
| 64 | + |
| 65 | + handle = nullptr; |
| 66 | + |
| 67 | + dev_ct1.destroy_queue(stream); |
| 68 | + |
| 69 | + dev_ct1.reset(); |
| 70 | + if (result == 4) { |
| 71 | + return true; |
| 72 | + } |
| 73 | + return false; |
| 74 | +} |
| 75 | +template <typename T> |
| 76 | +void conv2d(int batch, int color, int rows, int cols, int kCols, |
| 77 | + int kRows, T *matrix, float *kernel, T *result, |
| 78 | + const sycl::nd_item<3> &item_ct1) { |
| 79 | + int tid = item_ct1.get_group(2) * item_ct1.get_local_range(2) + |
| 80 | + item_ct1.get_local_id(2); |
| 81 | + int kCenterX = kCols / 2; |
| 82 | + int kCenterY = kRows / 2; |
| 83 | + |
| 84 | + for (int b = 0; b < batch; b++) { |
| 85 | + for (int c = 0; c < color; c++) { |
| 86 | + for (int i = 0; i < rows; i++) { |
| 87 | + for (int j = 0; j < cols; j++) { |
| 88 | + for (int m = 0; m < kRows; m++) { |
| 89 | + int mm = kRows - 1 - m; |
| 90 | + for (int n = 0; n < kCols; n++) { |
| 91 | + int nn = kCols - 1 - n; |
| 92 | + |
| 93 | + int ii = i + (kCenterY - mm); |
| 94 | + int jj = j + (kCenterX - nn); |
| 95 | + |
| 96 | + if (ii >= 0 && ii < rows && jj >= 0 && jj < cols) { |
| 97 | + result[b * color * rows * cols + c * rows * cols + i * cols + |
| 98 | + j] += |
| 99 | + matrix[b * c * ii * jj + c * ii * jj + ii * kRows + jj] * |
| 100 | + kernel[mm * kRows + nn]; |
| 101 | + result[tid] = result[b * color * rows * cols + c * rows * cols + |
| 102 | + i * cols + j]; |
| 103 | + } |
| 104 | + } |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + } |
| 109 | + } |
| 110 | +} |
| 111 | + |
| 112 | +bool cudnnCheck() { |
| 113 | + dpct::device_ext &dev_ct1 = dpct::get_current_device(); |
| 114 | + sycl::queue &q_ct1 = dev_ct1.in_order_queue(); |
| 115 | + dpct::dnnl::engine_ext handle; |
| 116 | + dpct::dnnl::memory_desc_ext dataTensor, outTensor, scalebiasTensor; |
| 117 | + handle.create_engine(); |
| 118 | + |
| 119 | + /* |
| 120 | + DPCT1026:1: The call to cudnnCreateTensorDescriptor was removed because this |
| 121 | + call is redundant in SYCL. |
| 122 | + */ |
| 123 | + /* |
| 124 | + DPCT1026:2: The call to cudnnCreateTensorDescriptor was removed because this |
| 125 | + call is redundant in SYCL. |
| 126 | + */ |
| 127 | + /* |
| 128 | + DPCT1026:3: The call to cudnnCreateTensorDescriptor was removed because this |
| 129 | + call is redundant in SYCL. |
| 130 | + */ |
| 131 | + |
| 132 | + int in = 2, ic = 4, ih = 5, iw = 5; |
| 133 | + int on = 2, oc = 4, oh = 5, ow = 5; |
| 134 | + int sbn = 1, sbc = 4, sbh = 5, sbw = 5; |
| 135 | + int ele_num = in * ic * ih * iw; |
| 136 | + int oele_num = on * oc * oh * ow; |
| 137 | + int sele_num = sbn * sbc * sbh * sbw; |
| 138 | + dataTensor.set(dpct::dnnl::memory_format_tag::nchw, |
| 139 | + dpct::library_data_t::real_float, in, ic, ih, iw); |
| 140 | + outTensor.set(dpct::dnnl::memory_format_tag::nchw, |
| 141 | + dpct::library_data_t::real_float, on, oc, oh, ow); |
| 142 | + scalebiasTensor.set(dpct::dnnl::memory_format_tag::nchw, |
| 143 | + dpct::library_data_t::real_float, sbn, sbc, sbh, sbw); |
| 144 | + |
| 145 | + int save = 1; |
| 146 | + float *data, *out, *scale, *bias, *rmean, *rvar, *smean, *svar, *z; |
| 147 | + std::vector<float> host_data(ele_num, 1.0f); |
| 148 | + std::vector<float> host_z(oele_num, 1.0f); |
| 149 | + std::vector<float> host_out(oele_num, 0.0f); |
| 150 | + std::vector<float> host_scale(sele_num, 1.0f); |
| 151 | + std::vector<float> host_bias(sele_num, 0.0f); |
| 152 | + std::vector<float> host_rmean(sele_num, 0.0f); |
| 153 | + std::vector<float> host_rvar(sele_num, 0.0f); |
| 154 | + std::vector<float> host_smean(save * sele_num, 0.0f); |
| 155 | + std::vector<float> host_svar(save * sele_num, 0.0f); |
| 156 | + |
| 157 | + for (int i = 0; i < ele_num; i++) { |
| 158 | + host_data[i] = i + 4.f; |
| 159 | + host_out[i] = 1.f; |
| 160 | + host_z[i] = 10; |
| 161 | + } |
| 162 | + for (int i = 0; i < sele_num; i++) { |
| 163 | + host_scale[i] = i; |
| 164 | + host_bias[i] = i; |
| 165 | + host_rmean[i] = i; |
| 166 | + host_rvar[i] = i; |
| 167 | + host_smean[i] = i; |
| 168 | + host_svar[i] = i; |
| 169 | + } |
| 170 | + |
| 171 | + data = sycl::malloc_device<float>(ele_num, q_ct1); |
| 172 | + z = sycl::malloc_device<float>(oele_num, q_ct1); |
| 173 | + out = sycl::malloc_device<float>(oele_num, q_ct1); |
| 174 | + scale = sycl::malloc_device<float>(sele_num, q_ct1); |
| 175 | + bias = sycl::malloc_device<float>(sele_num, q_ct1); |
| 176 | + rmean = sycl::malloc_device<float>(sele_num, q_ct1); |
| 177 | + rvar = sycl::malloc_device<float>(sele_num, q_ct1); |
| 178 | + smean = (float *)sycl::malloc_device(sizeof(float) * save * sele_num, q_ct1); |
| 179 | + svar = (float *)sycl::malloc_device(sizeof(float) * save * sele_num, q_ct1); |
| 180 | + |
| 181 | + q_ct1.memcpy(data, host_data.data(), sizeof(float) * ele_num); |
| 182 | + q_ct1.memcpy(z, host_z.data(), sizeof(float) * oele_num); |
| 183 | + q_ct1.memcpy(out, host_out.data(), sizeof(float) * oele_num); |
| 184 | + q_ct1.memcpy(scale, host_scale.data(), sizeof(float) * sele_num); |
| 185 | + q_ct1.memcpy(bias, host_bias.data(), sizeof(float) * sele_num); |
| 186 | + q_ct1.memcpy(rmean, host_rmean.data(), sizeof(float) * sele_num); |
| 187 | + q_ct1.memcpy(rvar, host_rvar.data(), sizeof(float) * sele_num); |
| 188 | + q_ct1.memcpy(smean, host_smean.data(), sizeof(float) * save * sele_num); |
| 189 | + q_ct1.memcpy(svar, host_svar.data(), sizeof(float) * save * sele_num).wait(); |
| 190 | + |
| 191 | + float alpha = 2.5f, beta = 1.5f, eps = 1.f; |
| 192 | + double factor = 0.5f; |
| 193 | + dpct::dnnl::activation_desc ActivationDesc; |
| 194 | + /* |
| 195 | + DPCT1026:4: The call to cudnnCreateActivationDescriptor was removed because |
| 196 | + this call is redundant in SYCL. |
| 197 | + */ |
| 198 | + /* |
| 199 | + DPCT1007:5: Migration of Nan numbers propagation option is not supported. |
| 200 | + */ |
| 201 | + ActivationDesc.set(dnnl::algorithm::eltwise_relu_use_dst_for_bwd, 0.0f); |
| 202 | + |
| 203 | + auto status = |
| 204 | + DPCT_CHECK_ERROR(handle.async_batch_normalization_forward_inference( |
| 205 | + dpct::dnnl::batch_normalization_mode::per_activation, |
| 206 | + dpct::dnnl::batch_normalization_ops::none, ActivationDesc, eps, alpha, |
| 207 | + dataTensor, data, beta, outTensor, out, dataTensor, z, |
| 208 | + scalebiasTensor, scale, bias, scalebiasTensor, smean, svar)); |
| 209 | + |
| 210 | + dev_ct1.queues_wait_and_throw(); |
| 211 | + q_ct1.memcpy(host_out.data(), out, sizeof(float) * oele_num).wait(); |
| 212 | + std::vector<float> expect = { |
| 213 | + 1.5, 11.0711, 18.047, 24, 29.3885, 34.4124, 39.1779, |
| 214 | + 43.7487, 48.1667, 52.4605, 56.6511, 60.7543, 64.782, 68.744, |
| 215 | + 72.6478, 76.5, 80.3057, 84.0694, 87.7948, 91.4853, 95.1436, |
| 216 | + 98.7721, 102.373, 105.949, 109.5, |
| 217 | + |
| 218 | + 113.029, 116.537, 120.025, 123.495, 126.947, 130.382, 133.801, |
| 219 | + 137.205, 140.595, 143.97, 147.333, 150.684, 154.022, 157.349, |
| 220 | + 160.664, 163.969, 167.264, 170.549, 173.825, 177.091, 180.349, |
| 221 | + 183.598, 186.839, 190.071, 193.296, |
| 222 | + |
| 223 | + 196.514, 199.724, 202.927, 206.124, 209.314, 212.497, 215.674, |
| 224 | + 218.845, 222.01, 225.169, 228.322, 231.47, 234.613, 237.75, |
| 225 | + 240.882, 244.009, 247.132, 250.249, 253.362, 256.471, 259.575, |
| 226 | + 262.674, 265.77, 268.861, 271.948, |
| 227 | + |
| 228 | + 275.031, 278.11, 281.185, 284.257, 287.325, 290.389, 293.45, |
| 229 | + 296.507, 299.56, 302.611, 305.658, 308.702, 311.742, 314.78, |
| 230 | + 317.814, 320.846, 323.874, 326.9, 329.922, 332.942, 335.959, |
| 231 | + 338.973, 341.985, 344.994, 348, |
| 232 | + |
| 233 | + 1.5, 187.848, 306.722, 399, 476.602, 544.723, 606.125, |
| 234 | + 662.467, 714.833, 763.973, 810.43, 854.611, 896.832, 937.343, |
| 235 | + 976.344, 1014, 1050.45, 1085.8, 1120.17, 1153.62, 1186.23, |
| 236 | + 1218.08, 1249.2, 1279.66, 1309.5, |
| 237 | + |
| 238 | + 1338.75, 1367.46, 1395.66, 1423.36, 1450.61, 1477.42, 1503.82, |
| 239 | + 1529.83, 1555.46, 1580.73, 1605.67, 1630.27, 1654.57, 1678.57, |
| 240 | + 1702.27, 1725.71, 1748.87, 1771.78, 1794.45, 1816.87, 1839.07, |
| 241 | + 1861.05, 1882.81, 1904.36, 1925.71, |
| 242 | + |
| 243 | + 1946.86, 1967.83, 1988.61, 2009.22, 2029.65, 2049.92, 2070.02, |
| 244 | + 2089.96, 2109.75, 2129.39, 2148.88, 2168.22, 2187.43, 2206.5, |
| 245 | + 2225.44, 2244.25, 2262.93, 2281.49, 2299.92, 2318.24, 2336.44, |
| 246 | + 2354.53, 2372.51, 2390.38, 2408.14, |
| 247 | + |
| 248 | + 2425.8, 2443.36, 2460.82, 2478.18, 2495.44, 2512.61, 2529.69, |
| 249 | + 2546.67, 2563.57, 2580.38, 2597.1, 2613.74, 2630.3, 2646.78, |
| 250 | + 2663.17, 2679.49, 2695.73, 2711.89, 2727.98, 2743.99, 2759.93, |
| 251 | + 2775.8, 2791.6, 2807.34, 2823, |
| 252 | + }; |
| 253 | + /* |
| 254 | + DPCT1026:6: The call to cudnnDestroy was removed because this call is |
| 255 | + redundant in SYCL. |
| 256 | + */ |
| 257 | + sycl::free(data, q_ct1); |
| 258 | + sycl::free(out, q_ct1); |
| 259 | + return check(expect, host_out, expect.size(), 1e-1); |
| 260 | +} |
| 261 | + |
| 262 | +int main(int argc, char *argv[]) { |
| 263 | + if (cublasCheck() && cudnnCheck()) { |
| 264 | + printf("Both case passed \n"); |
| 265 | + return 0; |
| 266 | + } else { |
| 267 | + printf("Tests failed"); |
| 268 | + exit(-1); |
| 269 | + } |
| 270 | + return 0; |
| 271 | +} |
0 commit comments