@@ -36,14 +36,10 @@ void test(queue &Q, int M, int N, int K)
36
36
auto A = malloc_device<T>(lda * K, Q);
37
37
auto B = malloc_device<T>(ldb * N, Q);
38
38
auto C = malloc_device<T>(ldc * N, Q);
39
+ auto flag = malloc_shared<int >(1 , Q);
39
40
40
- /* Fill A/B with random data */
41
41
constexpr int rd_size = 1048576 ;
42
- auto random_data = malloc_host<T>(rd_size, Q);
43
- generate_random_data (rd_size, random_data);
44
-
45
- replicate_data (Q, A, lda * K, random_data, rd_size);
46
- replicate_data (Q, B, ldb * N, random_data, rd_size);
42
+ auto host_data = malloc_host<T>(rd_size, Q);
47
43
48
44
/* Measure time for a given number of GEMM calls */
49
45
auto time_gemms = [=, &Q](int runs) -> double {
@@ -57,7 +53,49 @@ void test(queue &Q, int M, int N, int K)
57
53
return duration<double >(end - start).count ();
58
54
};
59
55
60
- /* Do a warmup call to initialize MKL and ensure kernels are JIT'ed if needed */
56
+ /* Fill A/B with all ones to verify correctness */
57
+ generate_ones (rd_size, host_data);
58
+ replicate_data (Q, A, lda * K, host_data, rd_size);
59
+ replicate_data (Q, B, ldb * N, host_data, rd_size);
60
+
61
+ /* Verify that the leading entries of C are correct */
62
+ std::cout << " -> Verification...\n " ;
63
+ (void ) time_gemms (1 );
64
+ size_t elems = std::min (ldc * N, rd_size);
65
+ Q.copy (C, host_data, elems);
66
+ flag[0 ] = 0 ;
67
+ int linear_id = 0 ;
68
+ for (size_t j = 0 ; j < N; j++) {
69
+ for (size_t i = 0 ; i < M; i++) {
70
+ linear_id = j*ldc + i;
71
+ if (linear_id >= elems) break ;
72
+ if (host_data[linear_id] != T (K)) {
73
+ flag[0 ] = 1 ;
74
+ }
75
+ }
76
+ if (linear_id >= elems) break ;
77
+ }
78
+ /*
79
+ for (size_t i = 0; i < elems; i++) {
80
+ int count = 0;
81
+ if (host_data[i] != T(K)) {
82
+ flag[0] = 1;
83
+ if (count < 10) {
84
+ sycl::ext::oneapi::experimental::printf("error elem %d expect %f got %f\n",
85
+ i, T(K), host_data[i]);
86
+ count++;
87
+ }
88
+ }
89
+ }
90
+ */
91
+ std::cout << " verification " << (flag[0 ] == 0 ? " passes." : " FAILS!" ) << std::endl;
92
+
93
+ /* Fill A/B with random data */
94
+ generate_random_data (rd_size, host_data);
95
+ replicate_data (Q, A, lda * K, host_data, rd_size);
96
+ replicate_data (Q, B, ldb * N, host_data, rd_size);
97
+
98
+ /* Do a warmup call with random data to initialize MKL and ensure kernels are JIT'ed if needed */
61
99
std::cout << " -> Warmup...\n " ;
62
100
(void ) time_gemms (1 );
63
101
@@ -93,7 +131,8 @@ void test(queue &Q, int M, int N, int K)
93
131
free (A, Q);
94
132
free (B, Q);
95
133
free (C, Q);
96
- free (random_data, Q);
134
+ free (flag, Q);
135
+ free (host_data, Q);
97
136
}
98
137
99
138
void usage (const char *pname)
0 commit comments