Skip to content

Commit b7055f0

Browse files
committed
tabs->spaces
1 parent f32282c commit b7055f0

File tree

1 file changed

+18
-18
lines changed

1 file changed

+18
-18
lines changed

posts/mixed-precision/haxpy.cu

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ void haxpy(int n, half a, const half *x, half *y)
5454

5555
#if __CUDA_ARCH__ >= 530
5656
int n2 = n/2;
57-
half2 *x2 = (half2*)x, *y2 = (half2*)y;
57+
half2 *x2 = (half2*)x, *y2 = (half2*)y;
5858

59-
for (int i = start; i < n2; i+= stride)
60-
y2[i] = __hfma2(__halves2half2(a, a), x2[i], y2[i]);
59+
for (int i = start; i < n2; i+= stride)
60+
y2[i] = __hfma2(__halves2half2(a, a), x2[i], y2[i]);
6161

6262
// first thread handles singleton for odd arrays
6363
if (start == 0 && (n%2))
@@ -66,33 +66,33 @@ void haxpy(int n, half a, const half *x, half *y)
6666
#else
6767
for (int i = start; i < n; i+= stride) {
6868
y[i] = __float2half(__half2float(a) * __half2float(x[i])
69-
+ __half2float(y[i]));
69+
+ __half2float(y[i]));
7070
}
7171
#endif
7272
}
7373

7474
int main(void) {
75-
const int n = 100;
75+
const int n = 100;
7676

77-
const half a = approx_float_to_half(2.0f);
77+
const half a = approx_float_to_half(2.0f);
7878

79-
half *x, *y;
80-
checkCuda(cudaMallocManaged(&x, n * sizeof(half)));
81-
checkCuda(cudaMallocManaged(&y, n * sizeof(half)));
82-
83-
for (int i = 0; i < n; i++) {
84-
x[i] = approx_float_to_half(1.0f);
85-
y[i] = approx_float_to_half((float)i);
86-
}
79+
half *x, *y;
80+
checkCuda(cudaMallocManaged(&x, n * sizeof(half)));
81+
checkCuda(cudaMallocManaged(&y, n * sizeof(half)));
82+
83+
for (int i = 0; i < n; i++) {
84+
x[i] = approx_float_to_half(1.0f);
85+
y[i] = approx_float_to_half((float)i);
86+
}
8787

88-
const int blockSize = 256;
89-
const int nBlocks = (n + blockSize - 1) / blockSize;
88+
const int blockSize = 256;
89+
const int nBlocks = (n + blockSize - 1) / blockSize;
9090

91-
haxpy<<<nBlocks, blockSize>>>(n, a, x, y);
91+
haxpy<<<nBlocks, blockSize>>>(n, a, x, y);
9292

9393
// must wait for kernel to finish before CPU accesses
9494
checkCuda(cudaDeviceSynchronize());
95-
95+
9696
for (int i = 0; i < n; i++)
9797
printf("%f\n", half_to_float(y[i]));
9898

0 commit comments

Comments
 (0)