Skip to content

Commit 0507de3

Browse files
committed
[libclc] Optimize clc_hypot
1 parent 9cf4987 commit 0507de3

File tree

2 files changed

+112
-84
lines changed

2 files changed

+112
-84
lines changed

libclc/clc/lib/generic/math/clc_hypot.cl

Lines changed: 4 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* THE SOFTWARE.
2121
*/
2222

23+
#include <clc/clc_convert.h>
2324
#include <clc/clcmacro.h>
2425
#include <clc/integer/clc_abs.h>
2526
#include <clc/internal/clc.h>
@@ -31,87 +32,6 @@
3132
#include <clc/relational/clc_isnan.h>
3233
#include <clc/shared/clc_clamp.h>
3334

34-
// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result
35-
// warrants it
36-
_CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
37-
uint ux = __clc_as_uint(x);
38-
uint aux = ux & EXSIGNBIT_SP32;
39-
uint uy = __clc_as_uint(y);
40-
uint auy = uy & EXSIGNBIT_SP32;
41-
float retval;
42-
int c = aux > auy;
43-
ux = c ? aux : auy;
44-
uy = c ? auy : aux;
45-
46-
int xexp =
47-
__clc_clamp((int)(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126);
48-
float fx_exp = __clc_as_float((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
49-
float fi_exp = __clc_as_float((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
50-
float fx = __clc_as_float(ux) * fi_exp;
51-
float fy = __clc_as_float(uy) * fi_exp;
52-
retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
53-
54-
retval = (ux > PINFBITPATT_SP32 || uy == 0) ? __clc_as_float(ux) : retval;
55-
retval = (ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32)
56-
? __clc_as_float(PINFBITPATT_SP32)
57-
: retval;
58-
return retval;
59-
}
60-
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_hypot, float, float)
61-
62-
#ifdef cl_khr_fp64
63-
64-
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
65-
66-
_CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y) {
67-
ulong ux = __clc_as_ulong(x) & ~SIGNBIT_DP64;
68-
int xexp = ux >> EXPSHIFTBITS_DP64;
69-
x = __clc_as_double(ux);
70-
71-
ulong uy = __clc_as_ulong(y) & ~SIGNBIT_DP64;
72-
int yexp = uy >> EXPSHIFTBITS_DP64;
73-
y = __clc_as_double(uy);
74-
75-
int c = xexp > EXPBIAS_DP64 + 500 | yexp > EXPBIAS_DP64 + 500;
76-
double preadjust = c ? 0x1.0p-600 : 1.0;
77-
double postadjust = c ? 0x1.0p+600 : 1.0;
78-
79-
c = xexp < EXPBIAS_DP64 - 500 | yexp < EXPBIAS_DP64 - 500;
80-
preadjust = c ? 0x1.0p+600 : preadjust;
81-
postadjust = c ? 0x1.0p-600 : postadjust;
82-
83-
double ax = x * preadjust;
84-
double ay = y * preadjust;
85-
86-
// The post adjust may overflow, but this can't be avoided in any case
87-
double r = __clc_sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
88-
89-
// If the difference in exponents between x and y is large
90-
double s = x + y;
91-
c = __clc_abs(xexp - yexp) > MANTLENGTH_DP64 + 1;
92-
r = c ? s : r;
93-
94-
// Check for NaN
95-
c = __clc_isnan(x) || __clc_isnan(y);
96-
r = c ? __clc_as_double(QNANBITPATT_DP64) : r;
97-
98-
// If either is Inf, we must return Inf
99-
c = x == __clc_as_double(PINFBITPATT_DP64) ||
100-
y == __clc_as_double(PINFBITPATT_DP64);
101-
r = c ? __clc_as_double(PINFBITPATT_DP64) : r;
102-
103-
return r;
104-
}
105-
106-
_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double,
107-
double)
108-
109-
#endif
110-
111-
#ifdef cl_khr_fp16
112-
113-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
114-
115-
_CLC_DEFINE_BINARY_BUILTIN_FP16(__clc_hypot);
116-
117-
#endif
35+
#define __CLC_BODY <clc_hypot.inc>
36+
#include <clc/math/gentype.inc>
37+
#undef __CLC_BODY
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright (c) 2014 Advanced Micro Devices, Inc.
3+
*
4+
* Permission is hereby granted, free of charge, to any person obtaining a copy
5+
* of this software and associated documentation files (the "Software"), to deal
6+
* in the Software without restriction, including without limitation the rights
7+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
* copies of the Software, and to permit persons to whom the Software is
9+
* furnished to do so, subject to the following conditions:
10+
*
11+
* The above copyright notice and this permission notice shall be included in
12+
* all copies or substantial portions of the Software.
13+
*
14+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20+
* THE SOFTWARE.
21+
*/
22+
23+
// Returns sqrt(x*x + y*y) with no overflow or underflow unless the result
24+
// warrants it
25+
26+
#if __CLC_FPSIZE == 32
27+
_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_hypot(__CLC_GENTYPE x,
28+
__CLC_GENTYPE y) {
29+
__CLC_UINTN ux = __CLC_AS_UINTN(x);
30+
__CLC_UINTN aux = ux & EXSIGNBIT_SP32;
31+
__CLC_UINTN uy = __CLC_AS_UINTN(y);
32+
__CLC_UINTN auy = uy & EXSIGNBIT_SP32;
33+
__CLC_INTN c = aux > auy;
34+
ux = c ? aux : auy;
35+
uy = c ? auy : aux;
36+
37+
__CLC_INTN xexp = __clc_clamp(
38+
__CLC_AS_INTN(ux >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32, -126, 126);
39+
__CLC_GENTYPE fx_exp =
40+
__CLC_AS_GENTYPE((xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
41+
__CLC_GENTYPE fi_exp =
42+
__CLC_AS_GENTYPE((-xexp + EXPBIAS_SP32) << EXPSHIFTBITS_SP32);
43+
__CLC_GENTYPE fx = __CLC_AS_GENTYPE(ux) * fi_exp;
44+
__CLC_GENTYPE fy = __CLC_AS_GENTYPE(uy) * fi_exp;
45+
46+
__CLC_GENTYPE retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;
47+
48+
retval = (ux > PINFBITPATT_SP32 || uy == 0) ? __CLC_AS_GENTYPE(ux) : retval;
49+
retval = (ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32)
50+
? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32)
51+
: retval;
52+
return retval;
53+
}
54+
55+
#elif __CLC_FPSIZE == 64
56+
57+
_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_hypot(__CLC_GENTYPE x,
58+
__CLC_GENTYPE y) {
59+
__CLC_ULONGN ux = __CLC_AS_ULONGN(x) & ~SIGNBIT_DP64;
60+
__CLC_INTN xexp = __CLC_CONVERT_INTN(ux >> EXPSHIFTBITS_DP64);
61+
x = __CLC_AS_GENTYPE(ux);
62+
63+
__CLC_ULONGN uy = __CLC_AS_ULONGN(y) & ~SIGNBIT_DP64;
64+
__CLC_INTN yexp = __CLC_CONVERT_INTN(uy >> EXPSHIFTBITS_DP64);
65+
y = __CLC_AS_GENTYPE(uy);
66+
67+
__CLC_LONGN c = __CLC_CONVERT_LONGN(xexp > EXPBIAS_DP64 + 500 ||
68+
yexp > EXPBIAS_DP64 + 500);
69+
__CLC_GENTYPE preadjust = c ? 0x1.0p-600 : 1.0;
70+
__CLC_GENTYPE postadjust = c ? 0x1.0p+600 : 1.0;
71+
72+
c = __CLC_CONVERT_LONGN(xexp < EXPBIAS_DP64 - 500 ||
73+
yexp < EXPBIAS_DP64 - 500);
74+
preadjust = c ? 0x1.0p+600 : preadjust;
75+
postadjust = c ? 0x1.0p-600 : postadjust;
76+
77+
__CLC_GENTYPE ax = x * preadjust;
78+
__CLC_GENTYPE ay = y * preadjust;
79+
80+
// The post adjust may overflow, but this can't be avoided in any case
81+
__CLC_GENTYPE r = __clc_sqrt(__clc_fma(ax, ax, ay * ay)) * postadjust;
82+
83+
// If the difference in exponents between x and y is large
84+
__CLC_GENTYPE s = x + y;
85+
c = __CLC_CONVERT_LONGN(__clc_abs(xexp - yexp) > MANTLENGTH_DP64 + 1);
86+
r = c ? s : r;
87+
88+
// Check for NaN
89+
c = __clc_isnan(x) || __clc_isnan(y);
90+
r = c ? __CLC_AS_GENTYPE((__CLC_ULONGN)QNANBITPATT_DP64) : r;
91+
92+
// If either is Inf, we must return Inf
93+
c = x == __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) ||
94+
y == __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64);
95+
r = c ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : r;
96+
97+
return r;
98+
}
99+
100+
#elif __CLC_FPSIZE == 16
101+
102+
_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_hypot(__CLC_GENTYPE x,
103+
__CLC_GENTYPE y) {
104+
return __CLC_CONVERT_GENTYPE(
105+
__clc_hypot(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
106+
}
107+
108+
#endif

0 commit comments

Comments
 (0)