Skip to content

Commit 4c96358

Browse files
committed
reverted erf_helper
1 parent d1ff526 commit 4c96358

File tree

2 files changed

+15
-160
lines changed

2 files changed

+15
-160
lines changed

include/nbl/builtin/hlsl/concepts/core.hlsl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ template<typename T>
2929
NBL_BOOL_CONCEPT UnsignedIntegral = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T>;
3030

3131
template<typename T>
32-
NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T>;
32+
NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
3333

3434
template<typename T>
3535
NBL_BOOL_CONCEPT Boolean = nbl::hlsl::is_same_v<T, bool> || (nbl::hlsl::is_vector_v<T> && nbl::hlsl::is_same_v<typename vector_traits<T>::scalar_type, bool>);
3636

3737
template <typename T>
38-
NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T>;
38+
NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
3939

4040
template<typename T>
4141
NBL_BOOL_CONCEPT IntegralScalar = nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
@@ -47,7 +47,7 @@ template<typename T>
4747
NBL_BOOL_CONCEPT UnsignedIntegralScalar = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
4848

4949
template<typename T>
50-
NBL_BOOL_CONCEPT FloatingPointScalar = nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>;
50+
NBL_BOOL_CONCEPT FloatingPointScalar = (nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>) || nbl::hlsl::is_same_v<T, float16_t>;
5151

5252
template<typename T>
5353
NBL_BOOL_CONCEPT BooleanScalar = concepts::Boolean<T> && nbl::hlsl::is_scalar_v<T>;

include/nbl/builtin/hlsl/tgmath/impl.hlsl

Lines changed: 12 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -189,163 +189,18 @@ struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScala
189189
{
190190
static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) _x)
191191
{
192-
// glibc implementation
193-
const float64_t tiny = NBL_FP64_LITERAL(1e-300),
194-
one = NBL_FP64_LITERAL(1.00000000000000000000e+00), /* 0x3FF00000, 0x00000000 */
195-
erx = NBL_FP64_LITERAL(8.45062911510467529297e-01); /* 0x3FEB0AC1, 0x60000000 */
196-
197-
// Coefficients for approximation to erf in [0,0.84375]
198-
const float64_t efx = NBL_FP64_LITERAL(1.28379167095512586316e-01); /* 0x3FC06EBA, 0x8214DB69 */
199-
const float64_t pp0 = NBL_FP64_LITERAL(1.28379167095512558561e-01); /* 0x3FC06EBA, 0x8214DB68 */
200-
const float64_t pp1 = NBL_FP64_LITERAL(-3.25042107247001499370e-01); /* 0xBFD4CD7D, 0x691CB913 */
201-
const float64_t pp2 = NBL_FP64_LITERAL(-2.84817495755985104766e-02); /* 0xBF9D2A51, 0xDBD7194F */
202-
const float64_t pp3 = NBL_FP64_LITERAL(-5.77027029648944159157e-03); /* 0xBF77A291, 0x236668E4 */
203-
const float64_t pp4 = NBL_FP64_LITERAL(-2.37630166566501626084e-05); /* 0xBEF8EAD6, 0x120016AC */
204-
const float64_t qq1 = NBL_FP64_LITERAL(3.97917223959155352819e-01); /* 0x3FD97779, 0xCDDADC09 */
205-
const float64_t qq2 = NBL_FP64_LITERAL(6.50222499887672944485e-02); /* 0x3FB0A54C, 0x5536CEBA */
206-
const float64_t qq3 = NBL_FP64_LITERAL(5.08130628187576562776e-03); /* 0x3F74D022, 0xC4D36B0F */
207-
const float64_t qq4 = NBL_FP64_LITERAL(1.32494738004321644526e-04); /* 0x3F215DC9, 0x221C1A10 */
208-
const float64_t qq5 = NBL_FP64_LITERAL(-3.96022827877536812320e-06); /* 0xBED09C43, 0x42A26120 */
209-
210-
//Coefficients for approximation to erf in [0.84375,1.25]
211-
const float64_t pa0 = NBL_FP64_LITERAL(-2.36211856075265944077e-03); /* 0xBF6359B8, 0xBEF77538 */
212-
const float64_t pa1 = NBL_FP64_LITERAL(4.14856118683748331666e-01); /* 0x3FDA8D00, 0xAD92B34D */
213-
const float64_t pa2 = NBL_FP64_LITERAL(-3.72207876035701323847e-01); /* 0xBFD7D240, 0xFBB8C3F1 */
214-
const float64_t pa3 = NBL_FP64_LITERAL(3.18346619901161753674e-01); /* 0x3FD45FCA, 0x805120E4 */
215-
const float64_t pa4 = NBL_FP64_LITERAL(-1.10894694282396677476e-01); /* 0xBFBC6398, 0x3D3E28EC */
216-
const float64_t pa5 = NBL_FP64_LITERAL(3.54783043256182359371e-02); /* 0x3FA22A36, 0x599795EB */
217-
const float64_t pa6 = NBL_FP64_LITERAL(-2.16637559486879084300e-03); /* 0xBF61BF38, 0x0A96073F */
218-
const float64_t qa1 = NBL_FP64_LITERAL(1.06420880400844228286e-01); /* 0x3FBB3E66, 0x18EEE323 */
219-
const float64_t qa2 = NBL_FP64_LITERAL(5.40397917702171048937e-01); /* 0x3FE14AF0, 0x92EB6F33 */
220-
const float64_t qa3 = NBL_FP64_LITERAL(7.18286544141962662868e-02); /* 0x3FB2635C, 0xD99FE9A7 */
221-
const float64_t qa4 = NBL_FP64_LITERAL(1.26171219808761642112e-01); /* 0x3FC02660, 0xE763351F */
222-
const float64_t qa5 = NBL_FP64_LITERAL(1.36370839120290507362e-02); /* 0x3F8BEDC2, 0x6B51DD1C */
223-
const float64_t qa6 = NBL_FP64_LITERAL(1.19844998467991074170e-02); /* 0x3F888B54, 0x5735151D */
224-
225-
// Coefficients for approximation to erfc in [1.25,1/0.35]
226-
const float64_t ra0 = NBL_FP64_LITERAL(-9.86494403484714822705e-03); /* 0xBF843412, 0x600D6435 */
227-
const float64_t ra1 = NBL_FP64_LITERAL(-6.93858572707181764372e-01); /* 0xBFE63416, 0xE4BA7360 */
228-
const float64_t ra2 = NBL_FP64_LITERAL(-1.05586262253232909814e+01); /* 0xC0251E04, 0x41B0E726 */
229-
const float64_t ra3 = NBL_FP64_LITERAL(-6.23753324503260060396e+01); /* 0xC04F300A, 0xE4CBA38D */
230-
const float64_t ra4 = NBL_FP64_LITERAL(-1.62396669462573470355e+02); /* 0xC0644CB1, 0x84282266 */
231-
const float64_t ra5 = NBL_FP64_LITERAL(-1.84605092906711035994e+02); /* 0xC067135C, 0xEBCCABB2 */
232-
const float64_t ra6 = NBL_FP64_LITERAL(-8.12874355063065934246e+01); /* 0xC0545265, 0x57E4D2F2 */
233-
const float64_t ra7 = NBL_FP64_LITERAL(-9.81432934416914548592e+00); /* 0xC023A0EF, 0xC69AC25C */
234-
const float64_t sa1 = NBL_FP64_LITERAL(1.96512716674392571292e+01); /* 0x4033A6B9, 0xBD707687 */
235-
const float64_t sa2 = NBL_FP64_LITERAL(1.37657754143519042600e+02); /* 0x4061350C, 0x526AE721 */
236-
const float64_t sa3 = NBL_FP64_LITERAL(4.34565877475229228821e+02); /* 0x407B290D, 0xD58A1A71 */
237-
const float64_t sa4 = NBL_FP64_LITERAL(6.45387271733267880336e+02); /* 0x40842B19, 0x21EC2868 */
238-
const float64_t sa5 = NBL_FP64_LITERAL(4.29008140027567833386e+02); /* 0x407AD021, 0x57700314 */
239-
const float64_t sa6 = NBL_FP64_LITERAL(1.08635005541779435134e+02); /* 0x405B28A3, 0xEE48AE2C */
240-
const float64_t sa7 = NBL_FP64_LITERAL(6.57024977031928170135e+00); /* 0x401A47EF, 0x8E484A93 */
241-
const float64_t sa8 = NBL_FP64_LITERAL(-6.04244152148580987438e-02); /* 0xBFAEEFF2, 0xEE749A62 */
242-
243-
// Coefficients for approximation to erfc in [1/.35,28]
244-
const float64_t rb0 = NBL_FP64_LITERAL(-9.86494292470009928597e-03); /* 0xBF843412, 0x39E86F4A */
245-
const float64_t rb1 = NBL_FP64_LITERAL(-7.99283237680523006574e-01); /* 0xBFE993BA, 0x70C285DE */
246-
const float64_t rb2 = NBL_FP64_LITERAL(-1.77579549177547519889e+01); /* 0xC031C209, 0x555F995A */
247-
const float64_t rb3 = NBL_FP64_LITERAL(-1.60636384855821916062e+02); /* 0xC064145D, 0x43C5ED98 */
248-
const float64_t rb4 = NBL_FP64_LITERAL(-6.37566443368389627722e+02); /* 0xC083EC88, 0x1375F228 */
249-
const float64_t rb5 = NBL_FP64_LITERAL(-1.02509513161107724954e+03); /* 0xC0900461, 0x6A2E5992 */
250-
const float64_t rb6 = NBL_FP64_LITERAL(-4.83519191608651397019e+02); /* 0xC07E384E, 0x9BDC383F */
251-
const float64_t sb1 = NBL_FP64_LITERAL(3.03380607434824582924e+01); /* 0x403E568B, 0x261D5190 */
252-
const float64_t sb2 = NBL_FP64_LITERAL(3.25792512996573918826e+02); /* 0x40745CAE, 0x221B9F0A */
253-
const float64_t sb3 = NBL_FP64_LITERAL(1.53672958608443695994e+03); /* 0x409802EB, 0x189D5118 */
254-
const float64_t sb4 = NBL_FP64_LITERAL(3.19985821950859553908e+03); /* 0x40A8FFB7, 0x688C246A */
255-
const float64_t sb5 = NBL_FP64_LITERAL(2.55305040643316442583e+03); /* 0x40A3F219, 0xCEDF3BE6 */
256-
const float64_t sb6 = NBL_FP64_LITERAL(4.74528541206955367215e+02); /* 0x407DA874, 0xE79FE763 */
257-
const float64_t sb7 = NBL_FP64_LITERAL(-2.24409524465858183362e+01); /* 0xC03670E2, 0x42712D62 */
258-
259-
float64_t x = float64_t(_x);
260-
int32_t hx, ix;
261-
float64_t s, y, z, r;
262-
hx = int32_t(bit_cast<uint64_t, float64_t>(x) >> 32);
263-
ix = hx & 0x7fffffff;
264-
if (ix >= 0x7ff00000) // erf(nan)=nan, erf(+-inf)=+-1
265-
{
266-
int32_t i = ((uint32_t)hx >> 31) << 1;
267-
return (float64_t)(1.0 - i) + one / x;
268-
}
269-
270-
float64_t P, Q;
271-
if (ix < 0x3feb0000) // |x| < 0.84375
272-
{
273-
if (ix < 0x3e300000) // |x| < 2**-28
274-
{
275-
if (ix < 0x00800000)
276-
{
277-
// avoid underflow
278-
return FloatingPoint(0.0625 * (16.0 * x + (16.0 * efx) * x));
279-
}
280-
return FloatingPoint(x + efx * x);
281-
}
282-
z = x * x;
283-
r = pp0 + z * (pp1 + z * (pp2 + z * (pp3 + z * pp4)));
284-
s = one + z * (qq1 + z * (qq2 + z * (qq3 + z * (qq4 + z * qq5))));
285-
y = r / s;
286-
return FloatingPoint(x + x * y);
287-
}
288-
if (ix < 0x3ff40000) // 0.84375 <= |x| < 1.25
289-
{
290-
s = abs_helper<float64_t>::__call(x) - one;
291-
P = pa0 + s * (pa1 + s * (pa2 + s * (pa3 + s * (pa4 + s * (pa5 + s * pa6)))));
292-
Q = one + s * (qa1 + s * (qa2 + s * (qa3 + s * (qa4 + s * (qa5 + s * (qa5 + s * qa6))))));
293-
if (hx >= 0)
294-
return FloatingPoint(erx + P / Q);
295-
else
296-
return FloatingPoint(-erx - P / Q);
297-
}
298-
if (ix >= 0x40180000) // inf > |x| >= 6
299-
{
300-
if (hx >= 0)
301-
return FloatingPoint(one - tiny);
302-
else
303-
return FloatingPoint(tiny - one);
304-
}
305-
306-
x = abs_helper<float64_t>::__call(x);
307-
s = one / (x * x);
308-
float64_t R, S;
309-
if (ix < 0x4006DB6E) // |x| < 1/0.35 ~2.85714
310-
{
311-
R = ra0 + s * (ra1 + s * (ra2 + s * (ra3 + s * (ra4 + s * (ra5 + s * (ra6 + s * ra7))))));
312-
S = one + s * (sa1 + s * (sa2 + s * (sa3 + s * (sa4 + s * (sa5 + s * (sa6 + s * sa7))))));
313-
}
314-
else // |x| >= 1/0.35
315-
{
316-
R = rb0 + s * (rb1 + s * (rb2 + s * (rb3 + s * (rb4 + s * rb5))));
317-
S = one + s * (sb1 + s * (sb2 + s * (sb3 + s * (sb4 + s * (sb5 + s * (sb6 + s * sb7))))));
318-
}
319-
z = x;
320-
uint64_t z1 = bit_cast<uint64_t, float64_t>(x);
321-
z1 &= 0xffffffff00000000;
322-
z = bit_cast<float64_t, uint64_t>(z1);
323-
r = exp_helper<float64_t>::__call(-z * z - 0.5625) * exp_helper<float64_t>::__call((z - x) * (z + x) + R / S);
324-
if (hx >= 0)
325-
return FloatingPoint(one - r / x);
326-
else
327-
return FloatingPoint(r / x - one);
328-
}
329-
};
330-
331-
template<>
332-
struct erf_helper<float32_t>
333-
{
334-
static float32_t __call(NBL_CONST_REF_ARG(float32_t) _x)
335-
{
336-
// A&S approximation to 1.5x10-7
337-
const float32_t a1 = float32_t(NBL_FP64_LITERAL(0.254829592));
338-
const float32_t a2 = float32_t(NBL_FP64_LITERAL(-0.284496736));
339-
const float32_t a3 = float32_t(NBL_FP64_LITERAL(1.421413741));
340-
const float32_t a4 = float32_t(NBL_FP64_LITERAL(-1.453152027));
341-
const float32_t a5 = float32_t(NBL_FP64_LITERAL(1.061405429));
342-
const float32_t p = float32_t(NBL_FP64_LITERAL(0.3275911));
343-
344-
float32_t _sign = float32_t(sign(_x));
345-
float32_t x = abs(_x);
346-
347-
float32_t t = float32_t(NBL_FP64_LITERAL(1.0)) / (float32_t(NBL_FP64_LITERAL(1.0)) + p * x);
348-
float32_t y = float32_t(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
192+
const FloatingPoint a1 = FloatingPoint(NBL_FP64_LITERAL(0.254829592));
193+
const FloatingPoint a2 = FloatingPoint(NBL_FP64_LITERAL(-0.284496736));
194+
const FloatingPoint a3 = FloatingPoint(NBL_FP64_LITERAL(1.421413741));
195+
const FloatingPoint a4 = FloatingPoint(NBL_FP64_LITERAL(-1.453152027));
196+
const FloatingPoint a5 = FloatingPoint(NBL_FP64_LITERAL(1.061405429));
197+
const FloatingPoint p = FloatingPoint(NBL_FP64_LITERAL(0.3275911));
198+
199+
FloatingPoint _sign = FloatingPoint(sign(_x));
200+
FloatingPoint x = abs(_x);
201+
202+
FloatingPoint t = FloatingPoint(NBL_FP64_LITERAL(1.0)) / (FloatingPoint(NBL_FP64_LITERAL(1.0)) + p * x);
203+
FloatingPoint y = FloatingPoint(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
349204

350205
return _sign * y;
351206
}

0 commit comments

Comments
 (0)