reverted erf_helper

keptsecret · keptsecret · commit 4c963587b5cc · 2025-03-04T15:53:40.000+07:00
diff --git a/include/nbl/builtin/hlsl/concepts/core.hlsl b/include/nbl/builtin/hlsl/concepts/core.hlsl
@@ -29,13 +29,13 @@ template<typename T>
 NBL_BOOL_CONCEPT UnsignedIntegral = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T>;
 
 template<typename T>
-NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T>;
+NBL_BOOL_CONCEPT FloatingPoint = nbl::hlsl::is_floating_point_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
 
 template<typename T>
 NBL_BOOL_CONCEPT Boolean = nbl::hlsl::is_same_v<T, bool> || (nbl::hlsl::is_vector_v<T> && nbl::hlsl::is_same_v<typename vector_traits<T>::scalar_type, bool>);
 
 template <typename T>
-NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T>;
+NBL_BOOL_CONCEPT Scalar = nbl::hlsl::is_scalar_v<T> || nbl::hlsl::is_same_v<T, float16_t>;
 
 template<typename T>
 NBL_BOOL_CONCEPT IntegralScalar = nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
@@ -47,7 +47,7 @@ template<typename T>
 NBL_BOOL_CONCEPT UnsignedIntegralScalar = !nbl::hlsl::is_signed_v<T> && ::nbl::hlsl::is_integral_v<T> && nbl::hlsl::is_scalar_v<T>;
 
 template<typename T>
-NBL_BOOL_CONCEPT FloatingPointScalar = nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>;
+NBL_BOOL_CONCEPT FloatingPointScalar = (nbl::hlsl::is_floating_point_v<T> && nbl::hlsl::is_scalar_v<T>) || nbl::hlsl::is_same_v<T, float16_t>;
 
 template<typename T>
 NBL_BOOL_CONCEPT BooleanScalar = concepts::Boolean<T> && nbl::hlsl::is_scalar_v<T>;
diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl
@@ -189,163 +189,18 @@ struct erf_helper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointScala
 {
 	static FloatingPoint __call(NBL_CONST_REF_ARG(FloatingPoint) _x)
 	{
-		// glibc implementation
-		const float64_t tiny = NBL_FP64_LITERAL(1e-300),
-			one = NBL_FP64_LITERAL(1.00000000000000000000e+00), /* 0x3FF00000, 0x00000000 */
-			erx = NBL_FP64_LITERAL(8.45062911510467529297e-01); /* 0x3FEB0AC1, 0x60000000 */
-
-		// Coefficients for approximation to erf in [0,0.84375]
-		const float64_t efx = NBL_FP64_LITERAL(1.28379167095512586316e-01); /* 0x3FC06EBA, 0x8214DB69 */
-		const float64_t pp0 = NBL_FP64_LITERAL(1.28379167095512558561e-01); /* 0x3FC06EBA, 0x8214DB68 */
-		const float64_t pp1 = NBL_FP64_LITERAL(-3.25042107247001499370e-01); /* 0xBFD4CD7D, 0x691CB913 */
-		const float64_t pp2 = NBL_FP64_LITERAL(-2.84817495755985104766e-02); /* 0xBF9D2A51, 0xDBD7194F */
-		const float64_t pp3 = NBL_FP64_LITERAL(-5.77027029648944159157e-03); /* 0xBF77A291, 0x236668E4 */
-		const float64_t pp4 = NBL_FP64_LITERAL(-2.37630166566501626084e-05); /* 0xBEF8EAD6, 0x120016AC */
-		const float64_t qq1 = NBL_FP64_LITERAL(3.97917223959155352819e-01); /* 0x3FD97779, 0xCDDADC09 */
-		const float64_t qq2 = NBL_FP64_LITERAL(6.50222499887672944485e-02); /* 0x3FB0A54C, 0x5536CEBA */
-		const float64_t qq3 = NBL_FP64_LITERAL(5.08130628187576562776e-03); /* 0x3F74D022, 0xC4D36B0F */
-		const float64_t qq4 = NBL_FP64_LITERAL(1.32494738004321644526e-04); /* 0x3F215DC9, 0x221C1A10 */
-		const float64_t qq5 = NBL_FP64_LITERAL(-3.96022827877536812320e-06); /* 0xBED09C43, 0x42A26120 */
-
-		//Coefficients for approximation to erf in [0.84375,1.25]
-		const float64_t pa0 = NBL_FP64_LITERAL(-2.36211856075265944077e-03); /* 0xBF6359B8, 0xBEF77538 */
-		const float64_t pa1 = NBL_FP64_LITERAL(4.14856118683748331666e-01); /* 0x3FDA8D00, 0xAD92B34D */
-		const float64_t pa2 = NBL_FP64_LITERAL(-3.72207876035701323847e-01); /* 0xBFD7D240, 0xFBB8C3F1 */
-		const float64_t pa3 = NBL_FP64_LITERAL(3.18346619901161753674e-01); /* 0x3FD45FCA, 0x805120E4 */
-		const float64_t pa4 = NBL_FP64_LITERAL(-1.10894694282396677476e-01); /* 0xBFBC6398, 0x3D3E28EC */
-		const float64_t pa5 = NBL_FP64_LITERAL(3.54783043256182359371e-02); /* 0x3FA22A36, 0x599795EB */
-		const float64_t pa6 = NBL_FP64_LITERAL(-2.16637559486879084300e-03); /* 0xBF61BF38, 0x0A96073F */
-		const float64_t qa1 = NBL_FP64_LITERAL(1.06420880400844228286e-01); /* 0x3FBB3E66, 0x18EEE323 */
-		const float64_t qa2 = NBL_FP64_LITERAL(5.40397917702171048937e-01); /* 0x3FE14AF0, 0x92EB6F33 */
-		const float64_t qa3 = NBL_FP64_LITERAL(7.18286544141962662868e-02); /* 0x3FB2635C, 0xD99FE9A7 */
-		const float64_t qa4 = NBL_FP64_LITERAL(1.26171219808761642112e-01); /* 0x3FC02660, 0xE763351F */
-		const float64_t qa5 = NBL_FP64_LITERAL(1.36370839120290507362e-02); /* 0x3F8BEDC2, 0x6B51DD1C */
-		const float64_t qa6 = NBL_FP64_LITERAL(1.19844998467991074170e-02); /* 0x3F888B54, 0x5735151D */
-
-		// Coefficients for approximation to erfc in [1.25,1/0.35]
-		const float64_t ra0 = NBL_FP64_LITERAL(-9.86494403484714822705e-03); /* 0xBF843412, 0x600D6435 */
-		const float64_t ra1 = NBL_FP64_LITERAL(-6.93858572707181764372e-01); /* 0xBFE63416, 0xE4BA7360 */
-		const float64_t ra2 = NBL_FP64_LITERAL(-1.05586262253232909814e+01); /* 0xC0251E04, 0x41B0E726 */
-		const float64_t ra3 = NBL_FP64_LITERAL(-6.23753324503260060396e+01); /* 0xC04F300A, 0xE4CBA38D */
-		const float64_t ra4 = NBL_FP64_LITERAL(-1.62396669462573470355e+02); /* 0xC0644CB1, 0x84282266 */
-		const float64_t ra5 = NBL_FP64_LITERAL(-1.84605092906711035994e+02); /* 0xC067135C, 0xEBCCABB2 */
-		const float64_t ra6 = NBL_FP64_LITERAL(-8.12874355063065934246e+01); /* 0xC0545265, 0x57E4D2F2 */
-		const float64_t ra7 = NBL_FP64_LITERAL(-9.81432934416914548592e+00); /* 0xC023A0EF, 0xC69AC25C */
-		const float64_t sa1 = NBL_FP64_LITERAL(1.96512716674392571292e+01); /* 0x4033A6B9, 0xBD707687 */
-		const float64_t sa2 = NBL_FP64_LITERAL(1.37657754143519042600e+02); /* 0x4061350C, 0x526AE721 */
-		const float64_t sa3 = NBL_FP64_LITERAL(4.34565877475229228821e+02); /* 0x407B290D, 0xD58A1A71 */
-		const float64_t sa4 = NBL_FP64_LITERAL(6.45387271733267880336e+02); /* 0x40842B19, 0x21EC2868 */
-		const float64_t sa5 = NBL_FP64_LITERAL(4.29008140027567833386e+02); /* 0x407AD021, 0x57700314 */
-		const float64_t sa6 = NBL_FP64_LITERAL(1.08635005541779435134e+02); /* 0x405B28A3, 0xEE48AE2C */
-		const float64_t sa7 = NBL_FP64_LITERAL(6.57024977031928170135e+00); /* 0x401A47EF, 0x8E484A93 */
-		const float64_t sa8 = NBL_FP64_LITERAL(-6.04244152148580987438e-02); /* 0xBFAEEFF2, 0xEE749A62 */
-
-		// Coefficients for approximation to erfc in [1/.35,28]
-		const float64_t rb0 = NBL_FP64_LITERAL(-9.86494292470009928597e-03); /* 0xBF843412, 0x39E86F4A */
-		const float64_t rb1 = NBL_FP64_LITERAL(-7.99283237680523006574e-01); /* 0xBFE993BA, 0x70C285DE */
-		const float64_t rb2 = NBL_FP64_LITERAL(-1.77579549177547519889e+01); /* 0xC031C209, 0x555F995A */
-		const float64_t rb3 = NBL_FP64_LITERAL(-1.60636384855821916062e+02); /* 0xC064145D, 0x43C5ED98 */
-		const float64_t rb4 = NBL_FP64_LITERAL(-6.37566443368389627722e+02); /* 0xC083EC88, 0x1375F228 */
-		const float64_t rb5 = NBL_FP64_LITERAL(-1.02509513161107724954e+03); /* 0xC0900461, 0x6A2E5992 */
-		const float64_t rb6 = NBL_FP64_LITERAL(-4.83519191608651397019e+02); /* 0xC07E384E, 0x9BDC383F */
-		const float64_t sb1 = NBL_FP64_LITERAL(3.03380607434824582924e+01); /* 0x403E568B, 0x261D5190 */
-		const float64_t sb2 = NBL_FP64_LITERAL(3.25792512996573918826e+02); /* 0x40745CAE, 0x221B9F0A */
-		const float64_t sb3 = NBL_FP64_LITERAL(1.53672958608443695994e+03); /* 0x409802EB, 0x189D5118 */
-		const float64_t sb4 = NBL_FP64_LITERAL(3.19985821950859553908e+03); /* 0x40A8FFB7, 0x688C246A */
-		const float64_t sb5 = NBL_FP64_LITERAL(2.55305040643316442583e+03); /* 0x40A3F219, 0xCEDF3BE6 */
-		const float64_t sb6 = NBL_FP64_LITERAL(4.74528541206955367215e+02); /* 0x407DA874, 0xE79FE763 */
-		const float64_t sb7 = NBL_FP64_LITERAL(-2.24409524465858183362e+01); /* 0xC03670E2, 0x42712D62 */
-
-		float64_t x = float64_t(_x);
-		int32_t hx, ix;
-		float64_t s, y, z, r;
-		hx = int32_t(bit_cast<uint64_t, float64_t>(x) >> 32);
-		ix = hx & 0x7fffffff;
-		if (ix >= 0x7ff00000)           // erf(nan)=nan, erf(+-inf)=+-1
-		{
-			int32_t i = ((uint32_t)hx >> 31) << 1;
-			return (float64_t)(1.0 - i) + one / x;
-		}
-
-		float64_t P, Q;
-		if (ix < 0x3feb0000)            // |x| < 0.84375
-		{
-			if (ix < 0x3e300000)        // |x| < 2**-28
-			{
-				if (ix < 0x00800000)
-				{
-					// avoid underflow
-					return FloatingPoint(0.0625 * (16.0 * x + (16.0 * efx) * x));
-				}
-				return FloatingPoint(x + efx * x);
-			}
-			z = x * x;
-			r = pp0 + z * (pp1 + z * (pp2 + z * (pp3 + z * pp4)));
-			s = one + z * (qq1 + z * (qq2 + z * (qq3 + z * (qq4 + z * qq5))));
-			y = r / s;
-			return FloatingPoint(x + x * y);
-		}
-		if (ix < 0x3ff40000)            // 0.84375 <= |x| < 1.25
-		{
-			s = abs_helper<float64_t>::__call(x) - one;
-			P = pa0 + s * (pa1 + s * (pa2 + s * (pa3 + s * (pa4 + s * (pa5 + s * pa6)))));
-			Q = one + s * (qa1 + s * (qa2 + s * (qa3 + s * (qa4 + s * (qa5 + s * (qa5 + s * qa6))))));
-			if (hx >= 0)
-				return FloatingPoint(erx + P / Q);
-			else
-				return FloatingPoint(-erx - P / Q);
-		}
-		if (ix >= 0x40180000)           // inf > |x| >= 6
-		{
-			if (hx >= 0)
-				return FloatingPoint(one - tiny);
-			else
-				return FloatingPoint(tiny - one);
-		}
-
-		x = abs_helper<float64_t>::__call(x);
-		s = one / (x * x);
-		float64_t R, S;
-		if (ix < 0x4006DB6E)            // |x| < 1/0.35     ~2.85714
-		{
-			R = ra0 + s * (ra1 + s * (ra2 + s * (ra3 + s * (ra4 + s * (ra5 + s * (ra6 + s * ra7))))));
-			S = one + s * (sa1 + s * (sa2 + s * (sa3 + s * (sa4 + s * (sa5 + s * (sa6 + s * sa7))))));
-		}
-		else                            // |x| >= 1/0.35
-		{
-			R = rb0 + s * (rb1 + s * (rb2 + s * (rb3 + s * (rb4 + s * rb5))));
-			S = one + s * (sb1 + s * (sb2 + s * (sb3 + s * (sb4 + s * (sb5 + s * (sb6 + s * sb7))))));
-		}
-		z = x;
-		uint64_t z1 = bit_cast<uint64_t, float64_t>(x);
-		z1 &= 0xffffffff00000000;
-		z = bit_cast<float64_t, uint64_t>(z1);
-		r = exp_helper<float64_t>::__call(-z * z - 0.5625) * exp_helper<float64_t>::__call((z - x) * (z + x) + R / S);
-		if (hx >= 0)
-			return FloatingPoint(one - r / x);
-		else
-			return FloatingPoint(r / x - one);
-	}
-};
-
-template<>
-struct erf_helper<float32_t>
-{
-	static float32_t __call(NBL_CONST_REF_ARG(float32_t) _x)
-	{
-		// A&S approximation to 1.5x10-7
-		const float32_t a1 = float32_t(NBL_FP64_LITERAL(0.254829592));
-		const float32_t a2 = float32_t(NBL_FP64_LITERAL(-0.284496736));
-		const float32_t a3 = float32_t(NBL_FP64_LITERAL(1.421413741));
-		const float32_t a4 = float32_t(NBL_FP64_LITERAL(-1.453152027));
-		const float32_t a5 = float32_t(NBL_FP64_LITERAL(1.061405429));
-		const float32_t p = float32_t(NBL_FP64_LITERAL(0.3275911));
-
-		float32_t _sign = float32_t(sign(_x));
-		float32_t x = abs(_x);
-
-		float32_t t = float32_t(NBL_FP64_LITERAL(1.0)) / (float32_t(NBL_FP64_LITERAL(1.0)) + p * x);
-		float32_t y = float32_t(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
+		const FloatingPoint a1 = FloatingPoint(NBL_FP64_LITERAL(0.254829592));
+		const FloatingPoint a2 = FloatingPoint(NBL_FP64_LITERAL(-0.284496736));
+		const FloatingPoint a3 = FloatingPoint(NBL_FP64_LITERAL(1.421413741));
+		const FloatingPoint a4 = FloatingPoint(NBL_FP64_LITERAL(-1.453152027));
+		const FloatingPoint a5 = FloatingPoint(NBL_FP64_LITERAL(1.061405429));
+		const FloatingPoint p = FloatingPoint(NBL_FP64_LITERAL(0.3275911));
+
+		FloatingPoint _sign = FloatingPoint(sign(_x));
+		FloatingPoint x = abs(_x);
+
+		FloatingPoint t = FloatingPoint(NBL_FP64_LITERAL(1.0)) / (FloatingPoint(NBL_FP64_LITERAL(1.0)) + p * x);
+		FloatingPoint y = FloatingPoint(NBL_FP64_LITERAL(1.0)) - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
 
 		return _sign * y;
 	}