@@ -1608,6 +1608,79 @@ void convertFloatToDouble (double* dest, const float* src, Size num) noexcept
16081608#endif
16091609}
16101610
1611+ template <typename Size>
1612+ double computeCorrelation (const float * a, const float * b, Size num) noexcept
1613+ {
1614+ if (num == 0 )
1615+ return 0.0 ;
1616+
1617+ double sumA2 = 0.0 ;
1618+ double sumB2 = 0.0 ;
1619+ double sumAB = 0.0 ;
1620+
1621+ #if YUP_USE_SSE_INTRINSICS
1622+ Size i = 0 ;
1623+ __m128 sumA = _mm_setzero_ps ();
1624+ __m128 sumB = _mm_setzero_ps ();
1625+ __m128 sumABv = _mm_setzero_ps ();
1626+
1627+ for (; i + 4 <= num; i += 4 )
1628+ {
1629+ __m128 av = _mm_loadu_ps (a + i);
1630+ __m128 bv = _mm_loadu_ps (b + i);
1631+ sumA = _mm_add_ps (sumA, _mm_mul_ps (av, av));
1632+ sumB = _mm_add_ps (sumB, _mm_mul_ps (bv, bv));
1633+ sumABv = _mm_add_ps (sumABv, _mm_mul_ps (av, bv));
1634+ }
1635+
1636+ alignas (16 ) float temp[4 ];
1637+ _mm_store_ps (temp, sumA);
1638+ sumA2 += temp[0 ] + temp[1 ] + temp[2 ] + temp[3 ];
1639+ _mm_store_ps (temp, sumB);
1640+ sumB2 += temp[0 ] + temp[1 ] + temp[2 ] + temp[3 ];
1641+ _mm_store_ps (temp, sumABv);
1642+ sumAB += temp[0 ] + temp[1 ] + temp[2 ] + temp[3 ];
1643+
1644+ #elif YUP_USE_ARM_NEON
1645+ Size i = 0 ;
1646+ float32x4_t sumA = vdupq_n_f32 (0 .0f );
1647+ float32x4_t sumB = vdupq_n_f32 (0 .0f );
1648+ float32x4_t sumABv = vdupq_n_f32 (0 .0f );
1649+
1650+ for (; i + 4 <= num; i += 4 )
1651+ {
1652+ float32x4_t av = vld1q_f32 (a + i);
1653+ float32x4_t bv = vld1q_f32 (b + i);
1654+ sumA = vmlaq_f32 (sumA, av, av);
1655+ sumB = vmlaq_f32 (sumB, bv, bv);
1656+ sumABv = vmlaq_f32 (sumABv, av, bv);
1657+ }
1658+
1659+ alignas (16 ) float temp[4 ];
1660+ vst1q_f32 (temp, sumA);
1661+ sumA2 += temp[0 ] + temp[1 ] + temp[2 ] + temp[3 ];
1662+ vst1q_f32 (temp, sumB);
1663+ sumB2 += temp[0 ] + temp[1 ] + temp[2 ] + temp[3 ];
1664+ vst1q_f32 (temp, sumABv);
1665+ sumAB += temp[0 ] + temp[1 ] + temp[2 ] + temp[3 ];
1666+
1667+ #endif
1668+
1669+ for (; i < num; ++i)
1670+ {
1671+ const double av = a[i];
1672+ const double bv = b[i];
1673+ sumA2 += av * av;
1674+ sumB2 += bv * bv;
1675+ sumAB += av * bv;
1676+ }
1677+
1678+ if (sumA2 == 0.0 || sumB2 == 0.0 )
1679+ return 0.0 ;
1680+
1681+ return sumAB / std::sqrt (sumA2 * sumB2);
1682+ }
1683+
16111684} // namespace
16121685} // namespace FloatVectorHelpers
16131686
@@ -1949,6 +2022,16 @@ void YUP_CALLTYPE FloatVectorOperations::convertDoubleToFloat (float* dest, cons
19492022 FloatVectorHelpers::convertDoubleToFloat (dest, src, num);
19502023}
19512024
2025+ double YUP_CALLTYPE FloatVectorOperations::computeCorrelation (const float * a, const float * b, int num) noexcept
2026+ {
2027+ return FloatVectorHelpers::computeCorrelation (a, b, num);
2028+ }
2029+
2030+ double YUP_CALLTYPE FloatVectorOperations::computeCorrelation (const float * a, const float * b, size_t num) noexcept
2031+ {
2032+ return FloatVectorHelpers::computeCorrelation (a, b, num);
2033+ }
2034+
19522035// ==============================================================================
19532036
19542037intptr_t YUP_CALLTYPE FloatVectorOperations::getFpStatusRegister () noexcept
0 commit comments