Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion modules/fastcv/include/opencv2/fastcv.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -11,6 +11,7 @@
#include "opencv2/fastcv/arithm.hpp"
#include "opencv2/fastcv/bilateralFilter.hpp"
#include "opencv2/fastcv/blur.hpp"
#include "opencv2/fastcv/channel.hpp"
#include "opencv2/fastcv/cluster.hpp"
#include "opencv2/fastcv/draw.hpp"
#include "opencv2/fastcv/edges.hpp"
Expand Down
41 changes: 40 additions & 1 deletion modules/fastcv/include/opencv2/fastcv/arithm.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -8,6 +8,10 @@

#include <opencv2/core.hpp>

#define FCV_CMP_EQ(val1,val2) (fabs(val1 - val2) < FLT_EPSILON)

#define FCV_OPTYPE(depth,op) ((depth<<3) + op)

namespace cv {
namespace fastcv {

Expand All @@ -26,6 +30,41 @@ CV_EXPORTS_W void matmuls8s32(InputArray src1, InputArray src2, OutputArray dst)

//! @}

//! @addtogroup fastcv
//! @{

/**
* @brief Arithmetic add and subtract operations for two matrices
* It is optimized for Qualcomm's processors
* @param src1 First source matrix, can be of type CV_8U, CV_16S, CV_32F.
* Note: CV_32F not supported for subtract
* @param src2 Second source matrix of same type and size as src1
* @param dst Resulting matrix of type as src mats
* @param op type of operation - 0 for add and 1 for subtract
*/
CV_EXPORTS_W void arithmetic_op(InputArray src1, InputArray src2, OutputArray dst, int op);

//! @}

//! @addtogroup fastcv
//! @{

/**
* @brief Matrix multiplication of two float type matrices
* R = a*A*B + b*C where A,B,C,R are matrices and a,b are constants
* It is optimized for Qualcomm's processors
* @param src1 First source matrix of type CV_32F
* @param src2 Second source matrix of type CV_32F with same rows as src1 cols
* @param dst Resulting matrix of type CV_32F
* @param alpha multiplying factor for src1 and src2
* @param src3 Optional third matrix of type CV_32F to be added to matrix product
* @param beta multiplying factor for src3
*/
CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, OutputArray dst, float alpha = 1.0,
InputArray src3 = noArray(), float beta = 0.0);
Comment on lines +52 to +64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OpenCV HAL has GEMM options:

/**
The function performs generalized matrix multiplication similar to the gemm functions in BLAS level 3:
\f$D = \alpha*AB+\beta*C\f$

@param src1 pointer to input \f$M\times N\f$ matrix \f$A\f$ or \f$A^T\f$ stored in row major order.
@param src1_step number of bytes between two consequent rows of matrix \f$A\f$ or \f$A^T\f$.
@param src2 pointer to input \f$N\times K\f$ matrix \f$B\f$ or \f$B^T\f$ stored in row major order.
@param src2_step number of bytes between two consequent rows of matrix \f$B\f$ or \f$B^T\f$.
@param alpha \f$\alpha\f$ multiplier before \f$AB\f$
@param src3 pointer to input \f$M\times K\f$ matrix \f$C\f$ or \f$C^T\f$ stored in row major order.
@param src3_step number of bytes between two consequent rows of matrix \f$C\f$ or \f$C^T\f$.
@param beta \f$\beta\f$ multiplier before \f$C\f$
@param dst pointer to input \f$M\times K\f$ matrix \f$D\f$ stored in row major order.
@param dst_step number of bytes between two consequent rows of matrix \f$D\f$.
@param m number of rows in matrix \f$A\f$ or \f$A^T\f$, equals to number of rows in matrix \f$D\f$
@param n number of columns in matrix \f$A\f$ or \f$A^T\f$
@param k number of columns in matrix \f$B\f$ or \f$B^T\f$, equals to number of columns in matrix \f$D\f$
@param flags algorithm options (combination of CV_HAL_GEMM_1_T, ...).
 */

//! @addtogroup core_hal_interface_matrix_multiplication Matrix multiplication
//! @{
inline int hal_ni_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
                          float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
                          double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
                          float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
                          double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @}

I propose to implement rather then add extension.


//! @}

} // fastcv::
} // cv::

Expand Down
45 changes: 45 additions & 0 deletions modules/fastcv/include/opencv2/fastcv/channel.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

#ifndef OPENCV_FASTCV_CHANNEL_HPP
#define OPENCV_FASTCV_CHANNEL_HPP

#include <opencv2/core.hpp>

namespace cv {
namespace fastcv {

//! @addtogroup fastcv
//! @{

/**
* @brief Creates one multi-channel mat out of several single-channel CV_8U mats.
* Optimized for Qualcomm's processors
* @param mv input vector of matrices to be merged; all the matrices in mv must be of CV_8UC1 and have the same size
* Note: numbers of mats can be 2,3 or 4.
* @param dst output array of depth CV_8U and same size as mv[0]; The number of channels
* will be the total number of matrices in the matrix array
*/
CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);

//! @}

//! @addtogroup fastcv
//! @{

/**
* @brief Splits an CV_8U multi-channel mat into several CV_8UC1 mats
* Optimized for Qualcomm's processors
* @param src input 2,3 or 4 channel mat of depth CV_8U
* @param mv output vector of size src.channels() of CV_8UC1 mats
*/
CV_EXPORTS_W void split(InputArray src, OutputArrayOfArrays mv);
Comment on lines +17 to +38
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same question on HAL.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are trying optimizations to achieve consistent perf across various targets. So for now we added it in extension


//! @}

} // fastcv::
} // cv::

#endif // OPENCV_FASTCV_CHANNEL_HPP
9 changes: 6 additions & 3 deletions modules/fastcv/include/opencv2/fastcv/pyramid.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -15,14 +15,17 @@ namespace fastcv {
//! @{

/**
* @brief Creates a gradient pyramid from an image pyramid
* @brief Creates a gradient pyramid from an image pyramid.
* Note: The borders are ignored during gradient calculation.
*
* @param pyr Input pyramid of 1-channel 8-bit images. Only continuous images are supported.
* @param dx Horizontal Sobel gradient pyramid of the same size as pyr
* @param dy Verical Sobel gradient pyramid of the same size as pyr
* @param outType Type of output data, can be CV_8S, CV_16S or CV_32F
* @param clearBuffers If set to 1, output buffers are set to 0 before computation, to remove garbage values.
*/
CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, OutputArrayOfArrays dy, int outType = CV_8S);
CV_EXPORTS_W void sobelPyramid(InputArrayOfArrays pyr, OutputArrayOfArrays dx, OutputArrayOfArrays dy, int outType = CV_8S,
int clearBuffers = 0);

/**
* @brief Builds an image pyramid of float32 arising from a single
Expand Down
35 changes: 34 additions & 1 deletion modules/fastcv/perf/perf_matmul.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -10,6 +10,9 @@ namespace opencv_test {
typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/> MatMulPerfParams;
typedef perf::TestBaseWithParam<MatMulPerfParams> MatMulPerfTest;

typedef std::tuple<int /*rows1*/, int /*cols1*/, int /*cols2*/, float> MatMulGemmPerfParams;
typedef perf::TestBaseWithParam<MatMulGemmPerfParams> MatMulGemmPerfTest;

PERF_TEST_P(MatMulPerfTest, run,
::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
::testing::Values(8, 16, 128, 256), // cols1
Expand Down Expand Up @@ -37,4 +40,34 @@ PERF_TEST_P(MatMulPerfTest, run,
SANITY_CHECK_NOTHING();
}

PERF_TEST_P(MatMulGemmPerfTest, run,
::testing::Combine(::testing::Values(8, 16, 128, 256), // rows1
::testing::Values(8, 16, 128, 256), // cols1
::testing::Values(8, 16, 128, 256), // cols2
::testing::Values(2.5, 5.8)) // alpha
)
{
auto p = GetParam();
int rows1 = std::get<0>(p);
int cols1 = std::get<1>(p);
int cols2 = std::get<2>(p);
float alpha = std::get<3>(p);

RNG& rng = cv::theRNG();
Mat src1(rows1, cols1, CV_32FC1), src2(cols1, cols2, CV_32FC1);
cvtest::randUni(rng, src1, Scalar::all(-128.0), Scalar::all(128.0));
cvtest::randUni(rng, src2, Scalar::all(-128.0), Scalar::all(128.0));

Mat dst;

while (next())
{
startTimer();
cv::fastcv::gemm(src1, src2, dst, alpha, noArray(), 0);
stopTimer();
}

SANITY_CHECK_NOTHING();
}

} // namespace
4 changes: 2 additions & 2 deletions modules/fastcv/perf/perf_pyramid.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

Expand Down Expand Up @@ -66,7 +66,7 @@ PERF_TEST_P(SobelPyramidTest, checkAllTypes,
{
std::vector<cv::Mat> pyrDx, pyrDy;
startTimer();
cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type);
cv::fastcv::sobelPyramid(pyr, pyrDx, pyrDy, type, 1);
stopTimer();
}

Expand Down
150 changes: 149 additions & 1 deletion modules/fastcv/src/arithm.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
* Copyright (c) 2024-2025 Qualcomm Innovation Center, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

Expand Down Expand Up @@ -32,5 +32,153 @@ void matmuls8s32(InputArray _src1, InputArray _src2, OutputArray _dst)
(int32_t*)dst.data, dst.step);
}

void arithmetic_op(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
{
CV_Assert(!_src1.empty() && (_src1.depth() == CV_8U || _src1.depth() == CV_16S || _src1.depth() == CV_32F));
CV_Assert(!_src2.empty() && _src2.type() == _src1.type());
CV_Assert(_src2.size() == _src1.size());

Mat src1 = _src1.getMat();
Mat src2 = _src2.getMat();

_dst.create(_src1.rows(), _src1.cols(), _src1.type());
Mat dst = _dst.getMat();

INITIALIZATION_CHECK;

fcvConvertPolicy policy = FASTCV_CONVERT_POLICY_SATURATE;

int nStripes = cv::getNumThreads();

int func = FCV_OPTYPE(_src1.depth(), op);
switch(func)
{
case FCV_OPTYPE(CV_8U, 0):
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const uchar* yS1 = src1.data + static_cast<size_t>(range.start)*src1.step[0];
const uchar* yS2 = src2.data + static_cast<size_t>(range.start)*src2.step[0];
uchar* yD = dst.data + static_cast<size_t>(range.start)*dst.step[0];
fcvAddu8(yS1, src1.cols, rangeHeight, src1.step[0],
yS2, src2.step[0], policy, yD, dst.step[0]);
}, nStripes);
break;
case FCV_OPTYPE(CV_16S, 0):
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const short* yS1 = (short*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(short));
const short* yS2 = (short*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(short));
short* yD = (short*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(short));
fcvAdds16_v2(yS1, src1.cols, rangeHeight, src1.step[0],
yS2, src2.step[0], policy, yD, dst.step[0]);
}, nStripes);
break;
case FCV_OPTYPE(CV_32F, 0):
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const float* yS1 = (float*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(float));
const float* yS2 = (float*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(float));
float* yD = (float*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(float));
fcvAddf32(yS1, src1.cols, rangeHeight, src1.step[0],
yS2, src2.step[0], yD, dst.step[0]);
}, nStripes);
break;
case FCV_OPTYPE(CV_8U, 1):
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const uchar* yS1 = src1.data + static_cast<size_t>(range.start)*src1.step[0];
const uchar* yS2 = src2.data + static_cast<size_t>(range.start)*src2.step[0];
uchar* yD = dst.data + static_cast<size_t>(range.start)*dst.step[0];
fcvSubtractu8(yS1, src1.cols, rangeHeight, src1.step[0],
yS2, src2.step[0], policy, yD, dst.step[0]);
}, nStripes);
break;
case FCV_OPTYPE(CV_16S, 1):
cv::parallel_for_(cv::Range(0, src1.rows), [&](const cv::Range &range){
int rangeHeight = range.end - range.start;
const short* yS1 = (short*)src1.data + static_cast<size_t>(range.start)*(src1.step[0]/sizeof(short));
const short* yS2 = (short*)src2.data + static_cast<size_t>(range.start)*(src2.step[0]/sizeof(short));
short* yD = (short*)dst.data + static_cast<size_t>(range.start)*(dst.step[0]/sizeof(short));
fcvSubtracts16(yS1, src1.cols, rangeHeight, src1.step[0],
yS2, src2.step[0], policy, yD, dst.step[0]);
}, nStripes);
break;
default:
CV_Error(cv::Error::StsBadArg, cv::format("op type is not supported"));
break;
}
}


void gemm(InputArray _src1, InputArray _src2, OutputArray _dst, float alpha, InputArray _src3, float beta)
{
CV_Assert(!_src1.empty() && _src1.type() == CV_32FC1);
CV_Assert(_src1.cols() == _src2.rows());
Mat src1 = _src1.getMat();

CV_Assert(!_src2.empty() && _src2.type() == CV_32FC1);
Mat src2 = _src2.getMat();

bool isSrc3 = !_src3.empty();

Mat src3 = _src3.getMat();

_dst.create(_src1.rows(), _src2.cols(), CV_32FC1);

Mat dst = _dst.getMat();

CV_Assert(!FCV_CMP_EQ(alpha,0));

cv::Mat dst_temp1, dst_temp2;
float *dstp = NULL;
bool inplace = false;
size_t dst_stride;
fcvStatus status = FASTCV_SUCCESS;

int n = src1.cols, m = src1.rows, k = src2.cols;

INITIALIZATION_CHECK;

if(src1.data == dst.data || src2.data == dst.data || (isSrc3 && (src3.data == dst.data)))
{
dst_temp1 = cv::Mat(m, k, CV_32FC1);
dstp = dst_temp1.ptr<float>();
inplace = true;
dst_stride = dst_temp1.step[0];
}
else
{
dstp = (float32_t*)dst.data;
dst_stride = dst.step[0];
}
float32_t *dstp1 = dstp;
status = fcvMatrixMultiplyf32_v2((float32_t*)src1.data, n, m, src1.step[0], (float32_t*)src2.data, k,
src2.step[0], dstp, dst_stride);

bool isAlpha = !(FCV_CMP_EQ(alpha,0) || FCV_CMP_EQ(alpha,1));
if(isAlpha && status == FASTCV_SUCCESS)
{
status = fcvMultiplyScalarf32(dstp, k, m, dst_stride, alpha, dstp1, dst_stride);
}

if(isSrc3 && (!FCV_CMP_EQ(beta,0)) && status == FASTCV_SUCCESS)
{
cv::Mat dst3 = cv::Mat(m, k, CV_32FC1);
if(!FCV_CMP_EQ(beta,1))
{
status = fcvMultiplyScalarf32((float32_t*)src3.data, k, m, src3.step[0], beta, (float32_t*)dst3.data, dst3.step[0]);
if(status == FASTCV_SUCCESS)
fcvAddf32_v2(dstp, k, m, dst_stride, (float32_t*)dst3.data, dst3.step[0], dstp1, dst_stride);
}
else
fcvAddf32_v2(dstp, k, m, dst_stride, (float32_t*)src3.data, src3.step[0], dstp1, dst_stride);
}

if(inplace == true)
{
dst_temp1(cv::Rect(0, 0, k, m)).copyTo(dst(cv::Rect(0, 0, k, m)));
}
}

} // fastcv::
} // cv::
Loading
Loading