Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,51 @@
#include <vector>

#include <hipdnn_data_sdk/types.hpp>
#include <hipdnn_data_sdk/utilities/ShapeUtilities.hpp>
#include <hipdnn_data_sdk/utilities/Tensor.hpp>
#include <hipdnn_data_sdk/utilities/TensorView.hpp>

namespace hipdnn_test_sdk::utilities
{

/**
* @brief Computes Higham's error growth factor γ_k for floating-point accumulation.
*
* For k accumulations with machine epsilon u:
* - Linear (when nU < 0.01): γ_k = (2*k*u) / (1 - 2*k*u) [deterministic worst-case]
* - Statistical (when nU >= 0.01): γ_k = K_SIGMA * sqrt(2*k) * u [probabilistic, K_SIGMA=6]
*
* where nU = 2*k*u.
*
* The switch at nU = 0.01 is chosen because the linear bound inflates noticeably
* above this point (at nU=0.01 the overshoot is ~1%, at nU=0.1 it's ~11%).
*
* This is a pure math function that never throws. Callers are responsible for
* checking if the returned gamma is too large for their use case (e.g., gamma >= 0.5
* means the error bound exceeds 50% of the signal).
*
* Reusable across conv, matmul, and any operation that accumulates k products.
*
* @param k Number of accumulations (reduction dimension).
* @param epsilon Machine epsilon for the compute type.
* @return The error growth factor γ_k as a double.
*/
inline double computeGamma(uint64_t k, double epsilon)
{
constexpr double NU_THRESHOLD = 0.01;

const double nU = 2.0 * static_cast<double>(k) * epsilon;

if(nU < NU_THRESHOLD)
{
return nU / (1.0 - nU);
}

constexpr double K_SIGMA = 6.0;
return K_SIGMA * std::sqrt(2.0 * static_cast<double>(k)) * epsilon;
}

} // namespace hipdnn_test_sdk::utilities

namespace hipdnn_test_sdk::utilities::conv
{
Expand Down Expand Up @@ -481,3 +526,192 @@ float calculateConvFpropTolerance(
}

} // namespace hipdnn_test_sdk::utilities::conv

namespace hipdnn_test_sdk::utilities::matmul
{
using hipdnn_data_sdk::types::bfloat16;
using hipdnn_data_sdk::types::half;
using hipdnn_data_sdk::utilities::ITensor;
using hipdnn_data_sdk::utilities::TensorView;

/**
* @brief Computes the infinity-norm (max absolute row sum) of a matrix tensor.
*
* The infinity-norm is defined as: ||A||_inf = max_i (sum_j |A[i,j]|)
*
* This is the appropriate subordinate matrix norm for element-wise error bounds
* via Higham's analysis: max_ij |error_ij| <= gamma_k * ||A||_inf * ||B||_inf.
*
* For batched tensors (>2D), the infinity-norm is computed across all batches,
* returning the maximum row sum over all rows in all batches.
*
* Uses iterateAlongDimensions + ConstTensorView to correctly handle padded
* and non-packed tensor layouts via stride-aware indexing.
*
* @tparam T The data type of the tensor elements.
* @param tensor The input tensor (must have at least 2 dimensions).
* @return The infinity-norm as a double.
*/
template <typename T>
double computeMatrixInfNorm(ITensor& tensor)
Comment on lines +537 to +556
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's good to have this version that uses the exact values, but for conv we used estimated values from the min/max to be able to quickly calculate an estimated worst-case tolerance.

Would it be possible to add a version that uses estimated values similar to conv instead of real values?
I think we should probably have both for every operation, and then we can add in some environment variable, or settings that let us swap between the modes.

That way for quick pre-submit checks we can use the quicker/looser estimation of tolerance, and for nightly / weekly we can instead use the calculated form. The calculated form will also likely need to be converted into a GPU version once we scale up as well. Having the estimated version gives us something to use until all this work is in place.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if we could keep this pretty much the same but just extract out where the computation of the Norm happens since that seems to be the expensive operation. Of course, im not amazing with the tolerance stuff yet so please correct me if this wont work at all.

  // Extract the common math and take in the norm of A & B.
  template <typename OutputType, typename InputType, typename ComputeType = float>
  float calculateMatmulTolerance(double normA, double normB, uint64_t k)
  {
      static_assert(std::is_same_v<ComputeType, float> || std::is_same_v<ComputeType, double>
                        || std::is_same_v<ComputeType, half> || std::is_same_v<ComputeType, bfloat16>,
                    "ComputeType must be float, double, half, or bfloat16");

      auto epsilon = static_cast<double>(std::numeric_limits<ComputeType>::epsilon());
      double gamma = hipdnn_test_sdk::utilities::computeGamma(k, epsilon);

      constexpr double GAMMA_MAX = 0.5;
      if(gamma >= GAMMA_MAX)
      {
          throw std::overflow_error("Error growth factor gamma >= 0.5...");
      }

      double accumulatedTolerance = gamma * normA * normB;

      auto inputEpsilon = static_cast<double>(std::numeric_limits<InputType>::epsilon());
      if(inputEpsilon < epsilon)
      {
          accumulatedTolerance += 2.0 * normA * normB * epsilon;
      }

      auto outputEpsilon = static_cast<double>(std::numeric_limits<OutputType>::epsilon());
      double castTolerance = 0.0;
      if(outputEpsilon > epsilon)
      {
          castTolerance = normA * normB * outputEpsilon;
      }

      double totalTolerance = accumulatedTolerance + castTolerance;

      if(totalTolerance > static_cast<double>(std::numeric_limits<OutputType>::max()))
      {
          throw std::overflow_error("Calculated tolerance exceeds max of output type.");
      }

      return static_cast<float>(totalTolerance);
  }

Then we can make a fast/loose version that takes in the min/max, cols of A/B and computes a maxNorm of A/B.

 template <typename OutputType, typename InputType, typename ComputeType = float>
  float calculateMatmulToleranceLoose(double minA, double maxA, int64_t colsA,
                                 double minB, double maxB, int64_t colsB)
  {
      double maxAbsA = std::max(std::abs(minA), std::abs(maxA));
      double maxAbsB = std::max(std::abs(minB), std::abs(maxB));
      double normA = static_cast<double>(colsA) * maxAbsA;
      double normB = static_cast<double>(colsB) * maxAbsB;
      auto k = static_cast<uint64_t>(colsA);  // K = cols of A = rows of B
      return calculateMatmulTolerance<OutputType, InputType, ComputeType>(normA, normB, k);
  }

Lastly we can have the original implementation that uses the full tensor values

  template <typename OutputType, typename InputType, typename ComputeType = float>
  float calculateMatmulToleranceTight(ITensor& a, ITensor& b)
  {
      const auto& aDims = a.dims();
      const auto& bDims = b.dims();

      if(aDims.size() < 2 || bDims.size() < 2)
      {
          throw std::invalid_argument("Matrices must have at least 2 dimensions for matmul.");
      }

      int64_t k = aDims[aDims.size() - 1];
      int64_t bRows = bDims[bDims.size() - 2];

      if(k != bRows)
      {
          throw std::invalid_argument(
              "Matrix dimensions incompatible for multiplication: A columns != B rows.");
      }

      if(k <= 0)
      {
          throw std::invalid_argument("Reduction dimension k must be positive.");
      }

      double normA = computeMatrixInfNorm<InputType>(a);
      double normB = computeMatrixInfNorm<InputType>(b);

      return calculateMatmulTolerance<OutputType, InputType, ComputeType>(
          normA, normB, static_cast<uint64_t>(k));
  }

Copy link
Contributor Author

@oanaoana oanaoana Mar 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fully agree this would be more expensive, especially now after adding batching without tensorization etc. This might be a wider conversation, and I support Adam's suggestion for layered APIs, since we might want a few separate categories of checks with different aims. Some that run on-the-fly, like the initial convs approach, some more accurate for rigorous tests, and some in between.
This particular dynamic tol is in-between, since I used the inf and not Frobenius norm. I haven't assessed how to make it faster and simplify the norm further, but it's a step toward removing the N^2 check for convs. The best for local tests would be to compute the norms as the tensors get generated, and pass Anorm, Bnorm as input arguments for the dynamic tol evaluation.

Let me know if you'd like me to add a second simplified option in the spirit of convs or Adam's suggestion. I am leaning toward leaving this PR simple and having a second round of different norm types for different purposes, especially since the incoming GPU refs might lead to more options to weed through.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am okay if we leave this PR simple and do a follow up PR to split this all up.

{
using hipdnn_data_sdk::utilities::iterateAlongDimensions;

const auto& dims = tensor.dims();
const TensorView<T> view(tensor);

auto cols = dims.back();

// outerDims = [batch..., rows] — everything except the last dim (cols)
auto outerDims = std::vector<int64_t>(dims.begin(), dims.end() - 1);

double maxRowSum = 0.0;

iterateAlongDimensions(outerDims, [&](const std::vector<int64_t>& outerIndices) {
double rowSum = 0.0;

auto fullIndices = outerIndices;
fullIndices.push_back(0);

for(int64_t j = 0; j < cols; ++j)
{
fullIndices.back() = j;
rowSum += static_cast<double>(
hipdnn_data_sdk::types::fabs(view.getHostValue(fullIndices)));
}

maxRowSum = std::max(maxRowSum, rowSum);
});

return maxRowSum;
}

/**
* @brief Calculates the expected tolerance for Matrix Multiplication operations.
*
* This function estimates the maximum expected element-wise error due to floating-point
* accumulation during the computation of C = A × B using Higham's error analysis.
*
* Norm Selection: Infinity-norm (max absolute row sum)
* =====================================================
* We use the infinity-norm ||A||_inf = max_i (sum_j |A[i,j]|) because Higham's element-wise
* error bound for matrix multiplication gives:
* max_ij |fl(C)_ij - C_ij| <= gamma_k * ||A||_inf * ||B||_inf
*
* This is the correct subordinate matrix norm for bounding the maximum element-wise error,
* which is what allClose-style validation checks.
*
* Error Bound (Higham's Analysis):
* =================================
* For C = A*B where A is m×k and B is k×n:
* max_ij |error_ij| <= γ_k * ||A||_inf * ||B||_inf
*
* where γ_k is the error growth factor:
* - High Precision (FP32, FP64): γ_k = (2*k*u) / (1 - 2*k*u) (linear worst-case bound)
* - Low Precision (FP16, BF16): γ_k = K_SIGMA * sqrt(2*k) * u (statistical bound, K_SIGMA=6)
* - u = machine epsilon for ComputeType
*
* The function also accounts for precision loss from input/output casting if needed.
*
* @tparam OutputType The data type of the output matrix C.
* @tparam InputType The data type of the input matrices A and B.
* @tparam ComputeType The data type used for accumulation (default: float).
* @param a The input matrix A.
* @param b The input matrix B.
* @return The calculated tolerance value as float.
*/
template <typename OutputType, typename InputType, typename ComputeType = float>
float calculateMatmulTolerance(ITensor& a, ITensor& b)
Copy link
Contributor

@BrianHarrisonAMD BrianHarrisonAMD Mar 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should update the existing hipBLASLt provider tests to use this path for the tolerance calc, and also verify that it's passing with these changes on a variety of shapes. We don't need to commit the extra tests for now, but we should do some exploratory testing to ensure this calc is working across different matmul problems with the hipBLASLt provider.

Tests in the folder here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I am all on board with this. I'll take a look at the tests.

{
// Validate ComputeType
static_assert(std::is_same_v<ComputeType, float> || std::is_same_v<ComputeType, double>
|| std::is_same_v<ComputeType, half> || std::is_same_v<ComputeType, bfloat16>,
"ComputeType must be float, double, half, or bfloat16");

// Validate tensor dimensions for matmul compatibility
const auto& aDims = a.dims();
const auto& bDims = b.dims();

if(aDims.size() < 2 || bDims.size() < 2)
{
throw std::invalid_argument("Matrices must have at least 2 dimensions for matmul.");
}

// Extract reduction dimension K (last dim of A, second-to-last dim of B)
const int64_t k = aDims[aDims.size() - 1];
const int64_t bRows = bDims[bDims.size() - 2];

if(k != bRows)
{
throw std::invalid_argument(
"Matrix dimensions incompatible for multiplication: A columns != B rows.");
}

if(k <= 0)
{
throw std::invalid_argument("Reduction dimension k must be positive.");
}

auto numberOfAccumulations = static_cast<uint64_t>(k);

// Compute infinity-norms of input matrices (max absolute row sum)
const double normA = computeMatrixInfNorm<InputType>(a);
const double normB = computeMatrixInfNorm<InputType>(b);

// Apply Higham's error bound: max_ij |error_ij| <= γ_k * ||A||_inf * ||B||_inf
auto epsilon = static_cast<double>(std::numeric_limits<ComputeType>::epsilon());
const double gamma = hipdnn_test_sdk::utilities::computeGamma(numberOfAccumulations, epsilon);

constexpr double GAMMA_MAX = 0.5;
if(gamma >= GAMMA_MAX)
{
throw std::overflow_error(
"Error growth factor gamma >= 0.5: the accumulation error exceeds 50% of the signal. "
"The computation may be numerically meaningless at this precision and reduction size.");
}

double accumulatedTolerance = gamma * normA * normB;

// Calculate input casting error
// If InputType has higher precision (smaller epsilon) than ComputeType, we lose precision
// when loading inputs (downcasting). Example: double -> float.
// If InputType has lower precision than ComputeType, no additional error (upcasting).
auto inputEpsilon = static_cast<double>(std::numeric_limits<InputType>::epsilon());
if(inputEpsilon < epsilon)
{
// Input precision is higher than compute precision, so we have casting error.
// Each element loses precision proportional to epsilon when cast.
// Worst-case error contribution from input casting:
// Error ≈ 2 * ||A||_inf * ||B||_inf * epsilon
const double castingError = 2.0 * normA * normB * epsilon;
accumulatedTolerance += castingError;
}

// Calculate output casting error
// If OutputType has lower precision (larger epsilon) than ComputeType, we lose precision
// when storing the result (downcasting). Example: float -> half.
auto outputEpsilon = static_cast<double>(std::numeric_limits<OutputType>::epsilon());
double castTolerance = 0.0;

if(outputEpsilon > epsilon)
{
// The error is bounded by the precision of the OutputType at the final accumulated value.
// Maximum possible output magnitude: ||A||_inf * ||B||_inf
const double maxPossibleOutputValue = normA * normB;
castTolerance = maxPossibleOutputValue * outputEpsilon;
}

// Total tolerance is the sum of accumulation error and casting errors
const double totalTolerance = accumulatedTolerance + castTolerance;

// Check if totalTolerance exceeds the maximum representable value of OutputType
if(totalTolerance > static_cast<double>(std::numeric_limits<OutputType>::max()))
{
throw std::overflow_error(
"Calculated tolerance exceeds the maximum representable value of the output type.");
}

return static_cast<float>(totalTolerance);
}

} // namespace hipdnn_test_sdk::utilities::matmul
Loading
Loading