ROCm · tony-davis · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -431,7 +431,7 @@ try
         ("initialization",
          value<std::string>(&initialization)->default_value("hpl"),
          "Initialize matrix data."
-         "Options: rand_int, trig_float, hpl(floating), special, zero, norm_dist, uniform_01")
+         "Options: rand_int, trig_float, hpl(floating), special, zero, norm_dist, uniform_01, integer_exact")
 
         ("transA",
          value<char>(&arg.transA)->default_value('N'),

@@ -33,13 +33,14 @@
 
 enum class hipblaslt_initialization
 {
-    rand_int   = 111,
-    trig_float = 222,
-    hpl        = 333,
-    special    = 444,
-    zero       = 555,
-    norm_dist  = 666,
-    uniform_01 = 777,
+    rand_int      = 111,
+    trig_float    = 222,
+    hpl           = 333,
+    special       = 444,
+    zero          = 555,
+    norm_dist     = 666,
+    uniform_01    = 777,
+    integer_exact = 888, // A,C in [0,1,2], B ±[0,1,2]; alpha=2, beta 0 or -2; exact when K bounded
 };
 
 typedef enum class _hipblaslt_activation_type
@@ -212,6 +213,8 @@ constexpr auto hipblaslt_initialization2string(hipblaslt_initialization init)
         return "norm_dist";
     case hipblaslt_initialization::uniform_01:
         return "uniform_01";
+    case hipblaslt_initialization::integer_exact:
+        return "integer_exact";
     }
     return "invalid";
 }
@@ -233,6 +236,7 @@ inline hipblaslt_initialization string2hipblaslt_initialization(const std::strin
         value == "zero"       ? hipblaslt_initialization::zero       :
         value == "norm_dist"  ? hipblaslt_initialization::norm_dist  :
         value == "uniform_01" ? hipblaslt_initialization::uniform_01 :
+        value == "integer_exact" ? hipblaslt_initialization::integer_exact :
         static_cast<hipblaslt_initialization>(0);
 }
 // clang-format on

@@ -280,11 +280,8 @@ inline void hipblaslt_init_sin(void*       A,
 
 // Initialize matrix so adjacent entries have alternating sign.
 // In gemm if either A or B are initialized with alternating
-// sign the reduction sum will be summing positive
-// and negative numbers, so it should not get too large.
-// This helps reduce floating point inaccuracies for 16bit
-// arithmetic where the exponent has only 5 bits, and the
-// mantissa 10 bits.
+// Checkerboard ± so first element of each row and column alternates; keeps
+// reduction sums from growing too large (helps 16bit with 5-bit exponent).
 template <typename T>
 inline void hipblaslt_init_alternating_sign(
     T* A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1)

@@ -903,6 +903,17 @@ void check(hipStream_t                   stream,
             }
             else
             {
+                if(arg.initialization == hipblaslt_initialization::integer_exact)
+                {
+                    unit_print_first_mismatch(M[gemmIdx],
+                                             N[gemmIdx],
+                                             ldd[gemmIdx],
+                                             stride_d[gemmIdx],
+                                             hD_gold[gemmIdx].buf(),
+                                             hD_1[gemmIdx].buf(),
+                                             num_batches[gemmIdx],
+                                             To);
+                }
                 unit_check_general(M[gemmIdx],
                                    N[gemmIdx],
                                    ldd[gemmIdx],
@@ -1248,6 +1259,37 @@ void testing_matmul(const Arguments& arg)
     hipblasltSetColdIterationsValue(arg.cold_iters);
     hipblasltSetHotIterationsValue(arg.iters);
 
+    // integer_exact: gfx11 skip disabled temporarily—re-enable with #if 1 after experiments.
+    // (Previously: Navi GPU vs CPU exact match failed; general fp16 GEMM on gfx11 still uses wider tol.)
+    if(arg.initialization == hipblaslt_initialization::integer_exact)
+    {
+#if 0
+        if(hipblaslt_get_arch_major() == 11)
+        {
+            hipblaslt_cout << "Skipping integer_exact on gfx11 (Navi)"
+                           << std::endl;
+            return;
+        }
+#endif
+        const bool is_16bit = (tiA == HIP_R_16F || tiA == HIP_R_16BF);
+        if(is_16bit)
+        {
+            // alpha=2: |2*dot|<=8K; beta=-2 adds 2*C. fp16 exact int ~2048 => K<=256 for both betas used
+            const int32_t k_limit
+                = (arg.alpha == 2.0f && (arg.beta == 0.0f || arg.beta == -2.0f)) ? 256 : 512;
+            const int32_t gemm_count = std::max(1, arg.grouped_gemm);
+            for(int32_t i = 0; i < gemm_count; i++)
+            {
+                if(arg.K[i] > k_limit)
+                {
+                    hipblaslt_cout << "Skipping integer_exact: 16-bit format with K=" << arg.K[i]
+                                   << " > " << k_limit << " (exact representability limit)" << std::endl;
+                    return;
+                }
+            }
+        }
+    }
+
     // for all f8/bf8 cases including mix mode
     if((realDataTypeSize(tiA) == 1 || realDataTypeSize(tiB) == 1) && tc != HIP_R_32I)
     {
@@ -2041,11 +2083,12 @@ void testing_matmul_with_bias(const Arguments& arg,
                                         lda[i],
                                         realDataTypeSize(TiA),
                                         do_swizzle_a));
+            // B is always stored as K×N in memory; use (K, N, ldb) not (B_row, B_col) to avoid row > lda when transB=T
             CHECK_HIP_ERROR(synchronize(hB[i],
                                         dB[i],
                                         num_batches[i],
-                                        B_row[i],
-                                        B_col[i],
+                                        K[i],
+                                        N[i],
                                         ldb[i],
                                         realDataTypeSize(TiB),
                                         do_swizzle_b));
@@ -3653,6 +3696,11 @@ void testing_matmul_with_bias(const Arguments& arg,
                     tol[gemmIdx] = K[gemmIdx] * sum_error_tolerance_for_gfx11_type(Tc, TiA, To);
                 }
             }
+            if(arg.initialization == hipblaslt_initialization::integer_exact)
+            {
+                for(int gemmIdx = 0; gemmIdx < gemm_count; gemmIdx++)
+                    tol[gemmIdx] = 0;
+            }
 
             if(arg.unit_check || arg.norm_check || arg.allclose_check)
             {
@@ -4069,6 +4117,11 @@ void testing_matmul_with_bias(const Arguments& arg,
                     tol[gemmIdx] = K[gemmIdx] * sum_error_tolerance_for_gfx11_type(Tc, TiA, To);
                 }
             }
+            if(arg.initialization == hipblaslt_initialization::integer_exact)
+            {
+                for(int gemmIdx = 0; gemmIdx < gemm_count; gemmIdx++)
+                    tol[gemmIdx] = 0;
+            }
             if(arg.unit_check || arg.norm_check || arg.allclose_check)
             {
                 if(arg.dump_matrix)

@@ -553,6 +553,82 @@ inline int64_t unit_check_diff(
     return error;
 }
 
+/*! \brief For integer_exact / debugging: print first mismatch (i, j, batch, CPU, GPU) and total error count.
+ *  Call before unit_check_general when a failure is suspected to get diagnostic output. */
+inline void unit_print_first_mismatch(int64_t       M,
+                                      int64_t       N,
+                                      int64_t       lda,
+                                      int64_t       strideA,
+                                      const void*   hCPU,
+                                      const void*   hGPU,
+                                      int64_t       batch_count,
+                                      hipDataType   type)
+{
+    auto diff_count = int64_t(0);
+    auto first_i = int64_t(-1), first_j = int64_t(-1), first_b = int64_t(-1);
+    auto found = false;
+
+#define FIND_FIRST_MISMATCH(T, EQ_EXPR)                                                             \
+    do                                                                                              \
+    {                                                                                               \
+        const T* cpu = static_cast<const T*>(hCPU);                                                 \
+        const T* gpu = static_cast<const T*>(hGPU);                                                 \
+        for(int64_t k = 0; k < batch_count; k++)                                                   \
+            for(int64_t j = 0; j < N; j++)                                                          \
+                for(int64_t i = 0; i < M; i++)                                                      \
+                {                                                                                   \
+                    int64_t idx = i + j * lda + k * strideA;                                        \
+                    if(!(EQ_EXPR))                                                                  \
+                    {                                                                               \
+                        diff_count++;                                                               \
+                        if(!found)                                                                  \
+                        {                                                                           \
+                            first_i = i;                                                            \
+                            first_j = j;                                                            \
+                            first_b = k;                                                            \
+                            found   = true;                                                        \
+                            hipblaslt_cerr << "First mismatch at (i=" << i << ", j=" << j           \
+                                          << ", batch=" << k << "): CPU=" << static_cast<double>(cpu[idx]) \
+                                          << " GPU=" << static_cast<double>(gpu[idx]) << std::endl; \
+                        }                                                                           \
+                    }                                                                               \
+                }                                                                                   \
+    } while(0)
+
+    switch(type)
+    {
+    case HIP_R_32F:
+        FIND_FIRST_MISMATCH(float,
+                            (cpu[idx] == gpu[idx]
+                             || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
+        break;
+    case HIP_R_64F:
+        FIND_FIRST_MISMATCH(double,
+                            (cpu[idx] == gpu[idx]
+                             || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
+        break;
+    case HIP_R_16F:
+        FIND_FIRST_MISMATCH(hipblasLtHalf,
+                            (float(cpu[idx]) == float(gpu[idx])
+                             || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
+        break;
+    case HIP_R_16BF:
+        FIND_FIRST_MISMATCH(hip_bfloat16,
+                            (float(cpu[idx]) == float(gpu[idx])
+                             || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
+        break;
+    default:
+        hipblaslt_cerr << "unit_print_first_mismatch: unhandled type (value="
+                       << static_cast<int>(type) << ")" << std::endl;
+        return;
+    }
+#undef FIND_FIRST_MISMATCH
+
+    if(diff_count > 0)
+        hipblaslt_cerr << "Total mismatches: " << diff_count << " (matrix " << M << "x" << N
+                      << ", batch_count=" << batch_count << ")" << std::endl;
+}
+
 inline void unit_check_general(int64_t     M,
                                int64_t     N,
                                int64_t     lda,

@@ -95,6 +95,31 @@ __device__ int8_t random_int<int8_t>(size_t idx)
     return pseudo_random_device(idx) % 3 + 1;
 }
 
+/*! \brief  generate a random number in range [0, 1, 2] for integer_exact init */
+template <typename T>
+__device__ T small_int_positive(size_t idx)
+{
+    return T(pseudo_random_device(idx) % 3);
+}
+
+template <>
+__device__ hipblasLtHalf small_int_positive<hipblasLtHalf>(size_t idx)
+{
+    return hipblasLtHalf(pseudo_random_device(idx) % 3);
+}
+
+template <>
+__device__ hip_bfloat16 small_int_positive<hip_bfloat16>(size_t idx)
+{
+    return hip_bfloat16(pseudo_random_device(idx) % 3);
+}
+
+template <>
+__device__ int8_t small_int_positive<int8_t>(size_t idx)
+{
+    return static_cast<int8_t>(pseudo_random_device(idx) % 3);
+}
+
 /*! \brief  generate a random number in HPL-like [-0.5,0.5] doubles  */
 template <typename T>
 __device__ T random_hpl(size_t idx)
@@ -259,6 +284,33 @@ void hipblaslt_init_device(ABC_dims                 abc,
                 return uniform_01<T>(idx);
             });
             break;
+        case hipblaslt_initialization::integer_exact:
+            // A and C: [0,1,2] (C with beta); B: checkerboard ±[0,1,2]
+            if(abc == ABC_dims::A || abc == ABC_dims::C)
+            {
+                fill_batch(A, M, N, lda, stride, batch_count, [](size_t idx) -> T {
+                    return small_int_positive<T>(idx);
+                });
+            }
+            else if(abc == ABC_dims::B)
+            {
+                // Checkerboard ±: (i^j)&1 so first element of each row and column alternates
+                // Use an effective stride that is never zero and at least large enough
+                // to contain one full matrix, to avoid division by a potentially zero stride.
+                // Offset PRNG index for B so {0,1,2} magnitudes differ from A (same idx would
+                // correlate via pseudo_random_device).
+                constexpr size_t kBSeedOffset = 1000003; // large prime
+                size_t effective_stride = stride ? std::max(stride, lda * N) : lda * N;
+                fill_batch(A, M, N, lda, effective_stride, batch_count, [effective_stride, lda](size_t idx) -> T {
+                    auto b        = idx / effective_stride;
+                    auto in_batch = idx - b * effective_stride;
+                    auto j        = in_batch / lda;
+                    auto i        = in_batch - j * lda;
+                    auto value    = small_int_positive<T>(idx + kBSeedOffset);
+                    return (i ^ j) & 1 ? value : negate(value);
+                });
+            }
+            break;
         default:
             hipblaslt_cerr << "Error type in hipblaslt_init_device" << std::endl;
             break;

@@ -44,6 +44,7 @@ Datatypes:
         zero: 555
         norm_dist: 666
         uniform_01: 777
+        integer_exact: 888
   - hipblaslt_activation_type:
       bases: [ c_int ]
       attr:
@@ -280,6 +281,10 @@ Real precisions f32_bf16_r: &real_precisions_intermeddiate_bf16
   - &single_precision_bf16computeIn_precision
     { a_type: f32_r, b_type: f32_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_fast_bf16_r, scale_type: f32_r}
 
+# TF32x1 only: f32 A+B with compute input type bf16. Use for integer_exact to avoid "no solution" from f16+bf16 path.
+Real precisions TF32x1 only: &real_precisions_tf32x1_only
+  - *single_precision_bf16computeIn_precision
+
 Real precisions f32_f8_fnuz_r: &real_precisions_intermeddiate_f8_fnuz
   - &hpa_half_fp16dst_f8_fnuz_computeIn_precision
     { a_type: f16_r, b_type: f16_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, compute_input_typeA: f8_fnuz_r, compute_input_typeB: f8_fnuz_r, scale_type: f32_r}