Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion projects/hipblaslt/clients/bench/src/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ try
("initialization",
value<std::string>(&initialization)->default_value("hpl"),
"Initialize matrix data."
"Options: rand_int, trig_float, hpl(floating), special, zero, norm_dist, uniform_01")
"Options: rand_int, trig_float, hpl(floating), special, zero, norm_dist, uniform_01, integer_exact")

("transA",
value<char>(&arg.transA)->default_value('N'),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@

enum class hipblaslt_initialization
{
rand_int = 111,
trig_float = 222,
hpl = 333,
special = 444,
zero = 555,
norm_dist = 666,
uniform_01 = 777,
rand_int = 111,
trig_float = 222,
hpl = 333,
special = 444,
zero = 555,
norm_dist = 666,
uniform_01 = 777,
integer_exact = 888, // A,C in [0,1,2], B ±[0,1,2]; alpha=2, beta 0 or -2; exact when K bounded
};

typedef enum class _hipblaslt_activation_type
Expand Down Expand Up @@ -212,6 +213,8 @@ constexpr auto hipblaslt_initialization2string(hipblaslt_initialization init)
return "norm_dist";
case hipblaslt_initialization::uniform_01:
return "uniform_01";
case hipblaslt_initialization::integer_exact:
return "integer_exact";
}
return "invalid";
}
Expand All @@ -233,6 +236,7 @@ inline hipblaslt_initialization string2hipblaslt_initialization(const std::strin
value == "zero" ? hipblaslt_initialization::zero :
value == "norm_dist" ? hipblaslt_initialization::norm_dist :
value == "uniform_01" ? hipblaslt_initialization::uniform_01 :
value == "integer_exact" ? hipblaslt_initialization::integer_exact :
static_cast<hipblaslt_initialization>(0);
}
// clang-format on
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,8 @@ inline void hipblaslt_init_sin(void* A,

// Initialize matrix so adjacent entries have alternating sign.
// In gemm if either A or B are initialized with alternating
// sign the reduction sum will be summing positive
// and negative numbers, so it should not get too large.
// This helps reduce floating point inaccuracies for 16bit
// arithmetic where the exponent has only 5 bits, and the
// mantissa 10 bits.
// Checkerboard ± so first element of each row and column alternates; keeps
// reduction sums from growing too large (helps 16bit with 5-bit exponent).
template <typename T>
inline void hipblaslt_init_alternating_sign(
T* A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1)
Expand Down
57 changes: 55 additions & 2 deletions projects/hipblaslt/clients/common/include/testing_matmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,17 @@ void check(hipStream_t stream,
}
else
{
if(arg.initialization == hipblaslt_initialization::integer_exact)
{
unit_print_first_mismatch(M[gemmIdx],
N[gemmIdx],
ldd[gemmIdx],
stride_d[gemmIdx],
hD_gold[gemmIdx].buf(),
hD_1[gemmIdx].buf(),
num_batches[gemmIdx],
To);
}
unit_check_general(M[gemmIdx],
N[gemmIdx],
ldd[gemmIdx],
Expand Down Expand Up @@ -1248,6 +1259,37 @@ void testing_matmul(const Arguments& arg)
hipblasltSetColdIterationsValue(arg.cold_iters);
hipblasltSetHotIterationsValue(arg.iters);

// integer_exact: gfx11 skip disabled temporarily—re-enable with #if 1 after experiments.
// (Previously: Navi GPU vs CPU exact match failed; general fp16 GEMM on gfx11 still uses wider tol.)
if(arg.initialization == hipblaslt_initialization::integer_exact)
{
#if 0
if(hipblaslt_get_arch_major() == 11)
{
hipblaslt_cout << "Skipping integer_exact on gfx11 (Navi)"
<< std::endl;
return;
}
#endif
const bool is_16bit = (tiA == HIP_R_16F || tiA == HIP_R_16BF);
if(is_16bit)
{
// alpha=2: |2*dot|<=8K; beta=-2 adds 2*C. fp16 exact int ~2048 => K<=256 for both betas used
const int32_t k_limit
= (arg.alpha == 2.0f && (arg.beta == 0.0f || arg.beta == -2.0f)) ? 256 : 512;
const int32_t gemm_count = std::max(1, arg.grouped_gemm);
for(int32_t i = 0; i < gemm_count; i++)
{
if(arg.K[i] > k_limit)
{
hipblaslt_cout << "Skipping integer_exact: 16-bit format with K=" << arg.K[i]
<< " > " << k_limit << " (exact representability limit)" << std::endl;
return;
}
}
}
}

// for all f8/bf8 cases including mix mode
if((realDataTypeSize(tiA) == 1 || realDataTypeSize(tiB) == 1) && tc != HIP_R_32I)
{
Expand Down Expand Up @@ -2041,11 +2083,12 @@ void testing_matmul_with_bias(const Arguments& arg,
lda[i],
realDataTypeSize(TiA),
do_swizzle_a));
// B is always stored as K×N in memory; use (K, N, ldb) not (B_row, B_col) to avoid row > lda when transB=T
CHECK_HIP_ERROR(synchronize(hB[i],
dB[i],
num_batches[i],
B_row[i],
B_col[i],
K[i],
N[i],
ldb[i],
realDataTypeSize(TiB),
do_swizzle_b));
Expand Down Expand Up @@ -3653,6 +3696,11 @@ void testing_matmul_with_bias(const Arguments& arg,
tol[gemmIdx] = K[gemmIdx] * sum_error_tolerance_for_gfx11_type(Tc, TiA, To);
}
}
if(arg.initialization == hipblaslt_initialization::integer_exact)
{
for(int gemmIdx = 0; gemmIdx < gemm_count; gemmIdx++)
tol[gemmIdx] = 0;
}

if(arg.unit_check || arg.norm_check || arg.allclose_check)
{
Expand Down Expand Up @@ -4069,6 +4117,11 @@ void testing_matmul_with_bias(const Arguments& arg,
tol[gemmIdx] = K[gemmIdx] * sum_error_tolerance_for_gfx11_type(Tc, TiA, To);
}
}
if(arg.initialization == hipblaslt_initialization::integer_exact)
{
for(int gemmIdx = 0; gemmIdx < gemm_count; gemmIdx++)
tol[gemmIdx] = 0;
}
if(arg.unit_check || arg.norm_check || arg.allclose_check)
{
if(arg.dump_matrix)
Expand Down
76 changes: 76 additions & 0 deletions projects/hipblaslt/clients/common/include/unit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,82 @@ inline int64_t unit_check_diff(
return error;
}

/*! \brief For integer_exact / debugging: print first mismatch (i, j, batch, CPU, GPU) and total error count.
* Call before unit_check_general when a failure is suspected to get diagnostic output. */
inline void unit_print_first_mismatch(int64_t M,
int64_t N,
int64_t lda,
int64_t strideA,
const void* hCPU,
const void* hGPU,
int64_t batch_count,
hipDataType type)
{
auto diff_count = int64_t(0);
auto first_i = int64_t(-1), first_j = int64_t(-1), first_b = int64_t(-1);
auto found = false;

#define FIND_FIRST_MISMATCH(T, EQ_EXPR) \
do \
{ \
const T* cpu = static_cast<const T*>(hCPU); \
const T* gpu = static_cast<const T*>(hGPU); \
for(int64_t k = 0; k < batch_count; k++) \
for(int64_t j = 0; j < N; j++) \
for(int64_t i = 0; i < M; i++) \
{ \
int64_t idx = i + j * lda + k * strideA; \
if(!(EQ_EXPR)) \
{ \
diff_count++; \
if(!found) \
{ \
first_i = i; \
first_j = j; \
first_b = k; \
found = true; \
hipblaslt_cerr << "First mismatch at (i=" << i << ", j=" << j \
<< ", batch=" << k << "): CPU=" << static_cast<double>(cpu[idx]) \
<< " GPU=" << static_cast<double>(gpu[idx]) << std::endl; \
} \
} \
} \
} while(0)

switch(type)
{
case HIP_R_32F:
FIND_FIRST_MISMATCH(float,
(cpu[idx] == gpu[idx]
|| (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
break;
case HIP_R_64F:
FIND_FIRST_MISMATCH(double,
(cpu[idx] == gpu[idx]
|| (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
break;
case HIP_R_16F:
FIND_FIRST_MISMATCH(hipblasLtHalf,
(float(cpu[idx]) == float(gpu[idx])
|| (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
break;
case HIP_R_16BF:
FIND_FIRST_MISMATCH(hip_bfloat16,
(float(cpu[idx]) == float(gpu[idx])
|| (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx]))));
break;
default:
hipblaslt_cerr << "unit_print_first_mismatch: unhandled type (value="
<< static_cast<int>(type) << ")" << std::endl;
return;
}
#undef FIND_FIRST_MISMATCH

if(diff_count > 0)
hipblaslt_cerr << "Total mismatches: " << diff_count << " (matrix " << M << "x" << N
<< ", batch_count=" << batch_count << ")" << std::endl;
}

inline void unit_check_general(int64_t M,
int64_t N,
int64_t lda,
Expand Down
52 changes: 52 additions & 0 deletions projects/hipblaslt/clients/common/src/hipblaslt_init_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,31 @@ __device__ int8_t random_int<int8_t>(size_t idx)
return pseudo_random_device(idx) % 3 + 1;
}

/*! \brief generate a random number in range [0, 1, 2] for integer_exact init */
template <typename T>
__device__ T small_int_positive(size_t idx)
{
return T(pseudo_random_device(idx) % 3);
}

template <>
__device__ hipblasLtHalf small_int_positive<hipblasLtHalf>(size_t idx)
{
return hipblasLtHalf(pseudo_random_device(idx) % 3);
}

template <>
__device__ hip_bfloat16 small_int_positive<hip_bfloat16>(size_t idx)
{
return hip_bfloat16(pseudo_random_device(idx) % 3);
}

template <>
__device__ int8_t small_int_positive<int8_t>(size_t idx)
{
return static_cast<int8_t>(pseudo_random_device(idx) % 3);
}

/*! \brief generate a random number in HPL-like [-0.5,0.5] doubles */
template <typename T>
__device__ T random_hpl(size_t idx)
Expand Down Expand Up @@ -259,6 +284,33 @@ void hipblaslt_init_device(ABC_dims abc,
return uniform_01<T>(idx);
});
break;
case hipblaslt_initialization::integer_exact:
// A and C: [0,1,2] (C with beta); B: checkerboard ±[0,1,2]
if(abc == ABC_dims::A || abc == ABC_dims::C)
{
fill_batch(A, M, N, lda, stride, batch_count, [](size_t idx) -> T {
return small_int_positive<T>(idx);
});
}
else if(abc == ABC_dims::B)
{
// Checkerboard ±: (i^j)&1 so first element of each row and column alternates
// Use an effective stride that is never zero and at least large enough
// to contain one full matrix, to avoid division by a potentially zero stride.
// Offset PRNG index for B so {0,1,2} magnitudes differ from A (same idx would
// correlate via pseudo_random_device).
constexpr size_t kBSeedOffset = 1000003; // large prime
size_t effective_stride = stride ? std::max(stride, lda * N) : lda * N;
fill_batch(A, M, N, lda, effective_stride, batch_count, [effective_stride, lda](size_t idx) -> T {
auto b = idx / effective_stride;
auto in_batch = idx - b * effective_stride;
auto j = in_batch / lda;
auto i = in_batch - j * lda;
auto value = small_int_positive<T>(idx + kBSeedOffset);
return (i ^ j) & 1 ? value : negate(value);
});
}
break;
default:
hipblaslt_cerr << "Error type in hipblaslt_init_device" << std::endl;
break;
Expand Down
5 changes: 5 additions & 0 deletions projects/hipblaslt/clients/tests/data/hipblaslt_common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Datatypes:
zero: 555
norm_dist: 666
uniform_01: 777
integer_exact: 888
- hipblaslt_activation_type:
bases: [ c_int ]
attr:
Expand Down Expand Up @@ -280,6 +281,10 @@ Real precisions f32_bf16_r: &real_precisions_intermeddiate_bf16
- &single_precision_bf16computeIn_precision
{ a_type: f32_r, b_type: f32_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_fast_bf16_r, scale_type: f32_r}

# TF32x1 only: f32 A+B with compute input type bf16. Use for integer_exact to avoid "no solution" from f16+bf16 path.
Real precisions TF32x1 only: &real_precisions_tf32x1_only
- *single_precision_bf16computeIn_precision

Real precisions f32_f8_fnuz_r: &real_precisions_intermeddiate_f8_fnuz
- &hpa_half_fp16dst_f8_fnuz_computeIn_precision
{ a_type: f16_r, b_type: f16_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, compute_input_typeA: f8_fnuz_r, compute_input_typeB: f8_fnuz_r, scale_type: f32_r}
Expand Down
Loading
Loading