diff --git a/projects/hipblaslt/clients/bench/src/client.cpp b/projects/hipblaslt/clients/bench/src/client.cpp index 6eee8871da9d..27450d485b4c 100644 --- a/projects/hipblaslt/clients/bench/src/client.cpp +++ b/projects/hipblaslt/clients/bench/src/client.cpp @@ -431,7 +431,7 @@ try ("initialization", value(&initialization)->default_value("hpl"), "Initialize matrix data." - "Options: rand_int, trig_float, hpl(floating), special, zero, norm_dist, uniform_01") + "Options: rand_int, trig_float, hpl(floating), special, zero, norm_dist, uniform_01, integer_exact") ("transA", value(&arg.transA)->default_value('N'), diff --git a/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp b/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp index 994c6082ee4a..a4b8c0598c94 100644 --- a/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp +++ b/projects/hipblaslt/clients/common/include/hipblaslt_datatype2string.hpp @@ -33,13 +33,14 @@ enum class hipblaslt_initialization { - rand_int = 111, - trig_float = 222, - hpl = 333, - special = 444, - zero = 555, - norm_dist = 666, - uniform_01 = 777, + rand_int = 111, + trig_float = 222, + hpl = 333, + special = 444, + zero = 555, + norm_dist = 666, + uniform_01 = 777, + integer_exact = 888, // A,C in [0,1,2], B ±[0,1,2]; alpha=2, beta 0 or -2; exact when K bounded }; typedef enum class _hipblaslt_activation_type @@ -212,6 +213,8 @@ constexpr auto hipblaslt_initialization2string(hipblaslt_initialization init) return "norm_dist"; case hipblaslt_initialization::uniform_01: return "uniform_01"; + case hipblaslt_initialization::integer_exact: + return "integer_exact"; } return "invalid"; } @@ -233,6 +236,7 @@ inline hipblaslt_initialization string2hipblaslt_initialization(const std::strin value == "zero" ? hipblaslt_initialization::zero : value == "norm_dist" ? hipblaslt_initialization::norm_dist : value == "uniform_01" ? hipblaslt_initialization::uniform_01 : + value == "integer_exact" ? hipblaslt_initialization::integer_exact : static_cast(0); } // clang-format on diff --git a/projects/hipblaslt/clients/common/include/hipblaslt_init.hpp b/projects/hipblaslt/clients/common/include/hipblaslt_init.hpp index ec1b820d7131..fa28dcec47c0 100644 --- a/projects/hipblaslt/clients/common/include/hipblaslt_init.hpp +++ b/projects/hipblaslt/clients/common/include/hipblaslt_init.hpp @@ -280,11 +280,8 @@ inline void hipblaslt_init_sin(void* A, // Initialize matrix so adjacent entries have alternating sign. // In gemm if either A or B are initialized with alternating -// sign the reduction sum will be summing positive -// and negative numbers, so it should not get too large. -// This helps reduce floating point inaccuracies for 16bit -// arithmetic where the exponent has only 5 bits, and the -// mantissa 10 bits. +// Checkerboard ± so first element of each row and column alternates; keeps +// reduction sums from growing too large (helps 16bit with 5-bit exponent). template inline void hipblaslt_init_alternating_sign( T* A, size_t M, size_t N, size_t lda, size_t stride = 0, size_t batch_count = 1) diff --git a/projects/hipblaslt/clients/common/include/testing_matmul.hpp b/projects/hipblaslt/clients/common/include/testing_matmul.hpp index 0d8705df0811..854b6b4ffcfd 100644 --- a/projects/hipblaslt/clients/common/include/testing_matmul.hpp +++ b/projects/hipblaslt/clients/common/include/testing_matmul.hpp @@ -903,6 +903,17 @@ void check(hipStream_t stream, } else { + if(arg.initialization == hipblaslt_initialization::integer_exact) + { + unit_print_first_mismatch(M[gemmIdx], + N[gemmIdx], + ldd[gemmIdx], + stride_d[gemmIdx], + hD_gold[gemmIdx].buf(), + hD_1[gemmIdx].buf(), + num_batches[gemmIdx], + To); + } unit_check_general(M[gemmIdx], N[gemmIdx], ldd[gemmIdx], @@ -1248,6 +1259,37 @@ void testing_matmul(const Arguments& arg) hipblasltSetColdIterationsValue(arg.cold_iters); hipblasltSetHotIterationsValue(arg.iters); + // integer_exact: gfx11 skip disabled temporarily—re-enable with #if 1 after experiments. + // (Previously: Navi GPU vs CPU exact match failed; general fp16 GEMM on gfx11 still uses wider tol.) + if(arg.initialization == hipblaslt_initialization::integer_exact) + { +#if 0 + if(hipblaslt_get_arch_major() == 11) + { + hipblaslt_cout << "Skipping integer_exact on gfx11 (Navi)" + << std::endl; + return; + } +#endif + const bool is_16bit = (tiA == HIP_R_16F || tiA == HIP_R_16BF); + if(is_16bit) + { + // alpha=2: |2*dot|<=8K; beta=-2 adds 2*C. fp16 exact int ~2048 => K<=256 for both betas used + const int32_t k_limit + = (arg.alpha == 2.0f && (arg.beta == 0.0f || arg.beta == -2.0f)) ? 256 : 512; + const int32_t gemm_count = std::max(1, arg.grouped_gemm); + for(int32_t i = 0; i < gemm_count; i++) + { + if(arg.K[i] > k_limit) + { + hipblaslt_cout << "Skipping integer_exact: 16-bit format with K=" << arg.K[i] + << " > " << k_limit << " (exact representability limit)" << std::endl; + return; + } + } + } + } + // for all f8/bf8 cases including mix mode if((realDataTypeSize(tiA) == 1 || realDataTypeSize(tiB) == 1) && tc != HIP_R_32I) { @@ -2041,11 +2083,12 @@ void testing_matmul_with_bias(const Arguments& arg, lda[i], realDataTypeSize(TiA), do_swizzle_a)); + // B is always stored as K×N in memory; use (K, N, ldb) not (B_row, B_col) to avoid row > lda when transB=T CHECK_HIP_ERROR(synchronize(hB[i], dB[i], num_batches[i], - B_row[i], - B_col[i], + K[i], + N[i], ldb[i], realDataTypeSize(TiB), do_swizzle_b)); @@ -3653,6 +3696,11 @@ void testing_matmul_with_bias(const Arguments& arg, tol[gemmIdx] = K[gemmIdx] * sum_error_tolerance_for_gfx11_type(Tc, TiA, To); } } + if(arg.initialization == hipblaslt_initialization::integer_exact) + { + for(int gemmIdx = 0; gemmIdx < gemm_count; gemmIdx++) + tol[gemmIdx] = 0; + } if(arg.unit_check || arg.norm_check || arg.allclose_check) { @@ -4069,6 +4117,11 @@ void testing_matmul_with_bias(const Arguments& arg, tol[gemmIdx] = K[gemmIdx] * sum_error_tolerance_for_gfx11_type(Tc, TiA, To); } } + if(arg.initialization == hipblaslt_initialization::integer_exact) + { + for(int gemmIdx = 0; gemmIdx < gemm_count; gemmIdx++) + tol[gemmIdx] = 0; + } if(arg.unit_check || arg.norm_check || arg.allclose_check) { if(arg.dump_matrix) diff --git a/projects/hipblaslt/clients/common/include/unit.hpp b/projects/hipblaslt/clients/common/include/unit.hpp index b5aab34262fc..4f1fdb0bbe97 100644 --- a/projects/hipblaslt/clients/common/include/unit.hpp +++ b/projects/hipblaslt/clients/common/include/unit.hpp @@ -553,6 +553,82 @@ inline int64_t unit_check_diff( return error; } +/*! \brief For integer_exact / debugging: print first mismatch (i, j, batch, CPU, GPU) and total error count. + * Call before unit_check_general when a failure is suspected to get diagnostic output. */ +inline void unit_print_first_mismatch(int64_t M, + int64_t N, + int64_t lda, + int64_t strideA, + const void* hCPU, + const void* hGPU, + int64_t batch_count, + hipDataType type) +{ + auto diff_count = int64_t(0); + auto first_i = int64_t(-1), first_j = int64_t(-1), first_b = int64_t(-1); + auto found = false; + +#define FIND_FIRST_MISMATCH(T, EQ_EXPR) \ + do \ + { \ + const T* cpu = static_cast(hCPU); \ + const T* gpu = static_cast(hGPU); \ + for(int64_t k = 0; k < batch_count; k++) \ + for(int64_t j = 0; j < N; j++) \ + for(int64_t i = 0; i < M; i++) \ + { \ + int64_t idx = i + j * lda + k * strideA; \ + if(!(EQ_EXPR)) \ + { \ + diff_count++; \ + if(!found) \ + { \ + first_i = i; \ + first_j = j; \ + first_b = k; \ + found = true; \ + hipblaslt_cerr << "First mismatch at (i=" << i << ", j=" << j \ + << ", batch=" << k << "): CPU=" << static_cast(cpu[idx]) \ + << " GPU=" << static_cast(gpu[idx]) << std::endl; \ + } \ + } \ + } \ + } while(0) + + switch(type) + { + case HIP_R_32F: + FIND_FIRST_MISMATCH(float, + (cpu[idx] == gpu[idx] + || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx])))); + break; + case HIP_R_64F: + FIND_FIRST_MISMATCH(double, + (cpu[idx] == gpu[idx] + || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx])))); + break; + case HIP_R_16F: + FIND_FIRST_MISMATCH(hipblasLtHalf, + (float(cpu[idx]) == float(gpu[idx]) + || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx])))); + break; + case HIP_R_16BF: + FIND_FIRST_MISMATCH(hip_bfloat16, + (float(cpu[idx]) == float(gpu[idx]) + || (hipblaslt_isnan(cpu[idx]) && hipblaslt_isnan(gpu[idx])))); + break; + default: + hipblaslt_cerr << "unit_print_first_mismatch: unhandled type (value=" + << static_cast(type) << ")" << std::endl; + return; + } +#undef FIND_FIRST_MISMATCH + + if(diff_count > 0) + hipblaslt_cerr << "Total mismatches: " << diff_count << " (matrix " << M << "x" << N + << ", batch_count=" << batch_count << ")" << std::endl; +} + inline void unit_check_general(int64_t M, int64_t N, int64_t lda, diff --git a/projects/hipblaslt/clients/common/src/hipblaslt_init_device.cpp b/projects/hipblaslt/clients/common/src/hipblaslt_init_device.cpp index 8d9573e626c1..bece13d48631 100644 --- a/projects/hipblaslt/clients/common/src/hipblaslt_init_device.cpp +++ b/projects/hipblaslt/clients/common/src/hipblaslt_init_device.cpp @@ -95,6 +95,31 @@ __device__ int8_t random_int(size_t idx) return pseudo_random_device(idx) % 3 + 1; } +/*! \brief generate a random number in range [0, 1, 2] for integer_exact init */ +template +__device__ T small_int_positive(size_t idx) +{ + return T(pseudo_random_device(idx) % 3); +} + +template <> +__device__ hipblasLtHalf small_int_positive(size_t idx) +{ + return hipblasLtHalf(pseudo_random_device(idx) % 3); +} + +template <> +__device__ hip_bfloat16 small_int_positive(size_t idx) +{ + return hip_bfloat16(pseudo_random_device(idx) % 3); +} + +template <> +__device__ int8_t small_int_positive(size_t idx) +{ + return static_cast(pseudo_random_device(idx) % 3); +} + /*! \brief generate a random number in HPL-like [-0.5,0.5] doubles */ template __device__ T random_hpl(size_t idx) @@ -259,6 +284,33 @@ void hipblaslt_init_device(ABC_dims abc, return uniform_01(idx); }); break; + case hipblaslt_initialization::integer_exact: + // A and C: [0,1,2] (C with beta); B: checkerboard ±[0,1,2] + if(abc == ABC_dims::A || abc == ABC_dims::C) + { + fill_batch(A, M, N, lda, stride, batch_count, [](size_t idx) -> T { + return small_int_positive(idx); + }); + } + else if(abc == ABC_dims::B) + { + // Checkerboard ±: (i^j)&1 so first element of each row and column alternates + // Use an effective stride that is never zero and at least large enough + // to contain one full matrix, to avoid division by a potentially zero stride. + // Offset PRNG index for B so {0,1,2} magnitudes differ from A (same idx would + // correlate via pseudo_random_device). + constexpr size_t kBSeedOffset = 1000003; // large prime + size_t effective_stride = stride ? std::max(stride, lda * N) : lda * N; + fill_batch(A, M, N, lda, effective_stride, batch_count, [effective_stride, lda](size_t idx) -> T { + auto b = idx / effective_stride; + auto in_batch = idx - b * effective_stride; + auto j = in_batch / lda; + auto i = in_batch - j * lda; + auto value = small_int_positive(idx + kBSeedOffset); + return (i ^ j) & 1 ? value : negate(value); + }); + } + break; default: hipblaslt_cerr << "Error type in hipblaslt_init_device" << std::endl; break; diff --git a/projects/hipblaslt/clients/tests/data/hipblaslt_common.yaml b/projects/hipblaslt/clients/tests/data/hipblaslt_common.yaml index 8d68d8e4b4e1..a7bd949d160e 100644 --- a/projects/hipblaslt/clients/tests/data/hipblaslt_common.yaml +++ b/projects/hipblaslt/clients/tests/data/hipblaslt_common.yaml @@ -44,6 +44,7 @@ Datatypes: zero: 555 norm_dist: 666 uniform_01: 777 + integer_exact: 888 - hipblaslt_activation_type: bases: [ c_int ] attr: @@ -280,6 +281,10 @@ Real precisions f32_bf16_r: &real_precisions_intermeddiate_bf16 - &single_precision_bf16computeIn_precision { a_type: f32_r, b_type: f32_r, c_type: f32_r, d_type: f32_r, compute_type: c_f32_fast_bf16_r, scale_type: f32_r} +# TF32x1 only: f32 A+B with compute input type bf16. Use for integer_exact to avoid "no solution" from f16+bf16 path. +Real precisions TF32x1 only: &real_precisions_tf32x1_only + - *single_precision_bf16computeIn_precision + Real precisions f32_f8_fnuz_r: &real_precisions_intermeddiate_f8_fnuz - &hpa_half_fp16dst_f8_fnuz_computeIn_precision { a_type: f16_r, b_type: f16_r, c_type: f16_r, d_type: f16_r, compute_type: c_f32_r, compute_input_typeA: f8_fnuz_r, compute_input_typeB: f8_fnuz_r, scale_type: f32_r} diff --git a/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml b/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml index d99c5dcb62d4..e01fd508cb87 100755 --- a/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml +++ b/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml @@ -23,6 +23,10 @@ Definitions: - { alpha: 1, beta: 0 } - { alpha: 1, beta: 1 } + - &integer_exact_alpha_beta + - { alpha: 2.0, beta: 0.0 } + - { alpha: 2.0, beta: -2.0 } + - &deepbench_transA_transB_range - { transA: N, transB: N } - { transA: N, transB: T } @@ -81,6 +85,114 @@ Tests: alpha_beta: *alpha_beta_range c_equal_d: [false, true] +# Integer-exact: A,C in [0,1,2], B ±[0,1,2]; alpha=2, beta in {0,-2} (D = 2 A*B or 2 A*B - 2 C). Exact GPU vs CPU. +# integer_exact: gfx11 runtime skip is off in testing_matmul.hpp (#if 0); turn back on after experiments. +# TF32x1 (f32 A+B, compute input bf16) runs only on gfx950 where solutions exist (see matmul_integer_exact_tf32x1_gfx950 below). +- name: matmul_integer_exact_one + category: quick + function: + matmul: *real_precisions + matrix_size: *one_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + +- name: matmul_integer_exact_small + category: quick + function: + matmul: *real_precisions + matrix_size: *small_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + +- name: matmul_integer_exact_medium + category: pre_checkin + function: + matmul: *real_precisions + matrix_size: *medium_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + +- name: matmul_integer_exact_batch_medium + category: pre_checkin + function: + matmul: *real_precisions + matrix_size: *medium_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + batch_count: 10 + initialization: integer_exact + unit_check: 1 + +- name: matmul_integer_exact_chunk + category: pre_checkin + function: + matmul: *real_precisions + matrix_size: *chunk_matrix_size_range + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + +- name: matmul_integer_exact_grid_limit_real + category: nightly + function: + matmul: *real_precisions + matrix_size: *grid_limit_matrix_size_real + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + gpu_arch: '120[0-1]' + +- name: matmul_integer_exact_grid_limit_double + category: nightly + function: + matmul: *double_precision + matrix_size: *grid_limit_matrix_size_double + transA_transB: *transA_transB_range + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + api_method: [0, 2] + gpu_arch: '120[0-1]' + +- name: matmul_integer_exact_deepbench + category: nightly + function: + - matmul: *real_precisions + - matmul: *real_precisions_gemm_only + matrix_size: *deepbench_sizes + alpha_beta: *integer_exact_alpha_beta + transA_transB: *deepbench_transA_transB_range + initialization: integer_exact + unit_check: 1 + +# Integer-exact + TF32x1 (f32 A+B, compute input bf16). gfx950 only. Sizes match matmul_gemm_f32_fast_bf16. +# Omit TT: TF32x1 often has no solution for both-transposed on gfx950 (CI saw 21× NO solution, all TT). +- name: matmul_integer_exact_tf32x1_gfx950 + category: quick + function: + matmul: *real_precisions_tf32x1_only + matrix_size: + - { M: 128, N: 128, K: 128 } + - { M: 131, N: 131, K: 131 } + - { M: 1024, N: 1024, K: 1024 } + - { M: 1031, N: 1031, K: 1031 } + transA_transB: + - { transA: N, transB: N } + - { transA: N, transB: T } + - { transA: T, transB: N } + alpha_beta: *integer_exact_alpha_beta + initialization: integer_exact + unit_check: 1 + gpu_arch: '950' + - name: matmul_conj_small category: quick function: