包含以下内容:
- hgemv_k32_f16_kernel
- hgemv_k128_f16x4_kernel
- hgemv_k16_f16_kernel
- hgemv_f16_cute_kernel
- hgemv_f16x8_cute_kernel
- hgemv_tensor_core_cute_kernel
- PyTorch bindings
# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
export TORCH_CUDA_ARCH_LIST=Ada
python3 hgemv.py输出:
--------------------------------------------------------------------------------
out_k32f16: [15.609375, 2.15234375, -10.9296875], time:0.00324011ms
out_k128f16x4: [15.609375, 2.15625, -10.9296875], time:0.00322700ms
out_hgemv_f16_cute: [15.609375, 2.15234375, -10.9296875], time:0.00318646ms
out_hgemv_f16x8_cute: [15.609375, 2.16015625, -10.9375], time:0.00323176ms
out_hgemv_tensor_core_cute: [15.6171875, 2.15625, -10.9375], time:0.00531912ms
out_f16_th: [15.6171875, 2.15429688, -10.9375], time:0.00889659ms
--------------------------------------------------------------------------------
out_k16f16: [-6.69140625, -7.2265625, -6.4921875], time:0.00339985ms
out_hgemv_f16_cute: [-6.69140625, -7.2265625, -6.4921875], time:0.00323296ms
out_hgemv_f16x8_cute: [-6.6875, -7.2265625, -6.4921875], time:0.00319839ms
out_hgemv_tensor_core_cute: [-6.6875, -7.22265625, -6.4921875], time:0.00305891ms
out_f16_th: [-6.69140625, -7.2265625, -6.4921875], time:0.00872254ms
--------------------------------------------------------------------------------