Skip to content

Commit e77f8f8

Browse files
committed
Update Something?
1 parent 7abdec4 commit e77f8f8

File tree

9 files changed

+136
-36
lines changed

9 files changed

+136
-36
lines changed

configs/lib-tests.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
OpTest:
22
VecAdd:
3-
- nElems: 32
43
- nElems: 1024
5-
- nElems: 2048
6-
- nElems: 1048576 # 1024*1024
7-
- nElems: 8388608 # 1024*1024
4+
- nElems: 8388608 # 8*1024*1024
85
Conv2D:
96
- inputHeight: 32
107
inputWidth: 32

csrc/cmake/compilers/cuda-compiler-configs.cmake

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,5 @@ set(CMAKE_CUDA_ARCHITECTURES native)
2323
log_info("CMAKE_CUDA_STANDARD: ${CMAKE_CUDA_STANDARD}")
2424

2525
string(APPEND CMAKE_CUDA_FLAGS " --expt-relaxed-constexpr")
26-
string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -O3")
27-
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -O3 -lineinfo")
26+
string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -lineinfo")
2827
string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")

csrc/cmake/compilers/cxx-compiler-configs.cmake

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,40 +27,26 @@ log_info("CMAKE_CXX_STANDARD: ${CMAKE_CXX_STANDARD}")
2727
log_info("CMAKE_CXX_SCAN_FOR_MODULES: ${CMAKE_CXX_SCAN_FOR_MODULES}")
2828
log_info("STACK_SIZE: ${STACK_SIZE}")
2929

30-
# Compiler flags for MSVC
30+
# MSVC ----------------------------------------------------------------------------------------------------------------
3131
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
3232
string(APPEND CMAKE_CXX_FLAGS " /permissive- /Zc:forScope /openmp /Zc:__cplusplus")
33-
string(APPEND CMAKE_CXX_FLAGS_RELEASE " /O2")
34-
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " /O2 /Zi")
35-
string(APPEND CMAKE_CXX_FLAGS_DEBUG " /Zi")
36-
# Set stack size
3733
string(APPEND CMAKE_EXE_LINKER_FLAGS " /STACK:${STACK_SIZE}")
38-
# Compiler flags for Clang
34+
# Clang ---------------------------------------------------------------------------------------------------------------
3935
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
4036
string(APPEND CMAKE_CXX_FLAGS " -fopenmp -Wall -Wextra -Werror")
41-
string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O3")
42-
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g")
43-
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -g")
44-
# Set stack size
4537
if (WIN32)
4638
string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-stack,${STACK_SIZE}")
4739
else()
4840
string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-zstack-size=${STACK_SIZE}")
4941
endif()
50-
# Compiler flags for GNU
42+
# GNU -----------------------------------------------------------------------------------------------------------------
5143
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
5244
string(APPEND CMAKE_CXX_FLAGS " -fopenmp -Wall -Wextra -Werror")
53-
string(APPEND CMAKE_CXX_FLAGS_RELEASE " -O3")
54-
string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g")
55-
string(APPEND CMAKE_CXX_FLAGS_DEBUG " -g")
56-
# Set stack size
5745
if (WIN32)
5846
string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,--stack,${STACK_SIZE}")
5947
else()
6048
string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-zstack-size=${STACK_SIZE}")
6149
endif()
62-
# [TODO] @jamesnulliu
63-
# Support more compilers
6450
else()
6551
log_fatal("Unsupported compiler")
6652
endif()

csrc/include/pmpp/pch.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <torch/python.h>
66
#include <torch/torch.h>
77
#include <type_traits>
8+
#include <format>
89

910
#include "pmpp/system.hpp"
1011
#include "pmpp/types/cu_types.cuh"

csrc/include/pmpp/utils/common.cuh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,22 @@
1616
do { \
1717
cudaError_t err_ = (err); \
1818
if (err_ != cudaSuccess) { \
19-
::fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\"", \
19+
::fprintf(stderr, \
20+
"CUDA error at %s:%d; Error code: %d(%s) \"%s\"", \
2021
__FILE__, __LINE__, err, \
2122
::cudaGetErrorString(err_), #err); \
2223
::cudaDeviceReset(); \
23-
throw ::std::runtime_error("CUDA error"); \
24+
::std::abort(); \
2425
} \
2526
} while (0)
27+
28+
#define PMPP_ABORT(msg) \
29+
do { \
30+
::fprintf(stderr, "Abort at %s:%d \"%s\"", __FILE__, __LINE__, \
31+
msg); \
32+
::cudaDeviceReset(); \
33+
::std::abort(); \
34+
} while (0)
2635
#endif
2736

2837
#ifdef PMPP_DEBUG_CUDA_ERR_CHECK

csrc/lib/ops/torch_bind.cu

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit?tab=t.0#heading=h.fu2gkc7w0nrc
88
TORCH_LIBRARY(pmpp, m)
99
{
10-
m.def("vector_add(Tensor a, Tensor b) -> Tensor");
10+
m.def("vector_add_v0(Tensor a, Tensor b) -> Tensor");
11+
m.def("vector_add_v1(Tensor a, Tensor b) -> Tensor");
12+
m.def("vector_add_v2(Tensor a, Tensor b) -> Tensor");
1113
m.def("cvt_rgb_to_gray(Tensor img) -> Tensor");
1214
m.def("matmul(Tensor A, Tensor B) -> Tensor");
1315
m.def("conv2d(Tensor input, Tensor kernel) -> Tensor");
@@ -21,7 +23,9 @@ TORCH_LIBRARY(pmpp, m)
2123
// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit?tab=t.0#heading=h.jc288bcufw9a
2224
TORCH_LIBRARY_IMPL(pmpp, CPU, m)
2325
{
24-
m.impl("vector_add", &pmpp::ops::cpu::torch_impl::vectorAdd);
26+
m.impl("vector_add_v0", &pmpp::ops::cpu::torch_impl::vectorAdd);
27+
m.impl("vector_add_v1", &pmpp::ops::cpu::torch_impl::vectorAdd);
28+
m.impl("vector_add_v2", &pmpp::ops::cpu::torch_impl::vectorAdd);
2529
m.impl("cvt_rgb_to_gray", &pmpp::ops::cpu::torch_impl::cvtRGBtoGray);
2630
m.impl("matmul", &pmpp::ops::cpu::torch_impl::matmul);
2731
m.impl("conv2d", &pmpp::ops::cpu::torch_impl::conv2d);
@@ -33,7 +37,9 @@ TORCH_LIBRARY_IMPL(pmpp, CPU, m)
3337

3438
TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
3539
{
36-
m.impl("vector_add", &pmpp::ops::cuda::torch_impl::vectorAdd);
40+
m.impl("vector_add_v0", &pmpp::ops::cuda::torch_impl::vectorAdd<0>);
41+
m.impl("vector_add_v1", &pmpp::ops::cuda::torch_impl::vectorAdd<1>);
42+
m.impl("vector_add_v2", &pmpp::ops::cuda::torch_impl::vectorAdd<2>);
3743
m.impl("cvt_rgb_to_gray", &pmpp::ops::cuda::torch_impl::cvtRGBtoGray);
3844
m.impl("matmul", &pmpp::ops::cuda::torch_impl::matmul);
3945
m.impl("conv2d", &pmpp::ops::cuda::torch_impl::conv2d);

csrc/lib/ops/vecAdd/op.cuh

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,37 +6,70 @@
66

77
namespace pmpp::ops::cuda
88
{
9-
__global__ void vecAddKernel(const fp32_t* a, const fp32_t* b, fp32_t* c,
10-
int32_t n)
9+
__global__ void vecAddKernelv0(const fp32_t* a, const fp32_t* b, fp32_t* c,
10+
int32_t n)
1111
{
1212

1313
int gtid = threadIdx.x + blockDim.x * blockIdx.x;
1414
if (gtid < n) {
15-
// [GM] 2 load, 1 store, 3 inst
15+
// [DRAM] 2 load, 1 store, 3 inst
1616
c[gtid] = a[gtid] + b[gtid];
1717
}
1818
}
1919

20+
__global__ void vecAddKernelv1(const fp32_t* a, const fp32_t* b, fp32_t* c,
21+
int32_t n)
22+
{
23+
24+
int gtid = threadIdx.x + blockDim.x * blockIdx.x;
25+
gtid = gtid % 2 == 0 ? gtid + 1 : gtid - 1;
26+
if (gtid < n) {
27+
// [DRAM] 2 load, 1 store, 3 inst
28+
c[gtid] = a[gtid] + b[gtid];
29+
}
30+
}
31+
32+
__global__ void vecAddKernelv2(const fp32_t* a, const fp32_t* b, fp32_t* c,
33+
int32_t n)
34+
{
35+
36+
int gtid = threadIdx.x + blockDim.x * blockIdx.x + 1;
37+
if (gtid < n) {
38+
// [DRAM] 2 load, 1 store, 3 inst
39+
c[gtid] = a[gtid] + b[gtid];
40+
}
41+
}
42+
43+
template <uint8_t VERSION = 0>
2044
void launchVecAdd(const fp32_t* d_A, const fp32_t* d_B, fp32_t* d_C, size_t n)
2145
{
2246
dim3 blockSize = {std::min<uint32_t>(n, 1024), 1, 1};
2347
dim3 gridSize = {ceilDiv<uint32_t>(n, blockSize.x), 1, 1};
2448

25-
vecAddKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, int32_t(n));
49+
if constexpr (VERSION == 0) {
50+
vecAddKernelv0<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
51+
} else if (VERSION == 1) {
52+
vecAddKernelv1<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
53+
} else if (VERSION == 2) {
54+
vecAddKernelv2<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
55+
} else {
56+
PMPP_ABORT(std::format("Unsupported version: {}", VERSION).c_str());
57+
}
2658

2759
PMPP_DEBUG_CUDA_ERR_CHECK(cudaGetLastError());
2860
}
2961

3062
namespace torch_impl
3163
{
64+
template <uint8_t VERSION = 0>
3265
inline auto vectorAdd(const torch::Tensor& A, const torch::Tensor& B)
3366
-> torch::Tensor
3467
{
3568
torch::Tensor C = torch::empty_like(A);
3669

3770
switch (A.scalar_type()) {
3871
case torch::kFloat32: {
39-
pmpp::ops::cuda::launchVecAdd(
72+
pmpp::ops::cuda::launchVecAdd<VERSION>(
4073
A.data_ptr<fp32_t>(), B.data_ptr<fp32_t>(), C.data_ptr<fp32_t>(),
4174
A.flatten().size(0));
4275
break;

csrc/test/OpTest/vecAdd.cpp

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@ namespace f = torch::nn::functional;
99
namespace pmpp::test::ops
1010
{
1111

12-
TEST_F(OpTest, VecAdd)
12+
TEST_F(OpTest, VecAddv0)
1313
{
1414

1515
const YAML::Node& configs = getConfigs()["OpTest"]["VecAdd"];
1616

1717
static auto custom_op = torch::Dispatcher::singleton()
18-
.findSchemaOrThrow("pmpp::vector_add", "")
18+
.findSchemaOrThrow("pmpp::vector_add_v0", "")
1919
.typed<torch::Tensor(const torch::Tensor&,
2020
const torch::Tensor&)>();
2121

@@ -36,7 +36,76 @@ TEST_F(OpTest, VecAdd)
3636
f::cosine_similarity(matCh.flatten(), matCd2h.flatten(),
3737
f::CosineSimilarityFuncOptions().dim(0));
3838

39+
EXPECT_TRUE(matCh.allclose(matCd2h));
3940
EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
4041
}
4142
}
43+
44+
TEST_F(OpTest, VecAddv1)
45+
{
46+
47+
const YAML::Node& configs = getConfigs()["OpTest"]["VecAdd"];
48+
49+
static auto custom_op = torch::Dispatcher::singleton()
50+
.findSchemaOrThrow("pmpp::vector_add_v1", "")
51+
.typed<torch::Tensor(const torch::Tensor&,
52+
const torch::Tensor&)>();
53+
54+
for (const auto& cfg : configs) {
55+
56+
auto nElems = cfg["nElems"].as<pmpp::size_t>();
57+
58+
torch::Tensor matAh = torch::rand(nElems, torch::kF32);
59+
torch::Tensor matBh = torch::rand(nElems, torch::kF32);
60+
torch::Tensor matCh = custom_op.call(matAh, matBh);
61+
62+
ASSERT_TRUE(torch::cuda::is_available());
63+
torch::Tensor matAd = matAh.to(torch::kCUDA);
64+
torch::Tensor matBd = matBh.to(matAd.device());
65+
torch::Tensor matCd2h = custom_op.call(matAd, matBd).to(torch::kCPU);
66+
67+
Tensor cosSim =
68+
f::cosine_similarity(matCh.flatten(), matCd2h.flatten(),
69+
f::CosineSimilarityFuncOptions().dim(0));
70+
71+
EXPECT_TRUE(matCh.allclose(matCd2h));
72+
EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
73+
}
74+
}
75+
76+
TEST_F(OpTest, VecAddv2)
77+
{
78+
79+
const YAML::Node& configs = getConfigs()["OpTest"]["VecAdd"];
80+
81+
static auto custom_op = torch::Dispatcher::singleton()
82+
.findSchemaOrThrow("pmpp::vector_add_v2", "")
83+
.typed<torch::Tensor(const torch::Tensor&,
84+
const torch::Tensor&)>();
85+
86+
for (const auto& cfg : configs) {
87+
88+
auto nElems = cfg["nElems"].as<pmpp::size_t>();
89+
90+
torch::Tensor matAh = torch::rand(nElems, torch::kF32);
91+
torch::Tensor matBh = torch::rand(nElems, torch::kF32);
92+
torch::Tensor matCh = custom_op.call(matAh, matBh);
93+
94+
ASSERT_TRUE(torch::cuda::is_available());
95+
torch::Tensor matAd = matAh.to(torch::kCUDA);
96+
torch::Tensor matBd = matBh.to(matAd.device());
97+
torch::Tensor matCd2h = custom_op.call(matAd, matBd).to(torch::kCPU);
98+
99+
Tensor cosSim =
100+
f::cosine_similarity(matCh.flatten(), matCd2h.flatten(),
101+
f::CosineSimilarityFuncOptions().dim(0));
102+
103+
std::cout << std::format("nElems: {}, cosSim: {}\n", nElems,
104+
cosSim.item<fp32_t>());
105+
106+
// // [NOTE] This won't pass because the kernel is deliberately wrong
107+
// EXPECT_TRUE(matCh.allclose(matCd2h));
108+
// EXPECT_GE(cosSim.item<fp32_t>(), 0.99);
109+
}
110+
}
42111
} // namespace pmpp::test::ops

scripts/nsight-profile.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
TEST_FILE="./build/test/pmpp_test"
2-
GTEST_FILTER="OpTest.VecAdd"
2+
GTEST_FILTER="OpTest.VecAdd*"
33
OUTPUT_FILE="outputs/nsight_profile.ncu-rep"
44

55
while [[ $# -gt 0 ]]; do

0 commit comments

Comments
 (0)