Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ LIB_NAME = libparcagpucupti.so.$(CUDA_MAJOR)
# Default target: build all CUDA versions (12 & 13) for both architectures and test infrastructure
all: cupti-all-versions test-infra

local:
mkdir -p build && \
cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -B build -S cupti && \
cd build && make -j$(nproc)

# Build libparcagpucupti.so for AMD64 using Docker
cupti-amd64:
@echo "=== Building $(LIB_NAME) for AMD64 with Docker (CUDA $(CUDA_MAJOR)) ==="
Expand Down Expand Up @@ -104,21 +109,28 @@ push-cuda-headers:
.
@echo "CUDA header images pushed to $(CUDA_HEADERS_REGISTRY):12 and :13"

# Build test infrastructure with Zig
# Build test infrastructure with CMake
test-infra:
@echo "=== Building test infrastructure with Zig ==="
@zig build
@echo "=== Building test infrastructure with CMake ==="
@mkdir -p test/build
@cd test/build && cmake .. && make

# Run tests (using AMD64 library)
test: cupti-amd64 test-infra
@./test.sh

# Run advanced test (8 GPUs @ 2500 launches/s, multi-threaded)
test-advanced: cupti-amd64 test-infra
@echo "=== Running advanced test (8 GPUs @ 2500 launches/s) ==="
@cd test/build && LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \
./test_cupti_prof ../../build/$(CUDA_MAJOR)/amd64/libparcagpucupti.so \
--threads=4 --num-gpus=8 --launch-rate=2500 --duration=10

# Clean build artifacts
clean:
@echo "=== Cleaning build artifacts ==="
@rm -rf cupti/build cupti/build-amd64 cupti/build-arm64 build
@rm -rf zig-out
@rm -rf .zig-cache
@rm -rf test/build
@echo "Clean complete"

# Build and push multi-arch Docker images to ghcr.io
Expand Down Expand Up @@ -155,4 +167,4 @@ docker-test-run: docker-test-build
@docker run --rm parcagpu-test:latest $(ARGS)

format:
clang-format -i -style=file cupti/*.[ch]
clang-format -i -style=file cupti/*.[ch]
94 changes: 0 additions & 94 deletions build.zig

This file was deleted.

63 changes: 63 additions & 0 deletions kernel_names.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
sm80_xmma_fprop_implicit_gemm_indexed_wo_smem_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x16x32_stage1_warpsize4x1x1_g1_tensor16x8x8_aligna4_alignc4_execute_kernel__5x_cudnn
sm80_xmma_fprop_implicit_gemm_indexed_wo_smem_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_tensor16x8x8_aligna4_execute_kernel__5x_cudnn
sm80_xmma_fprop_implicit_gemm_indexed_wo_smem_tf32f32_tf32f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_tensor16x8x8_alignc8_execute_kernel__5x_cudnn
sm80_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage4_warpsize2x2x1_g1_tensor16x8x8_alignc4_execute_kernel__5x_cudnn
_Z17wgrad_alg0_engineIfLi128ELi5ELi5ELi3ELi3ELi3ELb0ELi512EEviiiPKT_iPS0_S2_18kernel_grad_paramsyifiiii
_Z20magma_sgemmEx_kernelIfffLb0ELb0ELi6ELi3ELi5ELi3ELi3EEviii6TensoriS0_iS0_iS0_iiiPKT_S3_S1_S1_i18cublasLtEpilogue_tiPKvl
_Z20magma_sgemmEx_kernelIfffLb0ELb1ELi6ELi4ELi6ELi3ELi4EEviii6TensoriS0_iS0_iS0_iiiPKT_S3_S1_S1_i18cublasLtEpilogue_tiPKvl
_Z22foldedNhwcToNchwKernelIfffLb1EL21cudnnKernelDataType_t0EEviiiiiiiPKT_PT0_iiiiiiiiiiT1_S6_N5cudnn15reduced_divisorES8_S8_
_Z22nchwToFoldedNhwcKernelIfffLb1EL21cudnnKernelDataType_t2EEviiiiPKT_PT0_iiiiiiiiiiT1_S6_N5cudnn15reduced_divisorES8_S8_
_Z22tensorTransformGenericIfffLb1ELb0ELb0EL21cudnnKernelDataType_t2EEv26cudnnTensorTransformStruct21tensorTransformParamsimPKT_PT0_T1_S8_
_Z23implicit_convolve_sgemmIffLi1024ELi5ELi5ELi3ELi3ELi3ELi1ELb0ELb0ELb1EEviiiPKT_iPT0_S2_18kernel_conv_paramsyiffiPKS3_S7_bbii
_Z23implicit_convolve_sgemmIffLi128ELi5ELi5ELi3ELi3ELi3ELi1ELb0ELb0ELb1EEviiiPKT_iPT0_S2_18kernel_conv_paramsyiffiPKS3_S7_bbii
_ZN17cutlass__5x_cudnn6KernelI65cutlass_tensorop_s1688wgrad_optimized_tf32_64x64_32x5_nhwc_align4EEvNT_6ParamsE
_ZN17cutlass__5x_cudnn6KernelI66cutlass_tensorop_s1688dgrad_optimized_tf32_256x64_16x4_nhwc_align4EEvNT_6ParamsE
_ZN17cutlass__5x_cudnn6KernelI66cutlass_tensorop_s1688fprop_optimized_tf32_64x64_16x10_nhwc_align4EEvNT_6ParamsE
_ZN17cutlass__5x_cudnn6KernelI79cutlass_tensorop_s1688dgrad_optimized_tf32_256x64_16x4_nhwc_unity_stride_align4EEvNT_6ParamsE
_ZN17cutlass__5x_cudnn6KernelI79cutlass_tensorop_s1688dgrad_optimized_tf32_64x64_16x10_nhwc_unity_stride_align4EEvNT_6ParamsE
_ZN17cutlass__5x_cudnn6KernelINS_4conv6kernel23ImplicitGemmConvolutionINS1_11threadblock22ImplicitGemmMultistageINS_4gemm9GemmShapeILi64ELi64ELi32EEENS4_52Conv2dWgradOutputGradientTileAccessIteratorOptimizedINS_11MatrixShapeILi64ELi32EEENS_10tfloat32_tENS_9transform29PitchLinearWarpRakedThreadMapINS_16PitchLinearShapeILi64ELi32EEELi128ENSF_ILi8ELi4EEELi4EEENS_12AlignedArrayISC_Li4ELi16EEEEENSD_11threadblock25RegularTileAccessIteratorISB_SC_NS_6layout40ColumnMajorTensorOpMultiplicandCongruousILi32ELi32EEELi1ESI_Li16EEELNS_4arch14CacheOperation4KindE0ENS4_48Conv2dWgradActivationTileAccessIteratorOptimizedINSA_ILi32ELi64EEESC_SI_SK_EENSN_ISW_SC_NSO_37RowMajorTensorOpMultiplicandCongruousILi32ELi32EEELi0ESI_Li16EEELSU_0ENS6_11threadblock9MmaPolicyINS6_4warp11MmaTensorOpINS7_ILi32ELi32ELi32EEESC_SQ_SC_SZ_fNSO_8RowMajorENS13_17MmaTensorOpPolicyINSS_3MmaINS7_ILi16ELi8ELi8EEELi32ESC_S16_SC_NSO_11ColumnMajorEfS16_NSS_13OpMultiplyAddEEENSA_ILi1ELi1EEEEELi1ELb0EbEENSA_ILi0ELi0EEES1G_Li1EEELi5EbEENS_8epilogue11threadblock8EpilogueIS8_S1F_Li1ENS1K_22PredicatedTileIteratorINS1K_26OutputTileOptimalThreadMapINS1K_15OutputTileShapeILi64ELi8ELi2ELi1ELi1EEENS1O_ILi1ELi4ELi1ELi1ELi4EEELi128ELi4ELi32EEEfLb0ENSO_9NoPermuteELb0EEENS1J_4warp24FragmentIteratorTensorOpIS15_S19_fNS_5ArrayIfLi4ELb1EEES16_EENS1U_20TileIteratorTensorOpIS15_S19_fS16_EENS1K_18SharedLoadIteratorINS1R_18CompactedThreadMapEfLi16EEENS1J_6thread17LinearCombinationIfLi4EffLNS24_9ScaleType4KindE0ELNS_15FloatRoundStyleE2EEENSA_ILi0ELi8EEELi2ELi1EEENS11_30GemmIdentityThreadblockSwizzleILi4EEELNS1_8OperatorE2ENS1_17Conv2dProblemSizeELNS1_9GroupModeE0EEEEEvNT_6ParamsE
_ZN17cutlass__5x_cudnn6KernelINS_9reduction6kernel12ReduceSplitKINS_11MatrixShapeILi4ELi128EEENS_8epilogue6thread17LinearCombinationIfLi4EffLNS7_9ScaleType4KindE0ELNS_15FloatRoundStyleE2EEENS1_6thread9ReduceAddIffLi4EEELi4EEEEEvNT_6ParamsE
_ZN2at6native13reduce_kernelILi128ELi4ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_11sum_functorIfffEclERNS_14TensorIteratorEEUlffE_EEjfLi4ELi4EEEEEvT1_
_ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_14func_wrapper_tIfZNS0_11sum_functorIfffEclERNS_14TensorIteratorEEUlffE_EEjfLi4ELi4EEEEEvT1_
_ZN2at6native13reduce_kernelILi512ELi1ENS0_8ReduceOpIfNS0_7MeanOpsIffffEEjfLi4ELi4EEEEEvT1_
_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastINS0_13BinaryFunctorIfffNS0_15binary_internal10DivFunctorIfEEEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_
_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastINS0_15CUDAFunctor_addIfEEEEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_
_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastIZZZNS0_23direct_copy_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE1_clEvENKUlvE5_clEvEUlfE_EEvS4_RKT_EUliE_EEviT1_
_ZN2at6native18elementwise_kernelILi128ELi2EZNS0_22gpu_kernel_impl_nocastIZZZNS0_24mse_backward_cuda_kernelERNS_14TensorIteratorERKN3c106ScalarEENKUlvE_clEvENKUlvE0_clEvEUlfffE_EEvRNS_18TensorIteratorBaseERKT_EUliE_EEviT1_
_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_11FillFunctorIfEESt5arrayIPcLm1EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_13AUnaryFunctorIfffNS0_15binary_internal10MulFunctorIfEEEESt5arrayIPcLm2EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_13BinaryFunctorIfffZNS0_67_GLOBAL__N__06c19b41_28_ActivationThresholdKernel_cu_c1096568_2903721threshold_kernel_implIfEEvRNS_18TensorIteratorBaseET_S7_EUlffE_EESt5arrayIPcLm3EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_13BUnaryFunctorIfffNS0_15binary_internal10MulFunctorIfEEEESt5arrayIPcLm2EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_15CUDAFunctor_addIfEESt5arrayIPcLm3EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4ENS0_21CUDAFunctorOnSelf_addIfEESt5arrayIPcLm2EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_15mse_kernel_cudaERNS_18TensorIteratorBaseEENKUlvE_clEvENKUlvE0_clEvEUlffE_St5arrayIPcLm3EEEEviT0_T1_
_ZN2at6native29vectorized_elementwise_kernelILi4EZZZNS0_49_GLOBAL__N__d2ba64fb_16_TensorCompare_cu_71e06f4e19launch_clamp_scalarERNS_18TensorIteratorBaseEN3c106ScalarES6_NS0_6detail11ClampLimitsEENKUlvE_clEvENKUlvE5_clEvEUlfE_St5arrayIPcLm2EEEEviT0_T1_
_ZN2at6native49_GLOBAL__N__7861cdb6_16_ReflectionPad_cu_f800513927reflection_pad2d_out_kernelIfEEvPKT_PS3_lliiiiiii
_ZN2at6native49_GLOBAL__N__7861cdb6_16_ReflectionPad_cu_f800513936reflection_pad2d_backward_out_kernelIfEEvPT_PKS3_lliiiiiii
_ZN2at6native50_GLOBAL__N__b6ca21f8_17_ForeachUnaryOp_cu_77fe623d25multi_tensor_apply_kernelINS1_18TensorListMetadataILi2EEENS1_14UnaryOpFunctorIfLi2ELi1ELi1EEEJNS0_4SqrtIfEEEEEvT_T0_DpT1_
_ZN2at6native52_GLOBAL__N__4671fab5_19_ForeachTernaryOp_cu_5285c63625multi_tensor_apply_kernelINS1_18TensorListMetadataILi2EEENS1_22TernaryOpScalarFunctorIfLi2ELi2ELi0EEEJNS0_11LerpFunctorIfEEfEEEvT_T0_DpT1_
_ZN2at6native52_GLOBAL__N__e57809e0_19_DilatedMaxPool2d_cu_6258b57421max_pool_forward_nchwIfEEviPKT_llliiiiiiiiiiPS3_Pl
_ZN2at6native52_GLOBAL__N__e57809e0_19_DilatedMaxPool2d_cu_6258b57422max_pool_backward_nchwIffEEvPKT_PKlillliiiiiiiiiiPS3_
_ZN2at6native53_GLOBAL__N__222ba01c_20_UpSampleNearest2d_cu_198c5ce228upsample_nearest2d_out_frameIfXadL_ZNS0_37nearest_neighbor_compute_source_indexEfiiEEEEvPKT_PS3_mmmmmff
_ZN2at6native53_GLOBAL__N__222ba01c_20_UpSampleNearest2d_cu_198c5ce237upsample_nearest2d_backward_out_frameIffXadL_ZNS0_40nearest_neighbor_bw_compute_source_indexEfiiEEEEvPKT_mmmmmmPS3_ff
_ZN2at6native54_GLOBAL__N__f8a10d72_21_ForeachPointwiseOp_cu_ef478fac25multi_tensor_apply_kernelINS1_18TensorListMetadataILi3EEENS1_24PointwiseOpScalarFunctorIfLi3ELi3ELi0EEEJSt10multipliesIfEfEEEvT_T0_DpT1_
_ZN2at6native54_GLOBAL__N__f8a10d72_21_ForeachPointwiseOp_cu_ef478fac25multi_tensor_apply_kernelINS1_28TensorListScalarListMetadataIfLi3EEENS1_28PointwiseOpScalarListFunctorIfLi3ELi3ELi0EEEJSt7dividesIfEEEEvT_T0_DpT1_
_ZN2at6native57_GLOBAL__N__e658eeb9_24_ForeachBinaryOpScalar_cu_86b9896c25multi_tensor_apply_kernelINS1_18TensorListMetadataILi1EEENS1_21BinaryOpScalarFunctorIfLi1ELi1ELi0EEEJSt10multipliesIfEfEEEvT_T0_DpT1_
_ZN2at6native57_GLOBAL__N__e658eeb9_24_ForeachBinaryOpScalar_cu_86b9896c25multi_tensor_apply_kernelINS1_18TensorListMetadataILi1EEENS1_21BinaryOpScalarFunctorIfLi1ELi1ELi0EEEJSt4plusIfEfEEEvT_T0_DpT1_
_ZN2at6native61_GLOBAL__N__538ad94b_28_ForeachBinaryOpScalarList_cu_dda10a6925multi_tensor_apply_kernelINS1_28TensorListScalarListMetadataIfLi1EEENS1_25BinaryOpScalarListFunctorIfLi1ELi1ELi0EEEJSt7dividesIfEEEEvT_T0_DpT1_
_ZN5cudnn17winograd_nonfused20winogradWgradData4x4IffEEvNS0_18WinogradDataParamsIT_T0_EE
_ZN5cudnn17winograd_nonfused21winogradWgradDelta4x4IffEEvNS0_19WinogradDeltaParamsIT_T0_EE
_ZN5cudnn17winograd_nonfused22winogradWgradOutput4x4IffEEvNS0_25WinogradWgradOutputParamsIT_T0_EE
_ZN5cudnn19engines_precompiled16nchwToNhwcKernelIfffLb0ELb1EL21cudnnKernelDataType_t2EEEvNS0_18nchw2nhwc_params_tIT1_EEPKT_PT0_
_ZN5cudnn19engines_precompiled16nhwcToNchwKernelIfffLb1ELb0EL21cudnnKernelDataType_t0EEEvNS0_18nhwc2nchw_params_tIT1_EEPKT_PT0_
_ZN5cudnn19engines_precompiled24scalePackedTensor_kernelIffEEvlPT_T0_
_ZN5cudnn21bn_bw_1C11_kernel_newIff6float2Li512ELb1ELi1EEEvT0_S2_S2_S2_17cudnnTensorStructPKT_S3_S6_S3_PS4_PKS2_PS2_SA_S9_S9_S2_
_ZN5cudnn21bn_bw_1C11_singlereadIfLi512ELb1ELi1ELi2ELi0EEEvNS_15bn_bw_1C11_argsIT_EE
_ZN5cudnn24bn_fw_tr_1C11_singlereadIfLi512ELb1ELi1ELi2ELi0EEEvNS_18bn_fw_tr_1C11_argsIT_EE
_ZN5cudnn25bn_fw_tr_1C11_kernel_NCHWIffiLi512ELb1ELi1ELb1EEEv17cudnnTensorStructPKT_S1_PS2_PKT0_S8_S6_S6_PS6_S9_S9_S9_S6_S6_
_ZN5cudnn3cnn17wgrad_alg1_engineIffLi512ELi6ELi5ELi3ELi3ELi3ELb0ELb1EEEviiiPKT_iPT0_S4_18kernel_grad_paramsyiffiiPiS8_ii
_ZN5cudnn6detail12dgrad_engineIfLi128ELi6ELi8ELi3ELi3ELi5ELb0EEEviiiPKT_iS4_iPS2_18kernel_grad_paramsyiyifiii
_ZN7cutlass7Kernel2I41cutlass_80_simt_sgemm_64x64_8x5_tn_align1EEvNT_6ParamsE
_ZN7cutlass7Kernel2I42cutlass_80_simt_sgemm_128x32_8x5_nn_align1EEvNT_6ParamsE
_ZN7cutlass7Kernel2I42cutlass_80_simt_sgemm_128x32_8x5_nt_align1EEvNT_6ParamsE
_ZN7cutlass7Kernel2I42cutlass_80_simt_sgemm_128x32_8x5_tn_align1EEvNT_6ParamsE
13 changes: 7 additions & 6 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ for arg in "$@"; do
done

echo ""
echo "=== Building test infrastructure with Zig ==="
zig build
echo "=== Building test infrastructure with CMake ==="
mkdir -p test/build
cd test/build && cmake .. && make && cd ../..

# Start bpftrace if requested
if [ "$USE_BPFTRACE" -eq 1 ]; then
Expand All @@ -48,11 +49,11 @@ fi
echo ""
echo "=== Running test program ==="
# Set LD_LIBRARY_PATH so the test can find libcupti.so at runtime
# Set PARCAGPU_DEBUG to enable debug output
export LD_LIBRARY_PATH="$(pwd)/zig-out/lib:$LD_LIBRARY_PATH"
export PARCAGPU_DEBUG=1
# Set PARCAGPU_DEBUG externally to enable debug output
# Set PARCAGPU_RATE_LIMIT externally to override default (100/s)
export LD_LIBRARY_PATH="$(pwd)/test/build:$LD_LIBRARY_PATH"
# Use the CMake-built library with real CUPTI
zig-out/bin/test_cupti_prof build/$CUDA_MAJOR/$ARCH/libparcagpucupti.so "$@"
test/build/test_cupti_prof build/libparcagpucupti.so --kernel-names=kernel_names.txt "$@"

# If bpftrace was started, stop it and show results
if [ "$USE_BPFTRACE" -eq 1 ]; then
Expand Down
59 changes: 59 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
cmake_minimum_required(VERSION 3.18)
project(parcagpu_test C)

# CUDA include paths
if(NOT DEFINED CUDA_ROOT)
set(CUDA_ROOT "/usr/local/cuda" CACHE PATH "CUDA installation directory")
endif()

set(CUDA_INCLUDE_DIRS
"${CUDA_ROOT}/include"
"${CUDA_ROOT}/extras/CUPTI/include"
)

# Mock CUPTI library
add_library(cupti SHARED mock_cupti.c)
target_include_directories(cupti PRIVATE ${CUDA_INCLUDE_DIRS})
set_target_properties(cupti PROPERTIES
C_STANDARD 11
C_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON
)

# Detect CUDA major version from CUPTI library
execute_process(
COMMAND bash -c "ls ${CUDA_ROOT}/lib64/libcupti.so.* 2>/dev/null | grep -oE 'libcupti\\.so\\.[0-9]+$' | head -1 | grep -oE '[0-9]+$'"
OUTPUT_VARIABLE CUDA_MAJOR_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(NOT CUDA_MAJOR_VERSION)
set(CUDA_MAJOR_VERSION "12")
endif()
message(STATUS "Detected CUDA major version: ${CUDA_MAJOR_VERSION}")

# Create versioned symlink matching installed CUDA
add_custom_command(TARGET cupti POST_BUILD
COMMAND ${CMAKE_COMMAND} -E create_symlink
$<TARGET_FILE_NAME:cupti>
${CMAKE_CURRENT_BINARY_DIR}/libcupti.so.${CUDA_MAJOR_VERSION}
)

# Test executable
add_executable(test_cupti_prof test_cupti_prof.c)
target_include_directories(test_cupti_prof PRIVATE ${CUDA_INCLUDE_DIRS})
target_compile_options(test_cupti_prof PRIVATE
-D_POSIX_C_SOURCE=199309L
-Wall
-Wextra
)
target_link_libraries(test_cupti_prof PRIVATE dl pthread)
set_target_properties(test_cupti_prof PROPERTIES
C_STANDARD 11
C_STANDARD_REQUIRED ON
)

# Installation
install(TARGETS cupti test_cupti_prof
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
)
Loading
Loading