accel-sim · tgrogers · Jul 4, 2025 · Jul 3, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
@@ -13,19 +13,23 @@ on:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:  
-  test-12-6:
+  test-12-8:
     runs-on: ubuntu-latest
     container:
      image: ghcr.io/accel-sim/accel-sim-framework:ubuntu-24.04-cuda-12.8
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v4
+      - name: Checkout repository with submodules
+        uses: actions/checkout@v4
+        with:
+          submodules: false  # This clones submodules
 
       - name: Build Apps
         run: |
              git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
+             git submodule update --init -- src/cuda/cuda-samples
              /bin/bash test-build.sh
 
       - name: Print Successful Apps

diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_conflict/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_conflict/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_diverge/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_profile/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_bw_profile/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_lat/Makefile b/src/cuda/GPU_Microbenchmark/Atomic_ubench/Atomic_add/Atomic_add_lat/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/Makefile b/src/cuda/GPU_Microbenchmark/Makefile
@@ -1,32 +1,21 @@
-
 BASE_DIR := $(shell pwd)
 BIN_DIR := $(BASE_DIR)/bin
+SUB_DIRS        = $(wildcard ubench/*/*/)
+SUB_DIRS_ALL    = $(SUB_DIRS:%=all-%)
+SUB_DIRS_CLEAN  = $(SUB_DIRS:%=clean-%)
 
-all: 	
-	mkdir -p $(BIN_DIR)
-	cd l1_bw_32f &&		make &&	cp l1_bw_32f $(BIN_DIR)
-	cd l1_bw_64f && 		make &&	cp l1_bw_64f $(BIN_DIR)
-	cd l1_bw_128 && 		make &&	cp l1_bw_128 $(BIN_DIR)
-	cd l1_lat && 		make &&	cp l1_lat $(BIN_DIR)
-	cd l2_bw_32f &&  		make &&	cp l2_bw_32f $(BIN_DIR)
-	cd l2_bw_64f && 		make &&	cp l2_bw_64f $(BIN_DIR)
-	cd l2_bw_128 && 		make &&	cp l2_bw_128 $(BIN_DIR)
-	cd l2_lat && 		make &&	cp l2_lat $(BIN_DIR)
-	cd mem_bw && 		make &&	cp mem_bw $(BIN_DIR)
-	cd mem_lat && 		make &&	cp mem_lat $(BIN_DIR)
-	cd shared_bw && 		make &&	cp shared_bw $(BIN_DIR)
-	cd shared_lat && 		make &&	cp shared_lat $(BIN_DIR)
-	cd MaxFlops && 		make &&	cp MaxFlops $(BIN_DIR)
-	cd l1_shared_bw && 		make &&	cp l1_shared_bw $(BIN_DIR)
-	cd shared_bank_conflicts && 		make &&	cp shared_bank_conflicts $(BIN_DIR)
-	cd l1_bw_32f_unroll && 		make &&	cp l1_bw_32f_unroll $(BIN_DIR)
-	cd l1_bw_32f_unroll_large && 		make &&	cp l1_bw_32f_unroll_large $(BIN_DIR)
-	cd Atomic_ubench/Atomic_add/Atomic_add_bw &&            make && cp atomic_add_bw $(BIN_DIR)
-	cd Atomic_ubench/Atomic_add/Atomic_add_bw_conflict &&            make && cp atomic_add_bw_conflict $(BIN_DIR)
-	cd Atomic_ubench/Atomic_add/Atomic_add_bw_profile &&            make && cp atomic_add_bw_profile $(BIN_DIR)
-	cd Atomic_ubench/Atomic_add/Atomic_add_bw_diverge &&            make && cp atomic_add_bw_diverge $(BIN_DIR)
-	cd Atomic_ubench/Atomic_add/Atomic_add_lat &&            make && cp atomic_add_lat $(BIN_DIR)
+all: create_dir $(SUB_DIRS_ALL)
+
+clean: delete_dir $(SUB_DIRS_CLEAN)
 
+$(SUB_DIRS_ALL):
+	$(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%)
+
+$(SUB_DIRS_CLEAN):
+	$(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean
+
+create_dir:
+	mkdir -p $(BIN_DIR)
 
-clean:
-	cd $(BIN_DIR) && rm -f *
+delete_dir:
+	cd $(BIN_DIR); rm -f *
diff --git a/src/cuda/GPU_Microbenchmark/MaxFlops/Makefile b/src/cuda/GPU_Microbenchmark/MaxFlops/Makefile
diff --git a/src/cuda/GPU_Microbenchmark/common/common.mk b/src/cuda/GPU_Microbenchmark/common/common.mk
@@ -0,0 +1,48 @@
+BASE_DIR := $(shell pwd)
+BIN_DIR := $(BASE_DIR)/../../../bin/
+
+GENCODE_SM30 ?= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+GENCODE_SM35 ?= -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+GENCODE_SM50 ?= -gencode=arch=compute_50,code=\"sm_50,compute_50\"
+GENCODE_SM60 ?= -gencode=arch=compute_60,code=\"sm_60,compute_60\"
+GENCODE_SM62 ?= -gencode=arch=compute_62,code=\"sm_62,compute_62\"
+GENCODE_SM70 ?= -gencode=arch=compute_70,code=\"sm_70,compute_70\"
+GENCODE_SM75 ?= -gencode=arch=compute_75,code=\"sm_75,compute_75\"
+GENCODE_SM80 ?= -gencode=arch=compute_80,code=\"sm_80,compute_80\"
+GENCODE_SM86 ?= -gencode=arch=compute_86,code=\"sm_86,compute_86\"
+
+CUOPTS =  $(GENCODE_ARCH) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM62) $(GENCODE_SM70) $(GENCODE_SM75) $(GENCODE_SM80)
+
+CC := nvcc
+
+# CUDA_PATH ?= /use/local/cuda-10.1/
+INCLUDE := $(BASE_DIR)/../../../../cuda-samples/Common/
+LIB :=
+
+release:
+	$(CC) $(NVCC_FLGAS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
+	cp $(EXE) $(BIN_DIR)
+
+clean:
+	rm -f *.o; rm -f $(EXE)
+
+run:
+	./$(EXE)
+
+profile:
+	nvprof ./$(EXE)
+
+events:
+	nvprof  --events elapsed_cycles_sm ./$(EXE)
+
+profileall:
+	nvprof --concurrent-kernels off --print-gpu-trace -u us --metrics all --demangling off --csv --log-file data.csv ./$(EXE)
+
+nvsight:
+	nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum  --csv --page raw ./$(EXE) | tee nsight.csv
+
+ptx:
+	cuobjdump -ptx ./$(EXE)  tee ptx.txt
+
+sass:
+	cuobjdump -sass ./$(EXE)  tee sass.txt
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/ampere_A100_hw_def.h
@@ -0,0 +1,33 @@
+// These are the configration parameters that can be found publicly
+// Sources:
+// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
+// https://en.wikipedia.org/wiki/GeForce_30_series
+// https://en.wikipedia.org/wiki/CUDA
+
+#ifndef AMPERE_RTX3070_DEF_H
+#define AMPERE_RTX3070_DEF_H
+
+#include "./common/common.h"
+#include "./common/deviceQuery.h"
+
+#define L1_SIZE (192 * 1024) // Max L1 size in bytes
+
+#define CLK_FREQUENCY 1410 // frequency in MHz
+
+#define ISSUE_MODEL issue_model::single // single issue core or dual issue
+#define CORE_MODEL core_model::subcore  // subcore model or shared model
+#define DRAM_MODEL dram_model::HBM    // memory type
+#define WARP_SCHEDS_PER_SM 4            // number of warp schedulers per SM
+
+// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
+// see slide 22 at
+// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
+#define SASS_hmma_per_PTX_wmma 2
+
+// These vars are almost constant between HW generation
+// see slide 24 from Nvidia at
+// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
+#define L2_BANKS_PER_MEM_CHANNEL 2
+#define L2_BANK_WIDTH_in_BYTE 32
+
+#endif
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/ampere_RTX3070_hw_def.h
@@ -0,0 +1,33 @@
+// These are the configration parameters that can be found publicly
+// Sources:
+// https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
+// https://en.wikipedia.org/wiki/GeForce_30_series
+// https://en.wikipedia.org/wiki/CUDA
+
+#ifndef AMPERE_RTX3070_DEF_H
+#define AMPERE_RTX3070_DEF_H
+
+#include "./common/common.h"
+#include "./common/deviceQuery.h"
+
+#define L1_SIZE (128 * 1024) // Max L1 size in bytes
+
+#define CLK_FREQUENCY 1132 // frequency in MHz
+
+#define ISSUE_MODEL issue_model::single // single issue core or dual issue
+#define CORE_MODEL core_model::subcore  // subcore model or shared model
+#define DRAM_MODEL dram_model::GDDR6    // memory type
+#define WARP_SCHEDS_PER_SM 4            // number of warp schedulers per SM
+
+// number of SASS HMMA per 16x16 PTX WMMA for FP16 - FP32 accumlate operation
+// see slide 22 at
+// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
+#define SASS_hmma_per_PTX_wmma 2
+
+// These vars are almost constant between HW generation
+// see slide 24 from Nvidia at
+// https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
+#define L2_BANKS_PER_MEM_CHANNEL 2
+#define L2_BANK_WIDTH_in_BYTE 32
+
+#endif