Skip to content

Commit 6a9bc05

Browse files
JRPanclaude
andauthored
Fix LUD kernel syncthreads bug and clean up microbenchmark binaries (#77)
- Fix critical CUDA synchronization bug in lud_diagonal kernel where __syncthreads() was called inside conditional blocks. This caused undefined behavior as not all threads reached the barrier. Moved syncthreads outside the if-blocks so all threads participate. - Change GPU_Microbenchmark build to use mv instead of cp to avoid leaving duplicate binaries in source directories. - Add clean_GPU_Microbenchmark to the main clean target. Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent ae0cce2 commit 6a9bc05

File tree

3 files changed

+8
-8
lines changed

3 files changed

+8
-8
lines changed

src/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw
1919
#Disable clean for now, It has a bug!
2020
# clean_dragon-naive clean_pannotia clean_proxy-apps
2121

22-
clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface
22+
clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark
2323
clean_accelwattch: clean_rodinia-3.1 clean_parboil clean_cutlass clean_cuda_samples-11.0 clean_cuda_samples_hw_power clean_rodinia-3.1_hw_power clean_parboil_hw_power clean_accelwattch_ubench
2424

2525
clean_data:
@@ -108,7 +108,7 @@ dragon-cdp: dragon-naive
108108
GPU_Microbenchmark:
109109
mkdir -p $(BINDIR)/$(BINSUBDIR)/
110110
$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/GPU_Microbenchmark
111-
cp -r cuda/GPU_Microbenchmark/bin/* $(BINDIR)/$(BINSUBDIR)/
111+
mv cuda/GPU_Microbenchmark/bin/* $(BINDIR)/$(BINSUBDIR)/
112112
clean_GPU_Microbenchmark:
113113
find cuda/GPU_Microbenchmark/ubench -type f -executable -delete
114114

src/cuda/GPU_Microbenchmark/common/common.mk

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ LIB :=
1111

1212
release:
1313
$(CC) $(NVCC_FLAGS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
14-
cp $(EXE) $(BIN_DIR)
14+
mv $(EXE) $(BIN_DIR)
1515

1616
tuner:
1717
$(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart
18-
cp $(EXE) $(BIN_DIR)
18+
mv $(EXE) $(BIN_DIR)
1919

2020
clean:
2121
rm -f *.o; rm -f $(EXE)

src/cuda/rodinia/2.0-ft/lud/lud_kernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,14 @@ lud_diagonal(float *m, int matrix_dim, int offset)
2222
for(j=0; j < i; j++)
2323
shadow[threadIdx.x][i] -= shadow[threadIdx.x][j]*shadow[j][i];
2424
shadow[threadIdx.x][i] /= shadow[i][i];
25+
}
26+
__syncthreads();
2527

26-
__syncthreads();
27-
28+
if (threadIdx.x>i){
2829
for(j=0; j < i+1; j++)
2930
shadow[i+1][threadIdx.x] -= shadow[i+1][j]*shadow[j][threadIdx.x];
30-
31-
__syncthreads();
3231
}
32+
__syncthreads();
3333
}
3434

3535
/*

0 commit comments

Comments
 (0)