Reduced shared mem req

ryanstocks00 · ryanstocks00 · commit e9bf3a2b32dd · 2024-07-18T22:20:49.000+10:00
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
@@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   }
@@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   if(do_lapl)
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
@@ -193,7 +193,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
 
 }
 
-#define GGA_KERNEL_SM_BLOCK_Y 32
+#define GGA_KERNEL_SM_BLOCK_Y 16
 
 template <bool need_lapl>
 __global__ void eval_uvars_mgga_kernel( size_t           ntasks,
@@ -319,7 +319,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   if(do_lapl)

Original file line number	Diff line number	Diff line change
`@@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,`
`306`	`306`	`{`
`307`	`307`	`dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );`
`308`	`308`	`dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),`
`309`		`- std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),`
	`309`	`+ std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),`
`310`	`310`	`ntasks );`
`311`	`311`	`eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );`
`312`	`312`	`}`
`@@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,`
`330`	`330`	`{`
`331`	`331`	`dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );`
`332`	`332`	`dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),`
`333`		`- std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),`
	`333`	`+ std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),`
`334`	`334`	`ntasks );`
`335`	`335`	`eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );`
`336`	`336`	`if(do_lapl)`
Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,`
`193`	`193`
`194`	`194`	`}`
`195`	`195`
`196`		`-#define GGA_KERNEL_SM_BLOCK_Y 32`
	`196`	`+#define GGA_KERNEL_SM_BLOCK_Y 16`
`197`	`197`
`198`	`198`	`template <bool need_lapl>`
`199`	`199`	`__global__ void eval_uvars_mgga_kernel( size_t ntasks,`
`@@ -319,7 +319,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,`
`319`	`319`	`{`
`320`	`320`	`dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 );`
`321`	`321`	`dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),`
`322`		`- std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),`
	`322`	`+ std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),`
`323`	`323`	`ntasks );`
`324`	`324`	`eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );`
`325`	`325`	`if(do_lapl)`