Skip to content

Commit e9bf3a2

Browse files
committed
Reduced shared mem req
1 parent c2e3cc2 commit e9bf3a2

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
306306
{
307307
dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
308308
dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
309-
std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
309+
std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
310310
ntasks );
311311
eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
312312
}
@@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
330330
{
331331
dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
332332
dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
333-
std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
333+
std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
334334
ntasks );
335335
eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
336336
if(do_lapl)

src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
193193

194194
}
195195

196-
#define GGA_KERNEL_SM_BLOCK_Y 32
196+
#define GGA_KERNEL_SM_BLOCK_Y 16
197197

198198
template <bool need_lapl>
199199
__global__ void eval_uvars_mgga_kernel( size_t ntasks,
@@ -319,7 +319,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
319319
{
320320
dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 );
321321
dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
322-
std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
322+
std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
323323
ntasks );
324324
eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
325325
if(do_lapl)

0 commit comments

Comments
 (0)