Skip to content

Commit c16933e

Browse files
Nov 2025 v4.0.1 patch (#115)
* Major reconciliation of the device-sided array allocations to remove the redundant factor of 4. Also generally a move to Nelems consistently being typed as size_t and passed through allocations as size_t allowing arrays much larger than signed int maximum. * Bugfix v4.0.1 release patch items for November 2025, gpu_type=a100 added to batch FE submission scripts, changes to Example08 .json files, cast simTime_it to unsigned long long for curandSetPseudoRandomGeneratorSeed(), update binary converter batch script to point to pre-built conda environment on casper, and new enhanced performance Simgrid.py. * Remove extra factor of rho on tauxz and tauyz used in formulation os output value of fricvel in surfacelayer. Added Li and Zhou paper to publications list. --------- Co-authored-by: Domingo Muñoz-Esparza <[email protected]>
1 parent e2c4783 commit c16933e

30 files changed

+193
-170
lines changed

SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -102,26 +102,26 @@ extern "C" int cuda_GADDeviceSetup(){
102102
cudaMemcpyToSymbol(numgridCells_away_d, &numgridCells_away, sizeof(int));
103103

104104
/*Device memory allocations and Host-to-Device memcopy for turbine arrays */
105-
fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineType_d);
106-
fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRank_d);
107-
fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefi_d);
108-
fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefj_d);
109-
fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefk_d);
110-
fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineYawing_d);
105+
fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineType_d);
106+
fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRank_d);
107+
fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefi_d);
108+
fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefj_d);
109+
fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefk_d);
110+
fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineYawing_d);
111111
cudaMemcpy(GAD_turbineType_d, GAD_turbineType, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
112112
cudaMemcpy(GAD_turbineRank_d, GAD_turbineRank, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
113113
cudaMemcpy(GAD_turbineRefi_d, GAD_turbineRefi, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
114114
cudaMemcpy(GAD_turbineRefj_d, GAD_turbineRefj, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
115115
cudaMemcpy(GAD_turbineRefk_d, GAD_turbineRefk, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
116116
cudaMemcpy(GAD_turbineYawing_d, GAD_turbineYawing, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
117117

118-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefMag_d);
119-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefDir_d);
120-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Xcoords_d);
121-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Ycoords_d);
122-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_rotorTheta_d);
123-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_yawError_d);
124-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_anFactor_d);
118+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefMag_d);
119+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefDir_d);
120+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Xcoords_d);
121+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Ycoords_d);
122+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_rotorTheta_d);
123+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_yawError_d);
124+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_anFactor_d);
125125
cudaMemcpy(GAD_turbineRefMag_d, GAD_turbineRefMag, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
126126
cudaMemcpy(GAD_turbineRefDir_d, GAD_turbineRefDir, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
127127
cudaMemcpy(GAD_Xcoords_d, GAD_Xcoords, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
@@ -136,10 +136,10 @@ extern "C" int cuda_GADDeviceSetup(){
136136
cudaMemcpy(GAD_yawError_d, GAD_yawError, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
137137
cudaMemcpy(GAD_anFactor_d, GAD_anFactor, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
138138

139-
fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineUseries_d);
140-
fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineVseries_d);
141-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &u_sampAvg_d);
142-
fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &v_sampAvg_d);
139+
fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineUseries_d);
140+
fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineVseries_d);
141+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &u_sampAvg_d);
142+
fecuda_DeviceMalloc((size_t)(GADNumTurbines), &v_sampAvg_d);
143143

144144
//Initialize u_sampAvg & GAD_turbineUseries as constant (per-turbine) then send down to the device
145145
tmp_vector = (float *) malloc(GADrefSeriesLength*sizeof(float));
@@ -171,40 +171,40 @@ extern "C" int cuda_GADDeviceSetup(){
171171
}
172172
free(tmp_vector);
173173

174-
fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_hubHeights_d);
175-
fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_rotorD_d);
176-
fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_nacelleD_d);
174+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_hubHeights_d);
175+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_rotorD_d);
176+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_nacelleD_d);
177177
cudaMemcpy(GAD_hubHeights_d, GAD_hubHeights, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
178178
cudaMemcpy(GAD_rotorD_d, GAD_rotorD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
179179
cudaMemcpy(GAD_nacelleD_d, GAD_nacelleD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
180180

181181

182-
fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyTwist_d);
183-
fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyChord_d);
184-
fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyPitch_d);
185-
fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyOmega_d);
182+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyTwist_d);
183+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyChord_d);
184+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyPitch_d);
185+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyOmega_d);
186186
cudaMemcpy(turbinePolyTwist_d, turbinePolyTwist, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
187187
cudaMemcpy(turbinePolyChord_d, turbinePolyChord, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
188188
cudaMemcpy(turbinePolyPitch_d, turbinePolyPitch, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
189189
cudaMemcpy(turbinePolyOmega_d, turbinePolyOmega, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
190190

191-
fecuda_DeviceMalloc(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), &rnorm_vect_d);
192-
fecuda_DeviceMalloc(GADNumTurbineTypes*alphaBounds*sizeof(float), &alpha_minmax_vect_d);
193-
fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCl_d);
194-
fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCd_d);
191+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)), &rnorm_vect_d);
192+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*alphaBounds), &alpha_minmax_vect_d);
193+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCl_d);
194+
fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCd_d);
195195

196196
cudaMemcpy(rnorm_vect_d, rnorm_vect, GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), cudaMemcpyHostToDevice);
197197
cudaMemcpy(alpha_minmax_vect_d, alpha_minmax_vect, GADNumTurbineTypes*alphaBounds*sizeof(float), cudaMemcpyHostToDevice);
198198
cudaMemcpy(turbinePolyCd_d, turbinePolyCd, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
199199
cudaMemcpy(turbinePolyCl_d, turbinePolyCl, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
200200

201-
fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_turbineVolMask_d);
201+
fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_turbineVolMask_d);
202202
cudaMemcpy(GAD_turbineVolMask_d, GAD_turbineVolMask, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
203203

204204
if (GADoutputForces == 1){
205-
fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceX_d);
206-
fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceY_d);
207-
fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceZ_d);
205+
fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceX_d);
206+
fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceY_d);
207+
fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceZ_d);
208208
cudaMemcpy(GAD_forceX_d, GAD_forceX, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
209209
cudaMemcpy(GAD_forceY_d, GAD_forceY, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
210210
cudaMemcpy(GAD_forceZ_d, GAD_forceZ, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);

SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,21 @@ float *urban_heat_redis_d; /* Base Address of memory containing
2929
*/
3030
extern "C" int cuda_urbanDeviceSetup(){
3131
int errorCode = CUDA_URBAN_SUCCESS;
32-
int Nelems;
32+
size_t Nelems;
3333

3434
cudaMemcpyToSymbol(urbanSelector_d, &urbanSelector, sizeof(int));
3535
cudaMemcpyToSymbol(cd_build_d, &cd_build, sizeof(float));
3636
cudaMemcpyToSymbol(ct_build_d, &ct_build, sizeof(float));
3737

38-
Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
39-
fecuda_DeviceMalloc(Nelems*sizeof(float), &building_mask_d);
38+
Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
39+
fecuda_DeviceMalloc(Nelems, &building_mask_d);
4040
cudaMemcpy(building_mask_d, building_mask, Nelems*sizeof(float), cudaMemcpyHostToDevice);
4141

4242
cudaMemcpyToSymbol(delta_aware_bdg_d, &delta_aware_bdg, sizeof(float));
4343

4444
if(urban_heatRedis > 0){
4545
Nelems = (Nxp+2*Nh)*(Nyp+2*Nh);
46-
fecuda_DeviceMalloc(Nelems*sizeof(float), &urban_heat_redis_d);
46+
fecuda_DeviceMalloc(Nelems, &urban_heat_redis_d);
4747
cudaMemcpy(urban_heat_redis_d, urban_heat_redis, Nelems*sizeof(float), cudaMemcpyHostToDevice);
4848
}
4949

SRC/FECUDA/fecuda_Device_cu.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,6 @@ extern __constant__ int rankYid_d;
5555
*/
5656
extern "C" int fecuda_DeviceSetup(int tBx, int tBy, int tBz);
5757

58-
/*----->>>>> void fecuda_DeviceMallocInt(); -----------------------------------------------------------
59-
* Used to allocate device memory integer blocks and set the host memory addresses of device memory pointers.
60-
*/
61-
extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d);
62-
6358
/*----->>>>> int fecuda_SetBlocksPerGrid(); ------------------------------------------------------------------
6459
* Used to set the "dim3 grid" module variable that is passed to any device kernel
6560
* to specify the number of blocks per grid in each dimenaion

SRC/FECUDA/fecuda_Utils.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,19 +180,19 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers(){
180180
/*----->>>>> void fecuda_DeviceMalloc(); -----------------------------------------------------------
181181
* Used to allocate device memory float blocks and set the host memory addresses of device memory pointers.
182182
*/
183-
extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d) {
184-
cudaMalloc((void**)memBlock_d,sizeof(float)*Nelems);
183+
extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d) {
184+
cudaMalloc((void**)memBlock_d,(size_t)(sizeof(float))*Nelems);
185185
gpuErrchk( cudaPeekAtLastError() );
186-
cudaMemset(*memBlock_d,'\0',sizeof(float)*Nelems);
186+
cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(float))*Nelems);
187187
gpuErrchk( cudaPeekAtLastError() );
188188
#ifdef DEBUG
189189
printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d);
190190
#endif
191191
}
192-
extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d) {
193-
cudaMalloc((void**)memBlock_d,sizeof(int)*Nelems);
192+
extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d) {
193+
cudaMalloc((void**)memBlock_d,(size_t)(sizeof(int))*Nelems);
194194
gpuErrchk( cudaPeekAtLastError() );
195-
cudaMemset(*memBlock_d,'\0',sizeof(int)*Nelems);
195+
cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(int))*Nelems);
196196
gpuErrchk( cudaPeekAtLastError() );
197197
#ifdef DEBUG
198198
printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d);

SRC/FECUDA/fecuda_Utils_cu.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,12 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers();
4141
/*----->>>>> void fecuda_DeviceMalloc(); -----------------------------------------------------------
4242
* Used to allocate device memory float blocks and set the host memory addresses of device memory pointers.
4343
*/
44-
extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d);
44+
extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d);
45+
46+
/*----->>>>> void fecuda_DeviceMallocInt(); -----------------------------------------------------------
47+
* Used to allocate device memory integer blocks and set the host memory addresses of device memory pointers.
48+
*/
49+
extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d);
4550

4651
/*----->>>>> int fecuda_SendRecvWestEast(); -------------------------------------------------------------------
4752
Used to perform western/eastern device domain halo exchange for an arbitrary field.

SRC/GRID/CUDA/cuda_gridDevice.cu

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ float *invD_Jac_d; //inverse Determinant of the Jacbian
6464
*/
6565
extern "C" int cuda_gridDeviceSetup(){
6666
int errorCode = CUDA_GRID_SUCCESS;
67-
int Nelems;
67+
size_t Nelems;
6868
#ifdef DEBUG
6969
cudaEvent_t startE, stopE;
7070
float elapsedTime;
@@ -100,21 +100,21 @@ extern "C" int cuda_gridDeviceSetup(){
100100
gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/
101101

102102
/*Set the full memory block number of elements for grid fields*/
103-
Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
103+
Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
104104
/* Allocate the GRID arrays */
105105
/* Coordinate Arrays */
106-
fecuda_DeviceMalloc(Nelems*sizeof(float), &xPos_d);
107-
fecuda_DeviceMalloc(Nelems*sizeof(float), &yPos_d);
108-
fecuda_DeviceMalloc(Nelems*sizeof(float), &zPos_d);
109-
fecuda_DeviceMalloc(((Nxp+2*Nh)*(Nyp+2*Nh))*sizeof(float), &topoPos_d);
106+
fecuda_DeviceMalloc(Nelems, &xPos_d);
107+
fecuda_DeviceMalloc(Nelems, &yPos_d);
108+
fecuda_DeviceMalloc(Nelems, &zPos_d);
109+
fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)), &topoPos_d);
110110
/* Metric Tensors Fields */
111-
fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d);
112-
fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d);
113-
fecuda_DeviceMalloc(Nelems*sizeof(float), &J31_d);
114-
fecuda_DeviceMalloc(Nelems*sizeof(float), &J32_d);
115-
fecuda_DeviceMalloc(Nelems*sizeof(float), &J33_d);
116-
fecuda_DeviceMalloc(Nelems*sizeof(float), &D_Jac_d);
117-
fecuda_DeviceMalloc(Nelems*sizeof(float), &invD_Jac_d);
111+
fecuda_DeviceMalloc(Nelems, &J13_d);
112+
fecuda_DeviceMalloc(Nelems, &J23_d);
113+
fecuda_DeviceMalloc(Nelems, &J31_d);
114+
fecuda_DeviceMalloc(Nelems, &J32_d);
115+
fecuda_DeviceMalloc(Nelems, &J33_d);
116+
fecuda_DeviceMalloc(Nelems, &D_Jac_d);
117+
fecuda_DeviceMalloc(Nelems, &invD_Jac_d);
118118
gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMalloc calls*/
119119

120120
/* cudaMemcpy the GRID arrays from Host to Device*/

SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -66,22 +66,22 @@ extern "C" int cuda_BCsDeviceSetup(){
6666
/*Allocate arrays*/
6767
if(hydroBCs==1){ //Using LAD BCs
6868
if((rankYid == 0)||(rankYid == numProcsY-1)){
69-
fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanes_d);
70-
fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesNext_d);
71-
fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesBuffer_d);
69+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanes_d);
70+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesNext_d);
71+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesBuffer_d);
7272
}
7373
if((rankXid == 0)||(rankXid == numProcsX-1)){
74-
fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanes_d);
75-
fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesNext_d);
76-
fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesBuffer_d);
74+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanes_d);
75+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesNext_d);
76+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesBuffer_d);
7777
}
78-
fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanes_d);
79-
fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesNext_d);
80-
fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesBuffer_d);
78+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanes_d);
79+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesNext_d);
80+
fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesBuffer_d);
8181
if(surflayerSelector == 3){
82-
fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanes_d);
83-
fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesNext_d);
84-
fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesBuffer_d);
82+
fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanes_d);
83+
fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesNext_d);
84+
fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesBuffer_d);
8585
}
8686
}//end if hydroBCs == 1
8787

SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ float *hydroBaseStatePres_d; /*Base Adress of memory containing the diagnostic
2424
*/
2525
extern "C" int cuda_BaseStateDeviceSetup(){
2626
int errorCode = CUDA_BASESTATE_SUCCESS;
27-
int Nelems;
27+
size_t Nelems;
2828

2929
/*Set the full memory block number of elements for base-state fields*/
30-
Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
30+
Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
3131
/* Allocate the Base State arrays on the device */
32-
fecuda_DeviceMalloc(Nelems*2*sizeof(float), &hydroBaseStateFlds_d); //Only rho and theta base-state variables
33-
fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroBaseStatePres_d); //Only base-state pressure
32+
fecuda_DeviceMalloc(Nelems*2, &hydroBaseStateFlds_d); //Only rho and theta base-state variables
33+
fecuda_DeviceMalloc(Nelems, &hydroBaseStatePres_d); //Only base-state pressure
3434

3535
/* Send the Base State arrays down to the device */
3636
cudaMemcpy(hydroBaseStateFlds_d, hydroBaseStateFlds, Nelems*2*sizeof(float), cudaMemcpyHostToDevice);

SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@ __constant__ float b_hyb_d; /*hybrid advection scheme param
2525
*/
2626
extern "C" int cuda_advectionDeviceSetup(){
2727
int errorCode = CUDA_ADVECTION_SUCCESS;
28-
int Nelems;
28+
size_t Nelems;
2929

3030
cudaMemcpyToSymbol(advectionSelector_d, &advectionSelector, sizeof(int));
3131
cudaMemcpyToSymbol(ceilingAdvectionBC_d, &ceilingAdvectionBC, sizeof(int));
3232
cudaMemcpyToSymbol(b_hyb_d, &b_hyb, sizeof(float));
3333

3434
/*Set the full memory block number of elements for hydroCore fields*/
35-
Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
36-
fecuda_DeviceMalloc(Nelems*3*sizeof(float), &hydroFaceVels_d); /*Cell-face Velocities*/
35+
Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
36+
fecuda_DeviceMalloc(Nelems*3, &hydroFaceVels_d); /*Cell-face Velocities*/
3737

3838
return(errorCode);
3939
} //end cuda_advectionDeviceSetup()

0 commit comments

Comments
 (0)