NCAR
diff --git a/‎SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu‎
Lines changed: 32 additions & 32 deletions b/‎SRC/EXTENSIONS/GAD/CUDA/cuda_GADDevice.cu‎
Lines changed: 32 additions & 32 deletions
diff --git a/‎SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu‎
Lines changed: 4 additions & 4 deletions b/‎SRC/EXTENSIONS/URBAN/CUDA/cuda_urbanDevice.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎SRC/FECUDA/fecuda_Device_cu.h‎
Lines changed: 0 additions & 5 deletions b/‎SRC/FECUDA/fecuda_Device_cu.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎SRC/FECUDA/fecuda_Utils.cu‎
Lines changed: 6 additions & 6 deletions b/‎SRC/FECUDA/fecuda_Utils.cu‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎SRC/FECUDA/fecuda_Utils_cu.h‎
Lines changed: 6 additions & 1 deletion b/‎SRC/FECUDA/fecuda_Utils_cu.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎SRC/GRID/CUDA/cuda_gridDevice.cu‎
Lines changed: 13 additions & 13 deletions b/‎SRC/GRID/CUDA/cuda_gridDevice.cu‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu‎
Lines changed: 12 additions & 12 deletions b/‎SRC/HYDRO_CORE/CUDA/cuda_BCsDevice.cu‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu‎
Lines changed: 4 additions & 4 deletions b/‎SRC/HYDRO_CORE/CUDA/cuda_BaseStateDevice.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu‎
Lines changed: 3 additions & 3 deletions b/‎SRC/HYDRO_CORE/CUDA/cuda_advectionDevice.cu‎
Lines changed: 3 additions & 3 deletions
@@ -102,26 +102,26 @@ extern "C" int cuda_GADDeviceSetup(){
     cudaMemcpyToSymbol(numgridCells_away_d, &numgridCells_away, sizeof(int));
 
     /*Device memory allocations and Host-to-Device memcopy for turbine arrays */
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineType_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRank_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefi_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefj_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineRefk_d);
-    fecuda_DeviceMallocInt(GADNumTurbines*sizeof(int), &GAD_turbineYawing_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineType_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRank_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefi_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefj_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineRefk_d);
+    fecuda_DeviceMallocInt((size_t)(GADNumTurbines), &GAD_turbineYawing_d);
     cudaMemcpy(GAD_turbineType_d, GAD_turbineType, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRank_d, GAD_turbineRank, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRefi_d, GAD_turbineRefi, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRefj_d, GAD_turbineRefj, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRefk_d, GAD_turbineRefk, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineYawing_d, GAD_turbineYawing, GADNumTurbines*sizeof(int), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefMag_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_turbineRefDir_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Xcoords_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_Ycoords_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_rotorTheta_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_yawError_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &GAD_anFactor_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefMag_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_turbineRefDir_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Xcoords_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_Ycoords_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_rotorTheta_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_yawError_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &GAD_anFactor_d);
     cudaMemcpy(GAD_turbineRefMag_d, GAD_turbineRefMag, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_turbineRefDir_d, GAD_turbineRefDir, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_Xcoords_d, GAD_Xcoords, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
@@ -136,10 +136,10 @@ extern "C" int cuda_GADDeviceSetup(){
     cudaMemcpy(GAD_yawError_d, GAD_yawError, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_anFactor_d, GAD_anFactor, GADNumTurbines*sizeof(float), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineUseries_d);
-    fecuda_DeviceMalloc(GADNumTurbines*GADrefSeriesLength*sizeof(float), &GAD_turbineVseries_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &u_sampAvg_d);
-    fecuda_DeviceMalloc(GADNumTurbines*sizeof(float), &v_sampAvg_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineUseries_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines*GADrefSeriesLength), &GAD_turbineVseries_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &u_sampAvg_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbines), &v_sampAvg_d);
 
     //Initialize u_sampAvg & GAD_turbineUseries as constant (per-turbine) then send down to the device 
     tmp_vector = (float *) malloc(GADrefSeriesLength*sizeof(float));
@@ -171,40 +171,40 @@ extern "C" int cuda_GADDeviceSetup(){
     }
     free(tmp_vector);
 
-    fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_hubHeights_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_rotorD_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*sizeof(float), &GAD_nacelleD_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_hubHeights_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_rotorD_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes), &GAD_nacelleD_d);
     cudaMemcpy(GAD_hubHeights_d, GAD_hubHeights, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_rotorD_d, GAD_rotorD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(GAD_nacelleD_d, GAD_nacelleD, GADNumTurbineTypes*sizeof(float), cudaMemcpyHostToDevice);
 
 
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyTwist_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyChord_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyPitch_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), &turbinePolyOmega_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyTwist_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyChord_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyPitch_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyOrderMax), &turbinePolyOmega_d);
     cudaMemcpy(turbinePolyTwist_d, turbinePolyTwist, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyChord_d, turbinePolyChord, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyPitch_d, turbinePolyPitch, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyOmega_d, turbinePolyOmega, GADNumTurbineTypes*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), &rnorm_vect_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*alphaBounds*sizeof(float), &alpha_minmax_vect_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCl_d);
-    fecuda_DeviceMalloc(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), &turbinePolyCd_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)), &rnorm_vect_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*alphaBounds), &alpha_minmax_vect_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCl_d);
+    fecuda_DeviceMalloc((size_t)(GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax), &turbinePolyCd_d);
 
     cudaMemcpy(rnorm_vect_d, rnorm_vect, GADNumTurbineTypes*(turbinePolyClCdrNormSegments+1)*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(alpha_minmax_vect_d, alpha_minmax_vect, GADNumTurbineTypes*alphaBounds*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyCd_d, turbinePolyCd, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
     cudaMemcpy(turbinePolyCl_d, turbinePolyCl, GADNumTurbineTypes*turbinePolyClCdrNormSegments*turbinePolyOrderMax*sizeof(float), cudaMemcpyHostToDevice);
 
-    fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_turbineVolMask_d);
+    fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_turbineVolMask_d);
     cudaMemcpy(GAD_turbineVolMask_d, GAD_turbineVolMask, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
 
     if (GADoutputForces == 1){
-      fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceX_d);
-      fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceY_d);
-      fecuda_DeviceMalloc((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &GAD_forceZ_d);
+      fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceX_d);
+      fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceY_d);
+      fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)), &GAD_forceZ_d);
       cudaMemcpy(GAD_forceX_d, GAD_forceX, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
       cudaMemcpy(GAD_forceY_d, GAD_forceY, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
       cudaMemcpy(GAD_forceZ_d, GAD_forceZ, (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), cudaMemcpyHostToDevice);
 
@@ -29,21 +29,21 @@ float *urban_heat_redis_d;                 /* Base Address of memory containing
 */
 extern "C" int cuda_urbanDeviceSetup(){
    int errorCode = CUDA_URBAN_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(urbanSelector_d, &urbanSelector, sizeof(int));
    cudaMemcpyToSymbol(cd_build_d, &cd_build, sizeof(float));
    cudaMemcpyToSymbol(ct_build_d, &ct_build, sizeof(float));
 
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &building_mask_d);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems, &building_mask_d);
    cudaMemcpy(building_mask_d, building_mask, Nelems*sizeof(float), cudaMemcpyHostToDevice);
 
    cudaMemcpyToSymbol(delta_aware_bdg_d, &delta_aware_bdg, sizeof(float));
 
    if(urban_heatRedis > 0){
      Nelems = (Nxp+2*Nh)*(Nyp+2*Nh);
-     fecuda_DeviceMalloc(Nelems*sizeof(float), &urban_heat_redis_d);
+     fecuda_DeviceMalloc(Nelems, &urban_heat_redis_d);
      cudaMemcpy(urban_heat_redis_d, urban_heat_redis, Nelems*sizeof(float), cudaMemcpyHostToDevice);
    }
 
 
@@ -55,11 +55,6 @@ extern __constant__ int rankYid_d;
 */
 extern "C" int fecuda_DeviceSetup(int tBx, int tBy, int tBz);
 
-/*----->>>>> void fecuda_DeviceMallocInt();    -----------------------------------------------------------
-* Used to allocate device memory integer blocks and set the  host memory addresses of device memory pointers.
-*/
-extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d);
-
 /*----->>>>> int fecuda_SetBlocksPerGrid();   ------------------------------------------------------------------
  * Used to set the "dim3 grid" module variable that is passed to any device kernel 
  * to specify the number of blocks per grid in each dimenaion
 
@@ -180,19 +180,19 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers(){
 /*----->>>>> void fecuda_DeviceMalloc();    -----------------------------------------------------------
 * Used to allocate device memory float blocks and set the  host memory addresses of device memory pointers.
 */
-extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d) {
-    cudaMalloc((void**)memBlock_d,sizeof(float)*Nelems);
+extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d) {
+    cudaMalloc((void**)memBlock_d,(size_t)(sizeof(float))*Nelems);
     gpuErrchk( cudaPeekAtLastError() );
-    cudaMemset(*memBlock_d,'\0',sizeof(float)*Nelems);    
+    cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(float))*Nelems);    
     gpuErrchk( cudaPeekAtLastError() );
 #ifdef DEBUG
     printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d);
 #endif
 }
-extern "C" void fecuda_DeviceMallocInt(int Nelems, int** memBlock_d) {
-    cudaMalloc((void**)memBlock_d,sizeof(int)*Nelems);
+extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d) {
+    cudaMalloc((void**)memBlock_d,(size_t)(sizeof(int))*Nelems);
     gpuErrchk( cudaPeekAtLastError() );
-    cudaMemset(*memBlock_d,'\0',sizeof(int)*Nelems);
+    cudaMemset(*memBlock_d,'\0',(size_t)(sizeof(int))*Nelems);
     gpuErrchk( cudaPeekAtLastError() );
 #ifdef DEBUG
     printf("New device memory allocation, device pointer is stored at host address %p as %p\n",memBlock_d, *memBlock_d);
 
@@ -41,7 +41,12 @@ extern "C" int fecuda_UtilsDeallocateHaloBuffers();
 /*----->>>>> void fecuda_DeviceMalloc();    -----------------------------------------------------------
 * Used to allocate device memory float blocks and set the  host memory addresses of device memory pointers.
 */
-extern "C" void fecuda_DeviceMalloc(int Nelems, float** memBlock_d);
+extern "C" void fecuda_DeviceMalloc(size_t Nelems, float** memBlock_d);
+
+/*----->>>>> void fecuda_DeviceMallocInt();    -----------------------------------------------------------
+* Used to allocate device memory integer blocks and set the  host memory addresses of device memory pointers.
+*/
+extern "C" void fecuda_DeviceMallocInt(size_t Nelems, int** memBlock_d);
 
 /*----->>>>> int fecuda_SendRecvWestEast(); -------------------------------------------------------------------
 Used to perform western/eastern device domain halo exchange for an arbitrary field.
 
@@ -64,7 +64,7 @@ float *invD_Jac_d; //inverse Determinant of the Jacbian
 */
 extern "C" int cuda_gridDeviceSetup(){
    int errorCode = CUDA_GRID_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 #ifdef DEBUG 
    cudaEvent_t startE, stopE;
    float elapsedTime;
@@ -100,21 +100,21 @@ extern "C" int cuda_gridDeviceSetup(){
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMemCpy calls*/
 
    /*Set the full memory block number of elements for grid fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh); 
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh)); 
    /* Allocate the GRID arrays */
    /* Coordinate Arrays */
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &xPos_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &yPos_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &zPos_d);
-   fecuda_DeviceMalloc(((Nxp+2*Nh)*(Nyp+2*Nh))*sizeof(float), &topoPos_d);
+   fecuda_DeviceMalloc(Nelems, &xPos_d);
+   fecuda_DeviceMalloc(Nelems, &yPos_d);
+   fecuda_DeviceMalloc(Nelems, &zPos_d);
+   fecuda_DeviceMalloc((size_t)((Nxp+2*Nh)*(Nyp+2*Nh)), &topoPos_d);
    /* Metric Tensors Fields */
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J13_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J23_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J31_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J32_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &J33_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &D_Jac_d);
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &invD_Jac_d);
+   fecuda_DeviceMalloc(Nelems, &J13_d);
+   fecuda_DeviceMalloc(Nelems, &J23_d);
+   fecuda_DeviceMalloc(Nelems, &J31_d);
+   fecuda_DeviceMalloc(Nelems, &J32_d);
+   fecuda_DeviceMalloc(Nelems, &J33_d);
+   fecuda_DeviceMalloc(Nelems, &D_Jac_d);
+   fecuda_DeviceMalloc(Nelems, &invD_Jac_d);
    gpuErrchk( cudaPeekAtLastError() ); /*Check for errors in the cudaMalloc calls*/
 
    /* cudaMemcpy the GRID arrays from Host to Device*/
 
@@ -66,22 +66,22 @@ extern "C" int cuda_BCsDeviceSetup(){
    /*Allocate arrays*/
    if(hydroBCs==1){ //Using LAD BCs
      if((rankYid == 0)||(rankYid == numProcsY-1)){
-       fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanes_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesNext_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &XZBdyPlanesBuffer_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanes_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesNext_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nzp+2*Nh)), &XZBdyPlanesBuffer_d);
      }
      if((rankXid == 0)||(rankXid == numProcsX-1)){
-       fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanes_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesNext_d);
-       fecuda_DeviceMalloc(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)*sizeof(float), &YZBdyPlanesBuffer_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanes_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesNext_d);
+       fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nyp+2*Nh)*(Nzp+2*Nh)), &YZBdyPlanesBuffer_d);
      }
-     fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanes_d);
-     fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesNext_d);
-     fecuda_DeviceMalloc(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &XYBdyPlanesBuffer_d);
+     fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanes_d);
+     fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesNext_d);
+     fecuda_DeviceMalloc((size_t)(2*nBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &XYBdyPlanesBuffer_d);
      if(surflayerSelector == 3){
-       fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanes_d);
-       fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesNext_d);
-       fecuda_DeviceMalloc(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)*sizeof(float), &SURFBdyPlanesBuffer_d);
+       fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanes_d);
+       fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesNext_d);
+       fecuda_DeviceMalloc((size_t)(nSurfBndyVars*(Nxp+2*Nh)*(Nyp+2*Nh)), &SURFBdyPlanesBuffer_d);
      }
    }//end if hydroBCs == 1
 
 
@@ -24,13 +24,13 @@ float *hydroBaseStatePres_d;   /*Base Adress of memory containing the diagnostic
 */
 extern "C" int cuda_BaseStateDeviceSetup(){
    int errorCode = CUDA_BASESTATE_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    /*Set the full memory block number of elements for base-state fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
    /* Allocate the Base State arrays on the device */
-   fecuda_DeviceMalloc(Nelems*2*sizeof(float), &hydroBaseStateFlds_d);  //Only rho and theta base-state variables
-   fecuda_DeviceMalloc(Nelems*sizeof(float), &hydroBaseStatePres_d);  //Only base-state pressure 
+   fecuda_DeviceMalloc(Nelems*2, &hydroBaseStateFlds_d);  //Only rho and theta base-state variables
+   fecuda_DeviceMalloc(Nelems, &hydroBaseStatePres_d);  //Only base-state pressure 
 
    /* Send the Base State arrays down to the device */
    cudaMemcpy(hydroBaseStateFlds_d, hydroBaseStateFlds, Nelems*2*sizeof(float), cudaMemcpyHostToDevice);
 
@@ -25,15 +25,15 @@ __constant__ float b_hyb_d;                      /*hybrid advection scheme param
 */
 extern "C" int cuda_advectionDeviceSetup(){
    int errorCode = CUDA_ADVECTION_SUCCESS;
-   int Nelems;
+   size_t Nelems;
 
    cudaMemcpyToSymbol(advectionSelector_d, &advectionSelector, sizeof(int));
    cudaMemcpyToSymbol(ceilingAdvectionBC_d, &ceilingAdvectionBC, sizeof(int));
    cudaMemcpyToSymbol(b_hyb_d, &b_hyb, sizeof(float));
 
    /*Set the full memory block number of elements for hydroCore fields*/
-   Nelems = (Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh);
-   fecuda_DeviceMalloc(Nelems*3*sizeof(float), &hydroFaceVels_d); /*Cell-face Velocities*/
+   Nelems = (size_t)((Nxp+2*Nh)*(Nyp+2*Nh)*(Nzp+2*Nh));
+   fecuda_DeviceMalloc(Nelems*3, &hydroFaceVels_d); /*Cell-face Velocities*/
 
    return(errorCode);
 } //end cuda_advectionDeviceSetup()