@@ -357,7 +357,6 @@ extern char* gitversion;
357357 * Set the device that QUDA uses.
358358 */
359359void initQudaDevice (int dev) {
360-
361360 // static bool initialized = false;
362361 if (initialized) return ;
363362 initialized = true ;
@@ -434,13 +433,17 @@ void initQudaDevice(int dev) {
434433 cudaDeviceSetCacheConfig (cudaFuncCachePreferL1);
435434 // cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
436435 cudaGetDeviceProperties (&deviceProp, dev);
436+
437+ profileInit.TPSTOP (QUDA_PROFILE_TOTAL);
437438}
438439
439440/*
440441 * Any persistent memory allocations that QUDA uses are done here.
441442 */
442443void initQudaMemory ()
443444{
445+ profileInit.TPSTART (QUDA_PROFILE_TOTAL);
446+
444447 if (!comms_initialized) init_default_comms ();
445448
446449 streams = new cudaStream_t[Nstream];
@@ -470,6 +473,8 @@ void initQudaMemory()
470473 cudaHostGetDevicePointer (&num_failures_d, num_failures_h, 0 );
471474
472475 loadTuneCache (getVerbosity ());
476+
477+ profileInit.TPSTOP (QUDA_PROFILE_TOTAL);
473478}
474479
475480void initQuda (int dev)
@@ -489,8 +494,6 @@ void initQuda(int dev)
489494 pthread_mutexattr_settype (&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
490495 pthread_mutex_init (&pthread_mutex, &mutex_attr);
491496#endif
492-
493- profileInit.TPSTOP (QUDA_PROFILE_TOTAL);
494497}
495498
496499
@@ -3391,15 +3394,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
33913394 if (qudaGaugeParam->use_resident_mom ) {
33923395 if (!gaugePrecise) errorQuda (" No resident momentum field to use" );
33933396 cudaMom = momResident;
3397+ if (qudaGaugeParam->overwrite_mom ) cudaMom->zero ();
33943398 profileGaugeForce.TPSTOP (QUDA_PROFILE_INIT);
33953399 } else {
3396- gParamMom .create = QUDA_ZERO_FIELD_CREATE;
3400+ gParamMom .create = qudaGaugeParam-> overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
33973401 gParamMom .order = QUDA_FLOAT2_GAUGE_ORDER;
33983402 gParamMom .reconstruct = QUDA_RECONSTRUCT_10;
33993403 gParamMom .link_type = QUDA_ASQTAD_MOM_LINKS;
34003404 gParamMom .precision = qudaGaugeParam->cuda_prec ;
34013405 gParamMom .create = QUDA_ZERO_FIELD_CREATE;
34023406 cudaMom = new cudaGaugeField (gParamMom );
3407+ if (!qudaGaugeParam->overwrite_mom ) cudaMom->loadCPUField (*cpuMom, QUDA_CPU_FIELD_LOCATION);
34033408 profileGaugeForce.TPSTOP (QUDA_PROFILE_INIT);
34043409 }
34053410
@@ -3409,7 +3414,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
34093414 path_length, loop_coeff, num_paths, max_length);
34103415 profileGaugeForce.TPSTOP (QUDA_PROFILE_COMPUTE);
34113416
3412- if (qudaGaugeParam->return_mom ) {
3417+ if (qudaGaugeParam->return_result_mom ) {
34133418 profileGaugeForce.TPSTART (QUDA_PROFILE_D2H);
34143419 cudaMom->saveCPUField (*cpuMom, QUDA_CPU_FIELD_LOCATION);
34153420 profileGaugeForce.TPSTOP (QUDA_PROFILE_D2H);
@@ -4318,10 +4323,10 @@ computeHISQForceQuda(void* const milc_momentum,
43184323 updateMomentum (*momResident, 1.0 , *cudaMom);
43194324 }
43204325
4321- if (gParam ->return_mom ) {
4326+ if (gParam ->return_result_mom ) {
43224327 profileHISQForce.TPSTART (QUDA_PROFILE_D2H);
43234328 // Close the paths, make anti-hermitian, and store in compressed format
4324- if (gParam ->return_mom ) cudaMom->saveCPUField (*cpuMom, QUDA_CPU_FIELD_LOCATION);
4329+ if (gParam ->return_result_mom ) cudaMom->saveCPUField (*cpuMom, QUDA_CPU_FIELD_LOCATION);
43254330 profileHISQForce.TPSTOP (QUDA_PROFILE_D2H);
43264331 }
43274332
@@ -4365,7 +4370,7 @@ void computeStaggeredOprodQuda(void** oprod,
43654370
43664371#ifdef GPU_STAGGERED_OPROD
43674372#ifndef BUILD_QDP_INTERFACE
4368- #error "Staggerd oprod requires BUILD_QDP_INTERFACE";
4373+ #error "Staggered oprod requires BUILD_QDP_INTERFACE";
43694374#endif
43704375 using namespace quda ;
43714376 profileStaggeredOprod.TPSTART (QUDA_PROFILE_TOTAL);
@@ -4825,7 +4830,8 @@ void updateGaugeFieldQuda(void* gauge,
48254830 gParam .reconstruct = QUDA_RECONSTRUCT_NO;
48264831 gParam .gauge = gauge;
48274832 gParam .ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4828- cpuGaugeField *cpuGauge = !param->use_resident_gauge ? new cpuGaugeField (gParam ) : NULL ;
4833+ bool need_cpu = !param->use_resident_gauge || param->return_result_gauge ;
4834+ cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField (gParam ) : NULL ;
48294835
48304836 gParam .reconstruct = gParam .order == QUDA_TIFR_GAUGE_ORDER ?
48314837 QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
@@ -4875,7 +4881,7 @@ void updateGaugeFieldQuda(void* gauge,
48754881 (bool )conj_mom, (bool )exact);
48764882 profileGaugeUpdate.TPSTOP (QUDA_PROFILE_COMPUTE);
48774883
4878- if (param->return_gauge ) {
4884+ if (param->return_result_gauge ) {
48794885 // copy the gauge field back to the host
48804886 profileGaugeUpdate.TPSTART (QUDA_PROFILE_D2H);
48814887 cudaOutGauge->saveCPUField (*cpuGauge, QUDA_CPU_FIELD_LOCATION);
@@ -4923,7 +4929,7 @@ void updateGaugeFieldQuda(void* gauge,
49234929 gParam .reconstruct = QUDA_RECONSTRUCT_NO;
49244930 gParam .link_type = QUDA_GENERAL_LINKS;
49254931 gParam .gauge = gauge_h;
4926- bool need_cpu = !param->use_resident_gauge || param->return_gauge ;
4932+ bool need_cpu = !param->use_resident_gauge || param->return_result_gauge ;
49274933 cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField (gParam ) : NULL ;
49284934
49294935 // create the device fields
@@ -4954,7 +4960,7 @@ void updateGaugeFieldQuda(void* gauge,
49544960 errorQuda (" Error in the SU(3) unitarization: %d failures\n " , *num_failures_h);
49554961
49564962 profileProject.TPSTART (QUDA_PROFILE_D2H);
4957- if (param->return_gauge ) cudaGauge->saveCPUField (*cpuGauge, QUDA_CPU_FIELD_LOCATION);
4963+ if (param->return_result_gauge ) cudaGauge->saveCPUField (*cpuGauge, QUDA_CPU_FIELD_LOCATION);
49584964 profileProject.TPSTOP (QUDA_PROFILE_D2H);
49594965
49604966 if (param->make_resident_gauge ) {
@@ -4985,7 +4991,7 @@ void updateGaugeFieldQuda(void* gauge,
49854991 gParam .reconstruct = QUDA_RECONSTRUCT_NO;
49864992 gParam .link_type = QUDA_GENERAL_LINKS;
49874993 gParam .gauge = gauge_h;
4988- bool need_cpu = !param->use_resident_gauge || param->return_gauge ;
4994+ bool need_cpu = !param->use_resident_gauge || param->return_result_gauge ;
49894995 cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField (gParam ) : NULL ;
49904996
49914997 // create the device fields
@@ -5014,7 +5020,7 @@ void updateGaugeFieldQuda(void* gauge,
50145020 profilePhase.TPSTOP (QUDA_PROFILE_COMPUTE);
50155021
50165022 profilePhase.TPSTART (QUDA_PROFILE_D2H);
5017- if (param->return_gauge ) cudaGauge->saveCPUField (*cpuGauge, QUDA_CPU_FIELD_LOCATION);
5023+ if (param->return_result_gauge ) cudaGauge->saveCPUField (*cpuGauge, QUDA_CPU_FIELD_LOCATION);
50185024 profilePhase.TPSTOP (QUDA_PROFILE_D2H);
50195025
50205026 if (param->make_resident_gauge ) {
0 commit comments