Merge pull request #434 from lattice/hotfix/0.8_update

mathiaswagner · mathiaswagner · commit 9425ca6f301b · 2016-02-01T13:35:53.000+01:00
added date to NEWS,README file and fixed some typos
diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,5 @@
 
-Copyright (c) 2009-2015 QUDA Developers
+Copyright (c) 2009-2016 QUDA Developers
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/NEWS b/NEWS
@@ -1,16 +1,17 @@
-Version 0.8.0 - xxth December 2015
+Version 0.8.0 - 1st February 2016
 
 - Removed all Tesla-generation GPU support from QUDA (sm_1x).  As a
   result, QUDA now requires a minimum of the Fermi-generation GPUs.
 
 - Added support for building QUDA using cmake.  This gives a much more
   flexible and extensible build system as well as allowing
-  out-of-source-directory building.
+  out-of-source-directory building. For details see:
+  https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake
 
 - Improved strong scaling of the multi-shift solver by overlapping the
   shift updates with the subsequent iteration's dslash comms waiting.
 
-- Improved performance of multi-shift solver by preventing unecessary
+- Improved performance of multi-shift solver by preventing unnecessary
   refinement of shifted solutions once the residual falls below
   floating point precision.
 
@@ -45,9 +46,9 @@ Version 0.8.0 - xxth December 2015
   force kernels.  This also improves compilation time and reduces
   library size.
 
-- Added support for imaginary chemical potential to the staggeed phase
+- Added support for imaginary chemical potential to the staggered phase
   application / removal kernel, as well as fixing bugs in this
-  reoutine.
+  routine.
 
 - Algorithms that previously used double-precision atomics now use a
   cub reduction.  This drastically improves performance of such
diff --git a/README b/README
@@ -1,4 +1,4 @@
-Release Notes for QUDA v0.8.0                         xxth December 2015
+Release Notes for QUDA v0.8.0                         1st February 2016
 -----------------------------
 
 Overview:
diff --git a/include/quda.h b/include/quda.h
@@ -68,12 +68,14 @@ extern "C" {
 
     int overlap; /**< Width of overlapping domains */
 
-    int use_resident_gauge;  /**< Use the resident gauge field */
-    int use_resident_mom;    /**< Use the resident mom field */
-    int make_resident_gauge; /**< Make the gauge field resident */
-    int make_resident_mom;   /**< Make the mom field resident */
-    int return_gauge;        /**< Return the new gauge field */
-    int return_mom;          /**< Return the new mom field */
+    int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to to */
+
+    int use_resident_gauge;  /**< Use the resident gauge field as input */
+    int use_resident_mom;    /**< Use the resident momentum field as input*/
+    int make_resident_gauge; /**< Make the result gauge field resident */
+    int make_resident_mom;   /**< Make the result momentum field resident */
+    int return_result_gauge; /**< Return the result gauge field */
+    int return_result_mom;   /**< Return the result momentum field */
 
   } QudaGaugeParam;
 
diff --git a/lib/check_params.h b/lib/check_params.h
@@ -109,19 +109,21 @@ void printQudaGaugeParam(QudaGaugeParam *param) {
 #endif
 
 #if defined INIT_PARAM
+  P(overwrite_mom, 0);
   P(use_resident_gauge, 0);
   P(use_resident_mom, 0);
   P(make_resident_gauge, 0);
   P(make_resident_mom, 0);
-  P(return_gauge, 1);
-  P(return_mom, 1);
+  P(return_result_gauge, 1);
+  P(return_result_mom, 1);
 #else
+  P(overwrite_mom, INVALID_INT);
   P(use_resident_gauge, INVALID_INT);
   P(use_resident_mom, INVALID_INT);
   P(make_resident_gauge, INVALID_INT);
   P(make_resident_mom, INVALID_INT);
-  P(return_gauge, INVALID_INT);
-  P(return_mom, INVALID_INT);
+  P(return_result_gauge, INVALID_INT);
+  P(return_result_mom, INVALID_INT);
 #endif
 
 #ifdef INIT_PARAM
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
@@ -357,7 +357,6 @@ extern char* gitversion;
  * Set the device that QUDA uses.
  */
 void initQudaDevice(int dev) {
-
   //static bool initialized = false;
   if (initialized) return;
   initialized = true;
@@ -434,13 +433,17 @@ void initQudaDevice(int dev) {
   cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
   //cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
   cudaGetDeviceProperties(&deviceProp, dev);
+
+  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 /*
  * Any persistent memory allocations that QUDA uses are done here.
  */
 void initQudaMemory()
 {
+  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
+
   if (!comms_initialized) init_default_comms();
 
   streams = new cudaStream_t[Nstream];
@@ -470,6 +473,8 @@ void initQudaMemory()
   cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);
 
   loadTuneCache(getVerbosity());
+
+  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void initQuda(int dev)
@@ -489,8 +494,6 @@ void initQuda(int dev)
   pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
   pthread_mutex_init(&pthread_mutex, &mutex_attr);
 #endif
-
-  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 
@@ -3391,15 +3394,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   if (qudaGaugeParam->use_resident_mom) {
     if (!gaugePrecise) errorQuda("No resident momentum field to use");
     cudaMom = momResident;
+    if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
   } else {
-    gParamMom.create = QUDA_ZERO_FIELD_CREATE;  
+    gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
     gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.precision = qudaGaugeParam->cuda_prec;
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
     cudaMom = new cudaGaugeField(gParamMom);
+    if (!qudaGaugeParam->overwrite_mom) cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
   }
 
@@ -3409,7 +3414,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
       path_length, loop_coeff, num_paths, max_length);
   profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  if (qudaGaugeParam->return_mom) {
+  if (qudaGaugeParam->return_result_mom) {
     profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
     cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
     profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
@@ -4318,10 +4323,10 @@ computeHISQForceQuda(void* const milc_momentum,
     updateMomentum(*momResident, 1.0, *cudaMom);
   }
 
-  if (gParam->return_mom) {
+  if (gParam->return_result_mom) {
     profileHISQForce.TPSTART(QUDA_PROFILE_D2H);
     // Close the paths, make anti-hermitian, and store in compressed format
-    if (gParam->return_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
+    if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
     profileHISQForce.TPSTOP(QUDA_PROFILE_D2H);
   }
 
@@ -4365,7 +4370,7 @@ void computeStaggeredOprodQuda(void** oprod,
 
 #ifdef  GPU_STAGGERED_OPROD
 #ifndef BUILD_QDP_INTERFACE
-#error "Staggerd oprod requires BUILD_QDP_INTERFACE";
+#error "Staggered oprod requires BUILD_QDP_INTERFACE";
 #endif
   using namespace quda;
   profileStaggeredOprod.TPSTART(QUDA_PROFILE_TOTAL);
@@ -4825,7 +4830,8 @@ void updateGaugeFieldQuda(void* gauge,
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.gauge = gauge;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  cpuGaugeField *cpuGauge = !param->use_resident_gauge ? new cpuGaugeField(gParam) : NULL;
+  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
+  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
 
   gParam.reconstruct = gParam.order == QUDA_TIFR_GAUGE_ORDER ? 
    QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
@@ -4875,7 +4881,7 @@ void updateGaugeFieldQuda(void* gauge,
       (bool)conj_mom, (bool)exact);
   profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  if (param->return_gauge) {
+  if (param->return_result_gauge) {
     // copy the gauge field back to the host
     profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
     cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
@@ -4923,7 +4929,7 @@ void updateGaugeFieldQuda(void* gauge,
    gParam.reconstruct = QUDA_RECONSTRUCT_NO;
    gParam.link_type = QUDA_GENERAL_LINKS;
    gParam.gauge = gauge_h;
-   bool need_cpu = !param->use_resident_gauge || param->return_gauge;
+   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
    
    // create the device fields
@@ -4954,7 +4960,7 @@ void updateGaugeFieldQuda(void* gauge,
      errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
    
    profileProject.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
+   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
    profileProject.TPSTOP(QUDA_PROFILE_D2H);
 
    if (param->make_resident_gauge) {
@@ -4985,7 +4991,7 @@ void updateGaugeFieldQuda(void* gauge,
    gParam.reconstruct = QUDA_RECONSTRUCT_NO;
    gParam.link_type = QUDA_GENERAL_LINKS;
    gParam.gauge = gauge_h;
-   bool need_cpu = !param->use_resident_gauge || param->return_gauge;
+   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
    cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
    
    // create the device fields
@@ -5014,7 +5020,7 @@ void updateGaugeFieldQuda(void* gauge,
    profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
    
    profilePhase.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
+   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
    profilePhase.TPSTOP(QUDA_PROFILE_D2H);
 
    if (param->make_resident_gauge) {
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
@@ -293,11 +293,11 @@ void qudaHisqForce(int prec, const double level2_coeff[6], const double fat7_coe
   if (!invalidate_quda_mom) {
     gParam.use_resident_mom = true;
     gParam.make_resident_mom = true;
-    gParam.return_mom = false;
+    gParam.return_result_mom = false;
   } else {
     gParam.use_resident_mom = false;
     gParam.make_resident_mom = false;
-    gParam.return_mom = true;
+    gParam.return_result_mom = true;
   }
 
   long long flops;
@@ -546,11 +546,20 @@ void qudaGaugeForce( int precision,
   if (!invalidate_quda_mom) {
     qudaGaugeParam.use_resident_mom = true;
     qudaGaugeParam.make_resident_mom = true;
-    qudaGaugeParam.return_mom = false;
+    qudaGaugeParam.return_result_mom = false;
+
+    // this means when we compute the momentum, we acummulate to the
+    // preexisting resident momentum instead of overwriting it
+    qudaGaugeParam.overwrite_mom = false;
   } else {
     qudaGaugeParam.use_resident_mom = false;
     qudaGaugeParam.make_resident_mom = false;
-    qudaGaugeParam.return_mom = true;
+    qudaGaugeParam.return_result_mom = true;
+
+    // this means we compute momentum into a fresh field, copy it back
+    // and sum to current momentum in MILC.  This saves an initial
+    // CPU->GPU download of the current momentum.
+    qudaGaugeParam.overwrite_mom = true;
   }
 
   int max_length = 6;
diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90
@@ -59,12 +59,21 @@ module quda_fortran
      ! Whether the staggered phase has already been applied to the links
      integer(4) :: staggered_phase_applied
 
+     ! Imaginary chemical potential
+     real(8) :: i_mu
+
      integer(4) :: overlap ! width of domain overlap
 
+     ! When computing momentum, should we overwrite it or accumulate
+     ! to it (only presenty support in gauge-force)
+     integer(4) :: overwrite_mom
+
      integer(4) :: use_resident_gauge  ! Use the resident gauge field
-     integer(4) :: use_resident_mom    ! Use the resident mom field
-     integer(4) :: make_resident_gauge ! Make the gauge field resident
-     integer(4) :: make_resident_mom   ! Make the mom field resident
+     integer(4) :: use_resident_mom    ! Use the resident momentume field
+     integer(4) :: make_resident_gauge ! Make the result gauge field resident
+     integer(4) :: make_resident_mom   ! Make the result momentum field resident
+     integer(4) :: return_result_gauge ! Return the result gauge field
+     integer(4) :: return_result_mom   ! Return the result momentum field
 
   end type quda_gauge_param
 
@@ -114,6 +123,9 @@ module quda_fortran
      ! Actual L2 residual norm achieved in solver for each offset
      real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_offset
 
+     ! Iterated L2 residual achieved in multi shift solver for each offset
+     real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: iter_res_offset
+
      ! Actual heavy quark residual norm achieved in solver for each offset
      real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_hq_offset
 
@@ -206,6 +218,18 @@ module quda_fortran
      integer(4)::max_search_dim ! for magma library this parameter must be multiple 16?
      integer(4)::rhs_idx
      integer(4)::deflation_grid !total deflation space is nev*deflation_grid
+     integer(4)::use_reduced_vector_set ! eigCG: specifies whether to use reduced eigenvector set
+     real(8):: eigenval_tol ! eigCG: selection criterion for the reduced eigenvector set
+     integer(4)::use_cg_updates ! mixed precision eigCG:whether to use cg refinement corrections in the incremental stage
+     real(8)::cg_iterref_tol ! mixed precision eigCG:  tolerance for cg refinement corrections in the incremental stage
+     integer(4)::eigcg_max_restarts ! mixed precision eigCG tuning parameter:  minimum search vector space restarts
+     integer(4)::max_restart_num     ! initCG tuning parameter:  maximum restarts
+     real(8)::inc_tol     ! initCG tuning parameter:  decrease in absolute value of the residual within each restart cycle
+
+     ! Parameters for setting data residency of the solver
+     integer(8)::make_resident_solution ! Whether to make the solution vector(s) after the solve
+     integer(8)::use_resident_solution  ! Whether to use the resident solution vector(s)
+
   end type quda_invert_param
 
 end module quda_fortran

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Release Notes for QUDA v0.8.0 xxth December 2015`
	`1`	`+Release Notes for QUDA v0.8.0 1st February 2016`
`2`	`2`	`-----------------------------`
`3`	`3`
`4`	`4`	`Overview:`