Skip to content

Commit 9425ca6

Browse files
committed
Merge pull request #434 from lattice/hotfix/0.8_update
added date to NEWS,README file and fixed some typos
2 parents d38ed85 + d744cc1 commit 9425ca6

File tree

8 files changed

+82
-38
lines changed

8 files changed

+82
-38
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
Copyright (c) 2009-2015 QUDA Developers
2+
Copyright (c) 2009-2016 QUDA Developers
33

44
Permission is hereby granted, free of charge, to any person obtaining a copy
55
of this software and associated documentation files (the "Software"), to deal

NEWS

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1-
Version 0.8.0 - xxth December 2015
1+
Version 0.8.0 - 1st February 2016
22

33
- Removed all Tesla-generation GPU support from QUDA (sm_1x). As a
44
result, QUDA now requires a minimum of the Fermi-generation GPUs.
55

66
- Added support for building QUDA using cmake. This gives a much more
77
flexible and extensible build system as well as allowing
8-
out-of-source-directory building.
8+
out-of-source-directory building. For details see:
9+
https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake
910

1011
- Improved strong scaling of the multi-shift solver by overlapping the
1112
shift updates with the subsequent iteration's dslash comms waiting.
1213

13-
- Improved performance of multi-shift solver by preventing unecessary
14+
- Improved performance of multi-shift solver by preventing unnecessary
1415
refinement of shifted solutions once the residual falls below
1516
floating point precision.
1617

@@ -45,9 +46,9 @@ Version 0.8.0 - xxth December 2015
4546
force kernels. This also improves compilation time and reduces
4647
library size.
4748

48-
- Added support for imaginary chemical potential to the staggeed phase
49+
- Added support for imaginary chemical potential to the staggered phase
4950
application / removal kernel, as well as fixing bugs in this
50-
reoutine.
51+
routine.
5152

5253
- Algorithms that previously used double-precision atomics now use a
5354
cub reduction. This drastically improves performance of such

README

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Release Notes for QUDA v0.8.0 xxth December 2015
1+
Release Notes for QUDA v0.8.0 1st February 2016
22
-----------------------------
33

44
Overview:

include/quda.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,14 @@ extern "C" {
6868

6969
int overlap; /**< Width of overlapping domains */
7070

71-
int use_resident_gauge; /**< Use the resident gauge field */
72-
int use_resident_mom; /**< Use the resident mom field */
73-
int make_resident_gauge; /**< Make the gauge field resident */
74-
int make_resident_mom; /**< Make the mom field resident */
75-
int return_gauge; /**< Return the new gauge field */
76-
int return_mom; /**< Return the new mom field */
71+
int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to to */
72+
73+
int use_resident_gauge; /**< Use the resident gauge field as input */
74+
int use_resident_mom; /**< Use the resident momentum field as input*/
75+
int make_resident_gauge; /**< Make the result gauge field resident */
76+
int make_resident_mom; /**< Make the result momentum field resident */
77+
int return_result_gauge; /**< Return the result gauge field */
78+
int return_result_mom; /**< Return the result momentum field */
7779

7880
} QudaGaugeParam;
7981

lib/check_params.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,19 +109,21 @@ void printQudaGaugeParam(QudaGaugeParam *param) {
109109
#endif
110110

111111
#if defined INIT_PARAM
112+
P(overwrite_mom, 0);
112113
P(use_resident_gauge, 0);
113114
P(use_resident_mom, 0);
114115
P(make_resident_gauge, 0);
115116
P(make_resident_mom, 0);
116-
P(return_gauge, 1);
117-
P(return_mom, 1);
117+
P(return_result_gauge, 1);
118+
P(return_result_mom, 1);
118119
#else
120+
P(overwrite_mom, INVALID_INT);
119121
P(use_resident_gauge, INVALID_INT);
120122
P(use_resident_mom, INVALID_INT);
121123
P(make_resident_gauge, INVALID_INT);
122124
P(make_resident_mom, INVALID_INT);
123-
P(return_gauge, INVALID_INT);
124-
P(return_mom, INVALID_INT);
125+
P(return_result_gauge, INVALID_INT);
126+
P(return_result_mom, INVALID_INT);
125127
#endif
126128

127129
#ifdef INIT_PARAM

lib/interface_quda.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,6 @@ extern char* gitversion;
357357
* Set the device that QUDA uses.
358358
*/
359359
void initQudaDevice(int dev) {
360-
361360
//static bool initialized = false;
362361
if (initialized) return;
363362
initialized = true;
@@ -434,13 +433,17 @@ void initQudaDevice(int dev) {
434433
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
435434
//cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
436435
cudaGetDeviceProperties(&deviceProp, dev);
436+
437+
profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
437438
}
438439

439440
/*
440441
* Any persistent memory allocations that QUDA uses are done here.
441442
*/
442443
void initQudaMemory()
443444
{
445+
profileInit.TPSTART(QUDA_PROFILE_TOTAL);
446+
444447
if (!comms_initialized) init_default_comms();
445448

446449
streams = new cudaStream_t[Nstream];
@@ -470,6 +473,8 @@ void initQudaMemory()
470473
cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);
471474

472475
loadTuneCache(getVerbosity());
476+
477+
profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
473478
}
474479

475480
void initQuda(int dev)
@@ -489,8 +494,6 @@ void initQuda(int dev)
489494
pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
490495
pthread_mutex_init(&pthread_mutex, &mutex_attr);
491496
#endif
492-
493-
profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
494497
}
495498

496499

@@ -3391,15 +3394,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
33913394
if (qudaGaugeParam->use_resident_mom) {
33923395
if (!gaugePrecise) errorQuda("No resident momentum field to use");
33933396
cudaMom = momResident;
3397+
if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
33943398
profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
33953399
} else {
3396-
gParamMom.create = QUDA_ZERO_FIELD_CREATE;
3400+
gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
33973401
gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
33983402
gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
33993403
gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
34003404
gParamMom.precision = qudaGaugeParam->cuda_prec;
34013405
gParamMom.create = QUDA_ZERO_FIELD_CREATE;
34023406
cudaMom = new cudaGaugeField(gParamMom);
3407+
if (!qudaGaugeParam->overwrite_mom) cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
34033408
profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
34043409
}
34053410

@@ -3409,7 +3414,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
34093414
path_length, loop_coeff, num_paths, max_length);
34103415
profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
34113416

3412-
if (qudaGaugeParam->return_mom) {
3417+
if (qudaGaugeParam->return_result_mom) {
34133418
profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
34143419
cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
34153420
profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
@@ -4318,10 +4323,10 @@ computeHISQForceQuda(void* const milc_momentum,
43184323
updateMomentum(*momResident, 1.0, *cudaMom);
43194324
}
43204325

4321-
if (gParam->return_mom) {
4326+
if (gParam->return_result_mom) {
43224327
profileHISQForce.TPSTART(QUDA_PROFILE_D2H);
43234328
// Close the paths, make anti-hermitian, and store in compressed format
4324-
if (gParam->return_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
4329+
if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
43254330
profileHISQForce.TPSTOP(QUDA_PROFILE_D2H);
43264331
}
43274332

@@ -4365,7 +4370,7 @@ void computeStaggeredOprodQuda(void** oprod,
43654370

43664371
#ifdef GPU_STAGGERED_OPROD
43674372
#ifndef BUILD_QDP_INTERFACE
4368-
#error "Staggerd oprod requires BUILD_QDP_INTERFACE";
4373+
#error "Staggered oprod requires BUILD_QDP_INTERFACE";
43694374
#endif
43704375
using namespace quda;
43714376
profileStaggeredOprod.TPSTART(QUDA_PROFILE_TOTAL);
@@ -4825,7 +4830,8 @@ void updateGaugeFieldQuda(void* gauge,
48254830
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
48264831
gParam.gauge = gauge;
48274832
gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
4828-
cpuGaugeField *cpuGauge = !param->use_resident_gauge ? new cpuGaugeField(gParam) : NULL;
4833+
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
4834+
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
48294835

48304836
gParam.reconstruct = gParam.order == QUDA_TIFR_GAUGE_ORDER ?
48314837
QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
@@ -4875,7 +4881,7 @@ void updateGaugeFieldQuda(void* gauge,
48754881
(bool)conj_mom, (bool)exact);
48764882
profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);
48774883

4878-
if (param->return_gauge) {
4884+
if (param->return_result_gauge) {
48794885
// copy the gauge field back to the host
48804886
profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
48814887
cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
@@ -4923,7 +4929,7 @@ void updateGaugeFieldQuda(void* gauge,
49234929
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
49244930
gParam.link_type = QUDA_GENERAL_LINKS;
49254931
gParam.gauge = gauge_h;
4926-
bool need_cpu = !param->use_resident_gauge || param->return_gauge;
4932+
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
49274933
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
49284934

49294935
// create the device fields
@@ -4954,7 +4960,7 @@ void updateGaugeFieldQuda(void* gauge,
49544960
errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
49554961

49564962
profileProject.TPSTART(QUDA_PROFILE_D2H);
4957-
if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
4963+
if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
49584964
profileProject.TPSTOP(QUDA_PROFILE_D2H);
49594965

49604966
if (param->make_resident_gauge) {
@@ -4985,7 +4991,7 @@ void updateGaugeFieldQuda(void* gauge,
49854991
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
49864992
gParam.link_type = QUDA_GENERAL_LINKS;
49874993
gParam.gauge = gauge_h;
4988-
bool need_cpu = !param->use_resident_gauge || param->return_gauge;
4994+
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
49894995
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;
49904996

49914997
// create the device fields
@@ -5014,7 +5020,7 @@ void updateGaugeFieldQuda(void* gauge,
50145020
profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
50155021

50165022
profilePhase.TPSTART(QUDA_PROFILE_D2H);
5017-
if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
5023+
if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
50185024
profilePhase.TPSTOP(QUDA_PROFILE_D2H);
50195025

50205026
if (param->make_resident_gauge) {

lib/milc_interface.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -293,11 +293,11 @@ void qudaHisqForce(int prec, const double level2_coeff[6], const double fat7_coe
293293
if (!invalidate_quda_mom) {
294294
gParam.use_resident_mom = true;
295295
gParam.make_resident_mom = true;
296-
gParam.return_mom = false;
296+
gParam.return_result_mom = false;
297297
} else {
298298
gParam.use_resident_mom = false;
299299
gParam.make_resident_mom = false;
300-
gParam.return_mom = true;
300+
gParam.return_result_mom = true;
301301
}
302302

303303
long long flops;
@@ -546,11 +546,20 @@ void qudaGaugeForce( int precision,
546546
if (!invalidate_quda_mom) {
547547
qudaGaugeParam.use_resident_mom = true;
548548
qudaGaugeParam.make_resident_mom = true;
549-
qudaGaugeParam.return_mom = false;
549+
qudaGaugeParam.return_result_mom = false;
550+
551+
// this means when we compute the momentum, we acummulate to the
552+
// preexisting resident momentum instead of overwriting it
553+
qudaGaugeParam.overwrite_mom = false;
550554
} else {
551555
qudaGaugeParam.use_resident_mom = false;
552556
qudaGaugeParam.make_resident_mom = false;
553-
qudaGaugeParam.return_mom = true;
557+
qudaGaugeParam.return_result_mom = true;
558+
559+
// this means we compute momentum into a fresh field, copy it back
560+
// and sum to current momentum in MILC. This saves an initial
561+
// CPU->GPU download of the current momentum.
562+
qudaGaugeParam.overwrite_mom = true;
554563
}
555564

556565
int max_length = 6;

lib/quda_fortran.F90

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,21 @@ module quda_fortran
5959
! Whether the staggered phase has already been applied to the links
6060
integer(4) :: staggered_phase_applied
6161

62+
! Imaginary chemical potential
63+
real(8) :: i_mu
64+
6265
integer(4) :: overlap ! width of domain overlap
6366

67+
! When computing momentum, should we overwrite it or accumulate
68+
! to it (only presenty support in gauge-force)
69+
integer(4) :: overwrite_mom
70+
6471
integer(4) :: use_resident_gauge ! Use the resident gauge field
65-
integer(4) :: use_resident_mom ! Use the resident mom field
66-
integer(4) :: make_resident_gauge ! Make the gauge field resident
67-
integer(4) :: make_resident_mom ! Make the mom field resident
72+
integer(4) :: use_resident_mom ! Use the resident momentume field
73+
integer(4) :: make_resident_gauge ! Make the result gauge field resident
74+
integer(4) :: make_resident_mom ! Make the result momentum field resident
75+
integer(4) :: return_result_gauge ! Return the result gauge field
76+
integer(4) :: return_result_mom ! Return the result momentum field
6877

6978
end type quda_gauge_param
7079

@@ -114,6 +123,9 @@ module quda_fortran
114123
! Actual L2 residual norm achieved in solver for each offset
115124
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_offset
116125

126+
! Iterated L2 residual achieved in multi shift solver for each offset
127+
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: iter_res_offset
128+
117129
! Actual heavy quark residual norm achieved in solver for each offset
118130
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_hq_offset
119131

@@ -206,6 +218,18 @@ module quda_fortran
206218
integer(4)::max_search_dim ! for magma library this parameter must be multiple 16?
207219
integer(4)::rhs_idx
208220
integer(4)::deflation_grid !total deflation space is nev*deflation_grid
221+
integer(4)::use_reduced_vector_set ! eigCG: specifies whether to use reduced eigenvector set
222+
real(8):: eigenval_tol ! eigCG: selection criterion for the reduced eigenvector set
223+
integer(4)::use_cg_updates ! mixed precision eigCG:whether to use cg refinement corrections in the incremental stage
224+
real(8)::cg_iterref_tol ! mixed precision eigCG: tolerance for cg refinement corrections in the incremental stage
225+
integer(4)::eigcg_max_restarts ! mixed precision eigCG tuning parameter: minimum search vector space restarts
226+
integer(4)::max_restart_num ! initCG tuning parameter: maximum restarts
227+
real(8)::inc_tol ! initCG tuning parameter: decrease in absolute value of the residual within each restart cycle
228+
229+
! Parameters for setting data residency of the solver
230+
integer(8)::make_resident_solution ! Whether to make the solution vector(s) after the solve
231+
integer(8)::use_resident_solution ! Whether to use the resident solution vector(s)
232+
209233
end type quda_invert_param
210234

211235
end module quda_fortran

0 commit comments

Comments
 (0)