Skip to content

Commit a7b4357

Browse files
author
Vasileios Karakasis
authored
Merge branch 'master' into bugfix/print_null_config_default
2 parents 4ffeb08 + fd59db9 commit a7b4357

File tree

9 files changed

+73
-57
lines changed

9 files changed

+73
-57
lines changed
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1+
all: dgemm sgemm
12
dgemm:
2-
nvcc [email protected] -o [email protected] ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
3+
nvcc xgemm.cu -o [email protected] -DGEMM_TYPE=double -DXBLAS_GEMM=XblasDgemm ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
4+
sgemm:
5+
nvcc xgemm.cu -o [email protected] -DGEMM_TYPE=float -DXBLAS_GEMM=XblasSgemm ${CXXFLAGS} -lnvidia-ml -lcublas -std=c++14
6+
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
ROCM_ROOT?=/opt/rocm
22
RSMI_ROOT?=/opt/rocm/rocm_smi
33

4+
all: dgemm sgemm
5+
46
dgemm:
5-
hipcc -O3 [email protected] -o [email protected] -DTARGET_HIP ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas
7+
hipcc -O3 xgemm.cu -o [email protected] -DTARGET_HIP -DGEMM_TYPE=double -DXBLAS_GEMM=XblasDgemm ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas
8+
9+
sgemm:
10+
hipcc -O3 xgemm.cu -o [email protected] -DTARGET_HIP -DGEMM_TYPE=float -DXBLAS_GEMM=XblasSgemm ${CXXFLAGS} -std=c++14 -I${ROCM_ROOT} -I${RSMI_ROOT}/include -lnuma -lrocm_smi64 -lrocblas

hpctestlib/microbenchmarks/gpu/dgemm/src/dgemm.cu renamed to hpctestlib/microbenchmarks/gpu/dgemm/src/xgemm.cu

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -55,23 +55,25 @@ double tflops = SIZE*SIZE*SIZE*2.0 * 1E-12;
5555
int totalErrors = 0;
5656
std::mutex mtx;
5757

58+
5859
#define BLOCK_SIZE 128
59-
void dgemm(int device)
60+
template<class T>
61+
void xgemm_test(int device)
6062
{
6163
XSetDevice(device);
6264

63-
double * A;
64-
double * B;
65-
double * C;
66-
const double alpha = 1.0;
67-
const double beta = 0.0;
65+
T * A;
66+
T * B;
67+
T * C;
68+
const T alpha = 1.0;
69+
const T beta = 0.0;
6870

69-
XMalloc((void**)&A, sizeof(double)*SIZE*SIZE);
70-
XMalloc((void**)&B, sizeof(double)*SIZE*SIZE);
71-
XMalloc((void**)&C, sizeof(double)*SIZE*SIZE);
71+
XMalloc((void**)&A, sizeof(T)*SIZE*SIZE);
72+
XMalloc((void**)&B, sizeof(T)*SIZE*SIZE);
73+
XMalloc((void**)&C, sizeof(T)*SIZE*SIZE);
7274

73-
kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
74-
kernels::init_as_ones<double><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
75+
kernels::init_as_ones<T><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(A, SIZE*SIZE);
76+
kernels::init_as_ones<T><<<(SIZE*SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(B, SIZE*SIZE);
7577
XDeviceSynchronize();
7678

7779
XStream_t stream;
@@ -81,12 +83,13 @@ void dgemm(int device)
8183
XblasSetStream(blas_handle, stream);
8284

8385
// Warmup call
84-
XblasDgemm(blas_handle,
86+
// define either as XblasDgemm or XblasSgemm
87+
XBLAS_GEMM(blas_handle,
8588
XBLAS_OP_N, XBLAS_OP_N,
8689
SIZE, SIZE, SIZE,
8790
&alpha,
88-
(const double*)A, SIZE,
89-
(const double*)B, SIZE,
91+
(const T*)A, SIZE,
92+
(const T*)B, SIZE,
9093
&beta,
9194
C, SIZE);
9295
XDeviceSynchronize();
@@ -96,12 +99,13 @@ void dgemm(int device)
9699
t.start();
97100
for (int i = 0; i < REPEAT; i++)
98101
{
99-
XblasDgemm(blas_handle,
102+
// define either as XblasDgemm or XblasSgemm
103+
XBLAS_GEMM(blas_handle,
100104
XBLAS_OP_N, XBLAS_OP_N,
101105
SIZE, SIZE, SIZE,
102106
&alpha,
103-
(const double*)A, SIZE,
104-
(const double*)B, SIZE,
107+
(const T*)A, SIZE,
108+
(const T*)B, SIZE,
105109
&beta,
106110
C, SIZE);
107111
}
@@ -116,7 +120,7 @@ void dgemm(int device)
116120
int * err, h_err = 0;
117121
XMalloc((void**)&err, sizeof(int));
118122
XMemcpy( err, &h_err, sizeof(int), XMemcpyHostToDevice);
119-
kernels::verify<double><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
123+
kernels::verify<T><<<(SIZE+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>>(C, SIZE*SIZE, err);
120124
XMemcpy(&h_err, err, sizeof(int), XMemcpyDeviceToHost);
121125
{
122126
std::lock_guard<std::mutex> lg(mtx);
@@ -145,10 +149,11 @@ int main(int argc, char **argv)
145149
// Create vector of threads.
146150
std::vector<std::thread> threads;
147151

152+
148153
// Do the dgemm for all devices in the node.
149154
for (int device = 0; device < num_devices; device++)
150155
{
151-
threads.push_back(std::thread(dgemm,device));
156+
threads.push_back(std::thread(xgemm_test<GEMM_TYPE>,device));
152157
}
153158

154159
// Join all threads

reframe/core/pipeline.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,8 +1518,7 @@ def _clone_to_stagedir(self, url):
15181518
self.logger.debug(f'Cloning URL {url} into stage directory')
15191519
osext.git_clone(
15201520
self.sourcesdir, self._stagedir,
1521-
# FIXME: cast to float explicitly due to GH #2246
1522-
timeout=float(rt.runtime().get_option('general/0/git_timeout'))
1521+
timeout=rt.runtime().get_option('general/0/git_timeout')
15231522
)
15241523

15251524
@final

reframe/frontend/argparse.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def __getattr__(self, name):
8282
if name not in self.__option_map:
8383
return ret
8484

85-
envvar, _, action = self.__option_map[name]
85+
envvar, _, action, arg_type = self.__option_map[name]
8686
if ret is None and envvar is not None:
8787
# Try the environment variable
8888
envvar, *delim = envvar.split(maxsplit=2)
@@ -99,6 +99,14 @@ def __getattr__(self, name):
9999
raise ValueError(
100100
f'environment variable {envvar!r} not a boolean'
101101
) from None
102+
elif action == 'store' and arg_type != str:
103+
try:
104+
ret = arg_type(ret)
105+
except ValueError as err:
106+
raise ValueError(
107+
f'cannot convert environment variable {envvar!r} '
108+
f'to {arg_type.__name__!r}'
109+
) from err
102110

103111
return ret
104112

@@ -107,7 +115,7 @@ def update_config(self, site_config):
107115
namespace'''
108116
errors = []
109117
for option, spec in self.__option_map.items():
110-
_, confvar, action = spec
118+
confvar, action = spec[1:3]
111119
if action == 'version' or confvar is None:
112120
continue
113121

@@ -174,7 +182,8 @@ def add_argument(self, *flags, **kwargs):
174182
self._option_map[opt_name] = (
175183
kwargs.get('envvar', None),
176184
kwargs.get('configvar', None),
177-
kwargs.get('action', 'store')
185+
kwargs.get('action', 'store'),
186+
kwargs.get('type', str)
178187
)
179188
# Remove envvar and configvar keyword arguments and force dest
180189
# argument, even if we guessed it, in order to guard against changes

reframe/frontend/cli.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -391,11 +391,11 @@ def main():
391391
run_options.add_argument(
392392
'--max-retries', metavar='NUM', action='store', default=0,
393393
help='Set the maximum number of times a failed regression test '
394-
'may be retried (default: 0)'
394+
'may be retried (default: 0)', type=int
395395
)
396396
run_options.add_argument(
397397
'--maxfail', metavar='NUM', action='store', default=sys.maxsize,
398-
help='Exit after first NUM failures'
398+
help='Exit after first NUM failures', type=int
399399
)
400400
run_options.add_argument(
401401
'--mode', action='store', help='Execution mode to use'
@@ -527,8 +527,10 @@ def main():
527527
dest='git_timeout',
528528
envvar='RFM_GIT_TIMEOUT',
529529
configvar='general/git_timeout',
530+
action='store',
530531
help=('Timeout in seconds when checking if the url is a '
531-
'valid repository.')
532+
'valid repository.'),
533+
type=float
532534
)
533535
argparser.add_argument(
534536
dest='graylog_server',
@@ -568,7 +570,8 @@ def main():
568570
envvar='RFM_PIPELINE_TIMEOUT',
569571
configvar='general/pipeline_timeout',
570572
action='store',
571-
help='Timeout for advancing the pipeline'
573+
help='Timeout for advancing the pipeline',
574+
type=float
572575
)
573576
argparser.add_argument(
574577
dest='remote_detect',
@@ -1175,26 +1178,14 @@ def module_unuse(*paths):
11751178
parsed_job_options.append(f'--{optstr} {valstr}')
11761179

11771180
exec_policy.sched_options = parsed_job_options
1178-
try:
1179-
max_retries = int(options.max_retries)
1180-
except ValueError:
1181-
raise errors.ConfigError(
1182-
f'--max-retries is not a valid integer: {max_retries}'
1183-
) from None
1184-
1185-
try:
1186-
max_failures = int(options.maxfail)
1187-
if max_failures < 0:
1188-
raise errors.ConfigError(
1189-
f'--maxfail should be a non-negative integer: '
1190-
f'{options.maxfail!r}'
1191-
)
1192-
except ValueError:
1181+
if options.maxfail < 0:
11931182
raise errors.ConfigError(
1194-
f'--maxfail is not a valid integer: {options.maxfail!r}'
1195-
) from None
1183+
f'--maxfail should be a non-negative integer: '
1184+
f'{options.maxfail!r}'
1185+
)
11961186

1197-
runner = Runner(exec_policy, printer, max_retries, max_failures)
1187+
runner = Runner(exec_policy, printer, options.max_retries,
1188+
options.maxfail)
11981189
try:
11991190
time_start = time.time()
12001191
session_info['time_start'] = time.strftime(

reframe/frontend/executors/policies.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,6 @@ def exit(self):
339339
'general/0/pipeline_timeout'
340340
)
341341

342-
# FIXME: Always convert due to #GH 2246
343-
if timeout is not None:
344-
timeout = float(timeout)
345-
346342
self._advance_all(self._current_tasks, timeout)
347343
if self._pipeline_statistics:
348344
num_retired = len(self._retired_tasks)

unittests/test_argparser.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ def extended_parser():
103103
dest='keep_stage_files', action='store_true',
104104
envvar='RFM_KEEP_STAGE_FILES', configvar='general/keep_stage_files'
105105
)
106+
parser.add_argument(
107+
'--git-timeout', envvar='RFM_GIT_TIMEOUT', action='store',
108+
configvar='general/git_timeout', type=float
109+
)
106110
foo_options.add_argument(
107111
'--timestamp', action='store',
108112
envvar='RFM_TIMESTAMP_DIRS', configvar='general/timestamp_dirs'
@@ -154,7 +158,8 @@ def test_option_with_config(default_exec_ctx, extended_parser, tmp_path):
154158
'RFM_TIMESTAMP': '%F',
155159
'RFM_NON_DEFAULT_CRAYPE': 'yes',
156160
'RFM_MODULES_PRELOAD': 'a,b,c',
157-
'RFM_KEEP_STAGE_FILES': 'no'
161+
'RFM_KEEP_STAGE_FILES': 'no',
162+
'RFM_GIT_TIMEOUT': '0.3'
158163
}):
159164
site_config = rt.runtime().site_config
160165
options = extended_parser.parse_args(
@@ -167,6 +172,7 @@ def test_option_with_config(default_exec_ctx, extended_parser, tmp_path):
167172
assert site_config.get('systems/0/prefix') == str(tmp_path)
168173
assert site_config.get('general/0/colorize') is False
169174
assert site_config.get('general/0/keep_stage_files') is False
175+
assert site_config.get('general/0/git_timeout') == 0.3
170176

171177
# Defaults specified in parser override those in configuration file
172178
assert site_config.get('systems/0/stagedir') == '/foo'
@@ -175,8 +181,9 @@ def test_option_with_config(default_exec_ctx, extended_parser, tmp_path):
175181
def test_option_envvar_conversion_error(default_exec_ctx, extended_parser):
176182
with rt.temp_environment(variables={
177183
'RFM_NON_DEFAULT_CRAYPE': 'foo',
184+
'RFM_GIT_TIMEOUT': 'non-float'
178185
}):
179186
site_config = rt.runtime().site_config
180187
options = extended_parser.parse_args(['--nocolor'])
181188
errors = options.update_config(site_config)
182-
assert len(errors) == 1
189+
assert len(errors) == 2

unittests/test_cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -792,8 +792,8 @@ def test_maxfail_invalid_option(run_reframe):
792792
)
793793
assert 'Traceback' not in stdout
794794
assert 'Traceback' not in stderr
795-
assert "--maxfail is not a valid integer: 'foo'" in stdout
796-
assert returncode == 1
795+
assert "--maxfail: invalid int value: 'foo'" in stderr
796+
assert returncode == 2
797797

798798

799799
def test_maxfail_negative(run_reframe):
@@ -804,7 +804,7 @@ def test_maxfail_negative(run_reframe):
804804
)
805805
assert 'Traceback' not in stdout
806806
assert 'Traceback' not in stderr
807-
assert "--maxfail should be a non-negative integer: '-2'" in stdout
807+
assert "--maxfail should be a non-negative integer: -2" in stdout
808808
assert returncode == 1
809809

810810

0 commit comments

Comments
 (0)