Skip to content

Commit 500d269

Browse files
committed
auto single loop generator
1 parent 45a1c33 commit 500d269

File tree

4 files changed

+145
-71
lines changed

4 files changed

+145
-71
lines changed

CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,9 +415,8 @@ elseif (GPU_LANG STREQUAL "OPENACC")
415415
# set this line before project()
416416
set (CMAKE_CXX_COMPILER "${T9_ACC_COMPILER}")
417417
# 11 unrecognized preprocessing directive
418-
# 82 storage class is not first
419418
# 177 variable was declared but never referenced
420-
string (APPEND CMAKE_CXX_FLAGS " --diag_suppress=11,82,177")
419+
string (APPEND CMAKE_CXX_FLAGS " --diag_suppress=11,177")
421420
# Compute Capability 60,70 -> ,cc60,cc70
422421
set (__T9_ACC_CCLST4) # ""
423422
foreach (var ${T9_CUCCLIST})

ext/ext/ck3.py

Lines changed: 125 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,17 @@
2929
'20': 'u', '21': 'v', '22': 'w', '23': 'x', '24': 'y',
3030
'25': 'z'}
3131

32-
rc_kernel2c = '''
32+
rc_kernel11 = '''
33+
TEMPLATE_PARAMS __global__
34+
STATIC_KERNEL void KERNEL_NAME(SINGLE_LOOP_LIMIT_PARAM EXTRA_KERNEL_PARAMS)
35+
{
36+
KERNEL_SINGLE_LOOP_BEGIN
37+
KERNEL_SINGLE_LOOP_CODE
38+
KERNEL_SINGLE_LOOP_END
39+
}
40+
'''
41+
42+
rc_kernel23c = '''
3343
TEMPLATE_PARAMS \
3444
__global__ \
3545
void KERNEL_NAMEc( \
@@ -69,7 +79,7 @@
6979
'''
7080

7181

72-
rc_kernel2b = '''
82+
rc_kernel23b = '''
7383
TEMPLATE_PARAMS \
7484
__global__ \
7585
void KERNEL_NAMEb( \
@@ -130,7 +140,7 @@
130140
'''
131141

132142

133-
rc_kernel2a = '''
143+
rc_kernel23a = '''
134144
TEMPLATE_PARAMS \
135145
__global__ \
136146
void KERNEL_NAMEa( \
@@ -181,7 +191,7 @@
181191
'''
182192

183193

184-
rc_kernel1 = '''
194+
rc_kernel21 = '''
185195
TEMPLATE_PARAMS \
186196
__global__ \
187197
void KERNEL_NAME(int n, \
@@ -393,6 +403,13 @@ def ikreplace(self, code:str) -> str:
393403
return code
394404

395405

406+
def iterreplace(self, code:str) -> str:
407+
old_name = '@{}@'.format('i')
408+
new_name = self.name
409+
code = code.replace(old_name, new_name)
410+
return code
411+
412+
396413
class VariableDefinitions:
397414
def __init__(self, iork:str, lst:list) -> None:
398415
self.shared, self.register = {}, {}
@@ -548,9 +565,10 @@ def _load_scale_param(ptype:str, stem:str, input:str, separate_scaled_pairwise:b
548565
def __init__(self, config) -> None:
549566
self.config = config
550567

551-
self.yk_split_kernel = 'SPLIT_KERNEL'
568+
self.yk_kernel_version_number = 'KERNEL_VERSION_NUMBER'
552569

553570
self.yk_output_dir = 'OUTPUT_DIR'
571+
self.yk_kernel_is_static = 'KERNEL_IS_STATIC'
554572
self.yk_kernel_name = 'KERNEL_NAME'
555573
self.yk_template_params = 'TEMPLATE_PARAMS'
556574
self.yk_constexpr_flags = 'CONSTEXPR_FLAGS'
@@ -570,6 +588,9 @@ def __init__(self, config) -> None:
570588
self.yk_scaled_pairwise = 'SCALED_PAIRWISE_INTERACTION'
571589
self.yk_full_pairwise = 'FULL_PAIRWISE_INTERACTION'
572590

591+
self.yk_single_loop_limit = 'SINGLE_LOOP_LIMIT'
592+
self.yk_single_loop_iter = 'SINGLE_LOOP_ITER'
593+
self.yk_single_loop_code = 'SINGLE_LOOP_CODE'
573594

574595
def _kv(self, k:str):
575596
if k in self.config.keys():
@@ -588,6 +609,13 @@ def cudaReplaceDict(self) -> dict:
588609
d[k] = v
589610

590611
# kernel name
612+
k, v = 'STATIC_KERNEL', ''
613+
kcfg, vcfg = self.yk_kernel_is_static, False
614+
if kcfg in keys:
615+
vcfg = config[kcfg]
616+
if vcfg:
617+
v = 'static'
618+
d[k] = v
591619
k, v = 'KERNEL_NAME', self._kv(self.yk_kernel_name)
592620
d[k] = v
593621

@@ -619,25 +647,29 @@ def cudaReplaceDict(self) -> dict:
619647
k, v = 'KERNEL_CONSTEXPR_FLAGS', self._kv(self.yk_constexpr_flags)
620648
d[k] = v
621649

622-
# i and k declaration
623-
ivars, kvars = VariableDefinitions('i', config[self.yk_i_variables]), VariableDefinitions('k', config[self.yk_k_variables])
624-
ifrcs, kfrcs = VariableDefinitions('i', config[self.yk_i_force]), VariableDefinitions('k', config[self.yk_k_force])
625-
if len(ifrcs.shared.keys()):
626-
raise ValueError('I_FORCE cannot be put on shared memory.')
627-
if len(kfrcs.shared.keys()):
628-
raise ValueError('F_FORCE cannot be put on shared memory.')
629-
k1, v1 = 'DECLARE_PARAMS_I_AND_K', ivars.declare() + kvars.declare()
630-
k2, v2 = 'DECLARE_FORCE_I_AND_K', ifrcs.declare() + kfrcs.declare()
631-
d[k1], d[k2] = v1, v2
632-
633-
# i and k in exclude block
634-
k1, v1 = 'KERNEL_INIT_EXCLUDE_PARAMS_I_AND_K', ''
635-
k2, v2 = 'KERNEL_INIT_PARAMS_I_AND_K', ''
636-
k3, v3 = 'KERNEL_SHUFFLE_PARAMS_I', ''
637-
v1 = v1 + ivars.init_exclude() + kvars.init_exclude()
638-
v2 = v2 + ivars.init_block() + kvars.init_block()
639-
v3 = v3 + ivars.shuffle()
640-
d[k1], d[k2], d[k3] = v1, v2, v3
650+
use_ikvars = False
651+
if self.yk_i_variables in keys and self.yk_k_variables in keys:
652+
use_ikvars = True
653+
if use_ikvars:
654+
# i and k declaration
655+
ivars, kvars = VariableDefinitions('i', config[self.yk_i_variables]), VariableDefinitions('k', config[self.yk_k_variables])
656+
ifrcs, kfrcs = VariableDefinitions('i', config[self.yk_i_force]), VariableDefinitions('k', config[self.yk_k_force])
657+
if len(ifrcs.shared.keys()):
658+
raise ValueError('I_FORCE cannot be put on shared memory.')
659+
if len(kfrcs.shared.keys()):
660+
raise ValueError('F_FORCE cannot be put on shared memory.')
661+
k1, v1 = 'DECLARE_PARAMS_I_AND_K', ivars.declare() + kvars.declare()
662+
k2, v2 = 'DECLARE_FORCE_I_AND_K', ifrcs.declare() + kfrcs.declare()
663+
d[k1], d[k2] = v1, v2
664+
665+
# i and k in exclude block
666+
k1, v1 = 'KERNEL_INIT_EXCLUDE_PARAMS_I_AND_K', ''
667+
k2, v2 = 'KERNEL_INIT_PARAMS_I_AND_K', ''
668+
k3, v3 = 'KERNEL_SHUFFLE_PARAMS_I', ''
669+
v1 = v1 + ivars.init_exclude() + kvars.init_exclude()
670+
v2 = v2 + ivars.init_block() + kvars.init_block()
671+
v3 = v3 + ivars.shuffle()
672+
d[k1], d[k2], d[k3] = v1, v2, v3
641673

642674
# count
643675
k1, v1 = 'COUNT_KERNEL_PARAMS', ''
@@ -696,39 +728,46 @@ def cudaReplaceDict(self) -> dict:
696728
v3 = 'if CONSTEXPR (do_v) {%s}' % total
697729
d[k1], d[k2], d[k3] = v1, v2, v3
698730

699-
# gradient
700-
k, v = 'GRADIENT_KERNEL_PARAMS', ''
701-
kcfg = self.yk_gradient
702-
if kcfg in keys:
703-
vcfg = config[kcfg]
704-
for t in vcfg:
705-
v = v + ', grad_prec* restrict {}'.format(t)
706-
k1, v1 = 'KERNEL_ZERO_LOCAL_FORCE', ifrcs.zero() + kfrcs.zero()
707-
k2, v2 = 'KERNEL_SAVE_LOCAL_FORCE', ifrcs.save() + kfrcs.save()
708-
k3, v3 = 'KERNEL_SHUFFLE_LOCAL_FORCE_I', ifrcs.shuffle()
709-
kcfg = self.yk_constexpr_flags
710-
if kcfg in keys:
711-
vcfg = config[kcfg]
712-
if 'constexpr bool do_g =' in vcfg:
713-
v1 = 'if CONSTEXPR (do_g) {%s}' % v1
714-
v2 = 'if CONSTEXPR (do_g) {%s}' % v2
715-
if v3 != '':
716-
v3 = 'if CONSTEXPR (do_g) {%s}' % v3
717-
d[k], d[k1], d[k2], d[k3] = v, v1, v2, v3
718-
719-
# klane -- True only if ivar uses shared memory
720-
k1, v1 = 'KERNEL_KLANE1', ''
721-
k2, v2 = 'KERNEL_KLANE2', ''
722-
k3, v3 = 'KERNEL_SCALED_KLANE', ''
723-
use_klane = False
724-
if len(ivars.shared.keys()):
725-
use_klane = True
726-
if use_klane:
727-
v1 = 'int klane = srclane + threadIdx.x - ilane;'
728-
v2 = v2 + 'int srclane = (ilane + j) & (WARP_SIZE - 1);'
729-
v2 = v2 + 'int klane = srclane + threadIdx.x - ilane;'
730-
v3 = 'const int klane = threadIdx.x;'
731-
d[k1], d[k2], d[k3] = v1, v2, v3
731+
if use_ikvars:
732+
# sync warp
733+
k1, v1 = 'KERNEL_SYNCWARP', '__syncwarp();'
734+
if len(ivars.shared) == 0 and len(kvars.shared) == 0 and len(ifrcs.shared) == 0 and len(kfrcs.shared) == 0:
735+
v1 = ''
736+
d[k1] = v1
737+
738+
# gradient
739+
k, v = 'GRADIENT_KERNEL_PARAMS', ''
740+
kcfg = self.yk_gradient
741+
if kcfg in keys:
742+
vcfg = config[kcfg]
743+
for t in vcfg:
744+
v = v + ', grad_prec* restrict {}'.format(t)
745+
k1, v1 = 'KERNEL_ZERO_LOCAL_FORCE', ifrcs.zero() + kfrcs.zero()
746+
k2, v2 = 'KERNEL_SAVE_LOCAL_FORCE', ifrcs.save() + kfrcs.save()
747+
k3, v3 = 'KERNEL_SHUFFLE_LOCAL_FORCE_I', ifrcs.shuffle()
748+
kcfg = self.yk_constexpr_flags
749+
if kcfg in keys:
750+
vcfg = config[kcfg]
751+
if 'constexpr bool do_g =' in vcfg:
752+
v1 = 'if CONSTEXPR (do_g) {%s}' % v1
753+
v2 = 'if CONSTEXPR (do_g) {%s}' % v2
754+
if v3 != '':
755+
v3 = 'if CONSTEXPR (do_g) {%s}' % v3
756+
d[k], d[k1], d[k2], d[k3] = v, v1, v2, v3
757+
758+
# klane -- True only if ivar uses shared memory
759+
k1, v1 = 'KERNEL_KLANE1', ''
760+
k2, v2 = 'KERNEL_KLANE2', ''
761+
k3, v3 = 'KERNEL_SCALED_KLANE', ''
762+
use_klane = False
763+
if len(ivars.shared.keys()):
764+
use_klane = True
765+
if use_klane:
766+
v1 = 'int klane = srclane + threadIdx.x - ilane;'
767+
v2 = v2 + 'int srclane = (ilane + j) & (WARP_SIZE - 1);'
768+
v2 = v2 + 'int klane = srclane + threadIdx.x - ilane;'
769+
v3 = 'const int klane = threadIdx.x;'
770+
d[k1], d[k2], d[k3] = v1, v2, v3
732771

733772
# exclude
734773
k1, v1 = 'EXCLUDE_INFO_KERNEL_PARAMS', ''
@@ -780,18 +819,31 @@ def cudaReplaceDict(self) -> dict:
780819
v2 = kfrcs.ikreplace(v2)
781820
d[k1], d[k2] = v1, v2
782821

783-
# sync warp
784-
k1, v1 = 'KERNEL_SYNCWARP', '__syncwarp();'
785-
if len(ivars.shared) == 0 and len(kvars.shared) == 0 and len(ifrcs.shared) == 0 and len(kfrcs.shared) == 0:
786-
v1 = ''
787-
d[k1] = v1
822+
# single loop
823+
k0, v0 = 'SINGLE_LOOP_LIMIT_PARAM', ''
824+
k1, v1 = 'KERNEL_SINGLE_LOOP_CODE', ''
825+
k2, v2 = 'KERNEL_SINGLE_LOOP_BEGIN', ''
826+
k3, v3 = 'KERNEL_SINGLE_LOOP_END', ''
827+
kcfg = self.yk_single_loop_code
828+
if kcfg in keys:
829+
v0 = config[self.yk_single_loop_limit]
830+
v1 = config[kcfg]
831+
sl_limit, sl_iter = config[self.yk_single_loop_limit], config[self.yk_single_loop_iter]
832+
sl_limit = 'register ' + sl_limit + ' from:dummy'
833+
sl_iter = 'register ' + sl_iter + ' from:dummy'
834+
var_limit = Variable('k', sl_limit)
835+
var_iter = Variable('k', sl_iter)
836+
v2 = 'for(%s %s = ITHREAD; %s < %s; %s += STRIDE) {' % (var_iter.type, var_iter.name, var_iter.name, var_limit.name, var_iter.name)
837+
v3 = '}'
838+
v1 = var_iter.iterreplace(v1)
839+
d[k0], d[k1], d[k2], d[k3] = v0, v1, v2, v3
788840

789841
return d
790842

791843

792844
@staticmethod
793845
def version() -> str:
794-
return '3.0.2'
846+
return '3.1.0'
795847

796848

797849
@staticmethod
@@ -808,13 +860,18 @@ def _replace(s:str, d:dict) -> str:
808860
def write(self, output) -> None:
809861
d = self.cudaReplaceDict()
810862
outstr = '// ck.py Version {}'.format(self.version())
811-
if self.yk_split_kernel in self.config.keys():
863+
kernel_num = 21 # default
864+
if self.yk_kernel_version_number in self.config.keys():
865+
kernel_num = self.config[self.yk_kernel_version_number]
866+
if kernel_num == 11:
867+
outstr = outstr + self._replace(rc_kernel11, d)
868+
elif kernel_num == 23:
812869
if self.yk_scale_1x_type in self.config.keys():
813-
outstr = outstr + self._replace(rc_kernel2c, d)
814-
outstr = outstr + self._replace(rc_kernel2b, d)
815-
outstr = outstr + self._replace(rc_kernel2a, d)
870+
outstr = outstr + self._replace(rc_kernel23c, d)
871+
outstr = outstr + self._replace(rc_kernel23b, d)
872+
outstr = outstr + self._replace(rc_kernel23a, d)
816873
else:
817-
outstr = outstr + self._replace(rc_kernel1, d)
874+
outstr = outstr + self._replace(rc_kernel21, d)
818875
print(outstr, file=output)
819876

820877

ext/ext/y3/emplar_cu1.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
SPLIT_KERNEL: TRUE
1+
KERNEL_VERSION_NUMBER: 23
22

33
OUTPUT_DIR: src/cu/amoeba
44
KERNEL_NAME: emplar_cu1

ext/ext/y3/mdPos_cu1.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
KERNEL_VERSION_NUMBER: 11
2+
3+
KERNEL_IS_STATIC: True
4+
KERNEL_NAME: mdPos_cu1
5+
SINGLE_LOOP_LIMIT: int n
6+
SINGLE_LOOP_ITER: int i
7+
SINGLE_LOOP_CODE: |
8+
qx[@i@] += dt * vlx[@i@];
9+
qy[@i@] += dt * vly[@i@];
10+
qz[@i@] += dt * vlz[@i@];
11+
EXTRA_PARAMS: |
12+
, time_prec dt
13+
, pos_prec* restrict qx
14+
, pos_prec* restrict qy
15+
, pos_prec* restrict qz
16+
, const vel_prec* restrict vlx
17+
, const vel_prec* restrict vly
18+
, const vel_prec* restrict vlz

0 commit comments

Comments
 (0)