Skip to content

Commit b1aac2f

Browse files
committed
fix omp target
1 parent 6ef606d commit b1aac2f

File tree

5 files changed

+122
-57
lines changed

5 files changed

+122
-57
lines changed

src/backend/coalesced.nim

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,11 @@ proc newCoalesced*[T](V,M:static[int], p:ptr T, n:int):auto {.noinit.} =
109109
template `[]`*(x:Coalesced, ix:int):untyped = CoalescedObj[x.V,x.M,x.T](o:x, i:ix)
110110
template len*(x:Coalesced):untyped = x.n
111111

112-
template fromCoalesced*(x:CoalescedObj):untyped =
112+
template fromCoalesced*(x:CoalescedObj): auto =
113113
const N = getSize(x.T) div (x.M*sizeof(RegisterWord))
114114
type A = ptr Uncheckedarray[MemoryWord(x.M)]
115-
var r {.noinit.}: x.T
115+
#var r {.noinit.}: x.T
116+
var r: x.T
116117
let offset = (x.i div x.V)*N*x.V + x.i mod x.V
117118
unrollfor:
118119
for j in 0..N-1: cast[A](r.addr)[j] = cast[A](x.o.p)[offset + j*x.V]
@@ -145,8 +146,8 @@ template `:=`*[V,M:static[int],X,Y](x:CoalescedObj[V,M,X], y:Y) =
145146

146147
proc `*`*[VX,MX,VY,MY:static[int],X,Y](x:CoalescedObj[VX,MX,X], y:CoalescedObj[VY,MY,Y]):auto {.noinit,inline.} =
147148
let
148-
tx {.noinit.} = fromCoalesced(x)
149-
ty {.noinit.} = fromCoalesced(y)
149+
tx = fromCoalesced(x)
150+
ty = fromCoalesced(y)
150151
mixin `*`
151152
tx * ty
152153

src/backend/linalg.nim

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ type SomeNumber2* = SomeNumber
44
template `:=`*(x: var SomeNumber, y: SomeNumber2) =
55
type tx = type(x)
66
x = (tx)(y)
7-
template `+=`*(x: var SomeNumber, y: SomeNumber2) =
8-
bind `+=` # So the following += doesn't call this template again.
9-
type tx = type(x)
10-
x += (tx)(y)
7+
#template `+=`*(x: var SomeNumber, y: SomeNumber2) =
8+
# bind `+=` # So the following += doesn't call this template again.
9+
# type tx = type(x)
10+
# x += (tx)(y)
1111

1212
type
1313
Complex*[T] = object
@@ -63,7 +63,8 @@ template `*`*[T](x,y: Complex[T]): untyped =
6363
xx = x
6464
yy = y
6565
type tx = type(xx)
66-
var r {.noInit.}: tx #Complex[x.T] #Complex[type(x.re*y.re)]
66+
#var r {.noInit.}: tx #Complex[x.T] #Complex[type(x.re*y.re)]
67+
var r: tx #Complex[x.T] #Complex[type(x.re*y.re)]
6768
r.re := xx.re*yy.re - xx.im*yy.im
6869
r.im := xx.re*yy.im + xx.im*yy.re
6970
r
@@ -110,7 +111,8 @@ template `+`*[N:static[int],T](x,y: Colmat[N,T]): untyped =
110111
template `*`*[N:static[int],T](x,y: Colmat[N,T]): untyped =
111112
let xx = x
112113
let yy = y
113-
var r {.noInit.}: Colmat[N,type(x.d[0][0].re)]
114+
#var r {.noInit.}: Colmat[N,type(x.d[0][0].re)]
115+
var r: Colmat[N,type(x.d[0][0].re)]
114116
forstatic i, 0, N-1:
115117
forstatic j, 0, N-1:
116118
r.d[i][j] := xx.d[i][0] * yy.d[0][j]

src/backend/openmp.nim

Lines changed: 75 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ import base/metaUtils
33
import base/omp
44

55
{.pragma: omp, header:"omp.h".}
6+
{.passC:"-fcf-protection=none -no-pie -fno-stack-protector" .}
7+
{.passL:"-fcf-protection=none -no-pie -fno-stack-protector" .}
8+
69
template mkMemoryPragma*:untyped =
710
{.pragma: restrict, codegenDecl: "$# __restrict__ $#".}
811
{.pragma: aligned, codegenDecl: "$# $# __attribute__((aligned))".}
@@ -34,7 +37,7 @@ proc alignatImpl(n:NimNode, byte:int): NimNode =
3437
for c in n:
3538
result.add c.alignatImpl byte
3639
macro alignat*(byte:static[int], n:untyped): untyped =
37-
if byte notin {1,2,4,8,16,32,64,128,256}:
40+
if byte notin [1,2,4,8,16,32,64,128,256]:
3841
error("macro alignat: unsupported alignment: " & $byte, n)
3942
#echo "alignatImpl ", byte
4043
#echo n.treerepr
@@ -61,11 +64,11 @@ proc omp_get_initial_device*: cint {.omp.}
6164
proc omp_get_num_teams*: cint {.omp.}
6265
proc omp_get_team_num*: cint {.omp.}
6366

64-
template omp_target_alloc*(size: int): pointer =
67+
template omp_target_alloc*(size: SomeNumber): pointer =
6568
omp_target_alloc(csize_t size, omp_get_default_device())
6669
template omp_target_memcpy_tocpu*(dst: pointer, src: pointer; length: csize_t): cint =
6770
omp_target_memcpy(dst, src, length, 0, 0, omp_get_initial_device(), omp_get_default_device())
68-
template omp_target_memcpy_togpu*(dst: pointer, src: pointer; length: int): cint =
71+
template omp_target_memcpy_togpu*(dst: pointer, src: pointer; length: csize_t): cint =
6972
omp_target_memcpy(dst, src, csize_t length, 0, 0, omp_get_default_device(), omp_get_initial_device())
7073
template omp_target_free*(device_ptr: pointer) =
7174
omp_target_free(device_ptr, omp_get_default_device())
@@ -130,7 +133,8 @@ proc prepareVars(n:NimNode):seq[NimNode] =
130133
nnkWhileStmt, nnkForStmt} + RoutineNodes:
131134
# New lexical scope
132135
newscope = true
133-
ignoreStack.add newPar()
136+
#ignoreStack.add newPar()
137+
ignoreStack.add newNimNode(nnkTupleConstr)
134138
for i in 0..<n.len:
135139
#echo "### ",n[i].lisprepr
136140
case n[i].kind
@@ -176,7 +180,8 @@ proc prepareVars(n:NimNode):seq[NimNode] =
176180
let np = gensym(nsklet, "gpu_ptr_" & $n[i])
177181
ignoreStack[0].add nv
178182
ignoreStack[0].add np
179-
openvars.add newpar(n[i], nv, np)
183+
#openvars.add newpar(n[i], nv, np)
184+
openvars.add newNimNode(nnkTupleConstr).add(n[i], nv, np)
180185
n[i] = newcall("gpuVarPtr",nv,np)
181186
else:
182187
discard
@@ -199,6 +204,7 @@ proc genCpuPrepare(n:seq[NimNode]):NimNode =
199204
var v{.noinit.}:OffloadDummy[typeof(x)]
200205
result = newstmtlist()
201206
for c in n:
207+
#echo c.treerepr
202208
result.add getast r(c[0],c[1],c[2])
203209
proc genGpuPrepare(n:seq[NimNode]):NimNode =
204210
template r(x,v,p:untyped):untyped =
@@ -214,31 +220,55 @@ proc genCpuFinalize(n:seq[NimNode]):NimNode =
214220
result = newstmtlist()
215221
for c in n:
216222
result.add getast r(c[0],c[1],c[2])
217-
proc declarePtrString(n:seq[NimNode]):NimNode =
218-
template res(ptrlist:untyped):untyped =
219-
const s = ptrlist
220-
when s.len == 0: "" else: "is_device_ptr(" & s[0..^2] & ")"
221-
template varname(x, xp:untyped):untyped =
222-
mixin offloadPtr
223-
when compiles(offloadPtr(x)): xp&"," else: ""
224-
var ps = newlit""
223+
#proc declarePtrString(n:seq[NimNode]):NimNode =
224+
# template res(ptrlist:untyped):untyped =
225+
# const s = ptrlist
226+
# when s.len == 0: "" else: "is_device_ptr(" & s[0..^2] & ")"
227+
# template varname(x, xp:untyped):untyped =
228+
# mixin offloadPtr
229+
# when compiles(offloadPtr(x)): xp&"," else: ""
230+
# var ps = newlit""
231+
# for c in n:
232+
# ps = infix(getast varname(c[0], $c[2]), "&", ps)
233+
# result = getast res(ps)
234+
proc declarePtrTuple(n:seq[NimNode]):NimNode =
235+
mixin offloadPtr
236+
var ps = newNimNode(nnkTupleConstr)
237+
ps.add newLit"is_device_ptr("
225238
for c in n:
226-
ps = infix(getast varname(c[0], $c[2]), "&", ps)
227-
result = getast res(ps)
239+
when compiles(offloadPtr(c[0])):
240+
ps.add c[2]
241+
if ps.len == 1:
242+
result = newNimNode(nnkTupleConstr)
243+
else:
244+
ps.add newLit")"
245+
result = ps
246+
#echo result.treerepr
228247

229-
macro isDevicePtr(x: typed): untyped =
230-
let n = $x
231-
result = newLit(" is_device_ptr("&n&")")
248+
#macro isDevicePtr(x: typed): untyped =
249+
# let n = $x
250+
# result = newLit(" is_device_ptr("&n&")")
232251

233-
macro useDevicePtr(x: typed): untyped =
234-
let n = $x
235-
let p = newLit("#pragma omp target data use_device_ptr("&n&")")
236-
result = quote do:
237-
{.emit: `p`.}
252+
#macro useDevicePtr(x: auto): auto =
253+
#echo x.treerepr
254+
#let n = x.strVal
255+
#echo "useDevicePtr: ", n
256+
#let p = newLit("#pragma omp target data use_device_ptr("&n&")")
257+
# result = quote do:
258+
# {.emit: ["#pragma omp target data use_device_ptr(",`x`,")"].}
238259

239-
macro mapto(x: typed): untyped =
240-
let n = $x
241-
result = newLit(" map(to:"&n&")")
260+
#macro getrepr(x: untyped): auto =
261+
# echo x.treerepr
262+
# result = x
263+
264+
template useDevicePtr(x: auto) =
265+
#getrepr:
266+
{.emit: ["#pragma omp target data use_device_ptr(",x,")"].}
267+
268+
#macro mapto(x: typed): untyped =
269+
# let n = $x
270+
# result = newLit(" map(to:"&n&")")
271+
#macro mapto(x: typed): untyped =
242272

243273
macro onGpu*(body: untyped): untyped =
244274
# the architecture for cpugpuarray requires us replace body before it gets expanded, so we require untyped.
@@ -248,8 +278,8 @@ macro onGpu*(body: untyped): untyped =
248278
{.push stacktrace: off.}
249279
proc gpuProc {.gensym.} =
250280
cpuPrepare # a let section declare and save device pointers
251-
const isDevicePtrList = devicePtrDeclare # is_device_ptr(ptrList) in string
252-
ompBlock("target teams " & isDevicePtrList):
281+
#const isDevicePtrList = devicePtrDeclare # is_device_ptr(ptrList) in string
282+
ompBlock2("target teams ", devicePtrDeclare):
253283
openmpDefs:
254284
gpuPrepare
255285
body
@@ -260,7 +290,7 @@ macro onGpu*(body: untyped): untyped =
260290
cpuPrepare = genCpuPrepare v
261291
gpuPrepare = genGpuPrepare v
262292
cpuFinalize = genCpuFinalize v
263-
isDevicePtrs = declarePtrString v
293+
isDevicePtrs = declarePtrTuple v
264294
result = getast(target(cpuPrepare, gpuPrepare, cpuFinalize, isDevicePtrs, body))
265295
#echo result.repr
266296

@@ -306,10 +336,10 @@ proc identStr(n:NimNode):string =
306336
result = n.repr
307337
for i in 0..<result.len:
308338
if result[i] in {'.','[',']',':'}: result[i] = '_'
309-
proc isIndex(n,i:NimNode):bool =
310-
result = n.eqident i
311-
if n.kind == nnkHiddenStdConv:
312-
result = n[1].eqident i
339+
#proc isIndex(n,i:NimNode):bool =
340+
# result = n.eqident i
341+
# if n.kind == nnkHiddenStdConv:
342+
# result = n[1].eqident i
313343
macro simdForImpl(n:typed):untyped =
314344
proc getIndexedPtrs(n,i:NimNode):(NimNode,seq[NimNode]) =
315345
#echo "### getIndexedPtrs: ", i.repr
@@ -323,7 +353,8 @@ macro simdForImpl(n:typed):untyped =
323353
break
324354
if m < 0:
325355
let v = gensym(nskVar, n.cleanAst.identStr)
326-
ptrs.add newPar(v, n)
356+
#ptrs.add newPar(v, n)
357+
ptrs.add newNimNode(nnkTupleConstr).add(v, n)
327358
return v
328359
else:
329360
return ptrs[m][0]
@@ -437,20 +468,19 @@ when isMainModule:
437468
useDevicePtr(y)
438469
discard omp_target_memcpy_togpu(y, addr x, sizeof(float32))
439470
#ompBlock("target teams"&isDevicePtr(x)):
440-
ompBlock("target teams"&mapto(x)):
471+
#ompBlock("target teams"&mapto(x)):
472+
ompBlock2("target teams", " map(to:", x, ")"):
441473
{.emit:"#pragma omp parallel".}
442474
{.emit:"for(int ii=0; ii<1; ii++)".}
443475
block:
444476
x = 1.0
445477

446-
macro dump(n:typed):typed =
447-
echo n.repr
448-
n
449-
#[
450-
dump:
451-
onGpu:
452-
let i = getThreadNum()
453-
if i < n:
454-
c[i] = a[i] + b[i]
455-
]#
478+
#macro dump(n:auto):auto =
479+
# echo n.repr
480+
# n
481+
onGpu:
482+
let i = getThreadNum()
483+
if i < n:
484+
c[i] = a[i] + b[i]
485+
456486
test()

src/backend/vectorized.nim

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import macros
2929
mkMemoryPragma()
3030

3131
const CPUVLEN* {.intdefine.} = 0 ## CPU SIMD vector length in bits. Zero lets compiler auto-vectorize.
32-
const SupportedCPUVLENs = {128,256,512}
32+
const SupportedCPUVLENs = [128,256,512]
3333
const oneByte = 8
3434
macro defsimd:auto =
3535
var s,d:NimNode

src/base/omp.nim

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
import os, macros
22

33
when defined(noOpenmp):
44
static: echo "OpenMP disabled"
@@ -42,6 +42,38 @@ else:
4242
body
4343
#{. emit:"} /* End ompBlock " & p & " */".}
4444

45+
macro ompPragma2*(p: varargs[untyped]): auto =
46+
var b = newNimNode(nnkBracket)
47+
b.add newLit "_Pragma(\"omp "
48+
for i in 0..<p.len:
49+
if p[i].kind == nnkTupleConstr:
50+
for j in 0..<p[i].len:
51+
b.add p[i][j]
52+
else:
53+
b.add p[i]
54+
b.add newLit "\")"
55+
#echo p.treerepr
56+
result = quote do:
57+
{. emit:`b` .}
58+
#template ompBlock2*(p: varargs[untyped]) =
59+
# ompPragma2(p[0..^2])
60+
# block:
61+
# p[^1]
62+
macro ompBlock2*(p: varargs[untyped]): auto =
63+
{. push stackTrace:off, lineTrace:off, line_dir:off .}
64+
#echo p.treerepr
65+
let body = p[^1]
66+
var p2 = newNimNode(nnkCall).add bindSym"ompPragma2"
67+
for i in 0..(p.len-2):
68+
p2.add p[i]
69+
#echo body.treerepr
70+
result = quote do:
71+
`p2`
72+
block:
73+
`body`
74+
#echo result.treerepr
75+
{. pop .}
76+
4577
template ompBarrier* = ompPragma("barrier")
4678
template ompFlush* = ompPragma("flush")
4779
template ompFlushAcquire* = ompPragma("flush acquire")

0 commit comments

Comments
 (0)