Skip to content

Commit 482ddf9

Browse files
committed
Next optimization: tiles in thread-local arrays.
1 parent c0d5141 commit 482ddf9

File tree

1 file changed

+71
-2
lines changed

1 file changed

+71
-2
lines changed

src/MatrixMultiplication/Matrices.fs

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module ImageProcessing.Matrices
22

33
open Brahma.FSharp
44

5-
type Kernels = K0 = 0 | K1 = 1 | K2 = 2 | K3 = 3
5+
type Kernels = K0 = 0 | K1 = 1 | K2 = 2 | K3 = 3 | K4 = 4
66
let rand = new System.Random()
77

88
let getRandomMatrix (n: uint) init =
@@ -23,7 +23,7 @@ let cpuParallelMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>
2323
let res = Array.init (m1.Length * m1.Length) (fun _ -> zero)
2424
m1
2525
|> Array.Parallel.iteri (fun i row ->
26-
for j in 0..m1.Length - 1 do
26+
for j in 0..m1.Length - 1 do
2727
for k in 0..m1.Length - 1 do
2828
res.[i*m1.Length + j] <- opAdd res.[i * m1.Length + j] (opMult row.[k] m2.[k].[j])
2929
)
@@ -39,6 +39,74 @@ let getRandomByteMatrix n = getRandomMatrix n (fun i -> rand.Next() |> byte)
3939
let getRandomFloat32Matrix n = getRandomMatrix n (fun i -> rand.NextSingle())
4040
let getRandomOptionIntMatrix n = getRandomMatrix n (fun i -> let x = rand.Next(-10,10) in if x % 3 = 0 then Some x else None)
4141

42+
let multiplyKernel4 (clContext: ClContext) (localWorkSize:uint) (threadTileSize:uint) opAdd opMult zero =
43+
let localWorkSize = int localWorkSize
44+
let threadTileSize = int threadTileSize
45+
let localBufSize = FSharp.Quotations.Evaluator.QuotationEvaluator.Evaluate <@localWorkSize * localWorkSize@>
46+
let threadLocalBufSize = FSharp.Quotations.Evaluator.QuotationEvaluator.Evaluate <@threadTileSize * threadTileSize@>
47+
let kernel =
48+
<@
49+
fun (r: Range2D) (m1: ClArray<_>) (m2: ClArray<_>) (m3: ClArray<_>) n ->
50+
let localBaseRow = r.LocalID0 * threadTileSize
51+
let localBaseCol = r.LocalID1 * threadTileSize
52+
let globalBaseRow = r.GlobalID0 * threadTileSize
53+
let globalBaseCol = r.GlobalID1 * threadTileSize
54+
55+
let m1Submatrix = localArray localBufSize
56+
let m2Submatrix = localArray localBufSize
57+
58+
let m2Buf = threadLocalArray threadTileSize
59+
60+
let res = threadLocalArray threadLocalBufSize
61+
62+
for i in 0 .. threadLocalBufSize - 1 do res.[i] <- %zero
63+
64+
for t in 0 .. (n / localWorkSize) - 1 do
65+
for i in 0 .. threadTileSize - 1 do
66+
for j in 0 .. threadTileSize - 1 do
67+
let tiledRow = localWorkSize * t + localBaseRow + i
68+
let tiledCol = localWorkSize * t + localBaseCol + j
69+
let targetElem = (localBaseRow + i) * localWorkSize + localBaseCol + j
70+
m1Submatrix[targetElem] <- m1[(globalBaseRow + i) * n + tiledCol]
71+
m2Submatrix[targetElem] <- m2[tiledRow * n + globalBaseCol + j]
72+
73+
barrierLocal()
74+
75+
for k in 0 .. localWorkSize - 1 do
76+
77+
for i in 0 .. threadTileSize - 1 do
78+
m2Buf[i] <- m2Submatrix[k * localWorkSize + localBaseCol + i]
79+
80+
for i in 0 .. threadTileSize - 1 do
81+
let m1Val = m1Submatrix[(localBaseRow + i) * localWorkSize + k]
82+
for j in 0 .. threadTileSize - 1 do
83+
let x = (%opMult) m1Val m2Buf[j]
84+
let y = (%opAdd) res[i * threadTileSize + j] x
85+
res[i * threadTileSize + j] <- y
86+
barrierLocal()
87+
88+
for i in 0 .. threadTileSize - 1 do
89+
for j in 0 .. threadTileSize - 1 do
90+
m3.[(globalBaseRow + i) * n + globalBaseCol + j] <- res[i * threadTileSize + j]
91+
@>
92+
93+
let kernel = clContext.Compile kernel
94+
95+
fun (commandQueue: MailboxProcessor<_>) (m1: ClArray<_>) (m2: ClArray<_>) (m3: ClArray<_>) n ->
96+
97+
let ndRange =
98+
Range2D(
99+
n / threadTileSize,
100+
n / threadTileSize,
101+
localWorkSize / threadTileSize,
102+
localWorkSize / threadTileSize
103+
)
104+
105+
let kernel = kernel.GetKernel()
106+
commandQueue.Post(Msg.MsgSetArguments(fun () -> kernel.KernelFunc ndRange m1 m2 m3 n))
107+
commandQueue.Post(Msg.CreateRunMsg<_, _> kernel)
108+
m3
109+
42110
let multiplyKernel3 (clContext: ClContext) (localWorkSize:uint) (workPerThread:uint) opAdd opMult zero =
43111
let localWorkSize = int localWorkSize
44112
let workPerThread = int workPerThread
@@ -209,6 +277,7 @@ let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) localW
209277
| Kernels.K1 -> multiplyKernel1 clContext localWorkSize opAdd opMult zero
210278
| Kernels.K2 -> multiplyKernel2 clContext localWorkSize opAdd opMult zero
211279
| Kernels.K3 -> multiplyKernel3 clContext localWorkSize workPerThread opAdd opMult zero
280+
| Kernels.K4 -> multiplyKernel4 clContext localWorkSize workPerThread opAdd opMult zero
212281
| x -> failwithf $"Unexpected kernel {x}."
213282
let queue = clContext.QueueProvider.CreateQueue()
214283

0 commit comments

Comments
 (0)