@@ -2,7 +2,7 @@ module ImageProcessing.Matrices
22
33open Brahma.FSharp
44
5- type Kernels = K0 = 0 | K1 = 1 | K2 = 2 | K3 = 3
5+ type Kernels = K0 = 0 | K1 = 1 | K2 = 2 | K3 = 3 | K4 = 4
66let rand = new System.Random()
77
88let getRandomMatrix ( n : uint ) init =
@@ -23,7 +23,7 @@ let cpuParallelMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>
2323 let res = Array.init ( m1.Length * m1.Length) ( fun _ -> zero)
2424 m1
2525 |> Array.Parallel.iteri ( fun i row ->
26- for j in 0 .. m1.Length - 1 do
26+ for j in 0 .. m1.Length - 1 do
2727 for k in 0 .. m1.Length - 1 do
2828 res.[ i* m1.Length + j] <- opAdd res.[ i * m1.Length + j] ( opMult row.[ k] m2.[ k].[ j])
2929 )
@@ -39,6 +39,74 @@ let getRandomByteMatrix n = getRandomMatrix n (fun i -> rand.Next() |> byte)
3939let getRandomFloat32Matrix n = getRandomMatrix n ( fun i -> rand.NextSingle())
4040let getRandomOptionIntMatrix n = getRandomMatrix n ( fun i -> let x = rand.Next(- 10 , 10 ) in if x % 3 = 0 then Some x else None)
4141
42+ let multiplyKernel4 ( clContext : ClContext ) ( localWorkSize : uint ) ( threadTileSize : uint ) opAdd opMult zero =
43+ let localWorkSize = int localWorkSize
44+ let threadTileSize = int threadTileSize
45+ let localBufSize = FSharp.Quotations.Evaluator.QuotationEvaluator.Evaluate <@ localWorkSize * localWorkSize@>
46+ let threadLocalBufSize = FSharp.Quotations.Evaluator.QuotationEvaluator.Evaluate <@ threadTileSize * threadTileSize@>
47+ let kernel =
48+ <@
49+ fun ( r : Range2D ) ( m1 : ClArray < _ >) ( m2 : ClArray < _ >) ( m3 : ClArray < _ >) n ->
50+ let localBaseRow = r.LocalID0 * threadTileSize
51+ let localBaseCol = r.LocalID1 * threadTileSize
52+ let globalBaseRow = r.GlobalID0 * threadTileSize
53+ let globalBaseCol = r.GlobalID1 * threadTileSize
54+
55+ let m1Submatrix = localArray localBufSize
56+ let m2Submatrix = localArray localBufSize
57+
58+ let m2Buf = threadLocalArray threadTileSize
59+
60+ let res = threadLocalArray threadLocalBufSize
61+
62+ for i in 0 .. threadLocalBufSize - 1 do res.[ i] <- % zero
63+
64+ for t in 0 .. ( n / localWorkSize) - 1 do
65+ for i in 0 .. threadTileSize - 1 do
66+ for j in 0 .. threadTileSize - 1 do
67+ let tiledRow = localWorkSize * t + localBaseRow + i
68+ let tiledCol = localWorkSize * t + localBaseCol + j
69+ let targetElem = ( localBaseRow + i) * localWorkSize + localBaseCol + j
70+ m1Submatrix[ targetElem] <- m1[( globalBaseRow + i) * n + tiledCol]
71+ m2Submatrix[ targetElem] <- m2[ tiledRow * n + globalBaseCol + j]
72+
73+ barrierLocal()
74+
75+ for k in 0 .. localWorkSize - 1 do
76+
77+ for i in 0 .. threadTileSize - 1 do
78+ m2Buf[ i] <- m2Submatrix[ k * localWorkSize + localBaseCol + i]
79+
80+ for i in 0 .. threadTileSize - 1 do
81+ let m1Val = m1Submatrix[( localBaseRow + i) * localWorkSize + k]
82+ for j in 0 .. threadTileSize - 1 do
83+ let x = (% opMult) m1Val m2Buf[ j]
84+ let y = (% opAdd) res[ i * threadTileSize + j] x
85+ res[ i * threadTileSize + j] <- y
86+ barrierLocal()
87+
88+ for i in 0 .. threadTileSize - 1 do
89+ for j in 0 .. threadTileSize - 1 do
90+ m3.[( globalBaseRow + i) * n + globalBaseCol + j] <- res[ i * threadTileSize + j]
91+ @>
92+
93+ let kernel = clContext.Compile kernel
94+
95+ fun ( commandQueue : MailboxProcessor < _ >) ( m1 : ClArray < _ >) ( m2 : ClArray < _ >) ( m3 : ClArray < _ >) n ->
96+
97+ let ndRange =
98+ Range2D(
99+ n / threadTileSize,
100+ n / threadTileSize,
101+ localWorkSize / threadTileSize,
102+ localWorkSize / threadTileSize
103+ )
104+
105+ let kernel = kernel.GetKernel()
106+ commandQueue.Post( Msg.MsgSetArguments( fun () -> kernel.KernelFunc ndRange m1 m2 m3 n))
107+ commandQueue.Post( Msg.CreateRunMsg<_, _> kernel)
108+ m3
109+
42110let multiplyKernel3 ( clContext : ClContext ) ( localWorkSize : uint ) ( workPerThread : uint ) opAdd opMult zero =
43111 let localWorkSize = int localWorkSize
44112 let workPerThread = int workPerThread
@@ -209,6 +277,7 @@ let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) localW
209277 | Kernels.K1 -> multiplyKernel1 clContext localWorkSize opAdd opMult zero
210278 | Kernels.K2 -> multiplyKernel2 clContext localWorkSize opAdd opMult zero
211279 | Kernels.K3 -> multiplyKernel3 clContext localWorkSize workPerThread opAdd opMult zero
280+ | Kernels.K4 -> multiplyKernel4 clContext localWorkSize workPerThread opAdd opMult zero
212281 | x -> failwithf $" Unexpected kernel {x}."
213282 let queue = clContext.QueueProvider.CreateQueue()
214283
0 commit comments