Skip to content

Commit f73afea

Browse files
committed
Tuner for matrices.
1 parent 1e4eb06 commit f73afea

File tree

5 files changed

+131
-34
lines changed

5 files changed

+131
-34
lines changed

src/MatrixMultiplication/Main.fs

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ open Argu.ArguAttributes
55
open Brahma.FSharp
66
open FSharp.Quotations.Evaluator.QuotationEvaluationExtensions
77

8-
type Platforms = CPU = 1 | CPUParallel = 2 | NVidia = 3 | IntelGPU = 4 | AnyGPU = 5
9-
type MatrixTypes = MT_byte = 1 | MT_int = 2 | MT_float32 = 3 | MT_OptInt = 4
8+
type Platforms = CPU = 1 | CPUParallel = 2 | NVidia = 3 | IntelGPU = 4 | AnyGPU = 5 | PoclCPU = 6
9+
type MatrixTypes = MT_byte = 1 | MT_int = 2 | MT_float32 = 3 | MT_OptInt = 4 | MT_float64 = 5
1010
type Semirings = MinPlus = 1 | Arithmetic = 2
1111

1212
[<CliPrefix(CliPrefix.DoubleDash)>]
@@ -20,6 +20,8 @@ type ImageProcessingArguments =
2020
| Check of bool
2121
| MatrixType of MatrixTypes
2222
| Semiring of Semirings
23+
| NumToRun of uint
24+
| Tune of bool
2325

2426
with
2527
interface IArgParserTemplate with
@@ -33,6 +35,8 @@ type ImageProcessingArguments =
3335
| Check _ -> "Whether check result correctness."
3436
| MatrixType _ -> "Type of elements of matrices."
3537
| Semiring _ -> "Semiring to operate with matrices."
38+
| NumToRun _ -> "How many times run the kernel specified."
39+
| Tune _ -> "Run parameters tuning, not benchmarks."
3640

3741
module Main =
3842
let optIntZero = <@None@>
@@ -49,11 +53,13 @@ module Main =
4953
let check = results.GetResult(Check, defaultValue = false)
5054
let matrixType = results.GetResult(MatrixType, defaultValue = MatrixTypes.MT_int)
5155
let semiring = results.GetResult(Semiring, defaultValue = Semirings.Arithmetic)
56+
let numToRun = results.GetResult(NumToRun, defaultValue = 1u)
57+
let tune = results.GetResult(Tune, defaultValue = false)
5258

5359

5460
let time mXm checker m1 m2 =
5561
let start = System.DateTime.Now
56-
let res = mXm m1 m2
62+
let res,_ = mXm m1 m2
5763
printfn $"Processing time: {(System.DateTime.Now - start).TotalMilliseconds} ms"
5864
if check then checker res
5965

@@ -79,14 +85,17 @@ module Main =
7985
match platform with
8086
| Platforms.NVidia -> Platform.Nvidia
8187
| Platforms.IntelGPU -> Platform.Intel
88+
| Platforms.PoclCPU -> Platform.Custom "Portable*"
8289
ClDevice.GetAvailableDevices(platform = platform)
8390
|> Seq.head
8491

8592
printfn $"Device: %A{device.Name}"
8693

8794
let context = ClContext(device)
8895

89-
Matrices.applyMultiplyGPU kernel context workGroupSize workPerThread opAdd opMult zero
96+
if tune
97+
then ImageProcessing.Tuner.tune kernel context numToRun opAdd opMult zero
98+
else Matrices.applyMultiplyGPU kernel context numToRun workGroupSize workPerThread opAdd opMult zero
9099

91100
let inline mXmKernel opAdd opMult zero =
92101
match platform with
@@ -181,5 +190,20 @@ module Main =
181190

182191
time mXm checker m1 m2
183192

193+
| MatrixTypes.MT_float64 ->
194+
let m1 = Matrices.getRandomFloat64Matrix matrixSize
195+
let m2 = Matrices.getRandomFloat64Matrix matrixSize
196+
let mXm, checker =
197+
match semiring with
198+
| Semirings.Arithmetic ->
199+
mXmKernel <@(+)@> <@( * )@> <@0.0@>
200+
, Matrices.check (+) ( * ) 0.0 m1 m2
201+
| Semirings.MinPlus ->
202+
mXmKernel <@min@> <@(+)@> <@System.Double.MaxValue@>
203+
, Matrices.check min (+) System.Double.PositiveInfinity m1 m2
204+
| x -> failwithf $"Unexpected semiring {x}."
205+
206+
time mXm checker m1 m2
207+
184208

185209
0

src/MatrixMultiplication/Matrices.fs

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ let cpuMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>) =
1717
for j in 0..m1.Length - 1 do
1818
for k in 0..m1.Length - 1 do
1919
res.[i*m1.Length + j] <- opAdd res.[i * m1.Length + j] (opMult m1.[i].[k] m2.[k].[j])
20-
res
20+
res, 0.0
2121

2222
let cpuParallelMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>) =
2323
let res = Array.init (m1.Length * m1.Length) (fun _ -> zero)
@@ -27,16 +27,17 @@ let cpuParallelMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>
2727
for k in 0..m1.Length - 1 do
2828
res.[i*m1.Length + j] <- opAdd res.[i * m1.Length + j] (opMult row.[k] m2.[k].[j])
2929
)
30-
res
30+
res, 0.0
3131

3232
let check opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>) (m3:array<_>) =
33-
let res = cpuMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>)
33+
let res,_ = cpuMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>)
3434
Array.iteri2 (fun i r1 r2 -> if r1 <> r2 then printfn $"Expected {r1}, got {r2}") res m3
3535

3636

3737
let getRandomIntMatrix n = getRandomMatrix n (fun i -> rand.Next(-10,10))
3838
let getRandomByteMatrix n = getRandomMatrix n (fun i -> rand.Next() |> byte)
3939
let getRandomFloat32Matrix n = getRandomMatrix n (fun i -> rand.NextSingle())
40+
let getRandomFloat64Matrix n = getRandomMatrix n (fun i -> rand.NextDouble())
4041
let getRandomOptionIntMatrix n = getRandomMatrix n (fun i -> let x = rand.Next(-10,10) in if x % 3 = 0 then Some x else None)
4142

4243
let multiplyKernel4 (clContext: ClContext) (localWorkSize:uint) (threadTileSize:uint) opAdd opMult zero =
@@ -270,7 +271,7 @@ let multiplyKernel0 (clContext: ClContext) (localWorkSize: uint) opAdd opMult ze
270271
commandQueue.Post(Msg.CreateRunMsg<_, _> kernel)
271272
m3
272273

273-
let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) localWorkSize workPerThread (opAdd:Quotations.Expr<'a -> 'b -> 'a>) (opMult:Quotations.Expr<'e -> 'f -> 'b>) (zero:Quotations.Expr<'a>) =
274+
let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) (numToRun:uint) localWorkSize workPerThread (opAdd:Quotations.Expr<'a -> 'b -> 'a>) (opMult:Quotations.Expr<'e -> 'f -> 'b>) (zero:Quotations.Expr<'a>) =
274275
let kernel =
275276
match kernel with
276277
| Kernels.K0 -> multiplyKernel0 clContext localWorkSize opAdd opMult zero
@@ -280,34 +281,38 @@ let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) localW
280281
| Kernels.K4 -> multiplyKernel4 clContext localWorkSize workPerThread opAdd opMult zero
281282
| x -> failwithf $"Unexpected kernel {x}."
282283
let queue = clContext.QueueProvider.CreateQueue()
284+
//queue.Error.Add(fun x -> printfn "%A" x)
285+
let numToRun = int numToRun
283286

284287
fun (m1: 'e[][]) (m2: 'f[][]) ->
285-
286-
let m1_gpu =
287-
clContext.CreateClArray<_>(Array.concat m1, HostAccessMode.NotAccessible)
288-
289-
let m2_gpu =
290-
clContext.CreateClArray<_>(Array.concat m2, HostAccessMode.NotAccessible)
291-
292-
let m3_gpu =
293-
clContext.CreateClArray(
294-
m1.Length * m1.Length,
295-
HostAccessMode.NotAccessible,
296-
deviceAccessMode=DeviceAccessMode.WriteOnly,
297-
allocationMode = AllocationMode.Default
298-
)
299-
300-
let x = kernel queue m1_gpu m2_gpu m3_gpu m1.Length
301-
288+
let start = System.DateTime.Now
302289
let result : 'a[] = Array.zeroCreate(m1.Length * m1.Length)
290+
for i in 0 .. numToRun - 1 do
291+
let m1_gpu =
292+
clContext.CreateClArray<_>(Array.concat m1, HostAccessMode.NotAccessible)
293+
294+
let m2_gpu =
295+
clContext.CreateClArray<_>(Array.concat m2, HostAccessMode.NotAccessible)
296+
297+
let m3_gpu =
298+
clContext.CreateClArray(
299+
m1.Length * m1.Length,
300+
HostAccessMode.NotAccessible,
301+
deviceAccessMode=DeviceAccessMode.WriteOnly,
302+
allocationMode = AllocationMode.Default
303+
)
304+
305+
let x = kernel queue m1_gpu m2_gpu m3_gpu m1.Length
306+
307+
let result = queue.PostAndReply(fun ch -> Msg.CreateToHostMsg(m3_gpu, result, ch))
308+
309+
queue.Post(Msg.CreateFreeMsg m1_gpu)
310+
311+
queue.Post(Msg.CreateFreeMsg m2_gpu)
312+
313+
queue.Post(Msg.CreateFreeMsg m3_gpu)
314+
315+
let totalTime = (System.DateTime.Now - start).TotalMilliseconds
303316

304-
let result = queue.PostAndReply(fun ch -> Msg.CreateToHostMsg(m3_gpu, result, ch))
305-
306-
queue.Post(Msg.CreateFreeMsg m1_gpu)
307-
308-
queue.Post(Msg.CreateFreeMsg m2_gpu)
309-
310-
queue.Post(Msg.CreateFreeMsg m3_gpu)
311-
312-
result
317+
result, (totalTime / (float numToRun))
313318

src/MatrixMultiplication/MatrixMultiplication.fsproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
<Compile Include="AssemblyInfo.fs" />
1919
<None Include="App.config" />
2020
<Compile Include="Matrices.fs" />
21+
<Compile Include="Tuner.fs" />
2122
<Compile Include="Main.fs" />
2223
</ItemGroup>
2324
<ItemGroup>

src/MatrixMultiplication/Tuner.fs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
module ImageProcessing.Tuner
2+
3+
open ImageProcessing.Matrices
4+
open Brahma.FSharp
5+
6+
let tune (kernel:Kernels) (clContext: ClContext) (numToRun:uint) (opAdd:Quotations.Expr<'a -> 'b -> 'a>) (opMult:Quotations.Expr<'e -> 'f -> 'b>) (zero:Quotations.Expr<'a>) m1 m2 =
7+
let mutable localWorkSize = 2u
8+
let mutable workPerThread = 2u
9+
let mutable cruBestTime = System.Double.MaxValue
10+
let powOf2 = [for i in 1..8 -> pown 2 i |> uint]
11+
for lws in powOf2 do
12+
for wpt in powOf2 do
13+
try
14+
let _,time = applyMultiplyGPU kernel clContext numToRun lws wpt opAdd opMult zero m1 m2
15+
printfn $"local work size: {lws}; work per thread: {wpt} --- {time} ms."
16+
if time < cruBestTime
17+
then
18+
cruBestTime <- time
19+
localWorkSize <- lws
20+
workPerThread <- wpt
21+
with
22+
| e -> printfn $"local work size: {lws}; work per thread: {wpt} --- bad configuration."
23+
24+
printfn $"Best configuration: local work size: {localWorkSize}; work per thread: {workPerThread}."
25+
[||],0.0

tune.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import subprocess
2+
import os
3+
import datetime
4+
from operator import itemgetter
5+
6+
workGroupSizes = [2**(x+3) for x in range(5)]
7+
workPerThreads = [2**(x+1) for x in range(7)]
8+
9+
matrixSize = 1024
10+
kernels = ['k2', 'k3', 'k4']
11+
semiring = 'arithmetic'
12+
numToRun = 10
13+
platform = 'nvidia'
14+
types = ['mt-byte', 'mt-int', 'mt-float32', 'mt-float64']
15+
16+
out_directory = 'tuning_results'
17+
18+
if not os.path.exists(out_directory):
19+
os.makedirs(out_directory)
20+
21+
for kernel in kernels:
22+
for matrixType in types:
23+
res = []
24+
print(f'Tuning for {matrixType} and kernel {kernel} started.')
25+
for wgs in workGroupSizes:
26+
for wpt in workPerThreads:
27+
try:
28+
cmd = f'dotnet ./src/MatrixMultiplication/bin/Release/net9.0/MatrixMultiplication.dll --platform {platform} --kernel {kernel} --matrixsize {matrixSize} --matrixtype {matrixType} --semiring {semiring} --numtorun {numToRun} --workperthread {wpt} --workgroupsize {wgs}'
29+
output = subprocess.check_output([cmd],shell=True)
30+
output = output.decode("utf-8")
31+
if 'Processing time:' in output:
32+
time = float(output.split()[-2])
33+
res.append([wgs, wpt, time])
34+
print (f'wgs={wgs}, wpt={wpt}, time={time}')
35+
except BaseException: ()
36+
37+
res = sorted(res, key=itemgetter(2))
38+
f = open(os.path.join(out_directory,f'{kernel}_{platform}_{matrixType}_{matrixSize}_{semiring}_{datetime.datetime.now()}.log'),'a')
39+
for r in res:
40+
print(r)
41+
f.write(f'{r[0]}, {r[1]}, {r[2]}\n')
42+
f.close()

0 commit comments

Comments
 (0)