Tuner for matrices.

gsvgit · gsvgit · commit f73afea33dcd · 2025-04-22T10:32:44.000+03:00
diff --git a/src/MatrixMultiplication/Main.fs b/src/MatrixMultiplication/Main.fs
@@ -5,8 +5,8 @@ open Argu.ArguAttributes
 open Brahma.FSharp
 open FSharp.Quotations.Evaluator.QuotationEvaluationExtensions
 
-type Platforms = CPU = 1 | CPUParallel = 2 | NVidia = 3 | IntelGPU = 4 | AnyGPU = 5
-type MatrixTypes = MT_byte = 1 | MT_int = 2 | MT_float32 = 3 | MT_OptInt = 4
+type Platforms = CPU = 1 | CPUParallel = 2 | NVidia = 3 | IntelGPU = 4 | AnyGPU = 5 | PoclCPU = 6
+type MatrixTypes = MT_byte = 1 | MT_int = 2 | MT_float32 = 3 | MT_OptInt = 4 | MT_float64 = 5
 type Semirings = MinPlus = 1 | Arithmetic = 2
 
 [<CliPrefix(CliPrefix.DoubleDash)>]
@@ -20,6 +20,8 @@ type ImageProcessingArguments =
     | Check of bool
     | MatrixType of MatrixTypes 
     | Semiring of Semirings
+    | NumToRun of uint
+    | Tune of bool
 
     with
     interface IArgParserTemplate with
@@ -33,6 +35,8 @@ type ImageProcessingArguments =
             | Check _ -> "Whether check result correctness."
             | MatrixType _ -> "Type of elements of matrices."
             | Semiring _ -> "Semiring to operate with matrices."
+            | NumToRun _ -> "How many times run the kernel specified."
+            | Tune _ -> "Run parameters tuning, not benchmarks."
 
 module Main =
     let optIntZero = <@None@>
@@ -49,11 +53,13 @@ module Main =
         let check = results.GetResult(Check, defaultValue = false)
         let matrixType = results.GetResult(MatrixType, defaultValue = MatrixTypes.MT_int)
         let semiring = results.GetResult(Semiring, defaultValue = Semirings.Arithmetic)
+        let numToRun = results.GetResult(NumToRun, defaultValue = 1u)
+        let tune = results.GetResult(Tune, defaultValue = false)
 
 
         let time mXm checker m1 m2 =
             let start = System.DateTime.Now
-            let res = mXm m1 m2
+            let res,_ = mXm m1 m2
             printfn $"Processing time: {(System.DateTime.Now - start).TotalMilliseconds} ms"
             if check then checker res
 
@@ -79,14 +85,17 @@ module Main =
                         match platform with 
                         | Platforms.NVidia -> Platform.Nvidia
                         | Platforms.IntelGPU -> Platform.Intel
+                        | Platforms.PoclCPU -> Platform.Custom "Portable*"
                     ClDevice.GetAvailableDevices(platform = platform)
                     |> Seq.head
 
             printfn $"Device: %A{device.Name}"
 
             let context = ClContext(device)
 
-            Matrices.applyMultiplyGPU kernel context workGroupSize workPerThread opAdd opMult zero
+            if tune 
+            then ImageProcessing.Tuner.tune kernel context numToRun opAdd opMult zero
+            else Matrices.applyMultiplyGPU kernel context numToRun workGroupSize workPerThread opAdd opMult zero
         
         let inline mXmKernel opAdd opMult zero = 
             match platform with 
@@ -181,5 +190,20 @@ module Main =
 
             time mXm checker m1 m2
 
+        | MatrixTypes.MT_float64 ->
+            let m1 = Matrices.getRandomFloat64Matrix matrixSize
+            let m2 = Matrices.getRandomFloat64Matrix matrixSize
+            let mXm, checker  = 
+                match semiring with 
+                | Semirings.Arithmetic -> 
+                    mXmKernel <@(+)@> <@( * )@> <@0.0@>
+                    , Matrices.check (+) ( * ) 0.0 m1 m2
+                | Semirings.MinPlus -> 
+                    mXmKernel <@min@> <@(+)@> <@System.Double.MaxValue@>
+                    , Matrices.check min (+) System.Double.PositiveInfinity m1 m2
+                | x -> failwithf $"Unexpected semiring {x}."      
+
+            time mXm checker m1 m2
+
         
         0
diff --git a/src/MatrixMultiplication/Matrices.fs b/src/MatrixMultiplication/Matrices.fs
@@ -17,7 +17,7 @@ let cpuMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>) =
       for j in 0..m1.Length - 1 do
         for k in 0..m1.Length - 1 do
             res.[i*m1.Length + j] <- opAdd res.[i * m1.Length + j]  (opMult m1.[i].[k] m2.[k].[j])
-    res
+    res, 0.0
 
 let cpuParallelMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>) =
     let res = Array.init (m1.Length * m1.Length) (fun _ -> zero)
@@ -27,16 +27,17 @@ let cpuParallelMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>
           for k in 0..m1.Length - 1 do
             res.[i*m1.Length + j] <- opAdd res.[i * m1.Length + j]  (opMult row.[k] m2.[k].[j])
        )
-    res
+    res, 0.0
 
 let check opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>) (m3:array<_>) =
-    let res = cpuMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>)
+    let res,_ = cpuMxM opAdd opMult zero (m1 : array<array<_>>) (m2: array<array<_>>)
     Array.iteri2 (fun i r1 r2 -> if r1 <> r2 then printfn $"Expected {r1}, got {r2}") res m3
 
 
 let getRandomIntMatrix n = getRandomMatrix n (fun i -> rand.Next(-10,10))
 let getRandomByteMatrix n = getRandomMatrix n (fun i -> rand.Next() |> byte)
 let getRandomFloat32Matrix n = getRandomMatrix n (fun i -> rand.NextSingle())
+let getRandomFloat64Matrix n = getRandomMatrix n (fun i -> rand.NextDouble())
 let getRandomOptionIntMatrix n = getRandomMatrix n (fun i -> let x = rand.Next(-10,10) in if x % 3 = 0 then Some x else None)
 
 let multiplyKernel4 (clContext: ClContext) (localWorkSize:uint) (threadTileSize:uint) opAdd opMult zero =
@@ -270,7 +271,7 @@ let multiplyKernel0 (clContext: ClContext) (localWorkSize: uint) opAdd opMult ze
         commandQueue.Post(Msg.CreateRunMsg<_, _> kernel)
         m3
 
-let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) localWorkSize workPerThread (opAdd:Quotations.Expr<'a -> 'b -> 'a>) (opMult:Quotations.Expr<'e -> 'f -> 'b>) (zero:Quotations.Expr<'a>) =    
+let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) (numToRun:uint) localWorkSize workPerThread (opAdd:Quotations.Expr<'a -> 'b -> 'a>) (opMult:Quotations.Expr<'e -> 'f -> 'b>) (zero:Quotations.Expr<'a>) =    
     let kernel = 
         match kernel with 
         | Kernels.K0 -> multiplyKernel0 clContext localWorkSize opAdd opMult zero
@@ -280,34 +281,38 @@ let applyMultiplyGPU<'a,'b,'e,'f> (kernel:Kernels) (clContext: ClContext) localW
         | Kernels.K4 -> multiplyKernel4 clContext localWorkSize workPerThread opAdd opMult zero
         | x -> failwithf $"Unexpected kernel {x}."
     let queue = clContext.QueueProvider.CreateQueue()
+    //queue.Error.Add(fun x -> printfn "%A" x)
+    let numToRun = int numToRun
 
     fun (m1: 'e[][]) (m2: 'f[][]) ->
-        
-        let m1_gpu =
-            clContext.CreateClArray<_>(Array.concat m1, HostAccessMode.NotAccessible)
-        
-        let m2_gpu =
-            clContext.CreateClArray<_>(Array.concat m2, HostAccessMode.NotAccessible)
-        
-        let m3_gpu =
-            clContext.CreateClArray(
-                m1.Length * m1.Length,
-                HostAccessMode.NotAccessible,
-                deviceAccessMode=DeviceAccessMode.WriteOnly,
-                allocationMode = AllocationMode.Default
-            )
-        
-        let x = kernel queue m1_gpu m2_gpu m3_gpu m1.Length
-        
+        let start = System.DateTime.Now
         let result : 'a[] = Array.zeroCreate(m1.Length * m1.Length)
+        for i in 0 .. numToRun - 1 do
+            let m1_gpu =
+                clContext.CreateClArray<_>(Array.concat m1, HostAccessMode.NotAccessible)
+            
+            let m2_gpu =
+                clContext.CreateClArray<_>(Array.concat m2, HostAccessMode.NotAccessible)
+            
+            let m3_gpu =
+                clContext.CreateClArray(
+                    m1.Length * m1.Length,
+                    HostAccessMode.NotAccessible,
+                    deviceAccessMode=DeviceAccessMode.WriteOnly,
+                    allocationMode = AllocationMode.Default
+                )
+            
+            let x = kernel queue m1_gpu m2_gpu m3_gpu m1.Length
+            
+            let result = queue.PostAndReply(fun ch -> Msg.CreateToHostMsg(m3_gpu, result, ch))
+            
+            queue.Post(Msg.CreateFreeMsg m1_gpu)
+            
+            queue.Post(Msg.CreateFreeMsg m2_gpu)
+            
+            queue.Post(Msg.CreateFreeMsg m3_gpu)
+
+        let totalTime = (System.DateTime.Now - start).TotalMilliseconds
         
-        let result = queue.PostAndReply(fun ch -> Msg.CreateToHostMsg(m3_gpu, result, ch))
-        
-        queue.Post(Msg.CreateFreeMsg m1_gpu)
-        
-        queue.Post(Msg.CreateFreeMsg m2_gpu)
-        
-        queue.Post(Msg.CreateFreeMsg m3_gpu)
-        
-        result
+        result, (totalTime / (float numToRun))
         
diff --git a/src/MatrixMultiplication/MatrixMultiplication.fsproj b/src/MatrixMultiplication/MatrixMultiplication.fsproj
@@ -18,6 +18,7 @@
     <Compile Include="AssemblyInfo.fs" />
     <None Include="App.config" />
     <Compile Include="Matrices.fs" />
+    <Compile Include="Tuner.fs" />
     <Compile Include="Main.fs" />
   </ItemGroup>
   <ItemGroup>
diff --git a/src/MatrixMultiplication/Tuner.fs b/src/MatrixMultiplication/Tuner.fs
@@ -0,0 +1,25 @@
+module ImageProcessing.Tuner
+
+open ImageProcessing.Matrices
+open Brahma.FSharp
+
+let tune (kernel:Kernels) (clContext: ClContext) (numToRun:uint) (opAdd:Quotations.Expr<'a -> 'b -> 'a>) (opMult:Quotations.Expr<'e -> 'f -> 'b>) (zero:Quotations.Expr<'a>) m1 m2 = 
+    let mutable localWorkSize = 2u
+    let mutable workPerThread = 2u
+    let mutable cruBestTime = System.Double.MaxValue
+    let powOf2 = [for i in 1..8 -> pown 2 i |> uint]
+    for lws in powOf2 do
+       for wpt in powOf2 do
+          try 
+            let _,time = applyMultiplyGPU kernel clContext numToRun lws wpt opAdd opMult zero m1 m2
+            printfn $"local work size: {lws}; work per thread: {wpt} --- {time} ms."
+            if time < cruBestTime
+            then
+               cruBestTime <- time
+               localWorkSize <- lws
+               workPerThread <- wpt
+          with 
+          | e -> printfn $"local work size: {lws}; work per thread: {wpt} --- bad configuration."
+
+    printfn $"Best configuration: local work size: {localWorkSize}; work per thread: {workPerThread}."
+    [||],0.0
diff --git a/tune.py b/tune.py
@@ -0,0 +1,42 @@
+import subprocess
+import os
+import datetime
+from operator import itemgetter
+
+workGroupSizes = [2**(x+3) for x in range(5)]
+workPerThreads = [2**(x+1) for x in range(7)]
+
+matrixSize = 1024
+kernels = ['k2', 'k3', 'k4']
+semiring = 'arithmetic'
+numToRun = 10
+platform = 'nvidia'
+types = ['mt-byte', 'mt-int', 'mt-float32', 'mt-float64']
+
+out_directory = 'tuning_results'
+
+if not os.path.exists(out_directory):
+    os.makedirs(out_directory)
+
+for kernel in kernels:
+    for matrixType in types: 
+        res = []
+        print(f'Tuning for {matrixType} and kernel {kernel} started.')
+        for wgs in workGroupSizes:
+            for wpt in workPerThreads:
+                try: 
+                    cmd = f'dotnet ./src/MatrixMultiplication/bin/Release/net9.0/MatrixMultiplication.dll --platform {platform} --kernel {kernel} --matrixsize {matrixSize} --matrixtype {matrixType} --semiring {semiring} --numtorun {numToRun} --workperthread {wpt} --workgroupsize {wgs}'
+                    output = subprocess.check_output([cmd],shell=True)
+                    output = output.decode("utf-8")
+                    if 'Processing time:' in output:
+                        time = float(output.split()[-2])
+                        res.append([wgs, wpt, time])
+                        print (f'wgs={wgs}, wpt={wpt}, time={time}')
+                except BaseException: ()
+
+        res = sorted(res, key=itemgetter(2))
+        f = open(os.path.join(out_directory,f'{kernel}_{platform}_{matrixType}_{matrixSize}_{semiring}_{datetime.datetime.now()}.log'),'a')
+        for r in res:
+            print(r) 
+            f.write(f'{r[0]}, {r[1]}, {r[2]}\n')
+        f.close()