Partial changes for different files for kernel and input

arpitj1 · arpitj1 · commit d765bb90332e · 2025-06-12T16:07:16.000-07:00
diff --git a/generic_solver/cublas_example.mlir b/generic_solver/cublas_example.mlir
@@ -3,26 +3,6 @@ module {
   // Define a collection of kernel operation definitions
   kernel.defn_collection {
     
-    // GEMM operation definition with linalg.generic representation
-    kernel.defn @simple_gemm_linalg(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
-      // Implementation using linalg.generic
-      %result = linalg.generic {
-        indexing_maps = [
-          affine_map<(i, j, k) -> (i, k)>,  // A(i,k)
-          affine_map<(i, j, k) -> (k, j)>,  // B(k,j)
-          affine_map<(i, j, k) -> (i, j)>   // C(i,j)
-        ],
-        iterator_types = ["parallel", "parallel", "reduction"]
-      } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
-        outs(%C : tensor<?x?xf32>) {
-        ^bb0(%a: f32, %b: f32, %c: f32):
-          %product = arith.mulf %a, %b : f32
-          %result = arith.addf %product, %c : f32
-          linalg.yield %result : f32
-      } -> tensor<?x?xf32>
-      kernel.yield %result : tensor<?x?xf32>
-    }
-
     // GEMM operation definition with arbitrary code implementation
     kernel.defn @gemm(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) {
       // This could include arbitrary code to implement the GEMM operation
@@ -89,6 +69,27 @@ module {
       } -> tensor<?x?x?xf32>
       kernel.yield
     }
+    
+    // GEMM operation definition with linalg.generic representation
+    kernel.defn @simple_gemm_linalg(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+      // Implementation using linalg.generic
+      %result = linalg.generic {
+        indexing_maps = [
+          affine_map<(i, j, k) -> (i, k)>,  // A(i,k)
+          affine_map<(i, j, k) -> (k, j)>,  // B(k,j)
+          affine_map<(i, j, k) -> (i, j)>   // C(i,j)
+        ],
+        iterator_types = ["parallel", "parallel", "reduction"]
+      } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
+        outs(%C : tensor<?x?xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+          %product = arith.mulf %a, %b : f32
+          %result = arith.addf %product, %c : f32
+          linalg.yield %result : f32
+      } -> tensor<?x?xf32>
+      kernel.yield %result : tensor<?x?xf32>
+    }
+
 
     // Index of maximum absolute value operation definition with arbitrary code
     kernel.defn @iamax(%X: tensor<?xf32>) -> tensor<i32> {
@@ -195,26 +196,6 @@ module {
       kernel.yield %result : tensor<f32>
     }
   
-    //Func that uses simple gemm
-    func.func @simple_gemm(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
-            // Implementation using linalg.generic
-      %result = linalg.generic {
-        indexing_maps = [
-          affine_map<(i, j, k) -> (i, k)>,  // A(i,k)
-          affine_map<(i, j, k) -> (k, j)>,  // B(k,j)
-          affine_map<(i, j, k) -> (i, j)>   // C(i,j)
-        ],
-        iterator_types = ["parallel", "parallel", "reduction"]
-      } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
-        outs(%C : tensor<?x?xf32>) {
-        ^bb0(%a: f32, %b: f32, %c: f32):
-          %product = arith.mulf %a, %b : f32
-          %result = arith.addf %product, %c : f32
-          linalg.yield %result : f32
-      } -> tensor<?x?xf32>
-      return %result : tensor<?x?xf32>
-    }
-
     // Mathematical definitions (commented, for reference)
     // kernel.defn @gemm(...) {
     //   C(i,j) += alpha * A(i,k) * B(k,j);
@@ -236,4 +217,25 @@ module {
     //   result = sum_i |x_i|;
     // }
   }
+    
+    //Func that uses simple gemm
+    func.func @simple_gemm(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+            // Implementation using linalg.generic
+      %result = linalg.generic {
+        indexing_maps = [
+          affine_map<(i, j, k) -> (i, k)>,  // A(i,k)
+          affine_map<(i, j, k) -> (k, j)>,  // B(k,j)
+          affine_map<(i, j, k) -> (i, j)>   // C(i,j)
+        ],
+        iterator_types = ["parallel", "parallel", "reduction"]
+      } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
+        outs(%C : tensor<?x?xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+          %product = arith.mulf %a, %b : f32
+          %result = arith.addf %product, %c : f32
+          linalg.yield %result : f32
+      } -> tensor<?x?xf32>
+      return %result : tensor<?x?xf32>
+    }
+
 } 
diff --git a/generic_solver/kernel_library_simple.mlir b/generic_solver/kernel_library_simple.mlir
@@ -0,0 +1,101 @@
+// Kernel Library - Reusable kernel definitions
+// This file contains a collection of kernel definitions that can be loaded
+// by the linalg-to-kernel pass and applied to different MLIR modules.
+
+module {
+  // Collection of kernel operation definitions
+  kernel.defn_collection {
+    
+    // Simple GEMM operation definition with linalg.generic representation
+    kernel.defn @simple_gemm_linalg(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+      // Simple matrix multiplication: C = A * B + C
+      %result = linalg.generic {
+        indexing_maps = [
+          affine_map<(d0, d1, d2) -> (d0, d2)>,
+          affine_map<(d0, d1, d2) -> (d2, d1)>,
+          affine_map<(d0, d1, d2) -> (d0, d1)>
+        ],
+        iterator_types = ["parallel", "parallel", "reduction"]
+      } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
+        outs(%C : tensor<?x?xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+          %product = arith.mulf %a, %b : f32
+          %result = arith.addf %product, %c : f32
+          linalg.yield %result : f32
+      } -> tensor<?x?xf32>
+      kernel.yield %result : tensor<?x?xf32>
+    }
+
+    // Scaled GEMM operation definition with alpha and beta coefficients
+    kernel.defn @gemm_linalg(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+      %alpha = arith.constant 1.0 : f32
+      %beta = arith.constant 0.0 : f32
+      
+      // GEMM with scaling: C = alpha * A * B + beta * C
+      %result = linalg.generic {
+        indexing_maps = [
+          affine_map<(d0, d1, d2) -> (d0, d2)>,
+          affine_map<(d0, d1, d2) -> (d2, d1)>,
+          affine_map<(d0, d1, d2) -> (d0, d1)>
+        ],
+        iterator_types = ["parallel", "parallel", "reduction"]
+      } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
+        outs(%C : tensor<?x?xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+          %product = arith.mulf %a, %b : f32
+          %scaled = arith.mulf %product, %alpha : f32
+          %scaled_c = arith.mulf %c, %beta : f32
+          %result = arith.addf %scaled, %scaled_c : f32
+          linalg.yield %result : f32
+      } -> tensor<?x?xf32>
+      kernel.yield %result : tensor<?x?xf32>
+    }
+
+    // Sum of absolute values operation (ASUM)
+    kernel.defn @asum_linalg(%X: tensor<?xf32>) -> tensor<f32> {
+      %c0 = arith.constant 0.0 : f32
+      %init = tensor.empty() : tensor<f32>
+      %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<f32>) -> tensor<f32>
+      
+      // Sum of absolute values: result = sum_i |x_i|
+      %result = linalg.generic {
+        indexing_maps = [
+          affine_map<(d0) -> (d0)>,
+          affine_map<(d0) -> ()>
+        ],
+        iterator_types = ["reduction"]
+      } ins(%X : tensor<?xf32>) 
+        outs(%fill : tensor<f32>) {
+        ^bb0(%in: f32, %out: f32):
+          %abs_val = math.absf %in : f32
+          %result = arith.addf %abs_val, %out : f32
+          linalg.yield %result : f32
+      } -> tensor<f32>
+      kernel.yield %result : tensor<f32>
+    }
+
+    // Vector dot product
+    kernel.defn @dot_linalg(%X: tensor<?xf32>, %Y: tensor<?xf32>) -> tensor<f32> {
+      %c0 = arith.constant 0.0 : f32
+      %init = tensor.empty() : tensor<f32>
+      %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<f32>) -> tensor<f32>
+      
+      // Dot product: result = sum_i x_i * y_i
+      %result = linalg.generic {
+        indexing_maps = [
+          affine_map<(d0) -> (d0)>,
+          affine_map<(d0) -> (d0)>,
+          affine_map<(d0) -> ()>
+        ],
+        iterator_types = ["reduction"]
+      } ins(%X, %Y : tensor<?xf32>, tensor<?xf32>) 
+        outs(%fill : tensor<f32>) {
+        ^bb0(%x: f32, %y: f32, %out: f32):
+          %product = arith.mulf %x, %y : f32
+          %result = arith.addf %product, %out : f32
+          linalg.yield %result : f32
+      } -> tensor<f32>
+      kernel.yield %result : tensor<f32>
+    }
+  }
+} 
diff --git a/generic_solver/test_input_simple.mlir b/generic_solver/test_input_simple.mlir
@@ -0,0 +1,71 @@
+// Test input file - contains linalg.generic operations to be matched
+// This file does NOT contain kernel.defn_collection - those will be loaded externally
+
+module {
+  // Function that performs simple matrix multiplication
+  func.func @simple_gemm(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    // This linalg.generic should match @simple_gemm_linalg from kernel_library.mlir
+    %result = linalg.generic {
+      indexing_maps = [
+        affine_map<(d0, d1, d2) -> (d0, d2)>,
+        affine_map<(d0, d1, d2) -> (d2, d1)>,
+        affine_map<(d0, d1, d2) -> (d0, d1)>
+      ],
+      iterator_types = ["parallel", "parallel", "reduction"]
+    } ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>) 
+      outs(%C : tensor<?x?xf32>) {
+      ^bb0(%a: f32, %b: f32, %c: f32):
+        %product = arith.mulf %a, %b : f32
+        %result = arith.addf %product, %c : f32
+        linalg.yield %result : f32
+    } -> tensor<?x?xf32>
+    return %result : tensor<?x?xf32>
+  }
+
+  // Function that computes sum of absolute values
+  func.func @compute_asum(%X: tensor<?xf32>) -> tensor<f32> {
+    %c0 = arith.constant 0.0 : f32
+    %init = tensor.empty() : tensor<f32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<f32>) -> tensor<f32>
+    
+    // This linalg.generic should match @asum_linalg from kernel_library.mlir
+    %result = linalg.generic {
+      indexing_maps = [
+        affine_map<(d0) -> (d0)>,
+        affine_map<(d0) -> ()>
+      ],
+      iterator_types = ["reduction"]
+    } ins(%X : tensor<?xf32>) 
+      outs(%fill : tensor<f32>) {
+      ^bb0(%in: f32, %out: f32):
+        %abs_val = math.absf %in : f32
+        %result = arith.addf %abs_val, %out : f32
+        linalg.yield %result : f32
+    } -> tensor<f32>
+    return %result : tensor<f32>
+  }
+
+  // Function that computes dot product
+  func.func @compute_dot(%X: tensor<?xf32>, %Y: tensor<?xf32>) -> tensor<f32> {
+    %c0 = arith.constant 0.0 : f32
+    %init = tensor.empty() : tensor<f32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%init : tensor<f32>) -> tensor<f32>
+    
+    // This linalg.generic should match @dot_linalg from kernel_library.mlir
+    %result = linalg.generic {
+      indexing_maps = [
+        affine_map<(d0) -> (d0)>,
+        affine_map<(d0) -> (d0)>,
+        affine_map<(d0) -> ()>
+      ],
+      iterator_types = ["reduction"]
+    } ins(%X, %Y : tensor<?xf32>, tensor<?xf32>) 
+      outs(%fill : tensor<f32>) {
+      ^bb0(%x: f32, %y: f32, %out: f32):
+        %product = arith.mulf %x, %y : f32
+        %result = arith.addf %product, %out : f32
+        linalg.yield %result : f32
+    } -> tensor<f32>
+    return %result : tensor<f32>
+  }
+} 
diff --git a/include/polygeist/Passes/Passes.h b/include/polygeist/Passes/Passes.h
@@ -74,6 +74,7 @@ createGpuSerializeToHsacoPass(StringRef arch, StringRef features,
                               std::string rocmPath, bool outputIntermediate);
 
 std::unique_ptr<Pass> createLinalgToKernelPass();
+std::unique_ptr<Pass> createLinalgToKernelPass(const std::string& kernelLibraryPath);
 
 void registerGpuSerializeToCubinPass();
 void registerGpuSerializeToHsacoPass();
diff --git a/include/polygeist/Passes/Passes.td b/include/polygeist/Passes/Passes.td
@@ -294,6 +294,12 @@ def LinalgToKernel : Pass<"linalg-to-kernel", "mlir::ModuleOp"> {
     "tensor::TensorDialect",
     "arith::ArithDialect",
   ];
+  let options = [
+    Option<"kernelLibraryPath", "kernel-library-path", "std::string", 
+           /*default=*/"\"\"", 
+           "Path to external MLIR file containing kernel.defn_collection definitions. "
+           "If empty, looks for kernel.defn_collection in the input module.">
+  ];
 }
 
 def ConvertPolygeistToLLVM : Pass<"convert-polygeist-to-llvm", "mlir::ModuleOp"> {
diff --git a/lib/polygeist/Passes/LinalgToKernel.cpp b/lib/polygeist/Passes/LinalgToKernel.cpp