Towards more AMD and Intel matrix ops

davschneller · davschneller · commit ae43f64fca9b · 2026-02-09T07:04:12.000+01:00
diff --git a/tensorforge/backend/instructions/compute/primitives/amd.py b/tensorforge/backend/instructions/compute/primitives/amd.py
@@ -544,34 +544,51 @@ def hfma(writer: Writer, C, A, B, repeat, datatype, threads, ctx):
                 if b is not None:
                     func(writer, c, a, b, j)
 
-def wmma3atom(threads):
+def wmma3atom(writer, A, B, C, threads):
+
+    a = writer.varalloc()
+    b = writer.varalloc()
+    c = writer.varalloc()
+
     assert threads == 32
 
     N = 16
     M = 16
     K = 16
 
-    for i in range(N):
-        writer(f'const auto {a}_{i} = tensorforge::broadcast<32, 16, 0>({A}_{i});')
-    for j in range(N):
-        writer(f'const auto {b}_{j} = tensorforge::broadcast<32, 16, 0>({B}_{j});')
-
-    writer(f'tensorforge::transpose16x16({",".join(f"{b}_{i}" for i in range(N))});')
-
-    writer(f'VectorT<short, 16> {a}_p1;')
-    writer(f'VectorT<short, 16> {a}_p2;')
-    writer(f'VectorT<short, 16> {a}_p3;')
-    writer(f'VectorT<short, 16> {b}_p1;')
-    writer(f'VectorT<short, 16> {b}_p2;')
-    writer(f'VectorT<short, 16> {b}_p3;')
-
-    writer(f'VectorT<float, 8> {c}{"{}"};')
-    writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p1, {b}_p1, {c});')
-    writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p2, {b}_p1, {c});')
-    writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p1, {b}_p2, {c});')
-    writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p3, {b}_p1, {c});')
-    writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p1, {b}_p3, {c});')
-    writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p2, {b}_p2, {c});')
+    for m in range(2):
+        with writer.AnonymousScope():
+            for i in range(N):
+                writer(f'const auto {a}_{i} = tensorforge::broadcast<32, 16, {m}>({A}_{i});')
+            for j in range(N):
+                writer(f'const auto {b}_{j} = tensorforge::broadcast<32, 16, {m}>({B}_{j});')
+
+            writer(f'tensorforge::transpose16x16({",".join(f"{b}_{i}" for i in range(N))});')
+
+            writer(f'VectorT<short, 16> {a}_p1{"{}"};')
+            writer(f'VectorT<short, 16> {a}_p2{"{}"};')
+            writer(f'VectorT<short, 16> {a}_p3{"{}"};')
+            writer(f'VectorT<short, 16> {b}_p1{"{}"};')
+            writer(f'VectorT<short, 16> {b}_p2{"{}"};')
+            writer(f'VectorT<short, 16> {b}_p3{"{}"};')
+
+            for i in range(N):
+                writer(f'[{a}_p1[{i}], {a}_p2[{i}], {a}_p3[{i}]] = splitFloatBF16({a}_{i});')
+            for i in range(N):
+                writer(f'[{b}_p1[{i}], {b}_p2[{i}], {b}_p3[{i}]] = splitFloatBF16({b}_{i});')
+
+            writer(f'VectorT<float, 8> {c}{"{}"};')
+            writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p1, {b}_p1, {c});')
+            writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p2, {b}_p1, {c});')
+            writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p1, {b}_p2, {c});')
+            writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p3, {b}_p1, {c});')
+            writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p1, {b}_p3, {c});')
+            writer(f'{c} = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32({a}_p2, {b}_p2, {c});')
+
+            for j in range(N):
+                writer(f'const auto {c}_{j} = tensorforge::broadcast<32, 16, {m}>({c}[{j}]);')
+
+
 
     # TODO: gfx1200, f'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12'
 
diff --git a/tensorforge/backend/instructions/compute/primitives/intel.py b/tensorforge/backend/instructions/compute/primitives/intel.py
@@ -0,0 +1,16 @@
+
+def dpas(C, B, A, rc, sd):
+    # cf. https://github.com/intel/intel-graphics-compiler/blob/master/documentation/visa/instructions/DPAS.md
+    # sd == depth == k * elemsIn32Bit
+    # rc == m
+    writer(f'asm("DPAS.tf32.tf32.{sd}.{rc} (16) %[D], %[C], %[B], %[A]" : [D]"=f"({C}) : [C]"f"({C}), [B]"d"({B}), [A]"d"({A}) :);')
+
+def matmul(writer, C, A, B, M, N, K, kx, threads, dtype, sparse, ctx):
+
+
+    rc = 8
+    sd = 8
+
+    dpas(C, A, B, rc, sd)
+
+    # TODO
diff --git a/tensorforge/include/tensorforge_device/hip.h b/tensorforge/include/tensorforge_device/hip.h
@@ -5,6 +5,7 @@
 #include <hip/hip_cooperative_groups.h>
 
 #include <type_traits>
+#include <utility>
 
 #include "base.h"
 
@@ -807,4 +808,16 @@ class Loader {
 };
 */
 
+std::tuple<short, short, short> splitFloatBF16(float input) {
+  const auto i1 = static_cast<__bf16>(input);
+  const auto i1r = input - static_cast<float>(i1);
+  const auto i2 = static_cast<__bf16>(i1r);
+  const auto i2r = i1r - static_cast<float>(i2);
+  const auto i3 = static_cast<__bf16>(i2r);
+  const auto r1 = *reinterpret_cast<const short *>(&i1);
+  const auto r2 = *reinterpret_cast<const short *>(&i2);
+  const auto r3 = *reinterpret_cast<const short *>(&i3);
+  return {r1, r2, r3};
+}
+
 } // namespace tensorforge