Merge branch 'main' into sub-group-slm-transpose

victor-eds · web-flow · commit 1c2f1a8b9b41 · 2024-10-22T10:00:43.000+02:00
diff --git a/.github/workflows/e2e-accuracy.yml b/.github/workflows/e2e-accuracy.yml
@@ -44,6 +44,10 @@ on:
           - all
           - subset
         default: all
+      check_all_subset_models:
+        description: In "subset" mode, check all subset models
+        type: boolean
+        default: false
       only_one_model:
         description: Run only this one model
         type: string
@@ -125,6 +129,7 @@ jobs:
       test_mode: accuracy
       dtype: ${{ matrix.dtype }}
       models: ${{ inputs.models }}
+      check_all_subset_models: ${{ inputs.check_all_subset_models || false }}
       only_one_model: ${{ inputs.only_one_model }}
       runner_label: ${{ inputs.runner_label }}
       TORCH_COMPILE_DEBUG: ${{ inputs.TORCH_COMPILE_DEBUG }}
diff --git a/.github/workflows/e2e-performance.yml b/.github/workflows/e2e-performance.yml
@@ -44,6 +44,10 @@ on:
           - all
           - subset
         default: subset
+      check_all_subset_models:
+        description: In "subset" mode, do not fail workflow if one of models failed
+        type: boolean
+        default: false
       only_one_model:
         description: Run only this one model
         type: string
@@ -136,6 +140,7 @@ jobs:
       test_mode: performance
       dtype: ${{ matrix.dtype }}
       models: ${{ inputs.models }}
+      check_all_subset_models: ${{ inputs.check_all_subset_models || false }}
       only_one_model: ${{ inputs.only_one_model }}
       runner_label: ${{ inputs.runner_label }}
       TORCH_COMPILE_DEBUG: ${{ inputs.TORCH_COMPILE_DEBUG }}
diff --git a/.github/workflows/e2e-reusable.yml b/.github/workflows/e2e-reusable.yml
@@ -27,6 +27,10 @@ on:
         description: Run all models or a subset
         type: string
         default: all
+      check_all_subset_models:
+        description: In "subset" mode, check all subset models
+        type: boolean
+        default: false
       only_one_model:
         description: Run only this one model
         type: string
@@ -224,9 +228,19 @@ jobs:
           if [[ "${{ inputs.only_one_model }}" ]]; then
             bash -e $GITHUB_WORKSPACE/scripts/inductor_xpu_test.sh ${{ inputs.suite }} ${{ inputs.dtype }} ${{ inputs.mode }} ${{ inputs.test_mode }} xpu 0 static 1 0 ${{ inputs.only_one_model }}
           elif [[ "${{ inputs.models }}" == "subset" ]]; then
+            models_subset_file="$GITHUB_WORKSPACE/.github/models/${{ inputs.test_mode }}/${{ inputs.suite }}.txt"
             while read model; do
               bash -e $GITHUB_WORKSPACE/scripts/inductor_xpu_test.sh ${{ inputs.suite }} ${{ inputs.dtype }} ${{ inputs.mode }} ${{ inputs.test_mode }} xpu 0 static 1 0 $model
-            done < $GITHUB_WORKSPACE/.github/models/${{ inputs.test_mode }}/${{ inputs.suite }}.txt
+            done < $models_subset_file
+            if [[ "${{ inputs.check_all_subset_models }}" == true ]]; then
+              python $GITHUB_WORKSPACE/scripts/check_inductor_report.py --models-file="$models_subset_file" \
+                --suite=${{ inputs.suite }} \
+                --dtype=${{ inputs.dtype }} \
+                --mode=${{ inputs.mode }} \
+                --test_mode=${{ inputs.test_mode }} \
+                --device=xpu \
+                --inductor-log-dir="${GITHUB_WORKSPACE}/inductor_log"
+            fi
           else
             bash -e $GITHUB_WORKSPACE/scripts/inductor_xpu_test.sh ${{ inputs.suite }} ${{ inputs.dtype }} ${{ inputs.mode }} ${{ inputs.test_mode }} xpu 0 static 1 0
           fi
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -171,7 +171,7 @@ def forward(q, k, v, causal, sm_scale):
     assert Lk in {16, 32, 64, 128}
     o = torch.empty_like(q, dtype=torch.float32)
     BLOCK_M = 128
-    BLOCK_N = 64 if Lk <= 64 else 32
+    BLOCK_N = 64
     num_stages = 3
     num_warps = 8 if Lq == 64 else 16
     stage = 3 if causal else 1
@@ -205,7 +205,8 @@ def forward(q, k, v, causal, sm_scale):
             BLOCK_DMODEL=Lk,  #
             STAGE=stage,  #
             num_warps=num_warps,  #
-            num_stages=num_stages  #
+            num_stages=num_stages,  #
+            grf_mode='large',  #
         )
     return o
 
diff --git a/scripts/check_inductor_report.py b/scripts/check_inductor_report.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+import argparse
+from pathlib import Path
+import csv
+import sys
+
+
+def check_report(suite, dtype, mode, test_mode, device, models_file, inductor_log_dir):
+    inductor_log_dir_leaf = Path(inductor_log_dir) / suite / dtype
+    inductor_report_filename = f"inductor_{suite}_{dtype}_{mode}_{device}_{test_mode}.csv"
+    inductor_report_path = Path(inductor_log_dir_leaf / inductor_report_filename)
+
+    subset = []
+    report = []
+    exitcode = 0
+
+    with open(models_file, encoding="utf-8") as f:
+        subset = f.read().splitlines()
+
+    with open(inductor_report_path, encoding="utf-8") as f:
+        reader = csv.reader(f)
+        report_with_header = []
+        for l in reader:
+            report_with_header.append(l)
+        for r in report_with_header[1:]:
+            if r[0] == device:
+                report.append(r)
+
+    test_list = [r[1] for r in report]
+
+    if test_mode == "performance":
+        for m in subset:
+            if m not in test_list:
+                exitcode = 1
+                print(f"Test is not found in report: {m}")
+
+    if test_mode == "accuracy":
+        test_statuses = [r[3] for r in report]
+        for m in subset:
+            try:
+                idx = test_list.index(m)
+            except ValueError:
+                exitcode = 1
+                print(f"Test is NOT FOUND: {m}")
+                continue
+            if test_statuses[idx] != "pass":
+                exitcode = 1
+                print(f"Test is NOT PASSED: {m}")
+    return exitcode
+
+
+def main():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--suite", required=True)
+    argparser.add_argument("--dtype", required=True)
+    argparser.add_argument("--mode", required=True, choices=("inference", "training", "inference-no-freezing"))
+    argparser.add_argument("--test_mode", required=True, choices=("performance", "accuracy"))
+    argparser.add_argument("--device", help="i.e. xpu", required=True)
+    argparser.add_argument("--models-file", help="Subset of models list", required=True)
+    argparser.add_argument("--inductor-log-dir", help="Inductor test log directory", default="inductor_log")
+    args = argparser.parse_args()
+    exitcode = check_report(args.suite, args.dtype, args.mode, args.test_mode, args.device, args.models_file,
+                            args.inductor_log_dir)
+    print(f"Report check result: {'SUCCESS' if exitcode == 0 else 'FAIL'}")
+    sys.exit(exitcode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -238,7 +238,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_rewrite_tensor_pointer(pm)
         intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False)
 
-        passes.ttgpuir.add_coalesce(pm)
+        intel.passes.ttgpuir.add_coalesce(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
diff --git a/third_party/intel/include/Analysis/AxisInfo.h b/third_party/intel/include/Analysis/AxisInfo.h
@@ -27,11 +27,12 @@ class AxisInfo {
 public:
   AxisInfo() : AxisInfo({}, {}, {}) {}
 
-  AxisInfo(DimVectorT contiguity, DimVectorT divisibility, DimVectorT constancy)
+  AxisInfo(const DimVectorT &contiguity, const DimVectorT &divisibility,
+           const DimVectorT &constancy)
       : AxisInfo(contiguity, divisibility, constancy, std::nullopt) {}
 
-  AxisInfo(DimVectorT contiguity, DimVectorT divisibility, DimVectorT constancy,
-           std::optional<int64_t> constantValue)
+  AxisInfo(const DimVectorT &contiguity, const DimVectorT &divisibility,
+           const DimVectorT &constancy, std::optional<int64_t> constantValue)
       : contiguity(contiguity), divisibility(divisibility),
         constancy(constancy), constantValue(constantValue) {
     assert(divisibility.size() == contiguity.size());
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/Utils.h b/third_party/intel/include/Dialect/TritonIntelGPU/IR/Utils.h
@@ -9,11 +9,37 @@
 #ifndef TRITON_DIALECT_TRITON_INTEL_GPU_IR_UTILS_H
 #define TRITON_DIALECT_TRITON_INTEL_GPU_IR_UTILS_H
 
-#include <optional>
-
+#include "intel/include/Analysis/AxisInfo.h"
+#include "mlir/IR/Operation.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include <triton/Tools/Sys/GetEnv.hpp>
 
 namespace mlir::triton::gpu::intel {
+
+/// Calculate the optimal number of elements per thread for a given operation
+/// along an axis with greatest continuity.
+inline unsigned getNumElementsPerThread(
+    Operation *op, SmallVector<unsigned> order,
+    mlir::triton::intel::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
+  Value val = getMemAccessPtr(op);
+  Type valTy = val.getType();
+  auto ty =
+      isTensorPointerType(valTy)
+          ? cast<RankedTensorType>(cast<PointerType>(valTy).getPointeeType())
+          : cast<RankedTensorType>(valTy);
+  auto shapePerCTA = getShapePerCTA(ty);
+  mlir::triton::intel::AxisInfo &valInfo = *axisInfoAnalysis.getAxisInfo(val);
+
+  unsigned elemNumBits = getElementBitWidth(ty);
+  unsigned elemNumBytes = std::max(elemNumBits / 8, 1u);
+  unsigned maxMultipleBytes = valInfo.getDivisibility(order[0]);
+  unsigned maxMultiple = std::max(maxMultipleBytes / elemNumBytes, 1u);
+  unsigned maxContig =
+      std::min(valInfo.getContiguity(order[0]), shapePerCTA[order[0]]);
+  unsigned alignment = std::min(maxMultiple, maxContig);
+  return std::min(alignment, 128 / elemNumBits);
+}
+
 /// Check whether transposed reduction should be performed.
 ///
 /// See: https://github.com/intel/intel-xpu-backend-for-triton/issues/1637
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
@@ -27,6 +27,22 @@ def TritonIntelGPUAccelerateMatmul
   ];
 }
 
+def TritonIntelGPUCoalesce
+    : Pass<"tritonintelgpu-coalesce", "mlir::ModuleOp"> {
+  let summary = "Intel Coalesce";
+
+  let description = [{
+    The pass analyses loads/stores with type `tensor<tt.ptr<>>` or
+    `tt.ptr<tensor<>>` and replaces the layouts of these operations with
+    coalesced layouts, i.e. cache friendly access patterns.
+    Layout conversions are inserted before and after the load/store op
+    to maintain consistency with the rest of the program.
+  }];
+
+  let dependentDialects = ["mlir::triton::TritonDialect",
+                           "mlir::triton::gpu::TritonGPUDialect"];
+}
+
 def TritonIntelGPUDistributeToWarps
     : Pass<"tritonintelgpu-distribute-to-warps", "mlir::ModuleOp"> {
   let summary = "distribute the thread block workload to the warps";
diff --git a/third_party/intel/lib/Analysis/AxisInfo.cpp b/third_party/intel/lib/Analysis/AxisInfo.cpp
@@ -1010,8 +1010,12 @@ class MakeTensorPtrOpAxisInfoVisitor final
   getAxisInfo(triton::MakeTensorPtrOp op,
               ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
     LDBG("MakeTensorPtrOpAxisInfoVisitor: " << *op);
-    assert(op.getShape().size() == 2 && operands.size() == 7 &&
-           "MakeTensorPtrOp should have 2D shape");
+
+    // TODO: Extend to higher dimension tensor pointers.
+    if (op.getShape().size() != 2)
+      return AxisInfo();
+
+    assert(operands.size() == 7 && "MakeTensorPtrOp should have 2D shape");
 
     AxisInfo ptrInfo = operands[0]->getValue();
     AxisInfo shapeInfo0 = operands[1]->getValue();
@@ -1344,7 +1348,7 @@ void ModuleAxisInfoAnalysis::initialize(FunctionOpInterface funcOp) {
     } else {
       curAxisInfo = axisInfo;
     }
-    (*axisInfoMap)[value] = curAxisInfo;
+    (*axisInfoMap)[value] = std::move(curAxisInfo);
   };
   funcOp.walk([&](Operation *op) {
     for (auto value : op->getResults()) {
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_triton_library(TritonIntelGPUTransforms
   AccelerateMatmul.cpp
+  Coalesce.cpp
   DistributeToWarps.cpp
   MatchTargetSize.cpp
   MaterializeBlockPointer.cpp
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc