intel
diff --git a/‎.github/pins/ipex.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/ipex.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-test-windows.yml‎
Lines changed: 56 additions & 0 deletions b/‎.github/workflows/build-test-windows.yml‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎.github/workflows/build-test.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/build-test.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 7 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 1 addition & 7 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎python/setup.py‎
Lines changed: 10 additions & 6 deletions b/‎python/setup.py‎
Lines changed: 10 additions & 6 deletions
@@ -1 +1 @@
-cd132db4e11fbf799a2d3ed2afea100a4afd4efd
+15ef7db18b0a50101b41d9c78780d35ea7937ffc
@@ -0,0 +1,56 @@
+name: Build on Windows
+
+on:
+  workflow_dispatch:
+
+  pull_request:
+    branches:
+      - main
+      - release/**
+  push:
+    branches:
+      - main
+      - release/**
+
+permissions: read-all
+
+env:
+  NEW_WORKSPACE: C:\gh${{ github.run_id }}
+
+jobs:
+  build:
+    name: Build
+    runs-on: avc336
+    steps:
+      - name: Enable long paths
+        run: |
+          git config --system core.longPaths true
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      # Copy workspace to a temporary location with a shorter name.
+      - name: Copy workspace
+        run: |
+          Copy-Item -Path ${{ github.workspace }} -Destination ${{ env.NEW_WORKSPACE }} -Recurse
+
+      # We need ninja >= 1.12.0 to support long names on Windows. At the moment there is no required
+      # version in pypi, so instead of installing ninja with pip we use a preinstalled 1.12.1 on the
+      # runner.
+      - name: Build Triton
+        run: |
+          cd ${{ env.NEW_WORKSPACE }}
+          cd python
+          pip install -U wheel pybind11 certifi cython cmake
+          python -m certifi
+          pip install --no-build-isolation '.[build]'
+
+      - name: Clean
+        if: ${{ always() }}
+        run: |
+          Remove-Item -LiteralPath ${{ env.NEW_WORKSPACE }} -Force -Recurse -ErrorAction Ignore
@@ -43,9 +43,11 @@ on:
   pull_request:
     branches:
       - main
+      - release/**
   push:
     branches:
       - main
+      - release/**
 
 permissions: read-all
 
 
@@ -404,6 +404,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
                  --ignore=language/test_line_info.py \
 
@@ -402,6 +402,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 16 language runtime \
                  --ignore=language/test_line_info.py \
 
@@ -1142,7 +1142,8 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
     bool isHopper() const;
 
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> shape,
-                                          int bitwidth, int opIdx) const;
+                                          int bitwidth, int kWidth,
+                                          int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
 
     bool supportReduction() const {
 
@@ -28,6 +28,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_DISABLE_LINE_INFO",
     "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
     "TRITON_ENABLE_LLVM_DEBUG",
+    "TRITON_HIP_STREAM_PREFETCH",
     "TRITON_LLVM_DEBUG_ONLY",
     "USE_IR_LOC",
     "NVPTX_ENABLE_DUMP",
 
@@ -953,7 +953,7 @@ DotOperandEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape,
   } else if (auto mma = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent)) {
     if (mma.isAmpere() || mma.isHopper()) {
       auto bitwidth = getPointeeType(eltTy).getIntOrFloatBitWidth();
-      auto rep = mma.getRepForOperand(shape, bitwidth, idx);
+      auto rep = mma.getRepForOperand(shape, bitwidth, kWidth, idx);
       auto sizePerThread = getSizePerThread();
       auto elemsPerKRep = mma.isHopper() ? (kWidth * 2) : (32 / bitwidth * 2);
       if (rank == 3)
@@ -2018,14 +2018,18 @@ NvidiaMmaEncodingAttr::getRepOrderForOperand(int opIdx) const {
 
 SmallVector<int64_t>
 NvidiaMmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> shape, int bitwidth,
-                                        int opIdx) const {
+                                        int kWidth, int opIdx) const {
   auto rank = shape.size();
   auto warpsPerCTA = getWarpsPerCTA();
 
   // {batch, m, n, k}
   // Hopper path never uses the n value, since this method is only invoked
   // for in-RF (dotOpEnc) operands, but WGMMA only supports in A to be in RF
-  SmallVector<int> shapePerWarp = {1, 16, 8, 4 * 64 / bitwidth};
+  // TODO: rep per operand is not accurate for Hopper. It is currently done that
+  // way to allow us to get the correct total number of elements. this will be
+  // fixed when moving to linear layout.
+  SmallVector<int> shapePerWarp = {
+      1, 16, 8, isHopper() ? 4 * 2 * kWidth : 4 * 64 / bitwidth};
   int numRepBatch =
       rank == 3
           ? std::max<int64_t>(1, shape[0] / (shapePerWarp[0] * warpsPerCTA[0]))
 
@@ -38,13 +38,7 @@ void setUseAccFlag(Operation *op, Value useAcc) {
 }
 
 bool isConstantZeroTensor(Value v) {
-  auto constOp = v.getDefiningOp<arith::ConstantOp>();
-  if (!constOp)
-    return false;
-  auto splat = mlir::dyn_cast<SplatElementsAttr>(constOp.getValue());
-  if (!splat)
-    return false;
-  return splat.getSplatValue<FloatAttr>().getValue().convertToFloat() == 0.0f;
+  return (matchPattern(v, m_Zero()) || matchPattern(v, m_AnyZeroFloat()));
 }
 
 std::optional<std::pair<Operation *, int>> findZeroInitOp(Value accUse,
 
@@ -119,13 +119,13 @@ def find_visual_studio(version_ranges):
     for version_range in version_ranges:
         command = [
             str(vswhere), "-version", version_range, "-requires", "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-            "-property", "installationPath", "-prerelease"
+            "-products", "*", "-property", "installationPath", "-prerelease"
         ]
 
         try:
             output = subprocess.check_output(command, text=True).strip()
             if output:
-                return output
+                return output.split("\n")[0]
         except subprocess.CalledProcessError:
             continue
 
@@ -146,6 +146,13 @@ def set_env_vars(vs_path, arch="x64"):
             os.environ[var] = value
 
 
+def initialize_visual_studio_env(version_ranges, arch="x64"):
+    vs_path = find_visual_studio(version_ranges)
+    if not vs_path:
+        raise EnvironmentError("Visual Studio not found in specified version ranges.")
+    set_env_vars(vs_path, arch)
+
+
 # Taken from https://github.com/pytorch/pytorch/blob/master/tools/setup_helpers/env.py
 def check_env_flag(name: str, default: str = "") -> bool:
     return os.getenv(name, default).upper() in ["ON", "1", "YES", "TRUE", "Y"]
@@ -447,10 +454,7 @@ def build_extension(self, ext):
         lit_dir = shutil.which('lit')
         ninja_dir = shutil.which('ninja')
         if platform.system() == "Windows":
-            vs_path = find_visual_studio(["[17.0,18.0)", "[16.0,17.0)"])
-            env = set_env_vars(vs_path)
-            if not vs_path:
-                raise EnvironmentError("Visual Studio 2019 or 2022 not found.")
+            initialize_visual_studio_env(["[17.0,18.0)", "[16.0,17.0)"])
         # lit is used by the test suite
         thirdparty_cmake_args = get_thirdparty_packages([get_llvm_package_info()])
         thirdparty_cmake_args += self.get_pybind11_cmake_args()
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-cd132db4e11fbf799a2d3ed2afea100a4afd4efd`
	`1`	`+15ef7db18b0a50101b41d9c78780d35ea7937ffc`
Original file line number	Diff line number	Diff line change
`@@ -38,13 +38,7 @@ void setUseAccFlag(Operation *op, Value useAcc) {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`bool isConstantZeroTensor(Value v) {`
`41`		`- auto constOp = v.getDefiningOp<arith::ConstantOp>();`
`42`		`- if (!constOp)`
`43`		`- return false;`
`44`		`- auto splat = mlir::dyn_cast<SplatElementsAttr>(constOp.getValue());`
`45`		`- if (!splat)`
`46`		`- return false;`
`47`		`- return splat.getSplatValue<FloatAttr>().getValue().convertToFloat() == 0.0f;`
	`41`	`+ return (matchPattern(v, m_Zero()) \|\| matchPattern(v, m_AnyZeroFloat()));`
`48`	`42`	`}`
`49`	`43`
`50`	`44`	`std::optional<std::pair<Operation *, int>> findZeroInitOp(Value accUse,`