intel
diff --git a/‎include/triton/Analysis/Membar.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Analysis/Membar.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 368 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 368 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 3 additions & 3 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp‎
Lines changed: 2 additions & 5 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -12,7 +12,7 @@ class OpBuilder;
 
 /// Callback to allow backend to provide more information on whether a barrier
 /// is needed between two operations. Even though two operations access the same
-/// shared memory thay may not require a barrier in between them.
+/// shared memory they may not require a barrier in between them.
 using MembarFilterFn = std::function<bool(Operation *, Operation *)>;
 
 struct BlockInfo {
 
@@ -854,7 +854,7 @@ def TT_ElementwiseInlineAsmOp : TT_Op<"elementwise_inline_asm", [
 // Histogram Op
 //
 def TT_HistogramOp : TT_Op<"histogram", [Pure]> {
-  let summary = "return a histgram of the inputs.";
+  let summary = "return a histogram of the inputs.";
   let description = [{
     Return the histogram of the input tensor. The number of bins is equal to
     the dimension of the output tensor. Each bins has a width of 1 and bins
 
@@ -950,7 +950,7 @@ available on AMD Radeon GPUs of RDNA architectures.
   - 1: gfx11
   - 2: gfx12
 - A `warpsPerCTA` parameter characterizes data distribution between warps.
-  An important limitation of WMMA for layout is a shape for tiles proccessed
+  An important limitation of WMMA for layout is a shape for tiles processed
   by a single warp. It is [16, 16].
   This encoding assumes specific access to matrix elements by threads.
 
 
@@ -90,7 +90,7 @@ def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
   let description = [{
     This operation copies data from global memory to local memory asynchronously.
     This is analogue to tt.load except the data are copied to local memory pointed
-    by by the memory descriptor instread of a distributed tensor. The rest of the
+    by by the memory descriptor instead of a distributed tensor. The rest of the
     operands are the same as tt.load.
   }];
 
 
@@ -202,7 +202,7 @@ def TritonGPUReduceDataDuplication: Pass<"tritongpu-reduce-data-duplication", "m
 def TritonGPUCombineTensorSelectAndIf: Pass<"tritongpu-combine-tensor-select-and-if", "mlir::ModuleOp"> {
   let summary = "Combine tensor select and if";
 
-  let description = "For select instruction that uses the same condidtion as the if instruction in the same block "
+  let description = "For select instruction that uses the same condition as the if instruction in the same block "
                     "this pass combines the select into the if instruction, making the select operands returned by the "
                     "then/else yields.";
 
@@ -211,7 +211,7 @@ def TritonGPUCombineTensorSelectAndIf: Pass<"tritongpu-combine-tensor-select-and
 }
 
 def TritonGPUOptimizeAccumulatorInit: Pass<"tritongpu-optimize-accumulator-init", "mlir::ModuleOp"> {
-  let summary = "Replace accumulater zero-initialization with the flag indicating first use of the accumulator";
+  let summary = "Replace accumulator zero-initialization with the flag indicating first use of the accumulator";
 
   let description = "For the dot operations that support accumulator-use flag this pass replaces the zero-initialization "
                     "of the accumulator with the flag indicating the first use of the accumulator.";
 
@@ -136,7 +136,7 @@ def TTNG_InvalBarrierOp : TTNG_Op<"inval_barrier", [DeclareOpInterfaceMethods<Me
 
     let description = [{
       Invalidate a barrier allocation so that it can be re-used. According to PTX
-      spec this has to be done before any re-use of the memory used by mbarrier.
+      spec this has to be done before any reuse of the memory used by mbarrier.
 
       https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
     }];
@@ -213,7 +213,7 @@ def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local",
   let description = [{
     This operation copies data from global memory to local memory
     asynchronously.  This is analogue to tt.load except the data are copied to
-    local memory pointed by the memory descriptor instread of a distributed
+    local memory pointed by the memory descriptor instead of a distributed
     tensor. The data copied depends on the global memory descriptor pointed to
     by `desc_ptr`.
   }];
@@ -243,7 +243,7 @@ def TTNG_AsyncTMACopyLocalToGlobalOp : TTNG_Op<"async_tma_copy_local_to_global",
   let description = [{
     This operation copies data from local memory to global memory
     asynchronously.  This is analogue to tt.store except the data are copied from
-    local memory pointed by the memory descriptor instread of a distributed
+    local memory pointed by the memory descriptor instead of a distributed
     tensor. The data copied depends on the global memory descriptor pointed to
     by `desc_ptr`.
   }];
 
@@ -49,9 +49,6 @@ SmallVector<Value> swizzleIndices(ConversionPatternRewriter &rewriter,
 
   auto fastIdx = rawIndices[order[0]];
   auto secondIdx = rawIndices[order[1]];
-  // Original algorithm taken from getSwizzledSharedPtrs function
-  // (TritonGPUToLLVMBase.h)
-  //
   // phase = (secondIdx // perPhase) % maxPhase
   // swizzledGroup = ((fastIdx // vec) ^ phase) * vec
   // groupRemainder = fastIdx % vec
@@ -158,7 +155,7 @@ Value computeSwizzledOffset(ConversionPatternRewriter &rewriter, Location loc,
                             ArrayRef<int64_t> opTensorShape,
                             ArrayRef<Value> strides) {
   Value offset = i32_val(0);
-  // Compute unswizzled multi dim coordinates in shared memmory object
+  // Compute unswizzled multi dim coordinates in shared memory object
   SmallVector<Value> elemMultiDimIndices(3);
   elemMultiDimIndices[dim.batch] =
       add(bTileOffset, i32_val(i.bTile * shapePerCTABTile + i.b));
@@ -309,7 +306,7 @@ Value loadFMAOp(Value srcVal, Value llVal, BlockedEncodingAttr dLayout,
                               sizeNonKPerThread);
 
   // In swizzled memory case basePtr stores pointer to the beginning of shared
-  // memmory object.
+  // memory object.
   //
   // If memory is not swizzled, algorithm breaks element offset pointer into
   // constant and non-constant part. Non-constant (depends on thread id) part is
 
@@ -17,7 +17,7 @@ using namespace mlir::triton;
 // NOTE: [Additional Function Arguments]
 // To support use of shared memory and global scratch memory inside of a
 // function, the caller allocates a single large block of the relevant memory
-// and calls the funciton with these extra arguments at the end.
+// and calls the function with these extra arguments at the end.
 // Specifically, the last argument is the global scratch memory allocation and
 // the second to last is the shared memory allocation.
 //
 
@@ -127,7 +127,7 @@ struct ReduceOpConversion
         emitOffsetForLayout(helper.getSrcLayout(), operandType);
 
     // Thread X might hold the same input value in two registers.  Get the
-    // indices in `offsets` that hold unique values, and only accumualte over
+    // indices in `offsets` that hold unique values, and only accumulate over
     // those.
     llvm::MapVector<ArrayRef<unsigned>, int> uniqueOffsets;
     for (int i = 0; i < offsets.size(); ++i) {
@@ -221,7 +221,7 @@ struct ReduceOpConversion
   // For slice layout some ids are duplicated on multiple lanes, so we need to
   // handle the delinearization of laneId in a special way. We need to
   // generalize this part of the logic to work on any kind of linear layout
-  // uniformely.
+  // uniformly.
   SmallVector<Value>
   getMultiDimLaneId(ReduceOpHelper &helper, Value &laneId, Location &loc,
                     ConversionPatternRewriter &rewriter) const {