@@ -96,7 +96,7 @@ llvm::cl::opt<bool>
9696 llvm::cl::desc (" force use of wmma operations for tensorcore" ),
9797 llvm::cl::init(false ));
9898
99- // / Flag used to toggle using mma.sync vs wmma when targetting tensorcore.
99+ // / Flag used to toggle using mma.sync vs wmma when targeting tensorcore.
100100llvm::cl::opt<bool >
101101 clGPUUseMMASync (" iree-codegen-llvmgpu-use-mma-sync" ,
102102 llvm::cl::desc (" force use mma sync instead of wmma ops" ),
@@ -160,7 +160,7 @@ static bool needsLoweringConfigPropagation(
160160static SmallVector<TileWorkgroupSizePair>
161161getMatmulConfig (IREE::GPU::TargetAttr target) {
162162 SmallVector<TileWorkgroupSizePair> tileSizes;
163- // Pick tile size so that M*K and K*N dividible by wgSize * \*vecSize=*\4.
163+ // Pick tile size so that M*K and K*N divisible by wgSize * \*vecSize=*\4.
164164 // This way workgroup memory copy don't need to be masked. Once we support
165165 // masked load we can get performance out of more configuration.
166166
@@ -189,7 +189,7 @@ getTensorCoreConfig(SmallVectorImpl<TileWorkgroupSizePair> &tileSizes,
189189 Type elementType, int64_t M, int64_t N, int64_t K) {
190190 // Based on early analysis we found that 128x256x32_3 gives acceptable
191191 // performance across many of the large matrix sizes for f16 and fp32. This
192- // needs to be refined into a better strategy based on empircal data but this
192+ // needs to be refined into a better strategy based on empirical data but this
193193 // gives us a quick solution to achieve performance in the right order of
194194 // magnitude for large square like cases.
195195 int64_t parallelDim = M * N;
@@ -1164,7 +1164,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig(
11641164 // The subgroup distribution in attention is controlled by the second matmul
11651165 // (Parallel dimension distribution is usually (almost always) controlled by
11661166 // the last reduction operation in a dispatch). Since VectorDistribution
1167- // doesn't have logic to set subgroup and thread layouts seperately , we
1167+ // doesn't have logic to set subgroup and thread layouts separately , we
11681168 // explicitly set the subgroup count for the first matmul as well,
11691169 // corresponding to what the second matmul dictates.
11701170
@@ -1624,7 +1624,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
16241624
16251625 // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
16261626 // pipeline, similar to matvec. Note: Because of reassociation in the vector
1627- // reduction pipeline, this may lead to precission loss. If this ever becomes
1627+ // reduction pipeline, this may lead to precision loss. If this ever becomes
16281628 // an issue, we can hide this behind a flag.
16291629 if (llvm::all_equal ({contractionDims->m .size (), contractionDims->n .size (),
16301630 contractionDims->k .size (), size_t {1 }}) &&
@@ -2391,7 +2391,7 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
23912391
23922392 // Workgroup size contains 8 warps. Configured with 8 threads on fastest
23932393 // moving dimension so each thread can execute a vectorized copy of 4
2394- // contigious elements at a time from the 32 block.
2394+ // contiguous elements at a time from the 32 block.
23952395 std::array<int64_t , 3 > workgroupSize = {8 , 32 , 1 };
23962396
23972397 return setOpConfigAndEntryPointFnTranslation (
@@ -2470,7 +2470,7 @@ static LogicalResult setArgmaxUkernelConfig(
24702470}
24712471
24722472// / Decides the tiling and distribution parameters for one convolution
2473- // / dimension. Returns true if we can succesfully deduce.
2473+ // / dimension. Returns true if we can successfully deduce.
24742474// /
24752475// / - `inputDim` is the size of the dimension to be distributed.
24762476// / - `residualThreads` is the remaining threads we can distribute.
@@ -2512,7 +2512,7 @@ static bool distributeToOneDim(const int64_t inputDim,
25122512
25132513// / Decides the tiling and distribution parameters for two convolution window
25142514// / dimensions to two workgroup dimensions as a square. Returns true if we can
2515- // / succesfully deduce.
2515+ // / successfully deduce.
25162516static bool distributeToSquare (const int64_t oh, const int64_t ow,
25172517 int64_t &residualThreads,
25182518 int64_t &residualTilingFactor,
0 commit comments