@@ -1567,6 +1567,133 @@ def NVVM_ElectSyncOp : NVVM_Op<"elect.sync">
15671567 }];
15681568}
15691569
1570+ //===----------------------------------------------------------------------===//
1571+ // Permute Bytes (Prmt)
1572+ //===----------------------------------------------------------------------===//
1573+
1574+ // Attributes for the permute operation modes supported by PTX.
1575+ def PermuteModeDefault : I32EnumAttrCase<"DEFAULT", 0, "default">;
1576+ def PermuteModeF4E : I32EnumAttrCase<"F4E", 1, "f4e">;
1577+ def PermuteModeB4E : I32EnumAttrCase<"B4E", 2, "b4e">;
1578+ def PermuteModeRC8 : I32EnumAttrCase<"RC8", 3, "rc8">;
1579+ def PermuteModeECL : I32EnumAttrCase<"ECL", 4, "ecl">;
1580+ def PermuteModeECR : I32EnumAttrCase<"ECR", 5, "ecr">;
1581+ def PermuteModeRC16 : I32EnumAttrCase<"RC16", 6, "rc16">;
1582+
1583+ def PermuteMode : I32EnumAttr<"PermuteMode", "NVVM permute mode",
1584+ [PermuteModeDefault, PermuteModeF4E,
1585+ PermuteModeB4E, PermuteModeRC8, PermuteModeECL,
1586+ PermuteModeECR, PermuteModeRC16]> {
1587+ let genSpecializedAttr = 0;
1588+ let cppNamespace = "::mlir::NVVM";
1589+ }
1590+
1591+ def PermuteModeAttr : EnumAttr<NVVM_Dialect, PermuteMode, "permute_mode"> {
1592+ let assemblyFormat = "`<` $value `>`";
1593+ }
1594+
1595+ def NVVM_PermuteOp : NVVM_Op<"prmt", [Pure]>,
1596+ Results<(outs I32:$res)>,
1597+ Arguments<(ins I32:$lo, Optional<I32>:$hi, I32:$selector,
1598+ PermuteModeAttr:$mode)> {
1599+ let summary = "Permute bytes from two 32-bit registers";
1600+ let description = [{
1601+ The `nvvm.prmt` operation constructs a permutation of the
1602+ bytes of the first one or two operands, selecting based on
1603+ the 2 least significant bits of the final operand.
1604+
1605+ The bytes in the first one or two source operands are numbered.
1606+ The first source operand (%lo) is numbered {b3, b2, b1, b0},
1607+ in the case of the '``default``', '``f4e``' and '``b4e``' variants,
1608+ the second source operand (%hi) is numbered {b7, b6, b5, b4}.
1609+
1610+ Modes:
1611+ - `default`: Index mode - each nibble in `selector` selects a byte from the 8-byte pool
1612+ - `f4e` : Forward 4 extract - extracts 4 contiguous bytes starting from position in `selector`
1613+ - `b4e` : Backward 4 extract - extracts 4 contiguous bytes in reverse order
1614+ - `rc8` : Replicate 8 - replicates the lower 8 bits across the 32-bit result
1615+ - `ecl` : Edge clamp left - clamps out-of-range indices to the leftmost valid byte
1616+ - `ecr` : Edge clamp right - clamps out-of-range indices to the rightmost valid byte
1617+ - `rc16` : Replicate 16 - replicates the lower 16 bits across the 32-bit result
1618+
1619+ Depending on the 2 least significant bits of the %selector operand, the result
1620+ of the permutation is defined as follows:
1621+
1622+ +------------+----------------+--------------+
1623+ | Mode | %selector[1:0] | Output |
1624+ +------------+----------------+--------------+
1625+ | '``f4e``' | 0 | {3, 2, 1, 0} |
1626+ | +----------------+--------------+
1627+ | | 1 | {4, 3, 2, 1} |
1628+ | +----------------+--------------+
1629+ | | 2 | {5, 4, 3, 2} |
1630+ | +----------------+--------------+
1631+ | | 3 | {6, 5, 4, 3} |
1632+ +------------+----------------+--------------+
1633+ | '``b4e``' | 0 | {5, 6, 7, 0} |
1634+ | +----------------+--------------+
1635+ | | 1 | {6, 7, 0, 1} |
1636+ | +----------------+--------------+
1637+ | | 2 | {7, 0, 1, 2} |
1638+ | +----------------+--------------+
1639+ | | 3 | {0, 1, 2, 3} |
1640+ +------------+----------------+--------------+
1641+ | '``rc8``' | 0 | {0, 0, 0, 0} |
1642+ | +----------------+--------------+
1643+ | | 1 | {1, 1, 1, 1} |
1644+ | +----------------+--------------+
1645+ | | 2 | {2, 2, 2, 2} |
1646+ | +----------------+--------------+
1647+ | | 3 | {3, 3, 3, 3} |
1648+ +------------+----------------+--------------+
1649+ | '``ecl``' | 0 | {3, 2, 1, 0} |
1650+ | +----------------+--------------+
1651+ | | 1 | {3, 2, 1, 1} |
1652+ | +----------------+--------------+
1653+ | | 2 | {3, 2, 2, 2} |
1654+ | +----------------+--------------+
1655+ | | 3 | {3, 3, 3, 3} |
1656+ +------------+----------------+--------------+
1657+ | '``ecr``' | 0 | {0, 0, 0, 0} |
1658+ | +----------------+--------------+
1659+ | | 1 | {1, 1, 1, 0} |
1660+ | +----------------+--------------+
1661+ | | 2 | {2, 2, 1, 0} |
1662+ | +----------------+--------------+
1663+ | | 3 | {3, 2, 1, 0} |
1664+ +------------+----------------+--------------+
1665+ | '``rc16``' | 0 | {1, 0, 1, 0} |
1666+ | +----------------+--------------+
1667+ | | 1 | {3, 2, 3, 2} |
1668+ | +----------------+--------------+
1669+ | | 2 | {1, 0, 1, 0} |
1670+ | +----------------+--------------+
1671+ | | 3 | {3, 2, 3, 2} |
1672+ +------------+----------------+--------------+
1673+
1674+ [For more information, see PTX ISA]
1675+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prmt)
1676+ }];
1677+
1678+ let assemblyFormat = [{
1679+ $mode $selector `,` $lo (`,` $hi^)? attr-dict `:` type($res)
1680+ }];
1681+
1682+ let hasVerifier = 1;
1683+
1684+ let extraClassDeclaration = [{
1685+ static mlir::NVVM::IDArgPair
1686+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
1687+ llvm::IRBuilderBase &builder);
1688+ }];
1689+
1690+ string llvmBuilder = [{
1691+ auto [id, args] = NVVM::PermuteOp::getIntrinsicIDAndArgs(
1692+ *op, moduleTranslation, builder);
1693+ $res = createIntrinsicCall(builder, id, args);
1694+ }];
1695+ }
1696+
15701697def LoadCacheModifierCA : I32EnumAttrCase<"CA", 0, "ca">;
15711698def LoadCacheModifierCG : I32EnumAttrCase<"CG", 1, "cg">;
15721699def LoadCacheModifierCS : I32EnumAttrCase<"CS", 2, "cs">;
0 commit comments