diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index abd7ca5453645..11017fe4e01b4 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -147,7 +147,57 @@ Example: 32-bit PTX for CUDA Driver API: ``nvptx-nvidia-cuda`` Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda`` - +.. _nvptx_arch_hierarchy: + +NVPTX Architecture Hierarchy and Ordering +========================================= + +GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y +('Y' represents version within the architecture) +The architectures have name of form ``sm_XYz`` where ``X`` represent the generation +number, ``Y`` represents the version within the architecture, and ``z`` represents +the optional feature suffix. +If ``X1Y1 <= X2Y2``, then GPU capabilities of ``sm_X1Y1`` are included in ``sm_X2Y2``. +For example, take ``sm_90`` (9 represents ``X``, 0 represents ``Y``, and no feature +suffix) and ``sm_103`` architectures (10 represents ``X``, 3 represents ``Y``, and no +feature suffix). Since 90 <= 103, ``sm_90`` is compatible with ``sm_103``. + +The family-specific variants have ``f`` feature suffix and they follow +following order: +``sm_X{Y2}f > sm_X{Y1}f`` iff ``Y2 > Y1`` +``sm_XY{f} > sm_{XY}{}`` + +For example, take ``sm_100f`` (10 represents ``X``, 0 represents ``Y``, and ``f`` +represents ``z``) and ``sm_103f`` (10 represents ``X``, 3 represents ``Y``, and ``f`` +represents ``z``) architecture variants. Since ``Y1 < Y2``, ``sm_100f`` is compatible with +``sm_103f``. Similarly based on the second rule, ``sm_90`` is compatible with ``sm_103f``. + +Some counter examples, take ``sm_100f`` and ``sm_120f`` (12 represents ``X``, 0 +represents ``Y``, and ``f`` represents ``z``) architecture variants. Since both +belongs to different family i.e. ``X1 != X2``, ``sm_100f`` is not compatible with +``sm_120f``. + +The architecture-specific variants have ``a`` feature suffix and they follow +following order: +``sm_XY{a} > sm_XY{f} > sm_{XY}{}`` + +For example, take ``sm_103a`` (10 represents ``X``, 3 represents ``Y``, and ``a`` +represents ``z``), ``sm_103f``, and ``sm_103`` architecture variants. The ``sm_103`` is +compatible with ``sm_103a`` and ``sm_103f``, and ``sm_103f`` is compatible with ``sm_103a``. + +Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a') +Arch := X * 10 + Y + +For example, ``sm_103f`` is encoded as 1032 (103 * 10 + 2) and ``sm_103a`` is +encoded as 1033 (103 * 10 + 2 + 1). + +This encoding allows simple partial ordering of the architectures. + +* Compare Family and Arch by dividing FullSMVersion by 100 and 10 + respectively before the comparison. +* Compare within the family by comparing FullSMVersion, given both belongs to + the same family. +* Detect ``a`` variants by checking FullSMVersion & 1. .. _nvptx_intrinsics: diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index ff9a187ecf723..83992606bc419 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -33,20 +33,69 @@ class FeaturePTX: SubtargetFeature<"ptx"# version, "PTXVersion", "" # version, "Use PTX version " # version>; - +// NVPTX Architecture Hierarchy and Ordering: +// +// GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y +// ('Y' represents version within the architecture) +// The architectures have name of form sm_XYz where 'X' represent the generation +// number, 'Y' represents the version within the architecture, and 'z' represents +// the optional feature suffix. +// If X1Y1 <= X2Y2, then GPU capabilities of sm_X1Y1 are included in sm_X2Y2. +// For example, take sm_90 (9 represents 'X', 0 represents 'Y', and no feature +// suffix) and sm_103 architectures (10 represents 'X', 3 represents 'Y', and no +// feature suffix). Since 90 <= 103, sm_90 is compatible with sm_103. +// +// The family-specific variants have 'f' feature suffix and they follow +// following order: +// sm_X{Y2}f > sm_X{Y1}f iff Y2 > Y1 +// sm_XY{f} > sm_{XY}{} +// +// For example, take sm_100f (10 represents 'X', 0 represents 'Y', and 'f' +// represents 'z') and sm_103f (10 represents 'X', 3 represents 'Y', and 'f' +// represents 'z') architecture variants. Since Y1 < Y2, sm_100f is compatible with +// sm_103f. Similarly based on the second rule, sm_90 is compatible with sm_103f. +// +// Some counter examples, take sm_100f and sm_120f (12 represents 'X', 0 +// represents 'Y', and 'f' represents 'z') architecture variants. Since both +// belongs to different family i.e. X1 != X2, sm_100f is not compatible with +// sm_120f. +// +// The architecture-specific variants have 'a' feature suffix and they follow +// following order: +// sm_XY{a} > sm_XY{f} > sm_{XY}{} +// +// For example, take sm_103a (10 represents 'X', 3 represents 'Y', and 'a' +// represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is +// compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a. +// +// Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a') +// Arch := X * 10 + Y +// +// For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is +// encoded as 1032 (103 * 10 + 2). +// +// This encoding allows simple partial ordering of the architectures. +// + Compare Family and Arch by dividing FullSMVersion by 100 and 10 +// respectively before the comparison. +// + Compare within the family by comparing FullSMVersion, given both belongs to +// the same family. +// + Detect 'a' variants by checking FullSMVersion & 1. foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80, 86, 87, - 89, 90, 100, 101, 103, 120, 121] in - def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>; + 89, 90, 100, 101, 103, 120, 121] in { + // Base SM version (e.g. FullSMVersion for sm_100 is 1000) + def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>; + + // Family-specific targets which are compatible within same family + // (e.g. FullSMVersion for sm_100f is 1002) + if !ge(sm, 100) then + def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>; -// Arch-specific targets. PTX for these is not compatible with any other -// architectures. -def SM90a : FeatureSM<"90a", 901>; -def SM100a: FeatureSM<"100a", 1001>; -def SM101a: FeatureSM<"101a", 1011>; -def SM103a: FeatureSM<"103a", 1031>; -def SM120a: FeatureSM<"120a", 1201>; -def SM121a: FeatureSM<"121a", 1211>; + // Architecture-specific targets which are incompatible across architectures + // (e.g. FullSMVersion for sm_100a is 1003) + if !ge(sm, 90) then + def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>; +} foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72, 73, 74, 75, 76, 77, 78, @@ -83,14 +132,19 @@ def : Proc<"sm_90", [SM90, PTX78]>; def : Proc<"sm_90a", [SM90a, PTX80]>; def : Proc<"sm_100", [SM100, PTX86]>; def : Proc<"sm_100a", [SM100a, PTX86]>; +def : Proc<"sm_100f", [SM100f, PTX88]>; def : Proc<"sm_101", [SM101, PTX86]>; def : Proc<"sm_101a", [SM101a, PTX86]>; +def : Proc<"sm_101f", [SM101f, PTX88]>; def : Proc<"sm_103", [SM103, PTX88]>; def : Proc<"sm_103a", [SM103a, PTX88]>; +def : Proc<"sm_103f", [SM103f, PTX88]>; def : Proc<"sm_120", [SM120, PTX87]>; def : Proc<"sm_120a", [SM120a, PTX87]>; +def : Proc<"sm_120f", [SM120f, PTX88]>; def : Proc<"sm_121", [SM121, PTX88]>; def : Proc<"sm_121a", [SM121a, PTX88]>; +def : Proc<"sm_121f", [SM121f, PTX88]>; def NVPTXInstrInfo : InstrInfo { } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 5dbdce52f0553..bbe99dec5c445 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -158,10 +158,10 @@ class hasPTX: Predicate<"Subtarget->getPTXVersion() >= " # version> class hasSM: Predicate<"Subtarget->getSmVersion() >= " # version>; // Explicit records for arch-accelerated SM versions -def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; -def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">; -def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">; -def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">; +def hasSM90a : Predicate<"Subtarget->getSmVersion() == 90 && Subtarget->hasArchAccelFeatures()">; +def hasSM100a : Predicate<"Subtarget->getSmVersion() == 100 && Subtarget->hasArchAccelFeatures()">; +def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArchAccelFeatures()">; +def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index d2eae48826829..8810feaee297a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -108,8 +108,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { switch (FullSmVersion) { default: break; - case 1001: // sm_100a - case 1011: // sm_101a + case 1003: // sm_100a + case 1013: // sm_101a HasTcgen05 = true; break; } @@ -120,9 +120,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // TMA G2S copy with cta_group::1/2 support bool hasCpAsyncBulkTensorCTAGroupSupport() const { // TODO: Update/tidy-up after the family-conditional support arrives - return ((FullSmVersion == 1001 || FullSmVersion == 1011) && - PTXVersion >= 86) || - (FullSmVersion == 1031 && PTXVersion >= 88); + switch (FullSmVersion) { + case 1003: + case 1013: + return PTXVersion >= 86; + case 1033: + return PTXVersion >= 88; + default: + return false; + } } // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction @@ -136,14 +142,24 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } - // GPUs with "a" suffix have include architecture-accelerated features that - // are supported on the specified architecture only, hence such targets do not - // follow the onion layer model. hasArchAccelFeatures() allows - // distinguishing such GPU variants from the base GPU architecture. - // - 0 represents base GPU model, - // - non-zero value identifies particular architecture-accelerated variant. - bool hasArchAccelFeatures() const { return getFullSmVersion() % 10; } - + // GPUs with "a" suffix have architecture-accelerated features that are + // supported on the specified architecture only, hence such targets do not + // follow the onion layer model. hasArchAccelFeatures() allows distinguishing + // such GPU variants from the base GPU architecture. + // - false represents non-accelerated architecture. + // - true represents architecture-accelerated variant. + bool hasArchAccelFeatures() const { + return (getFullSmVersion() & 1) && PTXVersion >= 80; + } + // GPUs with 'f' suffix have architecture-accelerated features which are + // portable across all future architectures under same SM major. For example, + // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures. + // - false represents non-family-specific architecture. + // - true represents family-specific variant. + bool hasFamilySpecificFeatures() const { + return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88 + : hasArchAccelFeatures(); + } // If the user did not provide a target we default to the `sm_30` target. std::string getTargetName() const { return TargetName.empty() ? "sm_30" : TargetName; diff --git a/llvm/test/CodeGen/NVPTX/sm-version.ll b/llvm/test/CodeGen/NVPTX/sm-version.ll index 9705a2f3ba730..3a154a1b9ac9c 100644 --- a/llvm/test/CodeGen/NVPTX/sm-version.ll +++ b/llvm/test/CodeGen/NVPTX/sm-version.ll @@ -18,14 +18,19 @@ ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100 | FileCheck %s --check-prefix=SM100 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101 | FileCheck %s --check-prefix=SM101 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103 | FileCheck %s --check-prefix=SM103 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120 | FileCheck %s --check-prefix=SM120 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121 | FileCheck %s --check-prefix=SM121 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a +; RUN: llc < %s -mtriple=nvptx -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_21 | FileCheck %s --check-prefix=SM21 @@ -47,14 +52,19 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a | FileCheck %s --check-prefix=SM90a ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=SM100 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a | FileCheck %s --check-prefix=SM100a +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f | FileCheck %s --check-prefix=SM100f ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101 | FileCheck %s --check-prefix=SM101 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a | FileCheck %s --check-prefix=SM101a +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101f | FileCheck %s --check-prefix=SM101f ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103 | FileCheck %s --check-prefix=SM103 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a | FileCheck %s --check-prefix=SM103a +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103f | FileCheck %s --check-prefix=SM103f ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120 | FileCheck %s --check-prefix=SM120 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a | FileCheck %s --check-prefix=SM120a +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120f | FileCheck %s --check-prefix=SM120f ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121 | FileCheck %s --check-prefix=SM121 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121a | FileCheck %s --check-prefix=SM121a +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_121f | FileCheck %s --check-prefix=SM121f ; SM20: .version 3.2 ; SM21: .version 3.2 @@ -76,14 +86,19 @@ ; SM90a: .version 8.0 ; SM100: .version 8.6 ; SM100a: .version 8.6 +; SM100f: .version 8.8 ; SM101: .version 8.6 ; SM101a: .version 8.6 +; SM101f: .version 8.8 ; SM103: .version 8.8 ; SM103a: .version 8.8 +; SM103f: .version 8.8 ; SM120: .version 8.7 ; SM120a: .version 8.7 +; SM120f: .version 8.8 ; SM121: .version 8.8 ; SM121a: .version 8.8 +; SM121f: .version 8.8 ; SM20: .target sm_20 ; SM21: .target sm_21 @@ -105,11 +120,16 @@ ; SM90a: .target sm_90a ; SM100: .target sm_100 ; SM100a: .target sm_100a +; SM100f: .target sm_100f ; SM101: .target sm_101 ; SM101a: .target sm_101a +; SM101f: .target sm_101f ; SM103: .target sm_103 ; SM103a: .target sm_103a +; SM103f: .target sm_103f ; SM120: .target sm_120 ; SM120a: .target sm_120a +; SM120f: .target sm_120f ; SM121: .target sm_121 ; SM121a: .target sm_121a +; SM121f: .target sm_121f