@@ -33,20 +33,69 @@ class FeaturePTX<int version>:
3333 SubtargetFeature<"ptx"# version, "PTXVersion",
3434 "" # version,
3535 "Use PTX version " # version>;
36-
36+ // NVPTX Architecture Hierarchy and Ordering:
37+ //
38+ // GPU architectures: sm_2Y/sm_3Y/sm_5Y/sm_6Y/sm_7Y/sm_8Y/sm_9Y/sm_10Y/sm_12Y
39+ // ('Y' represents version within the architecture)
40+ // The architectures have name of form sm_XYz where 'X' represent the generation
41+ // number, 'Y' represents the version within the architecture, and 'z' represents
42+ // the optional feature suffix.
43+ // If X1Y1 <= X2Y2, then GPU capabilities of sm_X1Y1 are included in sm_X2Y2.
44+ // For example, take sm_90 (9 represents 'X', 0 represents 'Y', and no feature
45+ // suffix) and sm_103 architectures (10 represents 'X', 3 represents 'Y', and no
46+ // feature suffix). Since 90 <= 103, sm_90 is compatible with sm_103.
47+ //
48+ // The family-specific variants have 'f' feature suffix and they follow
49+ // following order:
50+ // sm_X{Y2}f > sm_X{Y1}f iff Y2 > Y1
51+ // sm_XY{f} > sm_{XY}{}
52+ //
53+ // For example, take sm_100f (10 represents 'X', 0 represents 'Y', and 'f'
54+ // represents 'z') and sm_103f (10 represents 'X', 3 represents 'Y', and 'f'
55+ // represents 'z') architecture variants. Since Y1 < Y2, sm_100f is compatible with
56+ // sm_103f. Similarly based on the second rule, sm_90 is compatible with sm_103f.
57+ //
58+ // Some counter examples, take sm_100f and sm_120f (12 represents 'X', 0
59+ // represents 'Y', and 'f' represents 'z') architecture variants. Since both
60+ // belongs to different family i.e. X1 != X2, sm_100f is not compatible with
61+ // sm_120f.
62+ //
63+ // The architecture-specific variants have 'a' feature suffix and they follow
64+ // following order:
65+ // sm_XY{a} > sm_XY{f} > sm_{XY}{}
66+ //
67+ // For example, take sm_103a (10 represents 'X', 3 represents 'Y', and 'a'
68+ // represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
69+ // compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
70+ //
71+ // Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
72+ // Arch := X * 10 + Y
73+ //
74+ // For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is
75+ // encoded as 1032 (103 * 10 + 2).
76+ //
77+ // This encoding allows simple partial ordering of the architectures.
78+ // + Compare Family and Arch by dividing FullSMVersion by 100 and 10
79+ // respectively before the comparison.
80+ // + Compare within the family by comparing FullSMVersion, given both belongs to
81+ // the same family.
82+ // + Detect 'a' variants by checking FullSMVersion & 1.
3783foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
3884 60, 61, 62, 70, 72, 75, 80, 86, 87,
39- 89, 90, 100, 101, 103, 120, 121] in
40- def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
85+ 89, 90, 100, 101, 103, 120, 121] in {
86+ // Base SM version (e.g. FullSMVersion for sm_100 is 1000)
87+ def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
88+
89+ // Family-specific targets which are compatible within same family
90+ // (e.g. FullSMVersion for sm_100f is 1002)
91+ if !ge(sm, 100) then
92+ def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
4193
42- // Arch-specific targets. PTX for these is not compatible with any other
43- // architectures.
44- def SM90a : FeatureSM<"90a", 901>;
45- def SM100a: FeatureSM<"100a", 1001>;
46- def SM101a: FeatureSM<"101a", 1011>;
47- def SM103a: FeatureSM<"103a", 1031>;
48- def SM120a: FeatureSM<"120a", 1201>;
49- def SM121a: FeatureSM<"121a", 1211>;
94+ // Architecture-specific targets which are incompatible across architectures
95+ // (e.g. FullSMVersion for sm_100a is 1003)
96+ if !ge(sm, 90) then
97+ def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
98+ }
5099
51100foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
52101 70, 71, 72, 73, 74, 75, 76, 77, 78,
@@ -83,14 +132,19 @@ def : Proc<"sm_90", [SM90, PTX78]>;
83132def : Proc<"sm_90a", [SM90a, PTX80]>;
84133def : Proc<"sm_100", [SM100, PTX86]>;
85134def : Proc<"sm_100a", [SM100a, PTX86]>;
135+ def : Proc<"sm_100f", [SM100f, PTX88]>;
86136def : Proc<"sm_101", [SM101, PTX86]>;
87137def : Proc<"sm_101a", [SM101a, PTX86]>;
138+ def : Proc<"sm_101f", [SM101f, PTX88]>;
88139def : Proc<"sm_103", [SM103, PTX88]>;
89140def : Proc<"sm_103a", [SM103a, PTX88]>;
141+ def : Proc<"sm_103f", [SM103f, PTX88]>;
90142def : Proc<"sm_120", [SM120, PTX87]>;
91143def : Proc<"sm_120a", [SM120a, PTX87]>;
144+ def : Proc<"sm_120f", [SM120f, PTX88]>;
92145def : Proc<"sm_121", [SM121, PTX88]>;
93146def : Proc<"sm_121a", [SM121a, PTX88]>;
147+ def : Proc<"sm_121f", [SM121f, PTX88]>;
94148
95149def NVPTXInstrInfo : InstrInfo {
96150}
0 commit comments