diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
new file mode 100644
index 00000000..30df534f
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -0,0 +1,315 @@
+#--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
+StructuredBuffer<half4> In  : register(t0);
+RWStructuredBuffer<half> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<half2> Out2 : register(u2); // test half2
+RWStructuredBuffer<half4> Out3 : register(u3); // test half3
+RWStructuredBuffer<half4> Out4 : register(u4); // test half4
+RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
+
+[numthreads(NUM_THREADS,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            half4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(half4(1,2,3,4));
+}
+
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float16
+    Stride: 8
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    0x2000, 0x2200, 0x2400, 0x2800, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    0x2A00, 0x2C00, 0x2E00, 0x3000, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    0x3200, 0x3400, 0x3600, 0x3800,
+    0x3900, 0x3A00, 0x3B00, 0x3BC0,
+    0x2200, 0x2400, 0x2800, 0x2A00, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    0x2C00, 0x2E00, 0x3000, 0x3200,
+    0x3400, 0x3600, 0x3800, 0x3900,
+    0x3A00, 0x3B00, 0x3BC0, 0x2000,
+    0x2400, 0x2800, 0x2A00, 0x2C00,
+    0x2E00, 0x3000, 0x3200, 0x3400,
+    0x3600, 0x3800, 0x3900, 0x3A00,
+    0x3B00, 0x3BC0, 0x2000, 0x2200,
+    0x2800, 0x2A00, 0x2C00, 0x2E00,
+    0x3000, 0x3200, 0x3400, 0x3600,
+    0x3800, 0x3900, 0x3A00, 0x3B00,
+    0x3BC0, 0x2000, 0x2200, 0x2400,
+    0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    0x3000, 0x2E00, 0x2C00, 0x2A00, 
+    0x3800, 0x3600, 0x3400, 0x3200, 
+    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
+    0x2A00, 0x2800, 0x2400, 0x2200, 
+    0x3200, 0x3000, 0x2E00, 0x2C00, 
+    0x3900, 0x3800, 0x3600, 0x3400, 
+    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
+    0x2C00, 0x2A00, 0x2800, 0x2400, 
+    0x3400, 0x3200, 0x3000, 0x2E00, 
+    0x3A00, 0x3900, 0x3800, 0x3600, 
+    0x2200, 0x2000, 0x3BC0, 0x3B00, 
+    0x2E00, 0x2C00, 0x2A00, 0x2800, 
+    0x3600, 0x3400, 0x3200, 0x3000, 
+    0x3B00, 0x3A00, 0x3900, 0x3800, 
+    0x2400, 0x2200, 0x2000, 0x3BC0 ]
+
+  - Name: Out1
+    Format: Float16
+    Stride: 2
+    # 1 half is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 64  
+  - Name: Out2
+    Format: Float16
+    Stride: 4
+    ZeroInitSize: 128
+  - Name: Out3
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 256
+  - Name: Out4
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 256
+  - Name: Out5
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 8
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
+  - Name: ExpectedOut1
+    Format: Float16
+    Stride: 8
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0x0, 0x0, 0x0, 0x0, 
+    0x3A00, 0x3A00, 0x3A00, 0x3A00,
+    0x2400, 0x0, 0x0, 0x0, 
+    0x0, 0x3800, 0x3800, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x3900, 0x3900, 0x3900, 0x3900, 
+    0x2C00, 0x0, 0x0, 0x0, 
+    0x0, 0x3B00, 0x3B00, 0x0 ]
+  - Name: ExpectedOut2
+    Format: Float16
+    Stride: 8
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0x0, 0x0, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x3A00, 0x3B00, 0x3A00, 0x3B00, 
+    0x3A00, 0x3B00, 0x3A00, 0x3B00, 
+    0x2400, 0x2800, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x0, 0x0, 0x3800, 0x3900, 
+    0x3800, 0x3900, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 
+    0x3900, 0x3BC0, 0x3900, 0x3BC0,
+    0x3900, 0x3BC0, 0x3900, 0x3BC0, 
+    0x2C00, 0x2A00, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x0, 0x0, 0x3B00, 0x3A00, 
+    0x3B00, 0x3A00, 0x0, 0x0 ]
+  - Name: ExpectedOut3
+    Format: Float16
+    Stride: 8
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3A00, 0x3B00, 0x3BC0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x0, 
+            0x3A00, 0x3B00, 0x3BC0, 0x0, 
+            0x3A00, 0x3B00, 0x3BC0, 0x0, 
+            0x2400, 0x2800, 0x2A00, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3800, 0x3900, 0x3A00, 0x0, 
+            0x3800, 0x3900, 0x3A00, 0x0, 
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x0,
+            0x3900, 0x3BC0, 0x3B00, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x0, 
+            0x2C00, 0x2A00, 0x2800, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3B00, 0x3A00, 0x3900, 0x0, 
+            0x3B00, 0x3A00, 0x3900, 0x0, 
+            0x0, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut4
+    Format: Float16
+    Stride: 8
+    Data: [ 0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3A00, 0x3B00, 0x3BC0, 0x3900,
+            0x3A00, 0x3B00, 0x3BC0, 0x3900, 
+            0x3A00, 0x3B00, 0x3BC0, 0x3900, 
+            0x3A00, 0x3B00, 0x3BC0, 0x3900, 
+            0x2400, 0x2800, 0x2A00, 0x2C00, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3800, 0x3900, 0x3A00, 0x3B00, 
+            0x3800, 0x3900, 0x3A00, 0x3B00, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00,
+            0x2C00, 0x2A00, 0x2800, 0x2400,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3B00, 0x3A00, 0x3900, 0x3800, 
+            0x3B00, 0x3A00, 0x3900, 0x3800, 
+            0x0, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut5
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test
new file mode 100644
index 00000000..b5218126
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.fp32.test
@@ -0,0 +1,315 @@
+#--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
+StructuredBuffer<float4> In  : register(t0);
+RWStructuredBuffer<float> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<float2> Out2 : register(u2); // test float2
+RWStructuredBuffer<float4> Out3 : register(u3); // test float3
+RWStructuredBuffer<float4> Out4 : register(u4); // test float4
+RWStructuredBuffer<float4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
+
+[numthreads(NUM_THREADS,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            float4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(float4(1.5,2.5,3.5,4.5));
+}
+
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float32
+    Stride: 16
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9.5, 10.5, 11.5, 12.5,
+    13.5, 14.5, 15.5, 16.5,
+    2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6.5, 7.5, 8.5, 9.5,
+    10.5, 11.5, 12.5, 13.5,
+    14.5, 15.5, 16.5, 1.5,
+    3.5, 4.5, 5.5, 6.5,
+    7.5, 8.5, 9.5, 10.5,
+    11.5, 12.5, 13.5, 14.5,
+    15.5, 16.5, 1.5, 2.5,
+    4.5, 5.5, 6.5, 7.5,
+    8.5, 9.5, 10.5, 11.5,
+    12.5, 13.5, 14.5, 15.5,
+    16.5, 1.5, 2.5, 3.5,
+    4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8.5, 7.5, 6.5, 5.5, 
+    12.5, 11.5, 10.5, 9.5, 
+    16.5, 15.5, 14.5, 13.5, 
+    5.5, 4.5, 3.5, 2.5, 
+    9.5, 8.5, 7.5, 6.5, 
+    13.5, 12.5, 11.5, 10.5, 
+    1.5, 16.5, 15.5, 14.5, 
+    6.5, 5.5, 4.5, 3.5, 
+    10.5, 9.5, 8.5, 7.5, 
+    14.5, 13.5, 12.5, 11.5, 
+    2.5, 1.5, 16.5, 15.5, 
+    7.5, 6.5, 5.5, 4.5, 
+    11.5, 10.5, 9.5, 8.5, 
+    15.5, 14.5, 13.5, 12.5, 
+    3.5, 2.5, 1.5, 16 ]
+
+  - Name: Out1
+    Format: Float32
+    Stride: 4
+    # 1 float is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 128  
+  - Name: Out2
+    Format: Float32
+    Stride: 8
+    ZeroInitSize: 256
+  - Name: Out3
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 512
+  - Name: Out4
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 512
+  - Name: Out5
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 16
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
+  - Name: ExpectedOut1
+    Format: Float32
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14.5, 14.5, 14.5, 14.5,
+    3.5, 0, 0, 0, 
+    0, 12.5, 12.5, 0, 
+    0, 0, 0, 0, 
+    13.5, 13.5, 13.5, 13.5, 
+    6.5, 0, 0, 0, 
+    0, 15.5, 15.5, 0 ]
+  - Name: ExpectedOut2
+    Format: Float32
+    Stride: 16
+    # 2 value sets.5, 4 masks per value set, 4 threads per mask.5, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0,  
+    14.5, 15.5, 14.5, 15.5, 
+    14.5, 15.5, 14.5, 15.5, 
+    3.5, 4.5, 0, 0, 
+    0, 0, 0, 0,  
+    0, 0, 12.5, 13.5, 
+    12.5, 13.5, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13.5, 16.5, 13.5, 16.5,
+    13.5, 16.5, 13.5, 16.5, 
+    6.5, 5.5, 0, 0,, 
+    0, 0, 0, 0, 
+    0, 0,, 15.5, 14.5, 
+    15.5, 14.5, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Float32
+    Stride: 16
+    # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 0,
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            3.5, 4.5, 5.5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            12.5, 13.5, 14.5, 0, 
+            12.5, 13.5, 14.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            13.5, 16.5, 15.5, 0,
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            6.5, 5.5, 4.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 0, 
+            15.5, 14.5, 13.5, 0, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut4
+    Format: Float32
+    Stride: 16
+    Data: [ 0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 13.5,
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            3.5, 4.5, 5.5, 6.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            12.5, 13.5, 14.5, 15.5, 
+            12.5, 13.5, 14.5, 15.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5,
+            6.5, 5.5, 4.5, 3.5,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 12.5, 
+            15.5, 14.5, 13.5, 12.5, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut5
+    Format: Float32
+    Stride: 8
+    Data: [ 1.5, 2.5, 3.5, 4.5 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test
new file mode 100644
index 00000000..8c6d1266
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.fp64.test
@@ -0,0 +1,315 @@
+#--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
+StructuredBuffer<double4> In  : register(t0);
+RWStructuredBuffer<double> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<double2> Out2 : register(u2); // test double2
+RWStructuredBuffer<double4> Out3 : register(u3); // test double3
+RWStructuredBuffer<double4> Out4 : register(u4); // test double4
+RWStructuredBuffer<double4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
+
+[numthreads(NUM_THREADS,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            double4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(double4(1.5,2.5,3.5,4.5));
+}
+
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float64
+    Stride: 32 
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9.5, 10.5, 11.5, 12.5,
+    13.5, 14.5, 15.5, 16.5,
+    2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6.5, 7.5, 8.5, 9.5,
+    10.5, 11.5, 12.5, 13.5,
+    14.5, 15.5, 16.5, 1.5,
+    3.5, 4.5, 5.5, 6.5,
+    7.5, 8.5, 9.5, 10.5,
+    11.5, 12.5, 13.5, 14.5,
+    15.5, 16.5, 1.5, 2.5,
+    4.5, 5.5, 6.5, 7.5,
+    8.5, 9.5, 10.5, 11.5,
+    12.5, 13.5, 14.5, 15.5,
+    16.5, 1.5, 2.5, 3.5,
+    4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8.5, 7.5, 6.5, 5.5, 
+    12.5, 11.5, 10.5, 9.5, 
+    16.5, 15.5, 14.5, 13.5, 
+    5.5, 4.5, 3.5, 2.5, 
+    9.5, 8.5, 7.5, 6.5, 
+    13.5, 12.5, 11.5, 10.5, 
+    1.5, 16.5, 15.5, 14.5, 
+    6.5, 5.5, 4.5, 3.5, 
+    10.5, 9.5, 8.5, 7.5, 
+    14.5, 13.5, 12.5, 11.5, 
+    2.5, 1.5, 16.5, 15.5, 
+    7.5, 6.5, 5.5, 4.5, 
+    11.5, 10.5, 9.5, 8.5, 
+    15.5, 14.5, 13.5, 12.5, 
+    3.5, 2.5, 1.5, 16 ]
+
+  - Name: Out1
+    Format: Float64
+    Stride: 4
+    # 1 double is 8 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 256  
+  - Name: Out2
+    Format: Float64
+    Stride: 8
+    ZeroInitSize: 512
+  - Name: Out3
+    Format: Float64
+    Stride: 16
+    ZeroInitSize: 1024
+  - Name: Out4
+    Format: Float64
+    Stride: 16
+    ZeroInitSize: 1024
+  - Name: Out5
+    Format: Float64
+    Stride: 32
+    ZeroInitSize: 32
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
+  - Name: ExpectedOut1
+    Format: Float64
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14.5, 14.5, 14.5, 14.5,
+    3.5, 0, 0, 0, 
+    0, 12.5, 12.5, 0, 
+    0, 0, 0, 0, 
+    13.5, 13.5, 13.5, 13.5, 
+    6.5, 0, 0, 0, 
+    0, 15.5, 15.5, 0 ]
+  - Name: ExpectedOut2
+    Format: Float64
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0,  
+    14.5, 15.5, 14.5, 15.5, 
+    14.5, 15.5, 14.5, 15.5, 
+    3.5, 4.5, 0, 0, 
+    0, 0, 0, 0,  
+    0, 0, 12.5, 13.5, 
+    12.5, 13.5, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13.5, 16.5, 13.5, 16.5,
+    13.5, 16.5, 13.5, 16.5, 
+    6.5, 5.5, 0, 0,, 
+    0, 0, 0, 0, 
+    0, 0,, 15.5, 14.5, 
+    15.5, 14.5, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Float64
+    Stride: 16
+    # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned.5, so the 3 result values are placed doubleo a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 0,
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            3.5, 4.5, 5.5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            12.5, 13.5, 14.5, 0, 
+            12.5, 13.5, 14.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            13.5, 16.5, 15.5, 0,
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            6.5, 5.5, 4.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 0, 
+            15.5, 14.5, 13.5, 0, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut4
+    Format: Float64
+    Stride: 16
+    Data: [ 0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 13.5,
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            3.5, 4.5, 5.5, 6.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            12.5, 13.5, 14.5, 15.5, 
+            12.5, 13.5, 14.5, 15.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5,
+            6.5, 5.5, 4.5, 3.5,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 12.5, 
+            15.5, 14.5, 13.5, 12.5, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut5
+    Format: Float64
+    Stride: 8
+    Data: [ 1.5, 2.5, 3.5, 4.5 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test
new file mode 100644
index 00000000..1cf120c7
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.int16.test
@@ -0,0 +1,315 @@
+#--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
+StructuredBuffer<int16_t4> In  : register(t0);
+RWStructuredBuffer<int16_t> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<int16_t2> Out2 : register(u2); // test int16_t2
+RWStructuredBuffer<int16_t4> Out3 : register(u3); // test int16_t3
+RWStructuredBuffer<int16_t4> Out4 : register(u4); // test int16_t4
+RWStructuredBuffer<int16_t4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
+
+[numthreads(NUM_THREADS,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            int16_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(int16_t4(1,2,3,4));
+}
+
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Int16
+    Stride: 8
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9, 10, 11, 12,
+    13, 14, 15, 16,
+    2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6, 7, 8, 9,
+    10, 11, 12, 13,
+    14, 15, 16, 1,
+    3, 4, 5, 6,
+    7, 8, 9, 10,
+    11, 12, 13, 14,
+    15, 16, 1, 2,
+    4, 5, 6, 7,
+    8, 9, 10, 11,
+    12, 13, 14, 15,
+    16, 1, 2, 3,
+    4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8, 7, 6, 5, 
+    12, 11, 10, 9, 
+    16, 15, 14, 13, 
+    5, 4, 3, 2, 
+    9, 8, 7, 6, 
+    13, 12, 11, 10, 
+    1, 16, 15, 14, 
+    6, 5, 4, 3, 
+    10, 9, 8, 7, 
+    14, 13, 12, 11, 
+    2, 1, 16, 15, 
+    7, 6, 5, 4, 
+    11, 10, 9, 8, 
+    15, 14, 13, 12, 
+    3, 2, 1, 16 ]
+
+  - Name: Out1
+    Format: Int16
+    Stride: 2
+    # 1 int16_t is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 64  
+  - Name: Out2
+    Format: Int16
+    Stride: 4
+    ZeroInitSize: 128
+  - Name: Out3
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 256
+  - Name: Out4
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 256
+  - Name: Out5
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 8
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
+  - Name: ExpectedOut1
+    Format: Int16
+    Stride: 8
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14, 14, 14, 14,
+    3, 0, 0, 0, 
+    0, 12, 12, 0, 
+    0, 0, 0, 0, 
+    13, 13, 13, 13, 
+    6, 0, 0, 0, 
+    0, 15, 15, 0 ]
+  - Name: ExpectedOut2
+    Format: Int16
+    Stride: 8
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0, 
+    14, 15, 14, 15, 
+    14, 15, 14, 15, 
+    3, 4, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 12, 13, 
+    12, 13, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13, 16, 13, 16,
+    13, 16, 13, 16, 
+    6, 5, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 15, 14, 
+    15, 14, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int16
+    Stride: 8
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 0,
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            3, 4, 5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 0, 
+            12, 13, 14, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 0,
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            6, 5, 4, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 0, 
+            15, 14, 13, 0, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut4
+    Format: Int16
+    Stride: 8
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 13,
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            3, 4, 5, 6, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 15, 
+            12, 13, 14, 15, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14,
+            6, 5, 4, 3,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 12, 
+            15, 14, 13, 12, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut5
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 2, 3, 4 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test
new file mode 100644
index 00000000..8693e8fb
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.int32.test
@@ -0,0 +1,315 @@
+#--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
+StructuredBuffer<int4> In  : register(t0);
+RWStructuredBuffer<int> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<int2> Out2 : register(u2); // test int2
+RWStructuredBuffer<int4> Out3 : register(u3); // test int3
+RWStructuredBuffer<int4> Out4 : register(u4); // test int4
+RWStructuredBuffer<int4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
+
+[numthreads(NUM_THREADS,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            int4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(int4(1,2,3,4));
+}
+
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Int32
+    Stride: 16
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9, 10, 11, 12,
+    13, 14, 15, 16,
+    2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6, 7, 8, 9,
+    10, 11, 12, 13,
+    14, 15, 16, 1,
+    3, 4, 5, 6,
+    7, 8, 9, 10,
+    11, 12, 13, 14,
+    15, 16, 1, 2,
+    4, 5, 6, 7,
+    8, 9, 10, 11,
+    12, 13, 14, 15,
+    16, 1, 2, 3,
+    4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8, 7, 6, 5, 
+    12, 11, 10, 9, 
+    16, 15, 14, 13, 
+    5, 4, 3, 2, 
+    9, 8, 7, 6, 
+    13, 12, 11, 10, 
+    1, 16, 15, 14, 
+    6, 5, 4, 3, 
+    10, 9, 8, 7, 
+    14, 13, 12, 11, 
+    2, 1, 16, 15, 
+    7, 6, 5, 4, 
+    11, 10, 9, 8, 
+    15, 14, 13, 12, 
+    3, 2, 1, 16 ]
+
+  - Name: Out1
+    Format: Int32
+    Stride: 4
+    # 1 int is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 128  
+  - Name: Out2
+    Format: Int32
+    Stride: 8
+    ZeroInitSize: 256
+  - Name: Out3
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 512
+  - Name: Out4
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 512
+  - Name: Out5
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 16
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
+  - Name: ExpectedOut1
+    Format: Int32
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14, 14, 14, 14,
+    3, 0, 0, 0, 
+    0, 12, 12, 0, 
+    0, 0, 0, 0, 
+    13, 13, 13, 13, 
+    6, 0, 0, 0, 
+    0, 15, 15, 0 ]
+  - Name: ExpectedOut2
+    Format: Int32
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0, 
+    14, 15, 14, 15, 
+    14, 15, 14, 15, 
+    3, 4, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 12, 13, 
+    12, 13, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13, 16, 13, 16,
+    13, 16, 13, 16, 
+    6, 5, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 15, 14, 
+    15, 14, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int32
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 0,
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            3, 4, 5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 0, 
+            12, 13, 14, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 0,
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            6, 5, 4, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 0, 
+            15, 14, 13, 0, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut4
+    Format: Int32
+    Stride: 16
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 13,
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            3, 4, 5, 6, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 15, 
+            12, 13, 14, 15, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14,
+            6, 5, 4, 3,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 12, 
+            15, 14, 13, 12, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut5
+    Format: Int32
+    Stride: 8
+    Data: [ 1, 2, 3, 4 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
new file mode 100644
index 00000000..4b0004eb
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -0,0 +1,317 @@
+#--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
+StructuredBuffer<int64_t4> In  : register(t0);
+RWStructuredBuffer<int64_t> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<int64_t2> Out2 : register(u2); // test int64_t2
+RWStructuredBuffer<int64_t4> Out3 : register(u3); // test int64_t3
+RWStructuredBuffer<int64_t4> Out4 : register(u4); // test int64_t4
+RWStructuredBuffer<int64_t4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
+
+[numthreads(NUM_THREADS,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (uint MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            int64_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(int64_t4(1,2,3,4));
+}
+
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Int64
+    Stride: 32
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9, 10, 11, 12,
+    13, 14, 15, 16,
+    2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6, 7, 8, 9,
+    10, 11, 12, 13,
+    14, 15, 16, 1,
+    3, 4, 5, 6,
+    7, 8, 9, 10,
+    11, 12, 13, 14,
+    15, 16, 1, 2,
+    4, 5, 6, 7,
+    8, 9, 10, 11,
+    12, 13, 14, 15,
+    16, 1, 2, 3,
+    4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8, 7, 6, 5, 
+    12, 11, 10, 9, 
+    16, 15, 14, 13, 
+    5, 4, 3, 2, 
+    9, 8, 7, 6, 
+    13, 12, 11, 10, 
+    1, 16, 15, 14, 
+    6, 5, 4, 3, 
+    10, 9, 8, 7, 
+    14, 13, 12, 11, 
+    2, 1, 16, 15, 
+    7, 6, 5, 4, 
+    11, 10, 9, 8, 
+    15, 14, 13, 12, 
+    3, 2, 1, 16 ]
+
+  - Name: Out1
+    Format: Int64
+    Stride: 8
+    # 1 int is 8 bytes, * 4 ints for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 256
+  - Name: Out2
+    Format: Int64
+    Stride: 16
+    ZeroInitSize: 512
+  - Name: Out3
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 1024
+  - Name: Out4
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 1024
+  - Name: Out5
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 32
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
+  - Name: ExpectedOut1
+    Format: Int64
+    Stride: 32
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14, 14, 14, 14,
+    3, 0, 0, 0, 
+    0, 12, 12, 0, 
+    0, 0, 0, 0, 
+    13, 13, 13, 13, 
+    6, 0, 0, 0, 
+    0, 15, 15, 0 ]
+  - Name: ExpectedOut2
+    Format: Int64
+    Stride: 32
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0, 
+    14, 15, 14, 15, 
+    14, 15, 14, 15, 
+    3, 4, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 12, 13, 
+    12, 13, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13, 16, 13, 16,
+    13, 16, 13, 16, 
+    6, 5, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 15, 14, 
+    15, 14, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int64
+    Stride: 32
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 0,
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            3, 4, 5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 0, 
+            12, 13, 14, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 0,
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            6, 5, 4, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 0, 
+            15, 14, 13, 0, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut4
+    Format: Int64
+    Stride: 32
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 13,
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            3, 4, 5, 6, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 15, 
+            12, 13, 14, 15, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14,
+            6, 5, 4, 3,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 12, 
+            15, 14, 13, 12, 
+            0, 0, 0, 0 ]
+  - Name: ExpectedOut5
+    Format: Int64
+    Stride: 16
+    Data: [ 1, 2, 3, 4 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+
+...
+#--- end
+
+# REQUIRES: Int64
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o