Skip to content

Conversation

@jwanggit86
Copy link
Contributor

For GFX10+, image_gather4 instructions that have v[254:255] as dst reg and the d16 bit on can be assembled correctly but the generated binary fails to disassemble. This patch fixes this problem.

For GFX10+, image_gather4 instructions that have v[254:255] as dst
reg and the d16 bit on can be assembled correctly but the generated binary
fails to disassemble. This patch fixes this problem.
@jwanggit86 jwanggit86 added backend:AMDGPU llvm:mc Machine (object) code labels Nov 1, 2024
@jwanggit86 jwanggit86 requested a review from arsenm November 1, 2024 21:58
@llvmbot
Copy link
Member

llvmbot commented Nov 1, 2024

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

Changes

For GFX10+, image_gather4 instructions that have v[254:255] as dst reg and the d16 bit on can be assembled correctly but the generated binary fails to disassemble. This patch fixes this problem.


Full diff: https://github.com/llvm/llvm-project/pull/114609.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+2-2)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt (+6)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt (+6)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vsample.txt (+5)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt (+1-1)
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 5c49a8116ae7fc..dbbc466aee100c 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1508,9 +1508,9 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
       Gather4 = 1 in {
     let VDataDwords = 2 in
-    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
+    defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 1>; /* for packed D16 only */
     let VDataDwords = 4 in
-    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
+    defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
     let VDataDwords = 5 in
     defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
   }
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt
index 8f3da34ea24739..5079f9a3216d41 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt
@@ -443,6 +443,12 @@
 # GFX10: image_gather4 v[16:19], [v8, v9, v10], s[20:27], s[100:103] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x12,0x0f,0x00,0xf1,0x08,0x10,0x25,0x03,0x09,0x0a,0x00,0x00]
 0x12,0x0f,0x00,0xf1,0x08,0x10,0x25,0x03,0x09,0x0a,0x0b,0x0c
 
+# GFX10: image_gather4 v[252:255], v[1:2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x08,0x00,0xf1,0x01,0xfc,0x62,0x00]
+0x08,0x08,0x00,0xf1,0x01,0xfc,0x62,0x00
+
+# image_gather4 v[254:255], v[1:2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x08,0x08,0x00,0xf1,0x01,0xfe,0x62,0x80]
+0x08,0x08,0x00,0xf1,0x01,0xfe,0x62,0x80
+
 # GFX10: image_gather4_cl v[16:19], [v8, v9, v10, v11], s[20:27], s[100:103] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; encoding: [0x1a,0x0f,0x04,0xf1,0x08,0x10,0x25,0x03,0x09,0x0a,0x0b,0x00]
 0x1a,0x0f,0x04,0xf1,0x08,0x10,0x25,0x03,0x09,0x0a,0x0b,0x0c
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
index 46488a9aa4ec70..2ced1e829aa843 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_mimg_features.txt
@@ -255,6 +255,12 @@
 # GFX11: image_gather4 v[64:67], v32, s[4:11], s[100:103] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0xbc,0xf0,0x20,0x40,0x01,0x64]
 0x00,0x01,0xbc,0xf0,0x20,0x40,0x01,0x64
 
+# GFX11: image_gather4 v[254:255], v[1:2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x04,0x08,0xbe,0xf0,0x01,0xfe,0x02,0x0c]
+0x04,0x08,0xbe,0xf0,0x01,0xfe,0x02,0x0c
+
+# GFX11: image_gather4 v[252:255], v[1:2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x08,0xbc,0xf0,0x01,0xfc,0x02,0x0c]
+0x04,0x08,0xbc,0xf0,0x01,0xfc,0x02,0x0c
+
 # GFX11: image_gather4_cl v[64:67], v[32:35], s[4:11], s[100:103] dmask:0x2 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x0c,0x02,0x80,0xf1,0x20,0x40,0x01,0x64]
 0x0c,0x02,0x80,0xf1,0x20,0x40,0x01,0x64
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vsample.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vsample.txt
index e03849ffa83834..ecc7b09bbe915e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vsample.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vsample.txt
@@ -510,6 +510,11 @@
 # GFX12: image_gather4 v[35:38], [v39, v40], s[48:55], s[76:79] dmask:0x2 dim:SQ_RSRC_IMG_2D lwe ; encoding: [0x01,0xc0,0x8b,0xe4,0x23,0x61,0x00,0x26,0x27,0x28,0x00,0x00]
 0x01,0xc0,0x8b,0xe4,0x23,0x61,0x00,0x26,0x27,0x28,0x00,0x00
 
+# GFX12: image_gather4 v[254:255], [v1, v2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x21,0xc0,0x0b,0xe6,0xfe,0x10,0x00,0x06,0x01,0x02,0x00,0x00]
+0x21,0xc0,0x0b,0xe6,0xfe,0x10,0x00,0x06,0x01,0x02,0x00,0x00
+
+# GFX12: image_gather4 v[252:255], [v1, v2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0x0b,0xe6,0xfc,0x10,0x00,0x06,0x01,0x02,0x00,0x00]
+0x01,0xc0,0x0b,0xe6,0xfc,0x10,0x00,0x06,0x01,0x02,0x00,0x00
 # GFX12: image_gather4_l v[64:67], [v32, v33, v34], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x00,0x4c,0xe4,0x40,0x08,0x00,0x02,0x20,0x21,0x22,0x00]
 0x01,0x00,0x4c,0xe4,0x40,0x08,0x00,0x02,0x20,0x21,0x22,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt
index 0a5bafc55f4d43..346ec5853b6c77 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8_mimg_features.txt
@@ -235,5 +235,5 @@
 # VI: image_gather4 v[252:255], v1, s[8:15], s[12:15] dmask:0x3 ; encoding: [0x00,0x03,0x00,0xf1,0x01,0xfc,0x62,0x00]
 0x00,0x03,0x00,0xf1,0x01,0xfc,0x62,0x00
 
-# VI: image_gather4 v[252:255], v1, s[8:15], s[12:15] dmask:0x1 unorm glc slc tfe lwe da ; encoding: [0x00,0x71,0x03,0xf3,0x01,0xfc,0x62,0x00]
+# VI: image_gather4 v[252:253], v1, s[8:15], s[12:15] dmask:0x1 unorm glc slc tfe lwe da ; encoding: [0x00,0x71,0x03,0xf3,0x01,0xfc,0x62,0x00]
 0x00,0x71,0x03,0xf3,0x01,0xfc,0x62,0x00

Gather4 = 1 in {
let VDataDwords = 2 in
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, 1>; /* for packed D16 only */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use true and comment operand name?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

# GFX10: image_gather4 v[252:255], v[1:2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x08,0x00,0xf1,0x01,0xfc,0x62,0x00]
0x08,0x08,0x00,0xf1,0x01,0xfc,0x62,0x00

# image_gather4 v[254:255], v[1:2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x08,0x08,0x00,0xf1,0x01,0xfe,0x62,0x80]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing GFX10: prefix

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

0x00,0x03,0x00,0xf1,0x01,0xfc,0x62,0x00

# VI: image_gather4 v[252:255], v1, s[8:15], s[12:15] dmask:0x1 unorm glc slc tfe lwe da ; encoding: [0x00,0x71,0x03,0xf3,0x01,0xfc,0x62,0x00]
# VI: image_gather4 v[252:253], v1, s[8:15], s[12:15] dmask:0x1 unorm glc slc tfe lwe da ; encoding: [0x00,0x71,0x03,0xf3,0x01,0xfc,0x62,0x00]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this change?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous default was V4 variant so it would start with VReg_128: v[252:255]
Now default is V2 so it starts with VReg_64: v[252:253]
And I think both fail when trying to convert to V5 and VReg_160: v[252:256] which would be invalid.
I'm not sure what the original test was checking since this is one of "incorrect" encodings

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on comment at line 231 it was probably testing that with tfe it will go out of range.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So really this should fail to disassemble, in the same way as any other instruction that uses a vgpr tuple that goes higher than v255. Right?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This currently matches sp3. It does fail on encoding as does sp3.

Looking into it further it seems that we do have inconsistent behavior.

We do not decode VOP when a register is out of bounds (gfx1010 example):

# v_add_f64 v[255:256], v[1:2], v[2:3]
0xff,0x00,0x64,0xd5,0x01,0x05,0x02,0x00

But we do MIMG (gfx1010 example):

# image_load v65, v[253:256], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
0x38,0x18,0x00,0xf0,0xfd,0x41,0x00,0x00 

except llvm produces just v253. Sp3 always decodes to instructions above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For invalid encodings, I would prefer that we consistently either fail to decode, or print something like v[252:256] /* VGPR tuple out of range */. But that would be for another patch.

I have no objection to the current patch if it fixes disassembly of valid encodings.

0x21,0xc0,0x0b,0xe6,0xfe,0x10,0x00,0x06,0x01,0x02,0x00,0x00

# GFX12: image_gather4 v[252:255], [v1, v2], s[8:15], s[12:15] dmask:0x8 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0x0b,0xe6,0xfc,0x10,0x00,0x06,0x01,0x02,0x00,0x00]
0x01,0xc0,0x0b,0xe6,0xfc,0x10,0x00,0x06,0x01,0x02,0x00,0x00
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing new line

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

Copy link
Collaborator

@mbrkusanin mbrkusanin left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with nits

@jwanggit86 jwanggit86 requested review from arsenm and jayfoad November 4, 2024 23:46
@jwanggit86 jwanggit86 merged commit 7780cf0 into llvm:main Nov 5, 2024
8 checks passed
@jwanggit86 jwanggit86 deleted the fix-disassembler-for-image-gather4-with-d16 branch November 5, 2024 18:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AMDGPU llvm:mc Machine (object) code

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[AMDGPU][MC][GFX10] MIMG gather4 opcodes failed to disassemble when d16 is specified

5 participants