-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[LoadStoreVectorizer] Fill gaps in load/store chains to enable vectorization #159388
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
32 commits
Select commit
Hold shift + click to select a range
81b0534
[LoadStoreVectorizer] Fill gaps in loads/stores to enable vectorization
dakersnar b147e23
Clang format
dakersnar c6d98ba
Remove cl opts
dakersnar adeacac
Add context argument to TTI API
dakersnar 47913a3
Update llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
dakersnar b6b87e7
Update tests to test for masked load generation in the LSV
dakersnar 8854d5a
Remove isLegalToWidenLoads API
dakersnar a1d2827
Change LSV to create masked loads
dakersnar bb25df1
Update calls to TTI to match changes in lowering PR
dakersnar 030c0bb
Update tests to match the new masked load/store syntax, moving alignm…
dakersnar 0a0aa2e
Simplify pre-gap-filling TTI legality check
dakersnar 20bf7ad
Clean up comments and simplify some logic
dakersnar 8d0d2e9
More comment improvement
dakersnar cae6020
Add comment to clarify API usage
dakersnar 4113e63
Address review feedback
dakersnar f02c6f8
Rework alignment deriving while gap filling
dakersnar 8240ccb
Fix bug in alignment derive, update test to show improvement
dakersnar 01dad11
Update tests to check for hex pragma
dakersnar 6dc716d
Add more specific asserts, remove if condition
dakersnar 7a05ee3
Update test to account for change in sub-byte element type legalizati…
dakersnar 82b6fcd
Formatting
dakersnar 12a7b5b
Merge remote-tracking branch 'github/main' into github/dkersnar/lsv-g…
dakersnar ccd5893
Fix formatting
dakersnar 551f136
Add redundant element test with gap filling
dakersnar 98d6f23
Merge branch 'main' into github/dkersnar/lsv-gap-fill
dakersnar b9fa06d
Update test to be auto generated
dakersnar 97e8a10
Refactor to prevent extending chain too soon
dakersnar eb6df17
Fix alignment on test, update checks to generate masked load
dakersnar 80b68fd
Merge branch 'main' into github/dkersnar/lsv-gap-fill
dakersnar 37cef5b
Fix overeager alignment upgrading when vectorizer tries to upgrade al…
dakersnar af095c2
Fix test check
dakersnar 0f91e88
Add new test, adjust comments
dakersnar File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s | ||
| ; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} | ||
|
|
||
| ; This is testing the lowering behavior of this case from LoadStoreVectorizer/NVPTX/4x2xhalf.ll | ||
| ; where two 3xhalfs are chained together and extended to 8xhalf. | ||
| define void @halfx3_extend_chain(ptr align 16 captures(none) %rd0) { | ||
| ; CHECK-LABEL: halfx3_extend_chain( | ||
| ; CHECK: { | ||
| ; CHECK-NEXT: .reg .b16 %rs<7>; | ||
| ; CHECK-NEXT: .reg .b32 %r<12>; | ||
| ; CHECK-NEXT: .reg .b64 %rd<2>; | ||
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: // %bb.0: | ||
| ; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_extend_chain_param_0]; | ||
| ; CHECK-NEXT: .pragma "used_bytes_mask 0xfff"; | ||
| ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; | ||
| ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; | ||
| ; CHECK-NEXT: mov.b32 {_, %rs3}, %r2; | ||
| ; CHECK-NEXT: mov.b32 %r5, {%rs3, %rs1}; | ||
| ; CHECK-NEXT: mov.b32 %r6, {%rs2, %rs4}; | ||
| ; CHECK-NEXT: mov.b32 %r7, 0; | ||
| ; CHECK-NEXT: max.f16x2 %r8, %r2, %r7; | ||
| ; CHECK-NEXT: max.f16x2 %r9, %r1, %r7; | ||
| ; CHECK-NEXT: st.b32 [%rd1], %r9; | ||
| ; CHECK-NEXT: mov.b32 {%rs5, _}, %r8; | ||
| ; CHECK-NEXT: st.b16 [%rd1+4], %rs5; | ||
| ; CHECK-NEXT: max.f16x2 %r10, %r6, %r7; | ||
| ; CHECK-NEXT: max.f16x2 %r11, %r5, %r7; | ||
| ; CHECK-NEXT: st.b32 [%rd1+6], %r11; | ||
| ; CHECK-NEXT: mov.b32 {%rs6, _}, %r10; | ||
| ; CHECK-NEXT: st.b16 [%rd1+10], %rs6; | ||
| ; CHECK-NEXT: ret; | ||
| %load1 = load <3 x half>, ptr %rd0, align 16 | ||
| %p1 = fcmp ogt <3 x half> %load1, zeroinitializer | ||
| %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer | ||
| store <3 x half> %s1, ptr %rd0, align 16 | ||
| %in2 = getelementptr half, ptr %rd0, i64 3 | ||
| %load2 = load <3 x half>, ptr %in2, align 4 | ||
| %p2 = fcmp ogt <3 x half> %load2, zeroinitializer | ||
| %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer | ||
| store <3 x half> %s2, ptr %in2, align 4 | ||
| ret void | ||
| } | ||
|
|
||
| ; This disables the vectorization by reducing the alignment. | ||
| define void @halfx3_no_align(ptr align 4 captures(none) %rd0) { | ||
| ; CHECK-LABEL: halfx3_no_align( | ||
| ; CHECK: { | ||
| ; CHECK-NEXT: .reg .b16 %rs<7>; | ||
| ; CHECK-NEXT: .reg .b32 %r<10>; | ||
| ; CHECK-NEXT: .reg .b64 %rd<2>; | ||
| ; CHECK-EMPTY: | ||
| ; CHECK-NEXT: // %bb.0: | ||
| ; CHECK-NEXT: ld.param.b64 %rd1, [halfx3_no_align_param_0]; | ||
| ; CHECK-NEXT: ld.b16 %rs1, [%rd1+4]; | ||
| ; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; | ||
| ; CHECK-NEXT: ld.b32 %r2, [%rd1]; | ||
| ; CHECK-NEXT: mov.b32 %r3, 0; | ||
| ; CHECK-NEXT: max.f16x2 %r4, %r1, %r3; | ||
| ; CHECK-NEXT: max.f16x2 %r5, %r2, %r3; | ||
| ; CHECK-NEXT: st.b32 [%rd1], %r5; | ||
| ; CHECK-NEXT: mov.b32 {%rs3, _}, %r4; | ||
| ; CHECK-NEXT: st.b16 [%rd1+4], %rs3; | ||
| ; CHECK-NEXT: ld.b16 %rs4, [%rd1+10]; | ||
| ; CHECK-NEXT: mov.b32 %r6, {%rs4, %rs5}; | ||
| ; CHECK-NEXT: ld.b32 %r7, [%rd1+6]; | ||
| ; CHECK-NEXT: max.f16x2 %r8, %r6, %r3; | ||
| ; CHECK-NEXT: max.f16x2 %r9, %r7, %r3; | ||
| ; CHECK-NEXT: st.b32 [%rd1+6], %r9; | ||
| ; CHECK-NEXT: mov.b32 {%rs6, _}, %r8; | ||
| ; CHECK-NEXT: st.b16 [%rd1+10], %rs6; | ||
| ; CHECK-NEXT: ret; | ||
| %load1 = load <3 x half>, ptr %rd0, align 4 | ||
| %p1 = fcmp ogt <3 x half> %load1, zeroinitializer | ||
| %s1 = select <3 x i1> %p1, <3 x half> %load1, <3 x half> zeroinitializer | ||
| store <3 x half> %s1, ptr %rd0, align 4 | ||
| %in2 = getelementptr half, ptr %rd0, i64 3 | ||
| %load2 = load <3 x half>, ptr %in2, align 4 | ||
| %p2 = fcmp ogt <3 x half> %load2, zeroinitializer | ||
| %s2 = select <3 x i1> %p2, <3 x half> %load2, <3 x half> zeroinitializer | ||
| store <3 x half> %s2, ptr %in2, align 4 | ||
| ret void | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.